[compare-debug] use call loc for nop_endbr
[official-gcc.git] / gcc / config / i386 / i386.c
blob35037434bf50ec26bd254e444542b29e4f04360e
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
92 /* This file should be included last. */
93 #include "target-def.h"
95 #include "x86-tune-costs.h"
97 static rtx legitimize_dllimport_symbol (rtx, bool);
98 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
99 static rtx legitimize_pe_coff_symbol (rtx, bool);
100 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
101 static bool ix86_save_reg (unsigned int, bool, bool);
102 static bool ix86_function_naked (const_tree);
103 static bool ix86_notrack_prefixed_insn_p (rtx);
104 static void ix86_emit_restore_reg_using_pop (rtx);
107 #ifndef CHECK_STACK_LIMIT
108 #define CHECK_STACK_LIMIT (-1)
109 #endif
111 /* Return index of given mode in mult and division cost tables. */
112 #define MODE_INDEX(mode) \
113 ((mode) == QImode ? 0 \
114 : (mode) == HImode ? 1 \
115 : (mode) == SImode ? 2 \
116 : (mode) == DImode ? 3 \
117 : 4)
120 /* Set by -mtune. */
121 const struct processor_costs *ix86_tune_cost = NULL;
123 /* Set by -mtune or -Os. */
124 const struct processor_costs *ix86_cost = NULL;
126 /* Processor feature/optimization bitmasks. */
127 #define m_386 (1U<<PROCESSOR_I386)
128 #define m_486 (1U<<PROCESSOR_I486)
129 #define m_PENT (1U<<PROCESSOR_PENTIUM)
130 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
131 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
132 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
133 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
134 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
135 #define m_CORE2 (1U<<PROCESSOR_CORE2)
136 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
137 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
138 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
139 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
140 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
141 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
142 #define m_KNL (1U<<PROCESSOR_KNL)
143 #define m_KNM (1U<<PROCESSOR_KNM)
144 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
145 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
146 #define m_INTEL (1U<<PROCESSOR_INTEL)
148 #define m_GEODE (1U<<PROCESSOR_GEODE)
149 #define m_K6 (1U<<PROCESSOR_K6)
150 #define m_K6_GEODE (m_K6 | m_GEODE)
151 #define m_K8 (1U<<PROCESSOR_K8)
152 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
153 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
154 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
155 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
156 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
157 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
158 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
159 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
160 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
161 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
162 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
163 #define m_BTVER (m_BTVER1 | m_BTVER2)
164 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
165 | m_ZNVER1)
167 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
169 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
170 #undef DEF_TUNE
171 #define DEF_TUNE(tune, name, selector) name,
172 #include "x86-tune.def"
173 #undef DEF_TUNE
176 /* Feature tests against the various tunings. */
177 unsigned char ix86_tune_features[X86_TUNE_LAST];
179 /* Feature tests against the various tunings used to create ix86_tune_features
180 based on the processor mask. */
181 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
182 #undef DEF_TUNE
183 #define DEF_TUNE(tune, name, selector) selector,
184 #include "x86-tune.def"
185 #undef DEF_TUNE
188 /* Feature tests against the various architecture variations. */
189 unsigned char ix86_arch_features[X86_ARCH_LAST];
191 /* Feature tests against the various architecture variations, used to create
192 ix86_arch_features based on the processor mask. */
193 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
194 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
195 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
197 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
198 ~m_386,
200 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
201 ~(m_386 | m_486),
203 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
204 ~m_386,
206 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
207 ~m_386,
210 /* In case the average insn count for single function invocation is
211 lower than this constant, emit fast (but longer) prologue and
212 epilogue code. */
213 #define FAST_PROLOGUE_INSN_COUNT 20
215 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
216 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
217 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
218 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
220 /* Array of the smallest class containing reg number REGNO, indexed by
221 REGNO. Used by REGNO_REG_CLASS in i386.h. */
223 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
225 /* ax, dx, cx, bx */
226 AREG, DREG, CREG, BREG,
227 /* si, di, bp, sp */
228 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
229 /* FP registers */
230 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
231 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
232 /* arg pointer */
233 NON_Q_REGS,
234 /* flags, fpsr, fpcr, frame */
235 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
236 /* SSE registers */
237 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
238 SSE_REGS, SSE_REGS,
239 /* MMX registers */
240 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
241 MMX_REGS, MMX_REGS,
242 /* REX registers */
243 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
244 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
245 /* SSE REX registers */
246 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
247 SSE_REGS, SSE_REGS,
248 /* AVX-512 SSE registers */
249 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
250 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
251 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
252 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
253 /* Mask registers. */
254 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
255 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
256 /* MPX bound registers */
257 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
260 /* The "default" register map used in 32bit mode. */
262 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
264 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
265 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
266 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
267 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
268 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
269 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
270 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
271 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
272 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
273 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
274 101, 102, 103, 104, /* bound registers */
277 /* The "default" register map used in 64bit mode. */
279 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
281 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
282 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
283 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
284 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
285 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
286 8,9,10,11,12,13,14,15, /* extended integer registers */
287 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
288 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
289 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
290 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
291 126, 127, 128, 129, /* bound registers */
294 /* Define the register numbers to be used in Dwarf debugging information.
295 The SVR4 reference port C compiler uses the following register numbers
296 in its Dwarf output code:
297 0 for %eax (gcc regno = 0)
298 1 for %ecx (gcc regno = 2)
299 2 for %edx (gcc regno = 1)
300 3 for %ebx (gcc regno = 3)
301 4 for %esp (gcc regno = 7)
302 5 for %ebp (gcc regno = 6)
303 6 for %esi (gcc regno = 4)
304 7 for %edi (gcc regno = 5)
305 The following three DWARF register numbers are never generated by
306 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
307 believed these numbers have these meanings.
308 8 for %eip (no gcc equivalent)
309 9 for %eflags (gcc regno = 17)
310 10 for %trapno (no gcc equivalent)
311 It is not at all clear how we should number the FP stack registers
312 for the x86 architecture. If the version of SDB on x86/svr4 were
313 a bit less brain dead with respect to floating-point then we would
314 have a precedent to follow with respect to DWARF register numbers
315 for x86 FP registers, but the SDB on x86/svr4 was so completely
316 broken with respect to FP registers that it is hardly worth thinking
317 of it as something to strive for compatibility with.
318 The version of x86/svr4 SDB I had does (partially)
319 seem to believe that DWARF register number 11 is associated with
320 the x86 register %st(0), but that's about all. Higher DWARF
321 register numbers don't seem to be associated with anything in
322 particular, and even for DWARF regno 11, SDB only seemed to under-
323 stand that it should say that a variable lives in %st(0) (when
324 asked via an `=' command) if we said it was in DWARF regno 11,
325 but SDB still printed garbage when asked for the value of the
326 variable in question (via a `/' command).
327 (Also note that the labels SDB printed for various FP stack regs
328 when doing an `x' command were all wrong.)
329 Note that these problems generally don't affect the native SVR4
330 C compiler because it doesn't allow the use of -O with -g and
331 because when it is *not* optimizing, it allocates a memory
332 location for each floating-point variable, and the memory
333 location is what gets described in the DWARF AT_location
334 attribute for the variable in question.
335 Regardless of the severe mental illness of the x86/svr4 SDB, we
336 do something sensible here and we use the following DWARF
337 register numbers. Note that these are all stack-top-relative
338 numbers.
339 11 for %st(0) (gcc regno = 8)
340 12 for %st(1) (gcc regno = 9)
341 13 for %st(2) (gcc regno = 10)
342 14 for %st(3) (gcc regno = 11)
343 15 for %st(4) (gcc regno = 12)
344 16 for %st(5) (gcc regno = 13)
345 17 for %st(6) (gcc regno = 14)
346 18 for %st(7) (gcc regno = 15)
348 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
350 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
351 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
352 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
353 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
354 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
355 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
356 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
357 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
358 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
359 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
360 101, 102, 103, 104, /* bound registers */
363 /* Define parameter passing and return registers. */
365 static int const x86_64_int_parameter_registers[6] =
367 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
370 static int const x86_64_ms_abi_int_parameter_registers[4] =
372 CX_REG, DX_REG, R8_REG, R9_REG
375 static int const x86_64_int_return_registers[4] =
377 AX_REG, DX_REG, DI_REG, SI_REG
380 /* Additional registers that are clobbered by SYSV calls. */
382 #define NUM_X86_64_MS_CLOBBERED_REGS 12
383 static int const x86_64_ms_sysv_extra_clobbered_registers
384 [NUM_X86_64_MS_CLOBBERED_REGS] =
386 SI_REG, DI_REG,
387 XMM6_REG, XMM7_REG,
388 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
389 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
392 enum xlogue_stub {
393 XLOGUE_STUB_SAVE,
394 XLOGUE_STUB_RESTORE,
395 XLOGUE_STUB_RESTORE_TAIL,
396 XLOGUE_STUB_SAVE_HFP,
397 XLOGUE_STUB_RESTORE_HFP,
398 XLOGUE_STUB_RESTORE_HFP_TAIL,
400 XLOGUE_STUB_COUNT
403 enum xlogue_stub_sets {
404 XLOGUE_SET_ALIGNED,
405 XLOGUE_SET_ALIGNED_PLUS_8,
406 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
407 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
409 XLOGUE_SET_COUNT
412 /* Register save/restore layout used by out-of-line stubs. */
413 class xlogue_layout {
414 public:
415 struct reginfo
417 unsigned regno;
418 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
419 rsi) to where each register is stored. */
422 unsigned get_nregs () const {return m_nregs;}
423 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
425 const reginfo &get_reginfo (unsigned reg) const
427 gcc_assert (reg < m_nregs);
428 return m_regs[reg];
431 static const char *get_stub_name (enum xlogue_stub stub,
432 unsigned n_extra_args);
434 /* Returns an rtx for the stub's symbol based upon
435 1.) the specified stub (save, restore or restore_ret) and
436 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
437 3.) rather or not stack alignment is being performed. */
438 static rtx get_stub_rtx (enum xlogue_stub stub);
440 /* Returns the amount of stack space (including padding) that the stub
441 needs to store registers based upon data in the machine_function. */
442 HOST_WIDE_INT get_stack_space_used () const
444 const struct machine_function *m = cfun->machine;
445 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
447 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
448 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
451 /* Returns the offset for the base pointer used by the stub. */
452 HOST_WIDE_INT get_stub_ptr_offset () const
454 return STUB_INDEX_OFFSET + m_stack_align_off_in;
457 static const struct xlogue_layout &get_instance ();
458 static unsigned count_stub_managed_regs ();
459 static bool is_stub_managed_reg (unsigned regno, unsigned count);
461 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
462 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
463 static const unsigned MAX_REGS = 18;
464 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
465 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
466 static const unsigned STUB_NAME_MAX_LEN = 20;
467 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
468 static const unsigned REG_ORDER[MAX_REGS];
469 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
471 private:
472 xlogue_layout ();
473 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
474 xlogue_layout (const xlogue_layout &);
476 /* True if hard frame pointer is used. */
477 bool m_hfp;
479 /* Max number of register this layout manages. */
480 unsigned m_nregs;
482 /* Incoming offset from 16-byte alignment. */
483 HOST_WIDE_INT m_stack_align_off_in;
485 /* Register order and offsets. */
486 struct reginfo m_regs[MAX_REGS];
488 /* Lazy-inited cache of symbol names for stubs. */
489 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
490 [STUB_NAME_MAX_LEN];
492 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
495 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
496 "savms64",
497 "resms64",
498 "resms64x",
499 "savms64f",
500 "resms64f",
501 "resms64fx"
504 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
505 /* The below offset values are where each register is stored for the layout
506 relative to incoming stack pointer. The value of each m_regs[].offset will
507 be relative to the incoming base pointer (rax or rsi) used by the stub.
509 s_instances: 0 1 2 3
510 Offset: realigned or aligned + 8
511 Register aligned aligned + 8 aligned w/HFP w/HFP */
512 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
513 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
514 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
515 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
516 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
517 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
518 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
519 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
520 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
521 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
522 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
523 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
524 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
525 BP_REG, /* 0xc0 0xc8 N/A N/A */
526 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
527 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
528 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
529 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
532 /* Instantiate static const values. */
533 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
534 const unsigned xlogue_layout::MIN_REGS;
535 const unsigned xlogue_layout::MAX_REGS;
536 const unsigned xlogue_layout::MAX_EXTRA_REGS;
537 const unsigned xlogue_layout::VARIANT_COUNT;
538 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
540 /* Initialize xlogue_layout::s_stub_names to zero. */
541 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
542 [STUB_NAME_MAX_LEN];
544 /* Instantiates all xlogue_layout instances. */
545 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
546 xlogue_layout (0, false),
547 xlogue_layout (8, false),
548 xlogue_layout (0, true),
549 xlogue_layout (8, true)
552 /* Return an appropriate const instance of xlogue_layout based upon values
553 in cfun->machine and crtl. */
554 const struct xlogue_layout &
555 xlogue_layout::get_instance ()
557 enum xlogue_stub_sets stub_set;
558 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
560 if (stack_realign_fp)
561 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
562 else if (frame_pointer_needed)
563 stub_set = aligned_plus_8
564 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
565 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
566 else
567 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
569 return s_instances[stub_set];
572 /* Determine how many clobbered registers can be saved by the stub.
573 Returns the count of registers the stub will save and restore. */
574 unsigned
575 xlogue_layout::count_stub_managed_regs ()
577 bool hfp = frame_pointer_needed || stack_realign_fp;
578 unsigned i, count;
579 unsigned regno;
581 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
583 regno = REG_ORDER[i];
584 if (regno == BP_REG && hfp)
585 continue;
586 if (!ix86_save_reg (regno, false, false))
587 break;
588 ++count;
590 return count;
593 /* Determine if register REGNO is a stub managed register given the
594 total COUNT of stub managed registers. */
595 bool
596 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
598 bool hfp = frame_pointer_needed || stack_realign_fp;
599 unsigned i;
601 for (i = 0; i < count; ++i)
603 gcc_assert (i < MAX_REGS);
604 if (REG_ORDER[i] == BP_REG && hfp)
605 ++count;
606 else if (REG_ORDER[i] == regno)
607 return true;
609 return false;
612 /* Constructor for xlogue_layout. */
613 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
614 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
615 m_stack_align_off_in (stack_align_off_in)
617 HOST_WIDE_INT offset = stack_align_off_in;
618 unsigned i, j;
620 for (i = j = 0; i < MAX_REGS; ++i)
622 unsigned regno = REG_ORDER[i];
624 if (regno == BP_REG && hfp)
625 continue;
626 if (SSE_REGNO_P (regno))
628 offset += 16;
629 /* Verify that SSE regs are always aligned. */
630 gcc_assert (!((stack_align_off_in + offset) & 15));
632 else
633 offset += 8;
635 m_regs[j].regno = regno;
636 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
638 gcc_assert (j == m_nregs);
641 const char *
642 xlogue_layout::get_stub_name (enum xlogue_stub stub,
643 unsigned n_extra_regs)
645 const int have_avx = TARGET_AVX;
646 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
648 /* Lazy init */
649 if (!*name)
651 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
652 (have_avx ? "avx" : "sse"),
653 STUB_BASE_NAMES[stub],
654 MIN_REGS + n_extra_regs);
655 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
658 return name;
661 /* Return rtx of a symbol ref for the entry point (based upon
662 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
664 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
666 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
667 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
668 gcc_assert (stub < XLOGUE_STUB_COUNT);
669 gcc_assert (crtl->stack_realign_finalized);
671 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
674 /* Define the structure for the machine field in struct function. */
676 struct GTY(()) stack_local_entry {
677 unsigned short mode;
678 unsigned short n;
679 rtx rtl;
680 struct stack_local_entry *next;
683 /* Which cpu are we scheduling for. */
684 enum attr_cpu ix86_schedule;
686 /* Which cpu are we optimizing for. */
687 enum processor_type ix86_tune;
689 /* Which instruction set architecture to use. */
690 enum processor_type ix86_arch;
692 /* True if processor has SSE prefetch instruction. */
693 unsigned char x86_prefetch_sse;
695 /* -mstackrealign option */
696 static const char ix86_force_align_arg_pointer_string[]
697 = "force_align_arg_pointer";
699 static rtx (*ix86_gen_leave) (void);
700 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
701 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
702 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
703 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
704 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
705 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
706 static rtx (*ix86_gen_clzero) (rtx);
707 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
709 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
711 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
714 /* Preferred alignment for stack boundary in bits. */
715 unsigned int ix86_preferred_stack_boundary;
717 /* Alignment for incoming stack boundary in bits specified at
718 command line. */
719 static unsigned int ix86_user_incoming_stack_boundary;
721 /* Default alignment for incoming stack boundary in bits. */
722 static unsigned int ix86_default_incoming_stack_boundary;
724 /* Alignment for incoming stack boundary in bits. */
725 unsigned int ix86_incoming_stack_boundary;
727 /* Calling abi specific va_list type nodes. */
728 static GTY(()) tree sysv_va_list_type_node;
729 static GTY(()) tree ms_va_list_type_node;
731 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
732 char internal_label_prefix[16];
733 int internal_label_prefix_len;
735 /* Fence to use after loop using movnt. */
736 tree x86_mfence;
738 /* Register class used for passing given 64bit part of the argument.
739 These represent classes as documented by the PS ABI, with the exception
740 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
741 use SF or DFmode move instead of DImode to avoid reformatting penalties.
743 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
744 whenever possible (upper half does contain padding). */
745 enum x86_64_reg_class
747 X86_64_NO_CLASS,
748 X86_64_INTEGER_CLASS,
749 X86_64_INTEGERSI_CLASS,
750 X86_64_SSE_CLASS,
751 X86_64_SSESF_CLASS,
752 X86_64_SSEDF_CLASS,
753 X86_64_SSEUP_CLASS,
754 X86_64_X87_CLASS,
755 X86_64_X87UP_CLASS,
756 X86_64_COMPLEX_X87_CLASS,
757 X86_64_MEMORY_CLASS
760 #define MAX_CLASSES 8
762 /* Table of constants used by fldpi, fldln2, etc.... */
763 static REAL_VALUE_TYPE ext_80387_constants_table [5];
764 static bool ext_80387_constants_init;
767 static struct machine_function * ix86_init_machine_status (void);
768 static rtx ix86_function_value (const_tree, const_tree, bool);
769 static bool ix86_function_value_regno_p (const unsigned int);
770 static unsigned int ix86_function_arg_boundary (machine_mode,
771 const_tree);
772 static rtx ix86_static_chain (const_tree, bool);
773 static int ix86_function_regparm (const_tree, const_tree);
774 static void ix86_compute_frame_layout (void);
775 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
776 rtx, rtx, int);
777 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
778 static tree ix86_canonical_va_list_type (tree);
779 static void predict_jump (int);
780 static unsigned int split_stack_prologue_scratch_regno (void);
781 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
783 enum ix86_function_specific_strings
785 IX86_FUNCTION_SPECIFIC_ARCH,
786 IX86_FUNCTION_SPECIFIC_TUNE,
787 IX86_FUNCTION_SPECIFIC_MAX
790 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
791 const char *, const char *, enum fpmath_unit,
792 bool);
793 static void ix86_function_specific_save (struct cl_target_option *,
794 struct gcc_options *opts);
795 static void ix86_function_specific_restore (struct gcc_options *opts,
796 struct cl_target_option *);
797 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
798 static void ix86_function_specific_print (FILE *, int,
799 struct cl_target_option *);
800 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
801 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
802 struct gcc_options *,
803 struct gcc_options *,
804 struct gcc_options *);
805 static bool ix86_can_inline_p (tree, tree);
806 static void ix86_set_current_function (tree);
807 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
809 static enum calling_abi ix86_function_abi (const_tree);
812 #ifndef SUBTARGET32_DEFAULT_CPU
813 #define SUBTARGET32_DEFAULT_CPU "i386"
814 #endif
816 /* Whether -mtune= or -march= were specified */
817 static int ix86_tune_defaulted;
818 static int ix86_arch_specified;
820 /* Vectorization library interface and handlers. */
821 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
823 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
824 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
826 /* Processor target table, indexed by processor number */
827 struct ptt
829 const char *const name; /* processor name */
830 const struct processor_costs *cost; /* Processor costs */
831 const int align_loop; /* Default alignments. */
832 const int align_loop_max_skip;
833 const int align_jump;
834 const int align_jump_max_skip;
835 const int align_func;
838 /* This table must be in sync with enum processor_type in i386.h. */
839 static const struct ptt processor_target_table[PROCESSOR_max] =
841 {"generic", &generic_cost, 16, 10, 16, 10, 16},
842 {"i386", &i386_cost, 4, 3, 4, 3, 4},
843 {"i486", &i486_cost, 16, 15, 16, 15, 16},
844 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
845 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
846 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
847 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
848 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
849 {"core2", &core_cost, 16, 10, 16, 10, 16},
850 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
851 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
852 {"haswell", &core_cost, 16, 10, 16, 10, 16},
853 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
854 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
855 {"knl", &slm_cost, 16, 15, 16, 7, 16},
856 {"knm", &slm_cost, 16, 15, 16, 7, 16},
857 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
858 {"cannonlake", &core_cost, 16, 10, 16, 10, 16},
859 {"intel", &intel_cost, 16, 15, 16, 7, 16},
860 {"geode", &geode_cost, 0, 0, 0, 0, 0},
861 {"k6", &k6_cost, 32, 7, 32, 7, 32},
862 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
863 {"k8", &k8_cost, 16, 7, 16, 7, 16},
864 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
865 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
866 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
867 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
868 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
869 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
870 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
871 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
874 static unsigned int
875 rest_of_handle_insert_vzeroupper (void)
877 int i;
879 /* vzeroupper instructions are inserted immediately after reload to
880 account for possible spills from 256bit or 512bit registers. The pass
881 reuses mode switching infrastructure by re-running mode insertion
882 pass, so disable entities that have already been processed. */
883 for (i = 0; i < MAX_386_ENTITIES; i++)
884 ix86_optimize_mode_switching[i] = 0;
886 ix86_optimize_mode_switching[AVX_U128] = 1;
888 /* Call optimize_mode_switching. */
889 g->get_passes ()->execute_pass_mode_switching ();
890 return 0;
893 /* Return 1 if INSN uses or defines a hard register.
894 Hard register uses in a memory address are ignored.
895 Clobbers and flags definitions are ignored. */
897 static bool
898 has_non_address_hard_reg (rtx_insn *insn)
900 df_ref ref;
901 FOR_EACH_INSN_DEF (ref, insn)
902 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
903 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
904 && DF_REF_REGNO (ref) != FLAGS_REG)
905 return true;
907 FOR_EACH_INSN_USE (ref, insn)
908 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
909 return true;
911 return false;
914 /* Check if comparison INSN may be transformed
915 into vector comparison. Currently we transform
916 zero checks only which look like:
918 (set (reg:CCZ 17 flags)
919 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
920 (subreg:SI (reg:DI x) 0))
921 (const_int 0 [0]))) */
923 static bool
924 convertible_comparison_p (rtx_insn *insn)
926 if (!TARGET_SSE4_1)
927 return false;
929 rtx def_set = single_set (insn);
931 gcc_assert (def_set);
933 rtx src = SET_SRC (def_set);
934 rtx dst = SET_DEST (def_set);
936 gcc_assert (GET_CODE (src) == COMPARE);
938 if (GET_CODE (dst) != REG
939 || REGNO (dst) != FLAGS_REG
940 || GET_MODE (dst) != CCZmode)
941 return false;
943 rtx op1 = XEXP (src, 0);
944 rtx op2 = XEXP (src, 1);
946 if (op2 != CONST0_RTX (GET_MODE (op2)))
947 return false;
949 if (GET_CODE (op1) != IOR)
950 return false;
952 op2 = XEXP (op1, 1);
953 op1 = XEXP (op1, 0);
955 if (!SUBREG_P (op1)
956 || !SUBREG_P (op2)
957 || GET_MODE (op1) != SImode
958 || GET_MODE (op2) != SImode
959 || ((SUBREG_BYTE (op1) != 0
960 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
961 && (SUBREG_BYTE (op2) != 0
962 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
963 return false;
965 op1 = SUBREG_REG (op1);
966 op2 = SUBREG_REG (op2);
968 if (op1 != op2
969 || !REG_P (op1)
970 || GET_MODE (op1) != DImode)
971 return false;
973 return true;
976 /* The DImode version of scalar_to_vector_candidate_p. */
978 static bool
979 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
981 rtx def_set = single_set (insn);
983 if (!def_set)
984 return false;
986 if (has_non_address_hard_reg (insn))
987 return false;
989 rtx src = SET_SRC (def_set);
990 rtx dst = SET_DEST (def_set);
992 if (GET_CODE (src) == COMPARE)
993 return convertible_comparison_p (insn);
995 /* We are interested in DImode promotion only. */
996 if ((GET_MODE (src) != DImode
997 && !CONST_INT_P (src))
998 || GET_MODE (dst) != DImode)
999 return false;
1001 if (!REG_P (dst) && !MEM_P (dst))
1002 return false;
1004 switch (GET_CODE (src))
1006 case ASHIFTRT:
1007 if (!TARGET_AVX512VL)
1008 return false;
1009 /* FALLTHRU */
1011 case ASHIFT:
1012 case LSHIFTRT:
1013 if (!REG_P (XEXP (src, 1))
1014 && (!SUBREG_P (XEXP (src, 1))
1015 || SUBREG_BYTE (XEXP (src, 1)) != 0
1016 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1017 && (!CONST_INT_P (XEXP (src, 1))
1018 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1019 return false;
1021 if (GET_MODE (XEXP (src, 1)) != QImode
1022 && !CONST_INT_P (XEXP (src, 1)))
1023 return false;
1024 break;
1026 case PLUS:
1027 case MINUS:
1028 case IOR:
1029 case XOR:
1030 case AND:
1031 if (!REG_P (XEXP (src, 1))
1032 && !MEM_P (XEXP (src, 1))
1033 && !CONST_INT_P (XEXP (src, 1)))
1034 return false;
1036 if (GET_MODE (XEXP (src, 1)) != DImode
1037 && !CONST_INT_P (XEXP (src, 1)))
1038 return false;
1039 break;
1041 case NEG:
1042 case NOT:
1043 break;
1045 case REG:
1046 return true;
1048 case MEM:
1049 case CONST_INT:
1050 return REG_P (dst);
1052 default:
1053 return false;
1056 if (!REG_P (XEXP (src, 0))
1057 && !MEM_P (XEXP (src, 0))
1058 && !CONST_INT_P (XEXP (src, 0))
1059 /* Check for andnot case. */
1060 && (GET_CODE (src) != AND
1061 || GET_CODE (XEXP (src, 0)) != NOT
1062 || !REG_P (XEXP (XEXP (src, 0), 0))))
1063 return false;
1065 if (GET_MODE (XEXP (src, 0)) != DImode
1066 && !CONST_INT_P (XEXP (src, 0)))
1067 return false;
1069 return true;
1072 /* The TImode version of scalar_to_vector_candidate_p. */
1074 static bool
1075 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1077 rtx def_set = single_set (insn);
1079 if (!def_set)
1080 return false;
1082 if (has_non_address_hard_reg (insn))
1083 return false;
1085 rtx src = SET_SRC (def_set);
1086 rtx dst = SET_DEST (def_set);
1088 /* Only TImode load and store are allowed. */
1089 if (GET_MODE (dst) != TImode)
1090 return false;
1092 if (MEM_P (dst))
1094 /* Check for store. Memory must be aligned or unaligned store
1095 is optimal. Only support store from register, standard SSE
1096 constant or CONST_WIDE_INT generated from piecewise store.
1098 ??? Verify performance impact before enabling CONST_INT for
1099 __int128 store. */
1100 if (misaligned_operand (dst, TImode)
1101 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1102 return false;
1104 switch (GET_CODE (src))
1106 default:
1107 return false;
1109 case REG:
1110 case CONST_WIDE_INT:
1111 return true;
1113 case CONST_INT:
1114 return standard_sse_constant_p (src, TImode);
1117 else if (MEM_P (src))
1119 /* Check for load. Memory must be aligned or unaligned load is
1120 optimal. */
1121 return (REG_P (dst)
1122 && (!misaligned_operand (src, TImode)
1123 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1126 return false;
1129 /* Return 1 if INSN may be converted into vector
1130 instruction. */
1132 static bool
1133 scalar_to_vector_candidate_p (rtx_insn *insn)
1135 if (TARGET_64BIT)
1136 return timode_scalar_to_vector_candidate_p (insn);
1137 else
1138 return dimode_scalar_to_vector_candidate_p (insn);
1141 /* The DImode version of remove_non_convertible_regs. */
1143 static void
1144 dimode_remove_non_convertible_regs (bitmap candidates)
1146 bitmap_iterator bi;
1147 unsigned id;
1148 bitmap regs = BITMAP_ALLOC (NULL);
1150 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1152 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1153 rtx reg = SET_DEST (def_set);
1155 if (!REG_P (reg)
1156 || bitmap_bit_p (regs, REGNO (reg))
1157 || HARD_REGISTER_P (reg))
1158 continue;
1160 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1161 def;
1162 def = DF_REF_NEXT_REG (def))
1164 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1166 if (dump_file)
1167 fprintf (dump_file,
1168 "r%d has non convertible definition in insn %d\n",
1169 REGNO (reg), DF_REF_INSN_UID (def));
1171 bitmap_set_bit (regs, REGNO (reg));
1172 break;
1177 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1179 for (df_ref def = DF_REG_DEF_CHAIN (id);
1180 def;
1181 def = DF_REF_NEXT_REG (def))
1182 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1184 if (dump_file)
1185 fprintf (dump_file, "Removing insn %d from candidates list\n",
1186 DF_REF_INSN_UID (def));
1188 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1192 BITMAP_FREE (regs);
1195 /* For a register REGNO, scan instructions for its defs and uses.
1196 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1198 static void
1199 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1200 unsigned int regno)
1202 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1203 def;
1204 def = DF_REF_NEXT_REG (def))
1206 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1208 if (dump_file)
1209 fprintf (dump_file,
1210 "r%d has non convertible def in insn %d\n",
1211 regno, DF_REF_INSN_UID (def));
1213 bitmap_set_bit (regs, regno);
1214 break;
1218 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1219 ref;
1220 ref = DF_REF_NEXT_REG (ref))
1222 /* Debug instructions are skipped. */
1223 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1224 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1226 if (dump_file)
1227 fprintf (dump_file,
1228 "r%d has non convertible use in insn %d\n",
1229 regno, DF_REF_INSN_UID (ref));
1231 bitmap_set_bit (regs, regno);
1232 break;
1237 /* The TImode version of remove_non_convertible_regs. */
1239 static void
1240 timode_remove_non_convertible_regs (bitmap candidates)
1242 bitmap_iterator bi;
1243 unsigned id;
1244 bitmap regs = BITMAP_ALLOC (NULL);
1246 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1248 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1249 rtx dest = SET_DEST (def_set);
1250 rtx src = SET_SRC (def_set);
1252 if ((!REG_P (dest)
1253 || bitmap_bit_p (regs, REGNO (dest))
1254 || HARD_REGISTER_P (dest))
1255 && (!REG_P (src)
1256 || bitmap_bit_p (regs, REGNO (src))
1257 || HARD_REGISTER_P (src)))
1258 continue;
1260 if (REG_P (dest))
1261 timode_check_non_convertible_regs (candidates, regs,
1262 REGNO (dest));
1264 if (REG_P (src))
1265 timode_check_non_convertible_regs (candidates, regs,
1266 REGNO (src));
1269 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1271 for (df_ref def = DF_REG_DEF_CHAIN (id);
1272 def;
1273 def = DF_REF_NEXT_REG (def))
1274 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1276 if (dump_file)
1277 fprintf (dump_file, "Removing insn %d from candidates list\n",
1278 DF_REF_INSN_UID (def));
1280 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1283 for (df_ref ref = DF_REG_USE_CHAIN (id);
1284 ref;
1285 ref = DF_REF_NEXT_REG (ref))
1286 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1288 if (dump_file)
1289 fprintf (dump_file, "Removing insn %d from candidates list\n",
1290 DF_REF_INSN_UID (ref));
1292 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1296 BITMAP_FREE (regs);
1299 /* For a given bitmap of insn UIDs scans all instruction and
1300 remove insn from CANDIDATES in case it has both convertible
1301 and not convertible definitions.
1303 All insns in a bitmap are conversion candidates according to
1304 scalar_to_vector_candidate_p. Currently it implies all insns
1305 are single_set. */
1307 static void
1308 remove_non_convertible_regs (bitmap candidates)
1310 if (TARGET_64BIT)
1311 timode_remove_non_convertible_regs (candidates);
1312 else
1313 dimode_remove_non_convertible_regs (candidates);
1316 class scalar_chain
1318 public:
1319 scalar_chain ();
1320 virtual ~scalar_chain ();
1322 static unsigned max_id;
1324 /* ID of a chain. */
1325 unsigned int chain_id;
1326 /* A queue of instructions to be included into a chain. */
1327 bitmap queue;
1328 /* Instructions included into a chain. */
1329 bitmap insns;
1330 /* All registers defined by a chain. */
1331 bitmap defs;
1332 /* Registers used in both vector and sclar modes. */
1333 bitmap defs_conv;
1335 void build (bitmap candidates, unsigned insn_uid);
1336 virtual int compute_convert_gain () = 0;
1337 int convert ();
1339 protected:
1340 void add_to_queue (unsigned insn_uid);
1341 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1343 private:
1344 void add_insn (bitmap candidates, unsigned insn_uid);
1345 void analyze_register_chain (bitmap candidates, df_ref ref);
1346 virtual void mark_dual_mode_def (df_ref def) = 0;
1347 virtual void convert_insn (rtx_insn *insn) = 0;
1348 virtual void convert_registers () = 0;
1351 class dimode_scalar_chain : public scalar_chain
1353 public:
1354 int compute_convert_gain ();
1355 private:
1356 void mark_dual_mode_def (df_ref def);
1357 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1358 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1359 void convert_insn (rtx_insn *insn);
1360 void convert_op (rtx *op, rtx_insn *insn);
1361 void convert_reg (unsigned regno);
1362 void make_vector_copies (unsigned regno);
1363 void convert_registers ();
1364 int vector_const_cost (rtx exp);
1367 class timode_scalar_chain : public scalar_chain
1369 public:
1370 /* Convert from TImode to V1TImode is always faster. */
1371 int compute_convert_gain () { return 1; }
1373 private:
1374 void mark_dual_mode_def (df_ref def);
1375 void fix_debug_reg_uses (rtx reg);
1376 void convert_insn (rtx_insn *insn);
1377 /* We don't convert registers to difference size. */
1378 void convert_registers () {}
1381 unsigned scalar_chain::max_id = 0;
1383 /* Initialize new chain. */
1385 scalar_chain::scalar_chain ()
1387 chain_id = ++max_id;
1389 if (dump_file)
1390 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1392 bitmap_obstack_initialize (NULL);
1393 insns = BITMAP_ALLOC (NULL);
1394 defs = BITMAP_ALLOC (NULL);
1395 defs_conv = BITMAP_ALLOC (NULL);
1396 queue = NULL;
1399 /* Free chain's data. */
1401 scalar_chain::~scalar_chain ()
1403 BITMAP_FREE (insns);
1404 BITMAP_FREE (defs);
1405 BITMAP_FREE (defs_conv);
1406 bitmap_obstack_release (NULL);
1409 /* Add instruction into chains' queue. */
1411 void
1412 scalar_chain::add_to_queue (unsigned insn_uid)
1414 if (bitmap_bit_p (insns, insn_uid)
1415 || bitmap_bit_p (queue, insn_uid))
1416 return;
1418 if (dump_file)
1419 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1420 insn_uid, chain_id);
1421 bitmap_set_bit (queue, insn_uid);
1424 /* For DImode conversion, mark register defined by DEF as requiring
1425 conversion. */
1427 void
1428 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1430 gcc_assert (DF_REF_REG_DEF_P (def));
1432 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1433 return;
1435 if (dump_file)
1436 fprintf (dump_file,
1437 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1438 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1440 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1443 /* For TImode conversion, it is unused. */
1445 void
1446 timode_scalar_chain::mark_dual_mode_def (df_ref)
1448 gcc_unreachable ();
1451 /* Check REF's chain to add new insns into a queue
1452 and find registers requiring conversion. */
1454 void
1455 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1457 df_link *chain;
1459 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1460 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1461 add_to_queue (DF_REF_INSN_UID (ref));
1463 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1465 unsigned uid = DF_REF_INSN_UID (chain->ref);
1467 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1468 continue;
1470 if (!DF_REF_REG_MEM_P (chain->ref))
1472 if (bitmap_bit_p (insns, uid))
1473 continue;
1475 if (bitmap_bit_p (candidates, uid))
1477 add_to_queue (uid);
1478 continue;
1482 if (DF_REF_REG_DEF_P (chain->ref))
1484 if (dump_file)
1485 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1486 DF_REF_REGNO (chain->ref), uid);
1487 mark_dual_mode_def (chain->ref);
1489 else
1491 if (dump_file)
1492 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1493 DF_REF_REGNO (chain->ref), uid);
1494 mark_dual_mode_def (ref);
1499 /* Add instruction into a chain. */
1501 void
1502 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1504 if (bitmap_bit_p (insns, insn_uid))
1505 return;
1507 if (dump_file)
1508 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1510 bitmap_set_bit (insns, insn_uid);
1512 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1513 rtx def_set = single_set (insn);
1514 if (def_set && REG_P (SET_DEST (def_set))
1515 && !HARD_REGISTER_P (SET_DEST (def_set)))
1516 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1518 df_ref ref;
1519 df_ref def;
1520 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1521 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1522 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1523 def;
1524 def = DF_REF_NEXT_REG (def))
1525 analyze_register_chain (candidates, def);
1526 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1527 if (!DF_REF_REG_MEM_P (ref))
1528 analyze_register_chain (candidates, ref);
1531 /* Build new chain starting from insn INSN_UID recursively
1532 adding all dependent uses and definitions. */
1534 void
1535 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1537 queue = BITMAP_ALLOC (NULL);
1538 bitmap_set_bit (queue, insn_uid);
1540 if (dump_file)
1541 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1543 while (!bitmap_empty_p (queue))
1545 insn_uid = bitmap_first_set_bit (queue);
1546 bitmap_clear_bit (queue, insn_uid);
1547 bitmap_clear_bit (candidates, insn_uid);
1548 add_insn (candidates, insn_uid);
1551 if (dump_file)
1553 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1554 fprintf (dump_file, " insns: ");
1555 dump_bitmap (dump_file, insns);
1556 if (!bitmap_empty_p (defs_conv))
1558 bitmap_iterator bi;
1559 unsigned id;
1560 const char *comma = "";
1561 fprintf (dump_file, " defs to convert: ");
1562 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1564 fprintf (dump_file, "%sr%d", comma, id);
1565 comma = ", ";
1567 fprintf (dump_file, "\n");
1571 BITMAP_FREE (queue);
1574 /* Return a cost of building a vector costant
1575 instead of using a scalar one. */
1578 dimode_scalar_chain::vector_const_cost (rtx exp)
1580 gcc_assert (CONST_INT_P (exp));
1582 if (standard_sse_constant_p (exp, V2DImode))
1583 return COSTS_N_INSNS (1);
1584 return ix86_cost->sse_load[1];
1587 /* Compute a gain for chain conversion. */
1590 dimode_scalar_chain::compute_convert_gain ()
1592 bitmap_iterator bi;
1593 unsigned insn_uid;
1594 int gain = 0;
1595 int cost = 0;
1597 if (dump_file)
1598 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1600 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1602 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1603 rtx def_set = single_set (insn);
1604 rtx src = SET_SRC (def_set);
1605 rtx dst = SET_DEST (def_set);
1607 if (REG_P (src) && REG_P (dst))
1608 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1609 else if (REG_P (src) && MEM_P (dst))
1610 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1611 else if (MEM_P (src) && REG_P (dst))
1612 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1613 else if (GET_CODE (src) == ASHIFT
1614 || GET_CODE (src) == ASHIFTRT
1615 || GET_CODE (src) == LSHIFTRT)
1617 if (CONST_INT_P (XEXP (src, 0)))
1618 gain -= vector_const_cost (XEXP (src, 0));
1619 if (CONST_INT_P (XEXP (src, 1)))
1621 gain += ix86_cost->shift_const;
1622 if (INTVAL (XEXP (src, 1)) >= 32)
1623 gain -= COSTS_N_INSNS (1);
1625 else
1626 /* Additional gain for omitting two CMOVs. */
1627 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1629 else if (GET_CODE (src) == PLUS
1630 || GET_CODE (src) == MINUS
1631 || GET_CODE (src) == IOR
1632 || GET_CODE (src) == XOR
1633 || GET_CODE (src) == AND)
1635 gain += ix86_cost->add;
1636 /* Additional gain for andnot for targets without BMI. */
1637 if (GET_CODE (XEXP (src, 0)) == NOT
1638 && !TARGET_BMI)
1639 gain += 2 * ix86_cost->add;
1641 if (CONST_INT_P (XEXP (src, 0)))
1642 gain -= vector_const_cost (XEXP (src, 0));
1643 if (CONST_INT_P (XEXP (src, 1)))
1644 gain -= vector_const_cost (XEXP (src, 1));
1646 else if (GET_CODE (src) == NEG
1647 || GET_CODE (src) == NOT)
1648 gain += ix86_cost->add - COSTS_N_INSNS (1);
1649 else if (GET_CODE (src) == COMPARE)
1651 /* Assume comparison cost is the same. */
1653 else if (CONST_INT_P (src))
1655 if (REG_P (dst))
1656 gain += COSTS_N_INSNS (2);
1657 else if (MEM_P (dst))
1658 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1659 gain -= vector_const_cost (src);
1661 else
1662 gcc_unreachable ();
1665 if (dump_file)
1666 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1668 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1669 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1671 if (dump_file)
1672 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1674 gain -= cost;
1676 if (dump_file)
1677 fprintf (dump_file, " Total gain: %d\n", gain);
1679 return gain;
1682 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1685 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1687 if (x == reg)
1688 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1690 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1691 int i, j;
1692 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1694 if (fmt[i] == 'e')
1695 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1696 else if (fmt[i] == 'E')
1697 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1698 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1699 reg, new_reg);
1702 return x;
1705 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1707 void
1708 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1709 rtx reg, rtx new_reg)
1711 replace_with_subreg (single_set (insn), reg, new_reg);
1714 /* Insert generated conversion instruction sequence INSNS
1715 after instruction AFTER. New BB may be required in case
1716 instruction has EH region attached. */
1718 void
1719 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1721 if (!control_flow_insn_p (after))
1723 emit_insn_after (insns, after);
1724 return;
1727 basic_block bb = BLOCK_FOR_INSN (after);
1728 edge e = find_fallthru_edge (bb->succs);
1729 gcc_assert (e);
1731 basic_block new_bb = split_edge (e);
1732 emit_insn_after (insns, BB_HEAD (new_bb));
1735 /* Make vector copies for all register REGNO definitions
1736 and replace its uses in a chain. */
1738 void
1739 dimode_scalar_chain::make_vector_copies (unsigned regno)
1741 rtx reg = regno_reg_rtx[regno];
1742 rtx vreg = gen_reg_rtx (DImode);
1743 bool count_reg = false;
1744 df_ref ref;
1746 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1747 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1749 df_ref use;
1751 /* Detect the count register of a shift instruction. */
1752 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1753 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1755 rtx_insn *insn = DF_REF_INSN (use);
1756 rtx def_set = single_set (insn);
1758 gcc_assert (def_set);
1760 rtx src = SET_SRC (def_set);
1762 if ((GET_CODE (src) == ASHIFT
1763 || GET_CODE (src) == ASHIFTRT
1764 || GET_CODE (src) == LSHIFTRT)
1765 && !CONST_INT_P (XEXP (src, 1))
1766 && reg_or_subregno (XEXP (src, 1)) == regno)
1767 count_reg = true;
1770 start_sequence ();
1771 if (count_reg)
1773 rtx qreg = gen_lowpart (QImode, reg);
1774 rtx tmp = gen_reg_rtx (SImode);
1776 if (TARGET_ZERO_EXTEND_WITH_AND
1777 && optimize_function_for_speed_p (cfun))
1779 emit_move_insn (tmp, const0_rtx);
1780 emit_insn (gen_movstrictqi
1781 (gen_lowpart (QImode, tmp), qreg));
1783 else
1784 emit_insn (gen_rtx_SET
1785 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1787 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1789 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1790 emit_move_insn (slot, tmp);
1791 tmp = copy_rtx (slot);
1794 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1796 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1798 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1799 emit_move_insn (adjust_address (tmp, SImode, 0),
1800 gen_rtx_SUBREG (SImode, reg, 0));
1801 emit_move_insn (adjust_address (tmp, SImode, 4),
1802 gen_rtx_SUBREG (SImode, reg, 4));
1803 emit_move_insn (vreg, tmp);
1805 else if (TARGET_SSE4_1)
1807 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1808 CONST0_RTX (V4SImode),
1809 gen_rtx_SUBREG (SImode, reg, 0)));
1810 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1811 gen_rtx_SUBREG (V4SImode, vreg, 0),
1812 gen_rtx_SUBREG (SImode, reg, 4),
1813 GEN_INT (2)));
1815 else
1817 rtx tmp = gen_reg_rtx (DImode);
1818 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1819 CONST0_RTX (V4SImode),
1820 gen_rtx_SUBREG (SImode, reg, 0)));
1821 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1822 CONST0_RTX (V4SImode),
1823 gen_rtx_SUBREG (SImode, reg, 4)));
1824 emit_insn (gen_vec_interleave_lowv4si
1825 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1826 gen_rtx_SUBREG (V4SImode, vreg, 0),
1827 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1829 rtx_insn *seq = get_insns ();
1830 end_sequence ();
1831 rtx_insn *insn = DF_REF_INSN (ref);
1832 emit_conversion_insns (seq, insn);
1834 if (dump_file)
1835 fprintf (dump_file,
1836 " Copied r%d to a vector register r%d for insn %d\n",
1837 regno, REGNO (vreg), INSN_UID (insn));
1840 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1841 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1843 rtx_insn *insn = DF_REF_INSN (ref);
1844 if (count_reg)
1846 rtx def_set = single_set (insn);
1847 gcc_assert (def_set);
1849 rtx src = SET_SRC (def_set);
1851 if ((GET_CODE (src) == ASHIFT
1852 || GET_CODE (src) == ASHIFTRT
1853 || GET_CODE (src) == LSHIFTRT)
1854 && !CONST_INT_P (XEXP (src, 1))
1855 && reg_or_subregno (XEXP (src, 1)) == regno)
1856 XEXP (src, 1) = vreg;
1858 else
1859 replace_with_subreg_in_insn (insn, reg, vreg);
1861 if (dump_file)
1862 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1863 regno, REGNO (vreg), INSN_UID (insn));
1867 /* Convert all definitions of register REGNO
1868 and fix its uses. Scalar copies may be created
1869 in case register is used in not convertible insn. */
1871 void
1872 dimode_scalar_chain::convert_reg (unsigned regno)
1874 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1875 rtx reg = regno_reg_rtx[regno];
1876 rtx scopy = NULL_RTX;
1877 df_ref ref;
1878 bitmap conv;
1880 conv = BITMAP_ALLOC (NULL);
1881 bitmap_copy (conv, insns);
1883 if (scalar_copy)
1884 scopy = gen_reg_rtx (DImode);
1886 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1888 rtx_insn *insn = DF_REF_INSN (ref);
1889 rtx def_set = single_set (insn);
1890 rtx src = SET_SRC (def_set);
1891 rtx reg = DF_REF_REG (ref);
1893 if (!MEM_P (src))
1895 replace_with_subreg_in_insn (insn, reg, reg);
1896 bitmap_clear_bit (conv, INSN_UID (insn));
1899 if (scalar_copy)
1901 start_sequence ();
1902 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1904 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1905 emit_move_insn (tmp, reg);
1906 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1907 adjust_address (tmp, SImode, 0));
1908 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1909 adjust_address (tmp, SImode, 4));
1911 else if (TARGET_SSE4_1)
1913 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1914 emit_insn
1915 (gen_rtx_SET
1916 (gen_rtx_SUBREG (SImode, scopy, 0),
1917 gen_rtx_VEC_SELECT (SImode,
1918 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1920 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1921 emit_insn
1922 (gen_rtx_SET
1923 (gen_rtx_SUBREG (SImode, scopy, 4),
1924 gen_rtx_VEC_SELECT (SImode,
1925 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1927 else
1929 rtx vcopy = gen_reg_rtx (V2DImode);
1930 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1931 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1932 gen_rtx_SUBREG (SImode, vcopy, 0));
1933 emit_move_insn (vcopy,
1934 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1935 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1936 gen_rtx_SUBREG (SImode, vcopy, 0));
1938 rtx_insn *seq = get_insns ();
1939 end_sequence ();
1940 emit_conversion_insns (seq, insn);
1942 if (dump_file)
1943 fprintf (dump_file,
1944 " Copied r%d to a scalar register r%d for insn %d\n",
1945 regno, REGNO (scopy), INSN_UID (insn));
1949 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1950 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1952 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1954 rtx_insn *insn = DF_REF_INSN (ref);
1956 rtx def_set = single_set (insn);
1957 gcc_assert (def_set);
1959 rtx src = SET_SRC (def_set);
1960 rtx dst = SET_DEST (def_set);
1962 if ((GET_CODE (src) == ASHIFT
1963 || GET_CODE (src) == ASHIFTRT
1964 || GET_CODE (src) == LSHIFTRT)
1965 && !CONST_INT_P (XEXP (src, 1))
1966 && reg_or_subregno (XEXP (src, 1)) == regno)
1968 rtx tmp2 = gen_reg_rtx (V2DImode);
1970 start_sequence ();
1972 if (TARGET_SSE4_1)
1973 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1974 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1975 else
1977 rtx vec_cst
1978 = gen_rtx_CONST_VECTOR (V2DImode,
1979 gen_rtvec (2, GEN_INT (0xff),
1980 const0_rtx));
1981 vec_cst
1982 = validize_mem (force_const_mem (V2DImode, vec_cst));
1984 emit_insn (gen_rtx_SET
1985 (tmp2,
1986 gen_rtx_AND (V2DImode,
1987 gen_rtx_SUBREG (V2DImode, reg, 0),
1988 vec_cst)));
1990 rtx_insn *seq = get_insns ();
1991 end_sequence ();
1993 emit_insn_before (seq, insn);
1995 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1997 else if (!MEM_P (dst) || !REG_P (src))
1998 replace_with_subreg_in_insn (insn, reg, reg);
2000 bitmap_clear_bit (conv, INSN_UID (insn));
2003 /* Skip debug insns and uninitialized uses. */
2004 else if (DF_REF_CHAIN (ref)
2005 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2007 gcc_assert (scopy);
2008 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2009 df_insn_rescan (DF_REF_INSN (ref));
2012 BITMAP_FREE (conv);
2015 /* Convert operand OP in INSN. We should handle
2016 memory operands and uninitialized registers.
2017 All other register uses are converted during
2018 registers conversion. */
2020 void
2021 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2023 *op = copy_rtx_if_shared (*op);
2025 if (GET_CODE (*op) == NOT)
2027 convert_op (&XEXP (*op, 0), insn);
2028 PUT_MODE (*op, V2DImode);
2030 else if (MEM_P (*op))
2032 rtx tmp = gen_reg_rtx (DImode);
2034 emit_insn_before (gen_move_insn (tmp, *op), insn);
2035 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2037 if (dump_file)
2038 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2039 INSN_UID (insn), REGNO (tmp));
2041 else if (REG_P (*op))
2043 /* We may have not converted register usage in case
2044 this register has no definition. Otherwise it
2045 should be converted in convert_reg. */
2046 df_ref ref;
2047 FOR_EACH_INSN_USE (ref, insn)
2048 if (DF_REF_REGNO (ref) == REGNO (*op))
2050 gcc_assert (!DF_REF_CHAIN (ref));
2051 break;
2053 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2055 else if (CONST_INT_P (*op))
2057 rtx vec_cst;
2058 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2060 /* Prefer all ones vector in case of -1. */
2061 if (constm1_operand (*op, GET_MODE (*op)))
2062 vec_cst = CONSTM1_RTX (V2DImode);
2063 else
2064 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2065 gen_rtvec (2, *op, const0_rtx));
2067 if (!standard_sse_constant_p (vec_cst, V2DImode))
2069 start_sequence ();
2070 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2071 rtx_insn *seq = get_insns ();
2072 end_sequence ();
2073 emit_insn_before (seq, insn);
2076 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2077 *op = tmp;
2079 else
2081 gcc_assert (SUBREG_P (*op));
2082 gcc_assert (GET_MODE (*op) == V2DImode);
2086 /* Convert INSN to vector mode. */
2088 void
2089 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2091 rtx def_set = single_set (insn);
2092 rtx src = SET_SRC (def_set);
2093 rtx dst = SET_DEST (def_set);
2094 rtx subreg;
2096 if (MEM_P (dst) && !REG_P (src))
2098 /* There are no scalar integer instructions and therefore
2099 temporary register usage is required. */
2100 rtx tmp = gen_reg_rtx (DImode);
2101 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2102 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2105 switch (GET_CODE (src))
2107 case ASHIFT:
2108 case ASHIFTRT:
2109 case LSHIFTRT:
2110 convert_op (&XEXP (src, 0), insn);
2111 PUT_MODE (src, V2DImode);
2112 break;
2114 case PLUS:
2115 case MINUS:
2116 case IOR:
2117 case XOR:
2118 case AND:
2119 convert_op (&XEXP (src, 0), insn);
2120 convert_op (&XEXP (src, 1), insn);
2121 PUT_MODE (src, V2DImode);
2122 break;
2124 case NEG:
2125 src = XEXP (src, 0);
2126 convert_op (&src, insn);
2127 subreg = gen_reg_rtx (V2DImode);
2128 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2129 src = gen_rtx_MINUS (V2DImode, subreg, src);
2130 break;
2132 case NOT:
2133 src = XEXP (src, 0);
2134 convert_op (&src, insn);
2135 subreg = gen_reg_rtx (V2DImode);
2136 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2137 src = gen_rtx_XOR (V2DImode, src, subreg);
2138 break;
2140 case MEM:
2141 if (!REG_P (dst))
2142 convert_op (&src, insn);
2143 break;
2145 case REG:
2146 if (!MEM_P (dst))
2147 convert_op (&src, insn);
2148 break;
2150 case SUBREG:
2151 gcc_assert (GET_MODE (src) == V2DImode);
2152 break;
2154 case COMPARE:
2155 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2157 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2158 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2160 if (REG_P (src))
2161 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2162 else
2163 subreg = copy_rtx_if_shared (src);
2164 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2165 copy_rtx_if_shared (subreg),
2166 copy_rtx_if_shared (subreg)),
2167 insn);
2168 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2169 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2170 copy_rtx_if_shared (src)),
2171 UNSPEC_PTEST);
2172 break;
2174 case CONST_INT:
2175 convert_op (&src, insn);
2176 break;
2178 default:
2179 gcc_unreachable ();
2182 SET_SRC (def_set) = src;
2183 SET_DEST (def_set) = dst;
2185 /* Drop possible dead definitions. */
2186 PATTERN (insn) = def_set;
2188 INSN_CODE (insn) = -1;
2189 recog_memoized (insn);
2190 df_insn_rescan (insn);
2193 /* Fix uses of converted REG in debug insns. */
2195 void
2196 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2198 if (!flag_var_tracking)
2199 return;
2201 df_ref ref, next;
2202 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2204 rtx_insn *insn = DF_REF_INSN (ref);
2205 /* Make sure the next ref is for a different instruction,
2206 so that we're not affected by the rescan. */
2207 next = DF_REF_NEXT_REG (ref);
2208 while (next && DF_REF_INSN (next) == insn)
2209 next = DF_REF_NEXT_REG (next);
2211 if (DEBUG_INSN_P (insn))
2213 /* It may be a debug insn with a TImode variable in
2214 register. */
2215 bool changed = false;
2216 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2218 rtx *loc = DF_REF_LOC (ref);
2219 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2221 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2222 changed = true;
2225 if (changed)
2226 df_insn_rescan (insn);
2231 /* Convert INSN from TImode to V1T1mode. */
2233 void
2234 timode_scalar_chain::convert_insn (rtx_insn *insn)
2236 rtx def_set = single_set (insn);
2237 rtx src = SET_SRC (def_set);
2238 rtx dst = SET_DEST (def_set);
2240 switch (GET_CODE (dst))
2242 case REG:
2244 rtx tmp = find_reg_equal_equiv_note (insn);
2245 if (tmp)
2246 PUT_MODE (XEXP (tmp, 0), V1TImode);
2247 PUT_MODE (dst, V1TImode);
2248 fix_debug_reg_uses (dst);
2250 break;
2251 case MEM:
2252 PUT_MODE (dst, V1TImode);
2253 break;
2255 default:
2256 gcc_unreachable ();
2259 switch (GET_CODE (src))
2261 case REG:
2262 PUT_MODE (src, V1TImode);
2263 /* Call fix_debug_reg_uses only if SRC is never defined. */
2264 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2265 fix_debug_reg_uses (src);
2266 break;
2268 case MEM:
2269 PUT_MODE (src, V1TImode);
2270 break;
2272 case CONST_WIDE_INT:
2273 if (NONDEBUG_INSN_P (insn))
2275 /* Since there are no instructions to store 128-bit constant,
2276 temporary register usage is required. */
2277 rtx tmp = gen_reg_rtx (V1TImode);
2278 start_sequence ();
2279 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2280 src = validize_mem (force_const_mem (V1TImode, src));
2281 rtx_insn *seq = get_insns ();
2282 end_sequence ();
2283 if (seq)
2284 emit_insn_before (seq, insn);
2285 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2286 dst = tmp;
2288 break;
2290 case CONST_INT:
2291 switch (standard_sse_constant_p (src, TImode))
2293 case 1:
2294 src = CONST0_RTX (GET_MODE (dst));
2295 break;
2296 case 2:
2297 src = CONSTM1_RTX (GET_MODE (dst));
2298 break;
2299 default:
2300 gcc_unreachable ();
2302 if (NONDEBUG_INSN_P (insn))
2304 rtx tmp = gen_reg_rtx (V1TImode);
2305 /* Since there are no instructions to store standard SSE
2306 constant, temporary register usage is required. */
2307 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2308 dst = tmp;
2310 break;
2312 default:
2313 gcc_unreachable ();
2316 SET_SRC (def_set) = src;
2317 SET_DEST (def_set) = dst;
2319 /* Drop possible dead definitions. */
2320 PATTERN (insn) = def_set;
2322 INSN_CODE (insn) = -1;
2323 recog_memoized (insn);
2324 df_insn_rescan (insn);
2327 void
2328 dimode_scalar_chain::convert_registers ()
2330 bitmap_iterator bi;
2331 unsigned id;
2333 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2334 convert_reg (id);
2336 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2337 make_vector_copies (id);
2340 /* Convert whole chain creating required register
2341 conversions and copies. */
2344 scalar_chain::convert ()
2346 bitmap_iterator bi;
2347 unsigned id;
2348 int converted_insns = 0;
2350 if (!dbg_cnt (stv_conversion))
2351 return 0;
2353 if (dump_file)
2354 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2356 convert_registers ();
2358 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2360 convert_insn (DF_INSN_UID_GET (id)->insn);
2361 converted_insns++;
2364 return converted_insns;
2367 /* Main STV pass function. Find and convert scalar
2368 instructions into vector mode when profitable. */
2370 static unsigned int
2371 convert_scalars_to_vector ()
2373 basic_block bb;
2374 bitmap candidates;
2375 int converted_insns = 0;
2377 bitmap_obstack_initialize (NULL);
2378 candidates = BITMAP_ALLOC (NULL);
2380 calculate_dominance_info (CDI_DOMINATORS);
2381 df_set_flags (DF_DEFER_INSN_RESCAN);
2382 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2383 df_md_add_problem ();
2384 df_analyze ();
2386 /* Find all instructions we want to convert into vector mode. */
2387 if (dump_file)
2388 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2390 FOR_EACH_BB_FN (bb, cfun)
2392 rtx_insn *insn;
2393 FOR_BB_INSNS (bb, insn)
2394 if (scalar_to_vector_candidate_p (insn))
2396 if (dump_file)
2397 fprintf (dump_file, " insn %d is marked as a candidate\n",
2398 INSN_UID (insn));
2400 bitmap_set_bit (candidates, INSN_UID (insn));
2404 remove_non_convertible_regs (candidates);
2406 if (bitmap_empty_p (candidates))
2407 if (dump_file)
2408 fprintf (dump_file, "There are no candidates for optimization.\n");
2410 while (!bitmap_empty_p (candidates))
2412 unsigned uid = bitmap_first_set_bit (candidates);
2413 scalar_chain *chain;
2415 if (TARGET_64BIT)
2416 chain = new timode_scalar_chain;
2417 else
2418 chain = new dimode_scalar_chain;
2420 /* Find instructions chain we want to convert to vector mode.
2421 Check all uses and definitions to estimate all required
2422 conversions. */
2423 chain->build (candidates, uid);
2425 if (chain->compute_convert_gain () > 0)
2426 converted_insns += chain->convert ();
2427 else
2428 if (dump_file)
2429 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2430 chain->chain_id);
2432 delete chain;
2435 if (dump_file)
2436 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2438 BITMAP_FREE (candidates);
2439 bitmap_obstack_release (NULL);
2440 df_process_deferred_rescans ();
2442 /* Conversion means we may have 128bit register spills/fills
2443 which require aligned stack. */
2444 if (converted_insns)
2446 if (crtl->stack_alignment_needed < 128)
2447 crtl->stack_alignment_needed = 128;
2448 if (crtl->stack_alignment_estimated < 128)
2449 crtl->stack_alignment_estimated = 128;
2450 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2451 if (TARGET_64BIT)
2452 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2453 parm; parm = DECL_CHAIN (parm))
2455 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2456 continue;
2457 if (DECL_RTL_SET_P (parm)
2458 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2460 rtx r = DECL_RTL (parm);
2461 if (REG_P (r))
2462 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2464 if (DECL_INCOMING_RTL (parm)
2465 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2467 rtx r = DECL_INCOMING_RTL (parm);
2468 if (REG_P (r))
2469 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2474 return 0;
2477 namespace {
2479 const pass_data pass_data_insert_vzeroupper =
2481 RTL_PASS, /* type */
2482 "vzeroupper", /* name */
2483 OPTGROUP_NONE, /* optinfo_flags */
2484 TV_MACH_DEP, /* tv_id */
2485 0, /* properties_required */
2486 0, /* properties_provided */
2487 0, /* properties_destroyed */
2488 0, /* todo_flags_start */
2489 TODO_df_finish, /* todo_flags_finish */
2492 class pass_insert_vzeroupper : public rtl_opt_pass
2494 public:
2495 pass_insert_vzeroupper(gcc::context *ctxt)
2496 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2499 /* opt_pass methods: */
2500 virtual bool gate (function *)
2502 return TARGET_AVX
2503 && TARGET_VZEROUPPER && flag_expensive_optimizations
2504 && !optimize_size;
2507 virtual unsigned int execute (function *)
2509 return rest_of_handle_insert_vzeroupper ();
2512 }; // class pass_insert_vzeroupper
2514 const pass_data pass_data_stv =
2516 RTL_PASS, /* type */
2517 "stv", /* name */
2518 OPTGROUP_NONE, /* optinfo_flags */
2519 TV_MACH_DEP, /* tv_id */
2520 0, /* properties_required */
2521 0, /* properties_provided */
2522 0, /* properties_destroyed */
2523 0, /* todo_flags_start */
2524 TODO_df_finish, /* todo_flags_finish */
2527 class pass_stv : public rtl_opt_pass
2529 public:
2530 pass_stv (gcc::context *ctxt)
2531 : rtl_opt_pass (pass_data_stv, ctxt),
2532 timode_p (false)
2535 /* opt_pass methods: */
2536 virtual bool gate (function *)
2538 return (timode_p == !!TARGET_64BIT
2539 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2542 virtual unsigned int execute (function *)
2544 return convert_scalars_to_vector ();
2547 opt_pass *clone ()
2549 return new pass_stv (m_ctxt);
2552 void set_pass_param (unsigned int n, bool param)
2554 gcc_assert (n == 0);
2555 timode_p = param;
2558 private:
2559 bool timode_p;
2560 }; // class pass_stv
2562 } // anon namespace
2564 rtl_opt_pass *
2565 make_pass_insert_vzeroupper (gcc::context *ctxt)
2567 return new pass_insert_vzeroupper (ctxt);
2570 rtl_opt_pass *
2571 make_pass_stv (gcc::context *ctxt)
2573 return new pass_stv (ctxt);
2576 /* Inserting ENDBRANCH instructions. */
2578 static unsigned int
2579 rest_of_insert_endbranch (void)
2581 timevar_push (TV_MACH_DEP);
2583 rtx cet_eb;
2584 rtx_insn *insn;
2585 basic_block bb;
2587 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2588 absent among function attributes. Later an optimization will be
2589 introduced to make analysis if an address of a static function is
2590 taken. A static function whose address is not taken will get a
2591 nocf_check attribute. This will allow to reduce the number of EB. */
2593 if (!lookup_attribute ("nocf_check",
2594 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2595 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2597 cet_eb = gen_nop_endbr ();
2599 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2600 insn = BB_HEAD (bb);
2601 emit_insn_before (cet_eb, insn);
2604 bb = 0;
2605 FOR_EACH_BB_FN (bb, cfun)
2607 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2608 insn = NEXT_INSN (insn))
2610 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2612 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2613 continue;
2614 /* Generate ENDBRANCH after CALL, which can return more than
2615 twice, setjmp-like functions. */
2617 /* Skip notes and debug insns that must be next to the
2618 call insn. ??? This might skip a lot more than
2619 that... ??? Skipping barriers and emitting code
2620 after them surely looks like a mistake; we probably
2621 won't ever hit it, for we'll hit BB_END first. */
2622 rtx_insn *next_insn = insn;
2623 while ((next_insn != BB_END (bb))
2624 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2625 || NOTE_P (NEXT_INSN (next_insn))
2626 || BARRIER_P (NEXT_INSN (next_insn))))
2627 next_insn = NEXT_INSN (next_insn);
2629 cet_eb = gen_nop_endbr ();
2630 emit_insn_after_setloc (cet_eb, next_insn, INSN_LOCATION (insn));
2631 continue;
2634 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2636 rtx target = JUMP_LABEL (insn);
2637 if (target == NULL_RTX || ANY_RETURN_P (target))
2638 continue;
2640 /* Check the jump is a switch table. */
2641 rtx_insn *label = as_a<rtx_insn *> (target);
2642 rtx_insn *table = next_insn (label);
2643 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2644 continue;
2646 /* For the indirect jump find out all places it jumps and insert
2647 ENDBRANCH there. It should be done under a special flag to
2648 control ENDBRANCH generation for switch stmts. */
2649 edge_iterator ei;
2650 edge e;
2651 basic_block dest_blk;
2653 FOR_EACH_EDGE (e, ei, bb->succs)
2655 rtx_insn *insn;
2657 dest_blk = e->dest;
2658 insn = BB_HEAD (dest_blk);
2659 gcc_assert (LABEL_P (insn));
2660 cet_eb = gen_nop_endbr ();
2661 emit_insn_after (cet_eb, insn);
2663 continue;
2666 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2667 || (NOTE_P (insn)
2668 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2669 /* TODO. Check /s bit also. */
2671 cet_eb = gen_nop_endbr ();
2672 emit_insn_after (cet_eb, insn);
2673 continue;
2678 timevar_pop (TV_MACH_DEP);
2679 return 0;
2682 namespace {
2684 const pass_data pass_data_insert_endbranch =
2686 RTL_PASS, /* type. */
2687 "cet", /* name. */
2688 OPTGROUP_NONE, /* optinfo_flags. */
2689 TV_MACH_DEP, /* tv_id. */
2690 0, /* properties_required. */
2691 0, /* properties_provided. */
2692 0, /* properties_destroyed. */
2693 0, /* todo_flags_start. */
2694 0, /* todo_flags_finish. */
2697 class pass_insert_endbranch : public rtl_opt_pass
2699 public:
2700 pass_insert_endbranch (gcc::context *ctxt)
2701 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2704 /* opt_pass methods: */
2705 virtual bool gate (function *)
2707 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2710 virtual unsigned int execute (function *)
2712 return rest_of_insert_endbranch ();
2715 }; // class pass_insert_endbranch
2717 } // anon namespace
2719 rtl_opt_pass *
2720 make_pass_insert_endbranch (gcc::context *ctxt)
2722 return new pass_insert_endbranch (ctxt);
2725 /* Return true if a red-zone is in use. */
2727 bool
2728 ix86_using_red_zone (void)
2730 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2733 /* Return a string that documents the current -m options. The caller is
2734 responsible for freeing the string. */
2736 static char *
2737 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2738 int flags, int flags2,
2739 const char *arch, const char *tune,
2740 enum fpmath_unit fpmath, bool add_nl_p)
2742 struct ix86_target_opts
2744 const char *option; /* option string */
2745 HOST_WIDE_INT mask; /* isa mask options */
2748 /* This table is ordered so that options like -msse4.2 that imply other
2749 ISAs come first. Target string will be displayed in the same order. */
2750 static struct ix86_target_opts isa2_opts[] =
2752 { "-mmpx", OPTION_MASK_ISA_MPX },
2753 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2754 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2755 { "-mvaes", OPTION_MASK_ISA_VAES },
2756 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2757 { "-msgx", OPTION_MASK_ISA_SGX },
2758 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2759 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2760 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2761 { "-mibt", OPTION_MASK_ISA_IBT },
2762 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2764 static struct ix86_target_opts isa_opts[] =
2766 { "-mgfni", OPTION_MASK_ISA_GFNI },
2767 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2768 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2769 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2770 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2771 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2772 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2773 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2774 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2775 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2776 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2777 { "-mfma", OPTION_MASK_ISA_FMA },
2778 { "-mxop", OPTION_MASK_ISA_XOP },
2779 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2780 { "-mf16c", OPTION_MASK_ISA_F16C },
2781 { "-mavx", OPTION_MASK_ISA_AVX },
2782 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2783 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2784 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2785 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2786 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2787 { "-msse3", OPTION_MASK_ISA_SSE3 },
2788 { "-maes", OPTION_MASK_ISA_AES },
2789 { "-msha", OPTION_MASK_ISA_SHA },
2790 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2791 { "-msse2", OPTION_MASK_ISA_SSE2 },
2792 { "-msse", OPTION_MASK_ISA_SSE },
2793 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2794 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2795 { "-mmmx", OPTION_MASK_ISA_MMX },
2796 { "-mrtm", OPTION_MASK_ISA_RTM },
2797 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2798 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2799 { "-madx", OPTION_MASK_ISA_ADX },
2800 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2801 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2802 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2803 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2804 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2805 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2806 { "-mabm", OPTION_MASK_ISA_ABM },
2807 { "-mbmi", OPTION_MASK_ISA_BMI },
2808 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2809 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2810 { "-mtbm", OPTION_MASK_ISA_TBM },
2811 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2812 { "-mcx16", OPTION_MASK_ISA_CX16 },
2813 { "-msahf", OPTION_MASK_ISA_SAHF },
2814 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2815 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2816 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2817 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2818 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
2819 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2820 { "-mpku", OPTION_MASK_ISA_PKU },
2821 { "-mlwp", OPTION_MASK_ISA_LWP },
2822 { "-mhle", OPTION_MASK_ISA_HLE },
2823 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2824 { "-mclwb", OPTION_MASK_ISA_CLWB }
2827 /* Flag options. */
2828 static struct ix86_target_opts flag_opts[] =
2830 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2831 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2832 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2833 { "-m80387", MASK_80387 },
2834 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2835 { "-malign-double", MASK_ALIGN_DOUBLE },
2836 { "-mcld", MASK_CLD },
2837 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2838 { "-mieee-fp", MASK_IEEE_FP },
2839 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2840 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2841 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2842 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2843 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2844 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2845 { "-mno-red-zone", MASK_NO_RED_ZONE },
2846 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2847 { "-mrecip", MASK_RECIP },
2848 { "-mrtd", MASK_RTD },
2849 { "-msseregparm", MASK_SSEREGPARM },
2850 { "-mstack-arg-probe", MASK_STACK_PROBE },
2851 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2852 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2853 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2854 { "-mvzeroupper", MASK_VZEROUPPER },
2855 { "-mstv", MASK_STV },
2856 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2857 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2858 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2861 /* Additional flag options. */
2862 static struct ix86_target_opts flag2_opts[] =
2864 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2867 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2868 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2870 char isa_other[40];
2871 char isa2_other[40];
2872 char flags_other[40];
2873 char flags2_other[40];
2874 unsigned num = 0;
2875 unsigned i, j;
2876 char *ret;
2877 char *ptr;
2878 size_t len;
2879 size_t line_len;
2880 size_t sep_len;
2881 const char *abi;
2883 memset (opts, '\0', sizeof (opts));
2885 /* Add -march= option. */
2886 if (arch)
2888 opts[num][0] = "-march=";
2889 opts[num++][1] = arch;
2892 /* Add -mtune= option. */
2893 if (tune)
2895 opts[num][0] = "-mtune=";
2896 opts[num++][1] = tune;
2899 /* Add -m32/-m64/-mx32. */
2900 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2902 if ((isa & OPTION_MASK_ABI_64) != 0)
2903 abi = "-m64";
2904 else
2905 abi = "-mx32";
2906 isa &= ~ (OPTION_MASK_ISA_64BIT
2907 | OPTION_MASK_ABI_64
2908 | OPTION_MASK_ABI_X32);
2910 else
2911 abi = "-m32";
2912 opts[num++][0] = abi;
2914 /* Pick out the options in isa2 options. */
2915 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2917 if ((isa2 & isa2_opts[i].mask) != 0)
2919 opts[num++][0] = isa2_opts[i].option;
2920 isa2 &= ~ isa2_opts[i].mask;
2924 if (isa2 && add_nl_p)
2926 opts[num++][0] = isa2_other;
2927 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2930 /* Pick out the options in isa options. */
2931 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2933 if ((isa & isa_opts[i].mask) != 0)
2935 opts[num++][0] = isa_opts[i].option;
2936 isa &= ~ isa_opts[i].mask;
2940 if (isa && add_nl_p)
2942 opts[num++][0] = isa_other;
2943 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2946 /* Add flag options. */
2947 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2949 if ((flags & flag_opts[i].mask) != 0)
2951 opts[num++][0] = flag_opts[i].option;
2952 flags &= ~ flag_opts[i].mask;
2956 if (flags && add_nl_p)
2958 opts[num++][0] = flags_other;
2959 sprintf (flags_other, "(other flags: %#x)", flags);
2962 /* Add additional flag options. */
2963 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2965 if ((flags2 & flag2_opts[i].mask) != 0)
2967 opts[num++][0] = flag2_opts[i].option;
2968 flags2 &= ~ flag2_opts[i].mask;
2972 if (flags2 && add_nl_p)
2974 opts[num++][0] = flags2_other;
2975 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2978 /* Add -fpmath= option. */
2979 if (fpmath)
2981 opts[num][0] = "-mfpmath=";
2982 switch ((int) fpmath)
2984 case FPMATH_387:
2985 opts[num++][1] = "387";
2986 break;
2988 case FPMATH_SSE:
2989 opts[num++][1] = "sse";
2990 break;
2992 case FPMATH_387 | FPMATH_SSE:
2993 opts[num++][1] = "sse+387";
2994 break;
2996 default:
2997 gcc_unreachable ();
3001 /* Any options? */
3002 if (num == 0)
3003 return NULL;
3005 gcc_assert (num < ARRAY_SIZE (opts));
3007 /* Size the string. */
3008 len = 0;
3009 sep_len = (add_nl_p) ? 3 : 1;
3010 for (i = 0; i < num; i++)
3012 len += sep_len;
3013 for (j = 0; j < 2; j++)
3014 if (opts[i][j])
3015 len += strlen (opts[i][j]);
3018 /* Build the string. */
3019 ret = ptr = (char *) xmalloc (len);
3020 line_len = 0;
3022 for (i = 0; i < num; i++)
3024 size_t len2[2];
3026 for (j = 0; j < 2; j++)
3027 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3029 if (i != 0)
3031 *ptr++ = ' ';
3032 line_len++;
3034 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3036 *ptr++ = '\\';
3037 *ptr++ = '\n';
3038 line_len = 0;
3042 for (j = 0; j < 2; j++)
3043 if (opts[i][j])
3045 memcpy (ptr, opts[i][j], len2[j]);
3046 ptr += len2[j];
3047 line_len += len2[j];
3051 *ptr = '\0';
3052 gcc_assert (ret + len >= ptr);
3054 return ret;
3057 /* Return true, if profiling code should be emitted before
3058 prologue. Otherwise it returns false.
3059 Note: For x86 with "hotfix" it is sorried. */
3060 static bool
3061 ix86_profile_before_prologue (void)
3063 return flag_fentry != 0;
3066 /* Function that is callable from the debugger to print the current
3067 options. */
3068 void ATTRIBUTE_UNUSED
3069 ix86_debug_options (void)
3071 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3072 target_flags, ix86_target_flags,
3073 ix86_arch_string,ix86_tune_string,
3074 ix86_fpmath, true);
3076 if (opts)
3078 fprintf (stderr, "%s\n\n", opts);
3079 free (opts);
3081 else
3082 fputs ("<no options>\n\n", stderr);
3084 return;
3087 /* Return true if T is one of the bytes we should avoid with
3088 -fmitigate-rop. */
3090 static bool
3091 ix86_rop_should_change_byte_p (int t)
3093 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3096 static const char *stringop_alg_names[] = {
3097 #define DEF_ENUM
3098 #define DEF_ALG(alg, name) #name,
3099 #include "stringop.def"
3100 #undef DEF_ENUM
3101 #undef DEF_ALG
3104 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3105 The string is of the following form (or comma separated list of it):
3107 strategy_alg:max_size:[align|noalign]
3109 where the full size range for the strategy is either [0, max_size] or
3110 [min_size, max_size], in which min_size is the max_size + 1 of the
3111 preceding range. The last size range must have max_size == -1.
3113 Examples:
3116 -mmemcpy-strategy=libcall:-1:noalign
3118 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3122 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3124 This is to tell the compiler to use the following strategy for memset
3125 1) when the expected size is between [1, 16], use rep_8byte strategy;
3126 2) when the size is between [17, 2048], use vector_loop;
3127 3) when the size is > 2048, use libcall. */
3129 struct stringop_size_range
3131 int max;
3132 stringop_alg alg;
3133 bool noalign;
3136 static void
3137 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3139 const struct stringop_algs *default_algs;
3140 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3141 char *curr_range_str, *next_range_str;
3142 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3143 int i = 0, n = 0;
3145 if (is_memset)
3146 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3147 else
3148 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3150 curr_range_str = strategy_str;
3154 int maxs;
3155 char alg_name[128];
3156 char align[16];
3157 next_range_str = strchr (curr_range_str, ',');
3158 if (next_range_str)
3159 *next_range_str++ = '\0';
3161 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
3162 alg_name, &maxs, align))
3164 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3165 return;
3168 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3170 error ("size ranges of option %qs should be increasing", opt);
3171 return;
3174 for (i = 0; i < last_alg; i++)
3175 if (!strcmp (alg_name, stringop_alg_names[i]))
3176 break;
3178 if (i == last_alg)
3180 error ("wrong strategy name %qs specified for option %qs",
3181 alg_name, opt);
3183 auto_vec <const char *> candidates;
3184 for (i = 0; i < last_alg; i++)
3185 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3186 candidates.safe_push (stringop_alg_names[i]);
3188 char *s;
3189 const char *hint
3190 = candidates_list_and_hint (alg_name, s, candidates);
3191 if (hint)
3192 inform (input_location,
3193 "valid arguments to %qs are: %s; did you mean %qs?",
3194 opt, s, hint);
3195 else
3196 inform (input_location, "valid arguments to %qs are: %s",
3197 opt, s);
3198 XDELETEVEC (s);
3199 return;
3202 if ((stringop_alg) i == rep_prefix_8_byte
3203 && !TARGET_64BIT)
3205 /* rep; movq isn't available in 32-bit code. */
3206 error ("strategy name %qs specified for option %qs "
3207 "not supported for 32-bit code", alg_name, opt);
3208 return;
3211 input_ranges[n].max = maxs;
3212 input_ranges[n].alg = (stringop_alg) i;
3213 if (!strcmp (align, "align"))
3214 input_ranges[n].noalign = false;
3215 else if (!strcmp (align, "noalign"))
3216 input_ranges[n].noalign = true;
3217 else
3219 error ("unknown alignment %qs specified for option %qs", align, opt);
3220 return;
3222 n++;
3223 curr_range_str = next_range_str;
3225 while (curr_range_str);
3227 if (input_ranges[n - 1].max != -1)
3229 error ("the max value for the last size range should be -1"
3230 " for option %qs", opt);
3231 return;
3234 if (n > MAX_STRINGOP_ALGS)
3236 error ("too many size ranges specified in option %qs", opt);
3237 return;
3240 /* Now override the default algs array. */
3241 for (i = 0; i < n; i++)
3243 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3244 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3245 = input_ranges[i].alg;
3246 *const_cast<int *>(&default_algs->size[i].noalign)
3247 = input_ranges[i].noalign;
3252 /* parse -mtune-ctrl= option. When DUMP is true,
3253 print the features that are explicitly set. */
3255 static void
3256 parse_mtune_ctrl_str (bool dump)
3258 if (!ix86_tune_ctrl_string)
3259 return;
3261 char *next_feature_string = NULL;
3262 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3263 char *orig = curr_feature_string;
3264 int i;
3267 bool clear = false;
3269 next_feature_string = strchr (curr_feature_string, ',');
3270 if (next_feature_string)
3271 *next_feature_string++ = '\0';
3272 if (*curr_feature_string == '^')
3274 curr_feature_string++;
3275 clear = true;
3277 for (i = 0; i < X86_TUNE_LAST; i++)
3279 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3281 ix86_tune_features[i] = !clear;
3282 if (dump)
3283 fprintf (stderr, "Explicitly %s feature %s\n",
3284 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3285 break;
3288 if (i == X86_TUNE_LAST)
3289 error ("unknown parameter to option -mtune-ctrl: %s",
3290 clear ? curr_feature_string - 1 : curr_feature_string);
3291 curr_feature_string = next_feature_string;
3293 while (curr_feature_string);
3294 free (orig);
3297 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3298 processor type. */
3300 static void
3301 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3303 unsigned int ix86_tune_mask = 1u << ix86_tune;
3304 int i;
3306 for (i = 0; i < X86_TUNE_LAST; ++i)
3308 if (ix86_tune_no_default)
3309 ix86_tune_features[i] = 0;
3310 else
3311 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3314 if (dump)
3316 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3317 for (i = 0; i < X86_TUNE_LAST; i++)
3318 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3319 ix86_tune_features[i] ? "on" : "off");
3322 parse_mtune_ctrl_str (dump);
3326 /* Default align_* from the processor table. */
3328 static void
3329 ix86_default_align (struct gcc_options *opts)
3331 if (opts->x_align_loops == 0)
3333 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3334 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3336 if (opts->x_align_jumps == 0)
3338 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3339 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3341 if (opts->x_align_functions == 0)
3343 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3347 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3349 static void
3350 ix86_override_options_after_change (void)
3352 ix86_default_align (&global_options);
3355 /* Override various settings based on options. If MAIN_ARGS_P, the
3356 options are from the command line, otherwise they are from
3357 attributes. Return true if there's an error related to march
3358 option. */
3360 static bool
3361 ix86_option_override_internal (bool main_args_p,
3362 struct gcc_options *opts,
3363 struct gcc_options *opts_set)
3365 int i;
3366 unsigned int ix86_arch_mask;
3367 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3369 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3370 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3371 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3372 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3373 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3374 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3375 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3376 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3377 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3378 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3379 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3380 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3381 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3382 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3383 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3384 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3385 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3386 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3387 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3388 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3389 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3390 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3391 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3392 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3393 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3394 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3395 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3396 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3397 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3398 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3399 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3400 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3401 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3402 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3403 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3404 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3405 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3406 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3407 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3408 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3409 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3410 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3411 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3412 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3413 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3414 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3415 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3416 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3417 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3418 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3419 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3420 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3421 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3422 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3423 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3424 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3425 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3426 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3427 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3428 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3429 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3430 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3431 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3432 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3434 #define PTA_CORE2 \
3435 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3436 | PTA_CX16 | PTA_FXSR)
3437 #define PTA_NEHALEM \
3438 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3439 #define PTA_WESTMERE \
3440 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3441 #define PTA_SANDYBRIDGE \
3442 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3443 #define PTA_IVYBRIDGE \
3444 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3445 #define PTA_HASWELL \
3446 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3447 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3448 #define PTA_BROADWELL \
3449 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3450 #define PTA_SKYLAKE \
3451 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3452 #define PTA_SKYLAKE_AVX512 \
3453 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3454 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU | PTA_CLWB)
3455 #define PTA_CANNONLAKE \
3456 (PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA)
3457 #define PTA_KNL \
3458 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3459 #define PTA_BONNELL \
3460 (PTA_CORE2 | PTA_MOVBE)
3461 #define PTA_SILVERMONT \
3462 (PTA_WESTMERE | PTA_MOVBE)
3463 #define PTA_KNM \
3464 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3466 /* if this reaches 64, need to widen struct pta flags below */
3468 static struct pta
3470 const char *const name; /* processor name or nickname. */
3471 const enum processor_type processor;
3472 const enum attr_cpu schedule;
3473 const unsigned HOST_WIDE_INT flags;
3475 const processor_alias_table[] =
3477 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3478 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3479 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3480 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3481 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3482 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3483 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3484 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3485 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3486 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3487 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3488 PTA_MMX | PTA_SSE | PTA_FXSR},
3489 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3490 PTA_MMX | PTA_SSE | PTA_FXSR},
3491 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3492 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3493 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3494 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3495 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3496 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3497 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3498 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3499 PTA_MMX | PTA_SSE | PTA_FXSR},
3500 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3501 PTA_MMX | PTA_SSE | PTA_FXSR},
3502 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3503 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3504 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3505 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3506 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3507 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3508 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3509 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3510 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3511 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3512 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3513 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3514 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3515 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3516 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3517 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3518 PTA_SANDYBRIDGE},
3519 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3520 PTA_SANDYBRIDGE},
3521 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3522 PTA_IVYBRIDGE},
3523 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3524 PTA_IVYBRIDGE},
3525 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3526 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3527 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3528 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3529 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3530 PTA_SKYLAKE_AVX512},
3531 {"cannonlake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_CANNONLAKE},
3532 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3533 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3534 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3535 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3536 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3537 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3538 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3539 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3540 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3541 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3542 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3543 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3544 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3545 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3546 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3547 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3548 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3549 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3550 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3551 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3552 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3553 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3554 {"x86-64", PROCESSOR_K8, CPU_K8,
3555 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3556 {"eden-x2", PROCESSOR_K8, CPU_K8,
3557 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3558 {"nano", PROCESSOR_K8, CPU_K8,
3559 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3560 | PTA_SSSE3 | PTA_FXSR},
3561 {"nano-1000", PROCESSOR_K8, CPU_K8,
3562 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3563 | PTA_SSSE3 | PTA_FXSR},
3564 {"nano-2000", PROCESSOR_K8, CPU_K8,
3565 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3566 | PTA_SSSE3 | PTA_FXSR},
3567 {"nano-3000", PROCESSOR_K8, CPU_K8,
3568 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3569 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3570 {"nano-x2", PROCESSOR_K8, CPU_K8,
3571 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3572 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3573 {"eden-x4", PROCESSOR_K8, CPU_K8,
3574 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3575 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3576 {"nano-x4", PROCESSOR_K8, CPU_K8,
3577 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3578 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3579 {"k8", PROCESSOR_K8, CPU_K8,
3580 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3581 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3582 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3583 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3584 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3585 {"opteron", PROCESSOR_K8, CPU_K8,
3586 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3587 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3588 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3589 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3590 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3591 {"athlon64", PROCESSOR_K8, CPU_K8,
3592 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3593 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3594 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3595 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3596 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3597 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3598 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3599 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3600 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3601 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3602 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3603 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3604 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3605 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3606 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3607 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3608 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3609 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3610 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3611 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3612 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3613 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3614 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3615 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3616 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3617 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3618 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3619 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3620 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3621 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3622 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3623 | PTA_XSAVEOPT | PTA_FSGSBASE},
3624 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3625 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3626 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3627 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3628 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3629 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3630 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3631 | PTA_MOVBE | PTA_MWAITX},
3632 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3633 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3634 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3635 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3636 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3637 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3638 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3639 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3640 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3641 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3642 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3643 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3644 | PTA_FXSR | PTA_XSAVE},
3645 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3646 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3647 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3648 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3649 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3650 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3652 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3653 PTA_64BIT
3654 | PTA_HLE /* flags are only used for -march switch. */ },
3657 /* -mrecip options. */
3658 static struct
3660 const char *string; /* option name */
3661 unsigned int mask; /* mask bits to set */
3663 const recip_options[] =
3665 { "all", RECIP_MASK_ALL },
3666 { "none", RECIP_MASK_NONE },
3667 { "div", RECIP_MASK_DIV },
3668 { "sqrt", RECIP_MASK_SQRT },
3669 { "vec-div", RECIP_MASK_VEC_DIV },
3670 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3673 int const pta_size = ARRAY_SIZE (processor_alias_table);
3675 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3676 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3677 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3678 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3679 #ifdef TARGET_BI_ARCH
3680 else
3682 #if TARGET_BI_ARCH == 1
3683 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3684 is on and OPTION_MASK_ABI_X32 is off. We turn off
3685 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3686 -mx32. */
3687 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3688 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3689 #else
3690 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3691 on and OPTION_MASK_ABI_64 is off. We turn off
3692 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3693 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3694 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3695 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3696 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3697 #endif
3698 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3699 && TARGET_IAMCU_P (opts->x_target_flags))
3700 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3701 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3703 #endif
3705 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3707 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3708 OPTION_MASK_ABI_64 for TARGET_X32. */
3709 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3710 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3712 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3713 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3714 | OPTION_MASK_ABI_X32
3715 | OPTION_MASK_ABI_64);
3716 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3718 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3719 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3720 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3721 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3724 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3725 SUBTARGET_OVERRIDE_OPTIONS;
3726 #endif
3728 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3729 SUBSUBTARGET_OVERRIDE_OPTIONS;
3730 #endif
3732 /* -fPIC is the default for x86_64. */
3733 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3734 opts->x_flag_pic = 2;
3736 /* Need to check -mtune=generic first. */
3737 if (opts->x_ix86_tune_string)
3739 /* As special support for cross compilers we read -mtune=native
3740 as -mtune=generic. With native compilers we won't see the
3741 -mtune=native, as it was changed by the driver. */
3742 if (!strcmp (opts->x_ix86_tune_string, "native"))
3744 opts->x_ix86_tune_string = "generic";
3746 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3747 warning (OPT_Wdeprecated,
3748 main_args_p
3749 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3750 "or %<-mtune=generic%> instead as appropriate")
3751 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3752 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3753 " instead as appropriate"));
3755 else
3757 if (opts->x_ix86_arch_string)
3758 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3759 if (!opts->x_ix86_tune_string)
3761 opts->x_ix86_tune_string
3762 = processor_target_table[TARGET_CPU_DEFAULT].name;
3763 ix86_tune_defaulted = 1;
3766 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3767 or defaulted. We need to use a sensible tune option. */
3768 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3770 opts->x_ix86_tune_string = "generic";
3774 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3775 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3777 /* rep; movq isn't available in 32-bit code. */
3778 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3779 opts->x_ix86_stringop_alg = no_stringop;
3782 if (!opts->x_ix86_arch_string)
3783 opts->x_ix86_arch_string
3784 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3785 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3786 else
3787 ix86_arch_specified = 1;
3789 if (opts_set->x_ix86_pmode)
3791 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3792 && opts->x_ix86_pmode == PMODE_SI)
3793 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3794 && opts->x_ix86_pmode == PMODE_DI))
3795 error ("address mode %qs not supported in the %s bit mode",
3796 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3797 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3799 else
3800 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3801 ? PMODE_DI : PMODE_SI;
3803 if (!opts_set->x_ix86_abi)
3804 opts->x_ix86_abi = DEFAULT_ABI;
3806 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3807 error ("-mabi=ms not supported with X32 ABI");
3808 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3810 /* For targets using ms ABI enable ms-extensions, if not
3811 explicit turned off. For non-ms ABI we turn off this
3812 option. */
3813 if (!opts_set->x_flag_ms_extensions)
3814 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3816 if (opts_set->x_ix86_cmodel)
3818 switch (opts->x_ix86_cmodel)
3820 case CM_SMALL:
3821 case CM_SMALL_PIC:
3822 if (opts->x_flag_pic)
3823 opts->x_ix86_cmodel = CM_SMALL_PIC;
3824 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3825 error ("code model %qs not supported in the %s bit mode",
3826 "small", "32");
3827 break;
3829 case CM_MEDIUM:
3830 case CM_MEDIUM_PIC:
3831 if (opts->x_flag_pic)
3832 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3833 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3834 error ("code model %qs not supported in the %s bit mode",
3835 "medium", "32");
3836 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3837 error ("code model %qs not supported in x32 mode",
3838 "medium");
3839 break;
3841 case CM_LARGE:
3842 case CM_LARGE_PIC:
3843 if (opts->x_flag_pic)
3844 opts->x_ix86_cmodel = CM_LARGE_PIC;
3845 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3846 error ("code model %qs not supported in the %s bit mode",
3847 "large", "32");
3848 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3849 error ("code model %qs not supported in x32 mode",
3850 "large");
3851 break;
3853 case CM_32:
3854 if (opts->x_flag_pic)
3855 error ("code model %s does not support PIC mode", "32");
3856 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3857 error ("code model %qs not supported in the %s bit mode",
3858 "32", "64");
3859 break;
3861 case CM_KERNEL:
3862 if (opts->x_flag_pic)
3864 error ("code model %s does not support PIC mode", "kernel");
3865 opts->x_ix86_cmodel = CM_32;
3867 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3868 error ("code model %qs not supported in the %s bit mode",
3869 "kernel", "32");
3870 break;
3872 default:
3873 gcc_unreachable ();
3876 else
3878 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3879 use of rip-relative addressing. This eliminates fixups that
3880 would otherwise be needed if this object is to be placed in a
3881 DLL, and is essentially just as efficient as direct addressing. */
3882 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3883 && (TARGET_RDOS || TARGET_PECOFF))
3884 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3885 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3886 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3887 else
3888 opts->x_ix86_cmodel = CM_32;
3890 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3892 error ("-masm=intel not supported in this configuration");
3893 opts->x_ix86_asm_dialect = ASM_ATT;
3895 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3896 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3897 sorry ("%i-bit mode not compiled in",
3898 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3900 for (i = 0; i < pta_size; i++)
3901 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3903 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3905 error (main_args_p
3906 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3907 "switch")
3908 : G_("%<generic%> CPU can be used only for "
3909 "%<target(\"tune=\")%> attribute"));
3910 return false;
3912 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3914 error (main_args_p
3915 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3916 "switch")
3917 : G_("%<intel%> CPU can be used only for "
3918 "%<target(\"tune=\")%> attribute"));
3919 return false;
3922 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3923 && !(processor_alias_table[i].flags & PTA_64BIT))
3925 error ("CPU you selected does not support x86-64 "
3926 "instruction set");
3927 return false;
3930 ix86_schedule = processor_alias_table[i].schedule;
3931 ix86_arch = processor_alias_table[i].processor;
3932 /* Default cpu tuning to the architecture. */
3933 ix86_tune = ix86_arch;
3935 if (processor_alias_table[i].flags & PTA_MMX
3936 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3937 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3938 if (processor_alias_table[i].flags & PTA_3DNOW
3939 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3940 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3941 if (processor_alias_table[i].flags & PTA_3DNOW_A
3942 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3943 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3944 if (processor_alias_table[i].flags & PTA_SSE
3945 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3946 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3947 if (processor_alias_table[i].flags & PTA_SSE2
3948 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3949 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3950 if (processor_alias_table[i].flags & PTA_SSE3
3951 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3952 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3953 if (processor_alias_table[i].flags & PTA_SSSE3
3954 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3955 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3956 if (processor_alias_table[i].flags & PTA_SSE4_1
3957 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3958 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3959 if (processor_alias_table[i].flags & PTA_SSE4_2
3960 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3961 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3962 if (processor_alias_table[i].flags & PTA_AVX
3963 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3964 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3965 if (processor_alias_table[i].flags & PTA_AVX2
3966 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3967 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3968 if (processor_alias_table[i].flags & PTA_FMA
3969 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3970 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3971 if (processor_alias_table[i].flags & PTA_SSE4A
3972 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3973 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3974 if (processor_alias_table[i].flags & PTA_FMA4
3975 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3976 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3977 if (processor_alias_table[i].flags & PTA_XOP
3978 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3979 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3980 if (processor_alias_table[i].flags & PTA_LWP
3981 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3982 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3983 if (processor_alias_table[i].flags & PTA_ABM
3984 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3985 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3986 if (processor_alias_table[i].flags & PTA_BMI
3987 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3988 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3989 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3990 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3991 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3992 if (processor_alias_table[i].flags & PTA_TBM
3993 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3994 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3995 if (processor_alias_table[i].flags & PTA_BMI2
3996 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3997 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3998 if (processor_alias_table[i].flags & PTA_CX16
3999 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
4000 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
4001 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
4002 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4003 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4004 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4005 && (processor_alias_table[i].flags & PTA_NO_SAHF))
4006 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4007 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4008 if (processor_alias_table[i].flags & PTA_MOVBE
4009 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
4010 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
4011 if (processor_alias_table[i].flags & PTA_AES
4012 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4013 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4014 if (processor_alias_table[i].flags & PTA_SHA
4015 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4016 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4017 if (processor_alias_table[i].flags & PTA_PCLMUL
4018 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4019 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4020 if (processor_alias_table[i].flags & PTA_FSGSBASE
4021 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4022 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4023 if (processor_alias_table[i].flags & PTA_RDRND
4024 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4025 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4026 if (processor_alias_table[i].flags & PTA_F16C
4027 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4028 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4029 if (processor_alias_table[i].flags & PTA_RTM
4030 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4031 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4032 if (processor_alias_table[i].flags & PTA_HLE
4033 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
4034 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
4035 if (processor_alias_table[i].flags & PTA_PRFCHW
4036 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4037 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4038 if (processor_alias_table[i].flags & PTA_RDSEED
4039 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4040 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4041 if (processor_alias_table[i].flags & PTA_ADX
4042 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4043 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4044 if (processor_alias_table[i].flags & PTA_FXSR
4045 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4046 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4047 if (processor_alias_table[i].flags & PTA_XSAVE
4048 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4049 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4050 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4051 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4052 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4053 if (processor_alias_table[i].flags & PTA_AVX512F
4054 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4055 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4056 if (processor_alias_table[i].flags & PTA_AVX512ER
4057 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4058 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4059 if (processor_alias_table[i].flags & PTA_AVX512PF
4060 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4061 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4062 if (processor_alias_table[i].flags & PTA_AVX512CD
4063 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4064 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4065 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4066 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4067 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4068 if (processor_alias_table[i].flags & PTA_CLWB
4069 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4070 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4071 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4072 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4073 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4074 if (processor_alias_table[i].flags & PTA_CLZERO
4075 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
4076 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
4077 if (processor_alias_table[i].flags & PTA_XSAVEC
4078 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4079 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4080 if (processor_alias_table[i].flags & PTA_XSAVES
4081 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4082 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4083 if (processor_alias_table[i].flags & PTA_AVX512DQ
4084 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4085 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4086 if (processor_alias_table[i].flags & PTA_AVX512BW
4087 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4088 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4089 if (processor_alias_table[i].flags & PTA_AVX512VL
4090 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4091 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4092 if (processor_alias_table[i].flags & PTA_MPX
4093 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4094 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4095 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4096 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4097 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4098 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4099 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4100 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4102 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4103 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
4104 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4105 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4106 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
4107 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4108 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4109 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4110 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4111 if (processor_alias_table[i].flags & PTA_SGX
4112 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4113 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4115 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4116 x86_prefetch_sse = true;
4117 if (processor_alias_table[i].flags & PTA_MWAITX
4118 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
4119 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
4120 if (processor_alias_table[i].flags & PTA_PKU
4121 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4122 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4124 /* Don't enable x87 instructions if only
4125 general registers are allowed. */
4126 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4127 && !(opts_set->x_target_flags & MASK_80387))
4129 if (processor_alias_table[i].flags & PTA_NO_80387)
4130 opts->x_target_flags &= ~MASK_80387;
4131 else
4132 opts->x_target_flags |= MASK_80387;
4134 break;
4137 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4138 error ("Intel MPX does not support x32");
4140 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4141 error ("Intel MPX does not support x32");
4143 if (i == pta_size)
4145 error (main_args_p
4146 ? G_("bad value (%qs) for %<-march=%> switch")
4147 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4148 opts->x_ix86_arch_string);
4150 auto_vec <const char *> candidates;
4151 for (i = 0; i < pta_size; i++)
4152 if (strcmp (processor_alias_table[i].name, "generic")
4153 && strcmp (processor_alias_table[i].name, "intel")
4154 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4155 || (processor_alias_table[i].flags & PTA_64BIT)))
4156 candidates.safe_push (processor_alias_table[i].name);
4158 char *s;
4159 const char *hint
4160 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4161 if (hint)
4162 inform (input_location,
4163 main_args_p
4164 ? G_("valid arguments to %<-march=%> switch are: "
4165 "%s; did you mean %qs?")
4166 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4167 "%s; did you mean %qs?"), s, hint);
4168 else
4169 inform (input_location,
4170 main_args_p
4171 ? G_("valid arguments to %<-march=%> switch are: %s")
4172 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4173 "are: %s"), s);
4174 XDELETEVEC (s);
4177 ix86_arch_mask = 1u << ix86_arch;
4178 for (i = 0; i < X86_ARCH_LAST; ++i)
4179 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4181 for (i = 0; i < pta_size; i++)
4182 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4184 ix86_schedule = processor_alias_table[i].schedule;
4185 ix86_tune = processor_alias_table[i].processor;
4186 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4188 if (!(processor_alias_table[i].flags & PTA_64BIT))
4190 if (ix86_tune_defaulted)
4192 opts->x_ix86_tune_string = "x86-64";
4193 for (i = 0; i < pta_size; i++)
4194 if (! strcmp (opts->x_ix86_tune_string,
4195 processor_alias_table[i].name))
4196 break;
4197 ix86_schedule = processor_alias_table[i].schedule;
4198 ix86_tune = processor_alias_table[i].processor;
4200 else
4201 error ("CPU you selected does not support x86-64 "
4202 "instruction set");
4205 /* Intel CPUs have always interpreted SSE prefetch instructions as
4206 NOPs; so, we can enable SSE prefetch instructions even when
4207 -mtune (rather than -march) points us to a processor that has them.
4208 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4209 higher processors. */
4210 if (TARGET_CMOV
4211 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4212 x86_prefetch_sse = true;
4213 break;
4216 if (ix86_tune_specified && i == pta_size)
4218 error (main_args_p
4219 ? G_("bad value (%qs) for %<-mtune=%> switch")
4220 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4221 opts->x_ix86_tune_string);
4223 auto_vec <const char *> candidates;
4224 for (i = 0; i < pta_size; i++)
4225 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4226 || (processor_alias_table[i].flags & PTA_64BIT))
4227 candidates.safe_push (processor_alias_table[i].name);
4229 char *s;
4230 const char *hint
4231 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4232 if (hint)
4233 inform (input_location,
4234 main_args_p
4235 ? G_("valid arguments to %<-mtune=%> switch are: "
4236 "%s; did you mean %qs?")
4237 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4238 "%s; did you mean %qs?"), s, hint);
4239 else
4240 inform (input_location,
4241 main_args_p
4242 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4243 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4244 "are: %s"), s);
4245 XDELETEVEC (s);
4248 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4250 #ifndef USE_IX86_FRAME_POINTER
4251 #define USE_IX86_FRAME_POINTER 0
4252 #endif
4254 #ifndef USE_X86_64_FRAME_POINTER
4255 #define USE_X86_64_FRAME_POINTER 0
4256 #endif
4258 /* Set the default values for switches whose default depends on TARGET_64BIT
4259 in case they weren't overwritten by command line options. */
4260 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4262 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4263 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4264 if (opts->x_flag_asynchronous_unwind_tables
4265 && !opts_set->x_flag_unwind_tables
4266 && TARGET_64BIT_MS_ABI)
4267 opts->x_flag_unwind_tables = 1;
4268 if (opts->x_flag_asynchronous_unwind_tables == 2)
4269 opts->x_flag_unwind_tables
4270 = opts->x_flag_asynchronous_unwind_tables = 1;
4271 if (opts->x_flag_pcc_struct_return == 2)
4272 opts->x_flag_pcc_struct_return = 0;
4274 else
4276 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4277 opts->x_flag_omit_frame_pointer
4278 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4279 if (opts->x_flag_asynchronous_unwind_tables == 2)
4280 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4281 if (opts->x_flag_pcc_struct_return == 2)
4283 /* Intel MCU psABI specifies that -freg-struct-return should
4284 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4285 we check -miamcu so that -freg-struct-return is always
4286 turned on if -miamcu is used. */
4287 if (TARGET_IAMCU_P (opts->x_target_flags))
4288 opts->x_flag_pcc_struct_return = 0;
4289 else
4290 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4294 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4295 /* TODO: ix86_cost should be chosen at instruction or function granuality
4296 so for cold code we use size_cost even in !optimize_size compilation. */
4297 if (opts->x_optimize_size)
4298 ix86_cost = &ix86_size_cost;
4299 else
4300 ix86_cost = ix86_tune_cost;
4302 /* Arrange to set up i386_stack_locals for all functions. */
4303 init_machine_status = ix86_init_machine_status;
4305 /* Validate -mregparm= value. */
4306 if (opts_set->x_ix86_regparm)
4308 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4309 warning (0, "-mregparm is ignored in 64-bit mode");
4310 else if (TARGET_IAMCU_P (opts->x_target_flags))
4311 warning (0, "-mregparm is ignored for Intel MCU psABI");
4312 if (opts->x_ix86_regparm > REGPARM_MAX)
4314 error ("-mregparm=%d is not between 0 and %d",
4315 opts->x_ix86_regparm, REGPARM_MAX);
4316 opts->x_ix86_regparm = 0;
4319 if (TARGET_IAMCU_P (opts->x_target_flags)
4320 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4321 opts->x_ix86_regparm = REGPARM_MAX;
4323 /* Default align_* from the processor table. */
4324 ix86_default_align (opts);
4326 /* Provide default for -mbranch-cost= value. */
4327 if (!opts_set->x_ix86_branch_cost)
4328 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4330 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4332 opts->x_target_flags
4333 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4335 /* Enable by default the SSE and MMX builtins. Do allow the user to
4336 explicitly disable any of these. In particular, disabling SSE and
4337 MMX for kernel code is extremely useful. */
4338 if (!ix86_arch_specified)
4339 opts->x_ix86_isa_flags
4340 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4341 | TARGET_SUBTARGET64_ISA_DEFAULT)
4342 & ~opts->x_ix86_isa_flags_explicit);
4344 if (TARGET_RTD_P (opts->x_target_flags))
4345 warning (0,
4346 main_args_p
4347 ? G_("%<-mrtd%> is ignored in 64bit mode")
4348 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4350 else
4352 opts->x_target_flags
4353 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4355 if (!ix86_arch_specified)
4356 opts->x_ix86_isa_flags
4357 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4359 /* i386 ABI does not specify red zone. It still makes sense to use it
4360 when programmer takes care to stack from being destroyed. */
4361 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4362 opts->x_target_flags |= MASK_NO_RED_ZONE;
4365 /* Keep nonleaf frame pointers. */
4366 if (opts->x_flag_omit_frame_pointer)
4367 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4368 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4369 opts->x_flag_omit_frame_pointer = 1;
4371 /* If we're doing fast math, we don't care about comparison order
4372 wrt NaNs. This lets us use a shorter comparison sequence. */
4373 if (opts->x_flag_finite_math_only)
4374 opts->x_target_flags &= ~MASK_IEEE_FP;
4376 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4377 since the insns won't need emulation. */
4378 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4379 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4381 /* Likewise, if the target doesn't have a 387, or we've specified
4382 software floating point, don't use 387 inline intrinsics. */
4383 if (!TARGET_80387_P (opts->x_target_flags))
4384 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4386 /* Turn on MMX builtins for -msse. */
4387 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4388 opts->x_ix86_isa_flags
4389 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4391 /* Enable SSE prefetch. */
4392 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4393 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4394 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4395 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4396 x86_prefetch_sse = true;
4398 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4399 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4400 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4401 opts->x_ix86_isa_flags
4402 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4404 /* Enable lzcnt instruction for -mabm. */
4405 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4406 opts->x_ix86_isa_flags
4407 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4409 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4410 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4411 opts->x_ix86_isa_flags
4412 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4413 & ~opts->x_ix86_isa_flags_explicit);
4415 /* Validate -mpreferred-stack-boundary= value or default it to
4416 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4417 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4418 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4420 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4421 int max = TARGET_SEH ? 4 : 12;
4423 if (opts->x_ix86_preferred_stack_boundary_arg < min
4424 || opts->x_ix86_preferred_stack_boundary_arg > max)
4426 if (min == max)
4427 error ("-mpreferred-stack-boundary is not supported "
4428 "for this target");
4429 else
4430 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4431 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4433 else
4434 ix86_preferred_stack_boundary
4435 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4438 /* Set the default value for -mstackrealign. */
4439 if (!opts_set->x_ix86_force_align_arg_pointer)
4440 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4442 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4444 /* Validate -mincoming-stack-boundary= value or default it to
4445 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4446 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4447 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4449 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4451 if (opts->x_ix86_incoming_stack_boundary_arg < min
4452 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4453 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4454 opts->x_ix86_incoming_stack_boundary_arg, min);
4455 else
4457 ix86_user_incoming_stack_boundary
4458 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4459 ix86_incoming_stack_boundary
4460 = ix86_user_incoming_stack_boundary;
4464 #ifndef NO_PROFILE_COUNTERS
4465 if (flag_nop_mcount)
4466 error ("-mnop-mcount is not compatible with this target");
4467 #endif
4468 if (flag_nop_mcount && flag_pic)
4469 error ("-mnop-mcount is not implemented for -fPIC");
4471 /* Accept -msseregparm only if at least SSE support is enabled. */
4472 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4473 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4474 error (main_args_p
4475 ? G_("%<-msseregparm%> used without SSE enabled")
4476 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4478 if (opts_set->x_ix86_fpmath)
4480 if (opts->x_ix86_fpmath & FPMATH_SSE)
4482 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4484 if (TARGET_80387_P (opts->x_target_flags))
4486 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4487 opts->x_ix86_fpmath = FPMATH_387;
4490 else if ((opts->x_ix86_fpmath & FPMATH_387)
4491 && !TARGET_80387_P (opts->x_target_flags))
4493 warning (0, "387 instruction set disabled, using SSE arithmetics");
4494 opts->x_ix86_fpmath = FPMATH_SSE;
4498 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4499 fpmath=387. The second is however default at many targets since the
4500 extra 80bit precision of temporaries is considered to be part of ABI.
4501 Overwrite the default at least for -ffast-math.
4502 TODO: -mfpmath=both seems to produce same performing code with bit
4503 smaller binaries. It is however not clear if register allocation is
4504 ready for this setting.
4505 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4506 codegen. We may switch to 387 with -ffast-math for size optimized
4507 functions. */
4508 else if (fast_math_flags_set_p (&global_options)
4509 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4510 opts->x_ix86_fpmath = FPMATH_SSE;
4511 else
4512 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4514 /* Use external vectorized library in vectorizing intrinsics. */
4515 if (opts_set->x_ix86_veclibabi_type)
4516 switch (opts->x_ix86_veclibabi_type)
4518 case ix86_veclibabi_type_svml:
4519 ix86_veclib_handler = ix86_veclibabi_svml;
4520 break;
4522 case ix86_veclibabi_type_acml:
4523 ix86_veclib_handler = ix86_veclibabi_acml;
4524 break;
4526 default:
4527 gcc_unreachable ();
4530 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4531 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4532 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4534 /* If stack probes are required, the space used for large function
4535 arguments on the stack must also be probed, so enable
4536 -maccumulate-outgoing-args so this happens in the prologue. */
4537 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4538 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4540 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4541 warning (0,
4542 main_args_p
4543 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4544 "for correctness")
4545 : G_("stack probing requires "
4546 "%<target(\"accumulate-outgoing-args\")%> for "
4547 "correctness"));
4548 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4551 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4552 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4553 if (fixed_regs[BP_REG]
4554 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4556 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4557 warning (0,
4558 main_args_p
4559 ? G_("fixed ebp register requires "
4560 "%<-maccumulate-outgoing-args%>")
4561 : G_("fixed ebp register requires "
4562 "%<target(\"accumulate-outgoing-args\")%>"));
4563 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4566 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4568 char *p;
4569 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4570 p = strchr (internal_label_prefix, 'X');
4571 internal_label_prefix_len = p - internal_label_prefix;
4572 *p = '\0';
4575 /* When scheduling description is not available, disable scheduler pass
4576 so it won't slow down the compilation and make x87 code slower. */
4577 if (!TARGET_SCHEDULE)
4578 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4580 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4581 ix86_tune_cost->simultaneous_prefetches,
4582 opts->x_param_values,
4583 opts_set->x_param_values);
4584 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4585 ix86_tune_cost->prefetch_block,
4586 opts->x_param_values,
4587 opts_set->x_param_values);
4588 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4589 ix86_tune_cost->l1_cache_size,
4590 opts->x_param_values,
4591 opts_set->x_param_values);
4592 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4593 ix86_tune_cost->l2_cache_size,
4594 opts->x_param_values,
4595 opts_set->x_param_values);
4597 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4598 if (opts->x_flag_prefetch_loop_arrays < 0
4599 && HAVE_prefetch
4600 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4601 && !opts->x_optimize_size
4602 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4603 opts->x_flag_prefetch_loop_arrays = 1;
4605 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4606 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4607 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4608 targetm.expand_builtin_va_start = NULL;
4610 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4612 ix86_gen_leave = gen_leave_rex64;
4613 if (Pmode == DImode)
4615 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4616 ix86_gen_tls_local_dynamic_base_64
4617 = gen_tls_local_dynamic_base_64_di;
4619 else
4621 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4622 ix86_gen_tls_local_dynamic_base_64
4623 = gen_tls_local_dynamic_base_64_si;
4626 else
4627 ix86_gen_leave = gen_leave;
4629 if (Pmode == DImode)
4631 ix86_gen_add3 = gen_adddi3;
4632 ix86_gen_sub3 = gen_subdi3;
4633 ix86_gen_sub3_carry = gen_subdi3_carry;
4634 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4635 ix86_gen_andsp = gen_anddi3;
4636 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4637 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4638 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4639 ix86_gen_monitor = gen_sse3_monitor_di;
4640 ix86_gen_monitorx = gen_monitorx_di;
4641 ix86_gen_clzero = gen_clzero_di;
4643 else
4645 ix86_gen_add3 = gen_addsi3;
4646 ix86_gen_sub3 = gen_subsi3;
4647 ix86_gen_sub3_carry = gen_subsi3_carry;
4648 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4649 ix86_gen_andsp = gen_andsi3;
4650 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4651 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4652 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4653 ix86_gen_monitor = gen_sse3_monitor_si;
4654 ix86_gen_monitorx = gen_monitorx_si;
4655 ix86_gen_clzero = gen_clzero_si;
4658 #ifdef USE_IX86_CLD
4659 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4660 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4661 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4662 #endif
4664 /* Set the default value for -mfentry. */
4665 if (!opts_set->x_flag_fentry)
4666 opts->x_flag_fentry = TARGET_SEH;
4667 else
4669 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4670 && opts->x_flag_fentry)
4671 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4672 "with -fpic");
4673 else if (TARGET_SEH && !opts->x_flag_fentry)
4674 sorry ("-mno-fentry isn%'t compatible with SEH");
4677 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4678 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4680 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4681 && TARGET_EMIT_VZEROUPPER)
4682 opts->x_target_flags |= MASK_VZEROUPPER;
4683 if (!(opts_set->x_target_flags & MASK_STV))
4684 opts->x_target_flags |= MASK_STV;
4685 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4686 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4687 stack realignment will be extra cost the pass doesn't take into
4688 account and the pass can't realign the stack. */
4689 if (ix86_preferred_stack_boundary < 128
4690 || ix86_incoming_stack_boundary < 128
4691 || opts->x_ix86_force_align_arg_pointer)
4692 opts->x_target_flags &= ~MASK_STV;
4693 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4694 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4695 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4696 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4697 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4698 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4700 /* Enable 128-bit AVX instruction generation
4701 for the auto-vectorizer. */
4702 if (TARGET_AVX128_OPTIMAL
4703 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4704 opts->x_prefer_vector_width_type = PVW_AVX128;
4706 /* Use 256-bit AVX instruction generation
4707 in the auto-vectorizer. */
4708 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4709 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4710 opts->x_prefer_vector_width_type = PVW_AVX256;
4712 if (opts->x_ix86_recip_name)
4714 char *p = ASTRDUP (opts->x_ix86_recip_name);
4715 char *q;
4716 unsigned int mask, i;
4717 bool invert;
4719 while ((q = strtok (p, ",")) != NULL)
4721 p = NULL;
4722 if (*q == '!')
4724 invert = true;
4725 q++;
4727 else
4728 invert = false;
4730 if (!strcmp (q, "default"))
4731 mask = RECIP_MASK_ALL;
4732 else
4734 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4735 if (!strcmp (q, recip_options[i].string))
4737 mask = recip_options[i].mask;
4738 break;
4741 if (i == ARRAY_SIZE (recip_options))
4743 error ("unknown option for -mrecip=%s", q);
4744 invert = false;
4745 mask = RECIP_MASK_NONE;
4749 opts->x_recip_mask_explicit |= mask;
4750 if (invert)
4751 opts->x_recip_mask &= ~mask;
4752 else
4753 opts->x_recip_mask |= mask;
4757 if (TARGET_RECIP_P (opts->x_target_flags))
4758 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4759 else if (opts_set->x_target_flags & MASK_RECIP)
4760 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4762 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4763 for 64-bit Bionic. Also default long double to 64-bit for Intel
4764 MCU psABI. */
4765 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4766 && !(opts_set->x_target_flags
4767 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4768 opts->x_target_flags |= (TARGET_64BIT
4769 ? MASK_LONG_DOUBLE_128
4770 : MASK_LONG_DOUBLE_64);
4772 /* Only one of them can be active. */
4773 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4774 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4776 /* Handle stack protector */
4777 if (!opts_set->x_ix86_stack_protector_guard)
4778 opts->x_ix86_stack_protector_guard
4779 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4781 #ifdef TARGET_THREAD_SSP_OFFSET
4782 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4783 #endif
4785 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4787 char *endp;
4788 const char *str = ix86_stack_protector_guard_offset_str;
4790 errno = 0;
4791 int64_t offset;
4793 #if defined(INT64_T_IS_LONG)
4794 offset = strtol (str, &endp, 0);
4795 #else
4796 offset = strtoll (str, &endp, 0);
4797 #endif
4799 if (!*str || *endp || errno)
4800 error ("%qs is not a valid number "
4801 "in -mstack-protector-guard-offset=", str);
4803 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4804 HOST_WIDE_INT_C (0x7fffffff)))
4805 error ("%qs is not a valid offset "
4806 "in -mstack-protector-guard-offset=", str);
4808 ix86_stack_protector_guard_offset = offset;
4811 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4813 /* The kernel uses a different segment register for performance
4814 reasons; a system call would not have to trash the userspace
4815 segment register, which would be expensive. */
4816 if (ix86_cmodel == CM_KERNEL)
4817 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4819 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4821 const char *str = ix86_stack_protector_guard_reg_str;
4822 addr_space_t seg = ADDR_SPACE_GENERIC;
4824 /* Discard optional register prefix. */
4825 if (str[0] == '%')
4826 str++;
4828 if (strlen (str) == 2 && str[1] == 's')
4830 if (str[0] == 'f')
4831 seg = ADDR_SPACE_SEG_FS;
4832 else if (str[0] == 'g')
4833 seg = ADDR_SPACE_SEG_GS;
4836 if (seg == ADDR_SPACE_GENERIC)
4837 error ("%qs is not a valid base register "
4838 "in -mstack-protector-guard-reg=",
4839 ix86_stack_protector_guard_reg_str);
4841 ix86_stack_protector_guard_reg = seg;
4844 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4845 if (opts->x_ix86_tune_memcpy_strategy)
4847 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4848 ix86_parse_stringop_strategy_string (str, false);
4849 free (str);
4852 if (opts->x_ix86_tune_memset_strategy)
4854 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4855 ix86_parse_stringop_strategy_string (str, true);
4856 free (str);
4859 /* Save the initial options in case the user does function specific
4860 options. */
4861 if (main_args_p)
4862 target_option_default_node = target_option_current_node
4863 = build_target_option_node (opts);
4865 /* Do not support control flow instrumentation if CET is not enabled. */
4866 if (opts->x_flag_cf_protection != CF_NONE)
4868 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4869 || TARGET_SHSTK_P (opts->x_ix86_isa_flags2)))
4871 if (flag_cf_protection == CF_FULL)
4873 error ("%<-fcf-protection=full%> requires CET support "
4874 "on this target. Use -mcet or one of -mibt, "
4875 "-mshstk options to enable CET");
4877 else if (flag_cf_protection == CF_BRANCH)
4879 error ("%<-fcf-protection=branch%> requires CET support "
4880 "on this target. Use -mcet or one of -mibt, "
4881 "-mshstk options to enable CET");
4883 else if (flag_cf_protection == CF_RETURN)
4885 error ("%<-fcf-protection=return%> requires CET support "
4886 "on this target. Use -mcet or one of -mibt, "
4887 "-mshstk options to enable CET");
4889 flag_cf_protection = CF_NONE;
4890 return false;
4892 opts->x_flag_cf_protection =
4893 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4896 return true;
4899 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4901 static void
4902 ix86_option_override (void)
4904 ix86_option_override_internal (true, &global_options, &global_options_set);
4907 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4908 static char *
4909 ix86_offload_options (void)
4911 if (TARGET_LP64)
4912 return xstrdup ("-foffload-abi=lp64");
4913 return xstrdup ("-foffload-abi=ilp32");
4916 /* Update register usage after having seen the compiler flags. */
4918 static void
4919 ix86_conditional_register_usage (void)
4921 int i, c_mask;
4923 /* If there are no caller-saved registers, preserve all registers.
4924 except fixed_regs and registers used for function return value
4925 since aggregate_value_p checks call_used_regs[regno] on return
4926 value. */
4927 if (cfun && cfun->machine->no_caller_saved_registers)
4928 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4929 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4930 call_used_regs[i] = 0;
4932 /* For 32-bit targets, squash the REX registers. */
4933 if (! TARGET_64BIT)
4935 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4936 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4937 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4938 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4939 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4940 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4943 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4944 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4946 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4948 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4950 /* Set/reset conditionally defined registers from
4951 CALL_USED_REGISTERS initializer. */
4952 if (call_used_regs[i] > 1)
4953 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4955 /* Calculate registers of CLOBBERED_REGS register set
4956 as call used registers from GENERAL_REGS register set. */
4957 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4958 && call_used_regs[i])
4959 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4962 /* If MMX is disabled, squash the registers. */
4963 if (! TARGET_MMX)
4964 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4965 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4966 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4968 /* If SSE is disabled, squash the registers. */
4969 if (! TARGET_SSE)
4970 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4971 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4972 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4974 /* If the FPU is disabled, squash the registers. */
4975 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4976 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4977 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4978 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4980 /* If AVX512F is disabled, squash the registers. */
4981 if (! TARGET_AVX512F)
4983 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4984 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4986 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4987 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4990 /* If MPX is disabled, squash the registers. */
4991 if (! TARGET_MPX)
4992 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4993 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4996 /* Canonicalize a comparison from one we don't have to one we do have. */
4998 static void
4999 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5000 bool op0_preserve_value)
5002 /* The order of operands in x87 ficom compare is forced by combine in
5003 simplify_comparison () function. Float operator is treated as RTX_OBJ
5004 with a precedence over other operators and is always put in the first
5005 place. Swap condition and operands to match ficom instruction. */
5006 if (!op0_preserve_value
5007 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5009 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5011 /* We are called only for compares that are split to SAHF instruction.
5012 Ensure that we have setcc/jcc insn for the swapped condition. */
5013 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5015 std::swap (*op0, *op1);
5016 *code = (int) scode;
5021 /* Save the current options */
5023 static void
5024 ix86_function_specific_save (struct cl_target_option *ptr,
5025 struct gcc_options *opts)
5027 ptr->arch = ix86_arch;
5028 ptr->schedule = ix86_schedule;
5029 ptr->prefetch_sse = x86_prefetch_sse;
5030 ptr->tune = ix86_tune;
5031 ptr->branch_cost = ix86_branch_cost;
5032 ptr->tune_defaulted = ix86_tune_defaulted;
5033 ptr->arch_specified = ix86_arch_specified;
5034 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5035 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5036 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5037 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5038 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5039 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5040 ptr->x_ix86_abi = opts->x_ix86_abi;
5041 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5042 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5043 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5044 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5045 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5046 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5047 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5048 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5049 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5050 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5051 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5052 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5053 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5054 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5055 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5056 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5057 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5058 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5059 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5060 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5062 /* The fields are char but the variables are not; make sure the
5063 values fit in the fields. */
5064 gcc_assert (ptr->arch == ix86_arch);
5065 gcc_assert (ptr->schedule == ix86_schedule);
5066 gcc_assert (ptr->tune == ix86_tune);
5067 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5070 /* Restore the current options */
5072 static void
5073 ix86_function_specific_restore (struct gcc_options *opts,
5074 struct cl_target_option *ptr)
5076 enum processor_type old_tune = ix86_tune;
5077 enum processor_type old_arch = ix86_arch;
5078 unsigned int ix86_arch_mask;
5079 int i;
5081 /* We don't change -fPIC. */
5082 opts->x_flag_pic = flag_pic;
5084 ix86_arch = (enum processor_type) ptr->arch;
5085 ix86_schedule = (enum attr_cpu) ptr->schedule;
5086 ix86_tune = (enum processor_type) ptr->tune;
5087 x86_prefetch_sse = ptr->prefetch_sse;
5088 opts->x_ix86_branch_cost = ptr->branch_cost;
5089 ix86_tune_defaulted = ptr->tune_defaulted;
5090 ix86_arch_specified = ptr->arch_specified;
5091 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5092 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5093 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5094 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5095 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5096 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5097 opts->x_ix86_abi = ptr->x_ix86_abi;
5098 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5099 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5100 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5101 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5102 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5103 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5104 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5105 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5106 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5107 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5108 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5109 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5110 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5111 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5112 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5113 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5114 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5115 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5116 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5117 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5118 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5119 /* TODO: ix86_cost should be chosen at instruction or function granuality
5120 so for cold code we use size_cost even in !optimize_size compilation. */
5121 if (opts->x_optimize_size)
5122 ix86_cost = &ix86_size_cost;
5123 else
5124 ix86_cost = ix86_tune_cost;
5126 /* Recreate the arch feature tests if the arch changed */
5127 if (old_arch != ix86_arch)
5129 ix86_arch_mask = 1u << ix86_arch;
5130 for (i = 0; i < X86_ARCH_LAST; ++i)
5131 ix86_arch_features[i]
5132 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5135 /* Recreate the tune optimization tests */
5136 if (old_tune != ix86_tune)
5137 set_ix86_tune_features (ix86_tune, false);
5140 /* Adjust target options after streaming them in. This is mainly about
5141 reconciling them with global options. */
5143 static void
5144 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5146 /* flag_pic is a global option, but ix86_cmodel is target saved option
5147 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5148 for PIC, or error out. */
5149 if (flag_pic)
5150 switch (ptr->x_ix86_cmodel)
5152 case CM_SMALL:
5153 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5154 break;
5156 case CM_MEDIUM:
5157 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5158 break;
5160 case CM_LARGE:
5161 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5162 break;
5164 case CM_KERNEL:
5165 error ("code model %s does not support PIC mode", "kernel");
5166 break;
5168 default:
5169 break;
5171 else
5172 switch (ptr->x_ix86_cmodel)
5174 case CM_SMALL_PIC:
5175 ptr->x_ix86_cmodel = CM_SMALL;
5176 break;
5178 case CM_MEDIUM_PIC:
5179 ptr->x_ix86_cmodel = CM_MEDIUM;
5180 break;
5182 case CM_LARGE_PIC:
5183 ptr->x_ix86_cmodel = CM_LARGE;
5184 break;
5186 default:
5187 break;
5191 /* Print the current options */
5193 static void
5194 ix86_function_specific_print (FILE *file, int indent,
5195 struct cl_target_option *ptr)
5197 char *target_string
5198 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5199 ptr->x_target_flags, ptr->x_ix86_target_flags,
5200 NULL, NULL, ptr->x_ix86_fpmath, false);
5202 gcc_assert (ptr->arch < PROCESSOR_max);
5203 fprintf (file, "%*sarch = %d (%s)\n",
5204 indent, "",
5205 ptr->arch, processor_target_table[ptr->arch].name);
5207 gcc_assert (ptr->tune < PROCESSOR_max);
5208 fprintf (file, "%*stune = %d (%s)\n",
5209 indent, "",
5210 ptr->tune, processor_target_table[ptr->tune].name);
5212 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5214 if (target_string)
5216 fprintf (file, "%*s%s\n", indent, "", target_string);
5217 free (target_string);
5222 /* Inner function to process the attribute((target(...))), take an argument and
5223 set the current options from the argument. If we have a list, recursively go
5224 over the list. */
5226 static bool
5227 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5228 struct gcc_options *opts,
5229 struct gcc_options *opts_set,
5230 struct gcc_options *enum_opts_set)
5232 char *next_optstr;
5233 bool ret = true;
5235 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5236 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5237 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5238 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5239 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5241 enum ix86_opt_type
5243 ix86_opt_unknown,
5244 ix86_opt_yes,
5245 ix86_opt_no,
5246 ix86_opt_str,
5247 ix86_opt_enum,
5248 ix86_opt_isa
5251 static const struct
5253 const char *string;
5254 size_t len;
5255 enum ix86_opt_type type;
5256 int opt;
5257 int mask;
5258 } attrs[] = {
5259 /* isa options */
5260 IX86_ATTR_ISA ("sgx", OPT_msgx),
5261 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5262 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5263 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5264 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5265 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5267 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5268 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5269 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5270 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5271 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5272 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5273 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5274 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5275 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5276 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5277 IX86_ATTR_ISA ("fma", OPT_mfma),
5278 IX86_ATTR_ISA ("xop", OPT_mxop),
5279 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5280 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5281 IX86_ATTR_ISA ("avx", OPT_mavx),
5282 IX86_ATTR_ISA ("sse4", OPT_msse4),
5283 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5284 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5285 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5286 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5287 IX86_ATTR_ISA ("sse3", OPT_msse3),
5288 IX86_ATTR_ISA ("aes", OPT_maes),
5289 IX86_ATTR_ISA ("sha", OPT_msha),
5290 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5291 IX86_ATTR_ISA ("sse2", OPT_msse2),
5292 IX86_ATTR_ISA ("sse", OPT_msse),
5293 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5294 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5295 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5296 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5297 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5298 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5299 IX86_ATTR_ISA ("adx", OPT_madx),
5300 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5301 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5302 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5303 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5304 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5305 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5306 IX86_ATTR_ISA ("abm", OPT_mabm),
5307 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5308 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5309 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5310 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5311 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5312 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5313 IX86_ATTR_ISA ("sahf", OPT_msahf),
5314 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5315 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5316 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5317 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5318 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5319 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5320 IX86_ATTR_ISA ("pku", OPT_mpku),
5321 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5322 IX86_ATTR_ISA ("hle", OPT_mhle),
5323 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5324 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5325 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5326 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5327 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5328 IX86_ATTR_ISA ("ibt", OPT_mibt),
5329 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5330 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5332 /* enum options */
5333 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5335 /* string options */
5336 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5337 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5339 /* flag options */
5340 IX86_ATTR_YES ("cld",
5341 OPT_mcld,
5342 MASK_CLD),
5344 IX86_ATTR_NO ("fancy-math-387",
5345 OPT_mfancy_math_387,
5346 MASK_NO_FANCY_MATH_387),
5348 IX86_ATTR_YES ("ieee-fp",
5349 OPT_mieee_fp,
5350 MASK_IEEE_FP),
5352 IX86_ATTR_YES ("inline-all-stringops",
5353 OPT_minline_all_stringops,
5354 MASK_INLINE_ALL_STRINGOPS),
5356 IX86_ATTR_YES ("inline-stringops-dynamically",
5357 OPT_minline_stringops_dynamically,
5358 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5360 IX86_ATTR_NO ("align-stringops",
5361 OPT_mno_align_stringops,
5362 MASK_NO_ALIGN_STRINGOPS),
5364 IX86_ATTR_YES ("recip",
5365 OPT_mrecip,
5366 MASK_RECIP),
5370 /* If this is a list, recurse to get the options. */
5371 if (TREE_CODE (args) == TREE_LIST)
5373 bool ret = true;
5375 for (; args; args = TREE_CHAIN (args))
5376 if (TREE_VALUE (args)
5377 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5378 p_strings, opts, opts_set,
5379 enum_opts_set))
5380 ret = false;
5382 return ret;
5385 else if (TREE_CODE (args) != STRING_CST)
5387 error ("attribute %<target%> argument not a string");
5388 return false;
5391 /* Handle multiple arguments separated by commas. */
5392 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5394 while (next_optstr && *next_optstr != '\0')
5396 char *p = next_optstr;
5397 char *orig_p = p;
5398 char *comma = strchr (next_optstr, ',');
5399 const char *opt_string;
5400 size_t len, opt_len;
5401 int opt;
5402 bool opt_set_p;
5403 char ch;
5404 unsigned i;
5405 enum ix86_opt_type type = ix86_opt_unknown;
5406 int mask = 0;
5408 if (comma)
5410 *comma = '\0';
5411 len = comma - next_optstr;
5412 next_optstr = comma + 1;
5414 else
5416 len = strlen (p);
5417 next_optstr = NULL;
5420 /* Recognize no-xxx. */
5421 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5423 opt_set_p = false;
5424 p += 3;
5425 len -= 3;
5427 else
5428 opt_set_p = true;
5430 /* Find the option. */
5431 ch = *p;
5432 opt = N_OPTS;
5433 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5435 type = attrs[i].type;
5436 opt_len = attrs[i].len;
5437 if (ch == attrs[i].string[0]
5438 && ((type != ix86_opt_str && type != ix86_opt_enum)
5439 ? len == opt_len
5440 : len > opt_len)
5441 && memcmp (p, attrs[i].string, opt_len) == 0)
5443 opt = attrs[i].opt;
5444 mask = attrs[i].mask;
5445 opt_string = attrs[i].string;
5446 break;
5450 /* Process the option. */
5451 if (opt == N_OPTS)
5453 error ("attribute(target(\"%s\")) is unknown", orig_p);
5454 ret = false;
5457 else if (type == ix86_opt_isa)
5459 struct cl_decoded_option decoded;
5461 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5462 ix86_handle_option (opts, opts_set,
5463 &decoded, input_location);
5466 else if (type == ix86_opt_yes || type == ix86_opt_no)
5468 if (type == ix86_opt_no)
5469 opt_set_p = !opt_set_p;
5471 if (opt_set_p)
5472 opts->x_target_flags |= mask;
5473 else
5474 opts->x_target_flags &= ~mask;
5477 else if (type == ix86_opt_str)
5479 if (p_strings[opt])
5481 error ("option(\"%s\") was already specified", opt_string);
5482 ret = false;
5484 else
5485 p_strings[opt] = xstrdup (p + opt_len);
5488 else if (type == ix86_opt_enum)
5490 bool arg_ok;
5491 int value;
5493 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5494 if (arg_ok)
5495 set_option (opts, enum_opts_set, opt, value,
5496 p + opt_len, DK_UNSPECIFIED, input_location,
5497 global_dc);
5498 else
5500 error ("attribute(target(\"%s\")) is unknown", orig_p);
5501 ret = false;
5505 else
5506 gcc_unreachable ();
5509 return ret;
5512 /* Release allocated strings. */
5513 static void
5514 release_options_strings (char **option_strings)
5516 /* Free up memory allocated to hold the strings */
5517 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5518 free (option_strings[i]);
5521 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5523 tree
5524 ix86_valid_target_attribute_tree (tree args,
5525 struct gcc_options *opts,
5526 struct gcc_options *opts_set)
5528 const char *orig_arch_string = opts->x_ix86_arch_string;
5529 const char *orig_tune_string = opts->x_ix86_tune_string;
5530 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5531 int orig_tune_defaulted = ix86_tune_defaulted;
5532 int orig_arch_specified = ix86_arch_specified;
5533 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5534 tree t = NULL_TREE;
5535 struct cl_target_option *def
5536 = TREE_TARGET_OPTION (target_option_default_node);
5537 struct gcc_options enum_opts_set;
5539 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5541 /* Process each of the options on the chain. */
5542 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5543 opts_set, &enum_opts_set))
5544 return error_mark_node;
5546 /* If the changed options are different from the default, rerun
5547 ix86_option_override_internal, and then save the options away.
5548 The string options are attribute options, and will be undone
5549 when we copy the save structure. */
5550 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5551 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5552 || opts->x_target_flags != def->x_target_flags
5553 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5554 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5555 || enum_opts_set.x_ix86_fpmath)
5557 /* If we are using the default tune= or arch=, undo the string assigned,
5558 and use the default. */
5559 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5561 opts->x_ix86_arch_string
5562 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5564 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5565 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5566 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5567 | OPTION_MASK_ABI_64
5568 | OPTION_MASK_ABI_X32
5569 | OPTION_MASK_CODE16);
5570 opts->x_ix86_isa_flags2 = 0;
5572 else if (!orig_arch_specified)
5573 opts->x_ix86_arch_string = NULL;
5575 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5576 opts->x_ix86_tune_string
5577 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5578 else if (orig_tune_defaulted)
5579 opts->x_ix86_tune_string = NULL;
5581 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5582 if (enum_opts_set.x_ix86_fpmath)
5583 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5585 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5586 bool r = ix86_option_override_internal (false, opts, opts_set);
5587 if (!r)
5589 release_options_strings (option_strings);
5590 return error_mark_node;
5593 /* Add any builtin functions with the new isa if any. */
5594 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5596 /* Save the current options unless we are validating options for
5597 #pragma. */
5598 t = build_target_option_node (opts);
5600 opts->x_ix86_arch_string = orig_arch_string;
5601 opts->x_ix86_tune_string = orig_tune_string;
5602 opts_set->x_ix86_fpmath = orig_fpmath_set;
5604 release_options_strings (option_strings);
5607 return t;
5610 /* Hook to validate attribute((target("string"))). */
5612 static bool
5613 ix86_valid_target_attribute_p (tree fndecl,
5614 tree ARG_UNUSED (name),
5615 tree args,
5616 int ARG_UNUSED (flags))
5618 struct gcc_options func_options;
5619 tree new_target, new_optimize;
5620 bool ret = true;
5622 /* attribute((target("default"))) does nothing, beyond
5623 affecting multi-versioning. */
5624 if (TREE_VALUE (args)
5625 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5626 && TREE_CHAIN (args) == NULL_TREE
5627 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5628 return true;
5630 tree old_optimize = build_optimization_node (&global_options);
5632 /* Get the optimization options of the current function. */
5633 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5635 if (!func_optimize)
5636 func_optimize = old_optimize;
5638 /* Init func_options. */
5639 memset (&func_options, 0, sizeof (func_options));
5640 init_options_struct (&func_options, NULL);
5641 lang_hooks.init_options_struct (&func_options);
5643 cl_optimization_restore (&func_options,
5644 TREE_OPTIMIZATION (func_optimize));
5646 /* Initialize func_options to the default before its target options can
5647 be set. */
5648 cl_target_option_restore (&func_options,
5649 TREE_TARGET_OPTION (target_option_default_node));
5651 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5652 &global_options_set);
5654 new_optimize = build_optimization_node (&func_options);
5656 if (new_target == error_mark_node)
5657 ret = false;
5659 else if (fndecl && new_target)
5661 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5663 if (old_optimize != new_optimize)
5664 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5667 finalize_options_struct (&func_options);
5669 return ret;
5673 /* Hook to determine if one function can safely inline another. */
5675 static bool
5676 ix86_can_inline_p (tree caller, tree callee)
5678 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5679 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5680 if (!callee_tree)
5681 callee_tree = target_option_default_node;
5682 if (!caller_tree)
5683 caller_tree = target_option_default_node;
5684 if (callee_tree == caller_tree)
5685 return true;
5687 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5688 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5689 bool ret = false;
5691 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5692 function can inline a SSE2 function but a SSE2 function can't inline
5693 a SSE4 function. */
5694 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5695 != callee_opts->x_ix86_isa_flags)
5696 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5697 != callee_opts->x_ix86_isa_flags2))
5698 ret = false;
5700 /* See if we have the same non-isa options. */
5701 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5702 ret = false;
5704 /* See if arch, tune, etc. are the same. */
5705 else if (caller_opts->arch != callee_opts->arch)
5706 ret = false;
5708 else if (caller_opts->tune != callee_opts->tune)
5709 ret = false;
5711 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5712 /* If the calle doesn't use FP expressions differences in
5713 ix86_fpmath can be ignored. We are called from FEs
5714 for multi-versioning call optimization, so beware of
5715 ipa_fn_summaries not available. */
5716 && (! ipa_fn_summaries
5717 || ipa_fn_summaries->get
5718 (cgraph_node::get (callee))->fp_expressions))
5719 ret = false;
5721 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5722 ret = false;
5724 else
5725 ret = true;
5727 return ret;
5731 /* Remember the last target of ix86_set_current_function. */
5732 static GTY(()) tree ix86_previous_fndecl;
5734 /* Set targets globals to the default (or current #pragma GCC target
5735 if active). Invalidate ix86_previous_fndecl cache. */
5737 void
5738 ix86_reset_previous_fndecl (void)
5740 tree new_tree = target_option_current_node;
5741 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5742 if (TREE_TARGET_GLOBALS (new_tree))
5743 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5744 else if (new_tree == target_option_default_node)
5745 restore_target_globals (&default_target_globals);
5746 else
5747 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5748 ix86_previous_fndecl = NULL_TREE;
5751 /* Set the func_type field from the function FNDECL. */
5753 static void
5754 ix86_set_func_type (tree fndecl)
5756 if (cfun->machine->func_type == TYPE_UNKNOWN)
5758 if (lookup_attribute ("interrupt",
5759 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5761 if (ix86_function_naked (fndecl))
5762 error_at (DECL_SOURCE_LOCATION (fndecl),
5763 "interrupt and naked attributes are not compatible");
5765 int nargs = 0;
5766 for (tree arg = DECL_ARGUMENTS (fndecl);
5767 arg;
5768 arg = TREE_CHAIN (arg))
5769 nargs++;
5770 cfun->machine->no_caller_saved_registers = true;
5771 cfun->machine->func_type
5772 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5774 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5776 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5777 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5778 sorry ("Only DWARF debug format is supported for interrupt "
5779 "service routine.");
5781 else
5783 cfun->machine->func_type = TYPE_NORMAL;
5784 if (lookup_attribute ("no_caller_saved_registers",
5785 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5786 cfun->machine->no_caller_saved_registers = true;
5791 /* Establish appropriate back-end context for processing the function
5792 FNDECL. The argument might be NULL to indicate processing at top
5793 level, outside of any function scope. */
5794 static void
5795 ix86_set_current_function (tree fndecl)
5797 /* Only change the context if the function changes. This hook is called
5798 several times in the course of compiling a function, and we don't want to
5799 slow things down too much or call target_reinit when it isn't safe. */
5800 if (fndecl == ix86_previous_fndecl)
5802 /* There may be 2 function bodies for the same function FNDECL,
5803 one is extern inline and one isn't. Call ix86_set_func_type
5804 to set the func_type field. */
5805 if (fndecl != NULL_TREE)
5806 ix86_set_func_type (fndecl);
5807 return;
5810 tree old_tree;
5811 if (ix86_previous_fndecl == NULL_TREE)
5812 old_tree = target_option_current_node;
5813 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5814 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5815 else
5816 old_tree = target_option_default_node;
5818 if (fndecl == NULL_TREE)
5820 if (old_tree != target_option_current_node)
5821 ix86_reset_previous_fndecl ();
5822 return;
5825 ix86_set_func_type (fndecl);
5827 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5828 if (new_tree == NULL_TREE)
5829 new_tree = target_option_default_node;
5831 if (old_tree != new_tree)
5833 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5834 if (TREE_TARGET_GLOBALS (new_tree))
5835 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5836 else if (new_tree == target_option_default_node)
5837 restore_target_globals (&default_target_globals);
5838 else
5839 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5841 ix86_previous_fndecl = fndecl;
5843 static bool prev_no_caller_saved_registers;
5845 /* 64-bit MS and SYSV ABI have different set of call used registers.
5846 Avoid expensive re-initialization of init_regs each time we switch
5847 function context. */
5848 if (TARGET_64BIT
5849 && (call_used_regs[SI_REG]
5850 == (cfun->machine->call_abi == MS_ABI)))
5851 reinit_regs ();
5852 /* Need to re-initialize init_regs if caller-saved registers are
5853 changed. */
5854 else if (prev_no_caller_saved_registers
5855 != cfun->machine->no_caller_saved_registers)
5856 reinit_regs ();
5858 if (cfun->machine->func_type != TYPE_NORMAL
5859 || cfun->machine->no_caller_saved_registers)
5861 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5862 may change processor state. */
5863 const char *isa;
5864 if (TARGET_MPX)
5865 isa = "MPX";
5866 else if (TARGET_SSE)
5867 isa = "SSE";
5868 else if (TARGET_MMX)
5869 isa = "MMX/3Dnow";
5870 else if (TARGET_80387)
5871 isa = "80387";
5872 else
5873 isa = NULL;
5874 if (isa != NULL)
5876 if (cfun->machine->func_type != TYPE_NORMAL)
5877 sorry ("%s instructions aren't allowed in %s service routine",
5878 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5879 ? "exception" : "interrupt"));
5880 else
5881 sorry ("%s instructions aren't allowed in function with "
5882 "no_caller_saved_registers attribute", isa);
5883 /* Don't issue the same error twice. */
5884 cfun->machine->func_type = TYPE_NORMAL;
5885 cfun->machine->no_caller_saved_registers = false;
5889 prev_no_caller_saved_registers
5890 = cfun->machine->no_caller_saved_registers;
5894 /* Return true if this goes in large data/bss. */
5896 static bool
5897 ix86_in_large_data_p (tree exp)
5899 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5900 return false;
5902 if (exp == NULL_TREE)
5903 return false;
5905 /* Functions are never large data. */
5906 if (TREE_CODE (exp) == FUNCTION_DECL)
5907 return false;
5909 /* Automatic variables are never large data. */
5910 if (VAR_P (exp) && !is_global_var (exp))
5911 return false;
5913 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5915 const char *section = DECL_SECTION_NAME (exp);
5916 if (strcmp (section, ".ldata") == 0
5917 || strcmp (section, ".lbss") == 0)
5918 return true;
5919 return false;
5921 else
5923 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5925 /* If this is an incomplete type with size 0, then we can't put it
5926 in data because it might be too big when completed. Also,
5927 int_size_in_bytes returns -1 if size can vary or is larger than
5928 an integer in which case also it is safer to assume that it goes in
5929 large data. */
5930 if (size <= 0 || size > ix86_section_threshold)
5931 return true;
5934 return false;
5937 /* i386-specific section flag to mark large sections. */
5938 #define SECTION_LARGE SECTION_MACH_DEP
5940 /* Switch to the appropriate section for output of DECL.
5941 DECL is either a `VAR_DECL' node or a constant of some sort.
5942 RELOC indicates whether forming the initial value of DECL requires
5943 link-time relocations. */
5945 ATTRIBUTE_UNUSED static section *
5946 x86_64_elf_select_section (tree decl, int reloc,
5947 unsigned HOST_WIDE_INT align)
5949 if (ix86_in_large_data_p (decl))
5951 const char *sname = NULL;
5952 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
5953 switch (categorize_decl_for_section (decl, reloc))
5955 case SECCAT_DATA:
5956 sname = ".ldata";
5957 break;
5958 case SECCAT_DATA_REL:
5959 sname = ".ldata.rel";
5960 break;
5961 case SECCAT_DATA_REL_LOCAL:
5962 sname = ".ldata.rel.local";
5963 break;
5964 case SECCAT_DATA_REL_RO:
5965 sname = ".ldata.rel.ro";
5966 break;
5967 case SECCAT_DATA_REL_RO_LOCAL:
5968 sname = ".ldata.rel.ro.local";
5969 break;
5970 case SECCAT_BSS:
5971 sname = ".lbss";
5972 flags |= SECTION_BSS;
5973 break;
5974 case SECCAT_RODATA:
5975 case SECCAT_RODATA_MERGE_STR:
5976 case SECCAT_RODATA_MERGE_STR_INIT:
5977 case SECCAT_RODATA_MERGE_CONST:
5978 sname = ".lrodata";
5979 flags &= ~SECTION_WRITE;
5980 break;
5981 case SECCAT_SRODATA:
5982 case SECCAT_SDATA:
5983 case SECCAT_SBSS:
5984 gcc_unreachable ();
5985 case SECCAT_TEXT:
5986 case SECCAT_TDATA:
5987 case SECCAT_TBSS:
5988 /* We don't split these for medium model. Place them into
5989 default sections and hope for best. */
5990 break;
5992 if (sname)
5994 /* We might get called with string constants, but get_named_section
5995 doesn't like them as they are not DECLs. Also, we need to set
5996 flags in that case. */
5997 if (!DECL_P (decl))
5998 return get_section (sname, flags, NULL);
5999 return get_named_section (decl, sname, reloc);
6002 return default_elf_select_section (decl, reloc, align);
6005 /* Select a set of attributes for section NAME based on the properties
6006 of DECL and whether or not RELOC indicates that DECL's initializer
6007 might contain runtime relocations. */
6009 static unsigned int ATTRIBUTE_UNUSED
6010 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6012 unsigned int flags = default_section_type_flags (decl, name, reloc);
6014 if (ix86_in_large_data_p (decl))
6015 flags |= SECTION_LARGE;
6017 if (decl == NULL_TREE
6018 && (strcmp (name, ".ldata.rel.ro") == 0
6019 || strcmp (name, ".ldata.rel.ro.local") == 0))
6020 flags |= SECTION_RELRO;
6022 if (strcmp (name, ".lbss") == 0
6023 || strncmp (name, ".lbss.", 5) == 0
6024 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6025 flags |= SECTION_BSS;
6027 return flags;
6030 /* Build up a unique section name, expressed as a
6031 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6032 RELOC indicates whether the initial value of EXP requires
6033 link-time relocations. */
6035 static void ATTRIBUTE_UNUSED
6036 x86_64_elf_unique_section (tree decl, int reloc)
6038 if (ix86_in_large_data_p (decl))
6040 const char *prefix = NULL;
6041 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6042 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6044 switch (categorize_decl_for_section (decl, reloc))
6046 case SECCAT_DATA:
6047 case SECCAT_DATA_REL:
6048 case SECCAT_DATA_REL_LOCAL:
6049 case SECCAT_DATA_REL_RO:
6050 case SECCAT_DATA_REL_RO_LOCAL:
6051 prefix = one_only ? ".ld" : ".ldata";
6052 break;
6053 case SECCAT_BSS:
6054 prefix = one_only ? ".lb" : ".lbss";
6055 break;
6056 case SECCAT_RODATA:
6057 case SECCAT_RODATA_MERGE_STR:
6058 case SECCAT_RODATA_MERGE_STR_INIT:
6059 case SECCAT_RODATA_MERGE_CONST:
6060 prefix = one_only ? ".lr" : ".lrodata";
6061 break;
6062 case SECCAT_SRODATA:
6063 case SECCAT_SDATA:
6064 case SECCAT_SBSS:
6065 gcc_unreachable ();
6066 case SECCAT_TEXT:
6067 case SECCAT_TDATA:
6068 case SECCAT_TBSS:
6069 /* We don't split these for medium model. Place them into
6070 default sections and hope for best. */
6071 break;
6073 if (prefix)
6075 const char *name, *linkonce;
6076 char *string;
6078 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6079 name = targetm.strip_name_encoding (name);
6081 /* If we're using one_only, then there needs to be a .gnu.linkonce
6082 prefix to the section name. */
6083 linkonce = one_only ? ".gnu.linkonce" : "";
6085 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6087 set_decl_section_name (decl, string);
6088 return;
6091 default_unique_section (decl, reloc);
6094 #ifdef COMMON_ASM_OP
6096 #ifndef LARGECOMM_SECTION_ASM_OP
6097 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6098 #endif
6100 /* This says how to output assembler code to declare an
6101 uninitialized external linkage data object.
6103 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6104 large objects. */
6105 void
6106 x86_elf_aligned_decl_common (FILE *file, tree decl,
6107 const char *name, unsigned HOST_WIDE_INT size,
6108 int align)
6110 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6111 && size > (unsigned int)ix86_section_threshold)
6113 switch_to_section (get_named_section (decl, ".lbss", 0));
6114 fputs (LARGECOMM_SECTION_ASM_OP, file);
6116 else
6117 fputs (COMMON_ASM_OP, file);
6118 assemble_name (file, name);
6119 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6120 size, align / BITS_PER_UNIT);
6122 #endif
6124 /* Utility function for targets to use in implementing
6125 ASM_OUTPUT_ALIGNED_BSS. */
6127 void
6128 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6129 unsigned HOST_WIDE_INT size, int align)
6131 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6132 && size > (unsigned int)ix86_section_threshold)
6133 switch_to_section (get_named_section (decl, ".lbss", 0));
6134 else
6135 switch_to_section (bss_section);
6136 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6137 #ifdef ASM_DECLARE_OBJECT_NAME
6138 last_assemble_variable_decl = decl;
6139 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6140 #else
6141 /* Standard thing is just output label for the object. */
6142 ASM_OUTPUT_LABEL (file, name);
6143 #endif /* ASM_DECLARE_OBJECT_NAME */
6144 ASM_OUTPUT_SKIP (file, size ? size : 1);
6147 /* Decide whether we must probe the stack before any space allocation
6148 on this target. It's essentially TARGET_STACK_PROBE except when
6149 -fstack-check causes the stack to be already probed differently. */
6151 bool
6152 ix86_target_stack_probe (void)
6154 /* Do not probe the stack twice if static stack checking is enabled. */
6155 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6156 return false;
6158 return TARGET_STACK_PROBE;
6161 /* Decide whether we can make a sibling call to a function. DECL is the
6162 declaration of the function being targeted by the call and EXP is the
6163 CALL_EXPR representing the call. */
6165 static bool
6166 ix86_function_ok_for_sibcall (tree decl, tree exp)
6168 tree type, decl_or_type;
6169 rtx a, b;
6170 bool bind_global = decl && !targetm.binds_local_p (decl);
6172 if (ix86_function_naked (current_function_decl))
6173 return false;
6175 /* Sibling call isn't OK if there are no caller-saved registers
6176 since all registers must be preserved before return. */
6177 if (cfun->machine->no_caller_saved_registers)
6178 return false;
6180 /* If we are generating position-independent code, we cannot sibcall
6181 optimize direct calls to global functions, as the PLT requires
6182 %ebx be live. (Darwin does not have a PLT.) */
6183 if (!TARGET_MACHO
6184 && !TARGET_64BIT
6185 && flag_pic
6186 && flag_plt
6187 && bind_global)
6188 return false;
6190 /* If we need to align the outgoing stack, then sibcalling would
6191 unalign the stack, which may break the called function. */
6192 if (ix86_minimum_incoming_stack_boundary (true)
6193 < PREFERRED_STACK_BOUNDARY)
6194 return false;
6196 if (decl)
6198 decl_or_type = decl;
6199 type = TREE_TYPE (decl);
6201 else
6203 /* We're looking at the CALL_EXPR, we need the type of the function. */
6204 type = CALL_EXPR_FN (exp); /* pointer expression */
6205 type = TREE_TYPE (type); /* pointer type */
6206 type = TREE_TYPE (type); /* function type */
6207 decl_or_type = type;
6210 /* Check that the return value locations are the same. Like
6211 if we are returning floats on the 80387 register stack, we cannot
6212 make a sibcall from a function that doesn't return a float to a
6213 function that does or, conversely, from a function that does return
6214 a float to a function that doesn't; the necessary stack adjustment
6215 would not be executed. This is also the place we notice
6216 differences in the return value ABI. Note that it is ok for one
6217 of the functions to have void return type as long as the return
6218 value of the other is passed in a register. */
6219 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6220 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6221 cfun->decl, false);
6222 if (STACK_REG_P (a) || STACK_REG_P (b))
6224 if (!rtx_equal_p (a, b))
6225 return false;
6227 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6229 else if (!rtx_equal_p (a, b))
6230 return false;
6232 if (TARGET_64BIT)
6234 /* The SYSV ABI has more call-clobbered registers;
6235 disallow sibcalls from MS to SYSV. */
6236 if (cfun->machine->call_abi == MS_ABI
6237 && ix86_function_type_abi (type) == SYSV_ABI)
6238 return false;
6240 else
6242 /* If this call is indirect, we'll need to be able to use a
6243 call-clobbered register for the address of the target function.
6244 Make sure that all such registers are not used for passing
6245 parameters. Note that DLLIMPORT functions and call to global
6246 function via GOT slot are indirect. */
6247 if (!decl
6248 || (bind_global && flag_pic && !flag_plt)
6249 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6251 /* Check if regparm >= 3 since arg_reg_available is set to
6252 false if regparm == 0. If regparm is 1 or 2, there is
6253 always a call-clobbered register available.
6255 ??? The symbol indirect call doesn't need a call-clobbered
6256 register. But we don't know if this is a symbol indirect
6257 call or not here. */
6258 if (ix86_function_regparm (type, NULL) >= 3
6259 && !cfun->machine->arg_reg_available)
6260 return false;
6264 /* Otherwise okay. That also includes certain types of indirect calls. */
6265 return true;
6268 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6269 and "sseregparm" calling convention attributes;
6270 arguments as in struct attribute_spec.handler. */
6272 static tree
6273 ix86_handle_cconv_attribute (tree *node, tree name,
6274 tree args,
6275 int,
6276 bool *no_add_attrs)
6278 if (TREE_CODE (*node) != FUNCTION_TYPE
6279 && TREE_CODE (*node) != METHOD_TYPE
6280 && TREE_CODE (*node) != FIELD_DECL
6281 && TREE_CODE (*node) != TYPE_DECL)
6283 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6284 name);
6285 *no_add_attrs = true;
6286 return NULL_TREE;
6289 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6290 if (is_attribute_p ("regparm", name))
6292 tree cst;
6294 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6296 error ("fastcall and regparm attributes are not compatible");
6299 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6301 error ("regparam and thiscall attributes are not compatible");
6304 cst = TREE_VALUE (args);
6305 if (TREE_CODE (cst) != INTEGER_CST)
6307 warning (OPT_Wattributes,
6308 "%qE attribute requires an integer constant argument",
6309 name);
6310 *no_add_attrs = true;
6312 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6314 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6315 name, REGPARM_MAX);
6316 *no_add_attrs = true;
6319 return NULL_TREE;
6322 if (TARGET_64BIT)
6324 /* Do not warn when emulating the MS ABI. */
6325 if ((TREE_CODE (*node) != FUNCTION_TYPE
6326 && TREE_CODE (*node) != METHOD_TYPE)
6327 || ix86_function_type_abi (*node) != MS_ABI)
6328 warning (OPT_Wattributes, "%qE attribute ignored",
6329 name);
6330 *no_add_attrs = true;
6331 return NULL_TREE;
6334 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6335 if (is_attribute_p ("fastcall", name))
6337 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6339 error ("fastcall and cdecl attributes are not compatible");
6341 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6343 error ("fastcall and stdcall attributes are not compatible");
6345 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6347 error ("fastcall and regparm attributes are not compatible");
6349 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6351 error ("fastcall and thiscall attributes are not compatible");
6355 /* Can combine stdcall with fastcall (redundant), regparm and
6356 sseregparm. */
6357 else if (is_attribute_p ("stdcall", name))
6359 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6361 error ("stdcall and cdecl attributes are not compatible");
6363 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6365 error ("stdcall and fastcall attributes are not compatible");
6367 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6369 error ("stdcall and thiscall attributes are not compatible");
6373 /* Can combine cdecl with regparm and sseregparm. */
6374 else if (is_attribute_p ("cdecl", name))
6376 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6378 error ("stdcall and cdecl attributes are not compatible");
6380 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6382 error ("fastcall and cdecl attributes are not compatible");
6384 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6386 error ("cdecl and thiscall attributes are not compatible");
6389 else if (is_attribute_p ("thiscall", name))
6391 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6392 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6393 name);
6394 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6396 error ("stdcall and thiscall attributes are not compatible");
6398 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6400 error ("fastcall and thiscall attributes are not compatible");
6402 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6404 error ("cdecl and thiscall attributes are not compatible");
6408 /* Can combine sseregparm with all attributes. */
6410 return NULL_TREE;
6413 /* The transactional memory builtins are implicitly regparm or fastcall
6414 depending on the ABI. Override the generic do-nothing attribute that
6415 these builtins were declared with, and replace it with one of the two
6416 attributes that we expect elsewhere. */
6418 static tree
6419 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6420 int flags, bool *no_add_attrs)
6422 tree alt;
6424 /* In no case do we want to add the placeholder attribute. */
6425 *no_add_attrs = true;
6427 /* The 64-bit ABI is unchanged for transactional memory. */
6428 if (TARGET_64BIT)
6429 return NULL_TREE;
6431 /* ??? Is there a better way to validate 32-bit windows? We have
6432 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6433 if (CHECK_STACK_LIMIT > 0)
6434 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6435 else
6437 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6438 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6440 decl_attributes (node, alt, flags);
6442 return NULL_TREE;
6445 /* This function determines from TYPE the calling-convention. */
6447 unsigned int
6448 ix86_get_callcvt (const_tree type)
6450 unsigned int ret = 0;
6451 bool is_stdarg;
6452 tree attrs;
6454 if (TARGET_64BIT)
6455 return IX86_CALLCVT_CDECL;
6457 attrs = TYPE_ATTRIBUTES (type);
6458 if (attrs != NULL_TREE)
6460 if (lookup_attribute ("cdecl", attrs))
6461 ret |= IX86_CALLCVT_CDECL;
6462 else if (lookup_attribute ("stdcall", attrs))
6463 ret |= IX86_CALLCVT_STDCALL;
6464 else if (lookup_attribute ("fastcall", attrs))
6465 ret |= IX86_CALLCVT_FASTCALL;
6466 else if (lookup_attribute ("thiscall", attrs))
6467 ret |= IX86_CALLCVT_THISCALL;
6469 /* Regparam isn't allowed for thiscall and fastcall. */
6470 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6472 if (lookup_attribute ("regparm", attrs))
6473 ret |= IX86_CALLCVT_REGPARM;
6474 if (lookup_attribute ("sseregparm", attrs))
6475 ret |= IX86_CALLCVT_SSEREGPARM;
6478 if (IX86_BASE_CALLCVT(ret) != 0)
6479 return ret;
6482 is_stdarg = stdarg_p (type);
6483 if (TARGET_RTD && !is_stdarg)
6484 return IX86_CALLCVT_STDCALL | ret;
6486 if (ret != 0
6487 || is_stdarg
6488 || TREE_CODE (type) != METHOD_TYPE
6489 || ix86_function_type_abi (type) != MS_ABI)
6490 return IX86_CALLCVT_CDECL | ret;
6492 return IX86_CALLCVT_THISCALL;
6495 /* Return 0 if the attributes for two types are incompatible, 1 if they
6496 are compatible, and 2 if they are nearly compatible (which causes a
6497 warning to be generated). */
6499 static int
6500 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6502 unsigned int ccvt1, ccvt2;
6504 if (TREE_CODE (type1) != FUNCTION_TYPE
6505 && TREE_CODE (type1) != METHOD_TYPE)
6506 return 1;
6508 ccvt1 = ix86_get_callcvt (type1);
6509 ccvt2 = ix86_get_callcvt (type2);
6510 if (ccvt1 != ccvt2)
6511 return 0;
6512 if (ix86_function_regparm (type1, NULL)
6513 != ix86_function_regparm (type2, NULL))
6514 return 0;
6516 return 1;
6519 /* Return the regparm value for a function with the indicated TYPE and DECL.
6520 DECL may be NULL when calling function indirectly
6521 or considering a libcall. */
6523 static int
6524 ix86_function_regparm (const_tree type, const_tree decl)
6526 tree attr;
6527 int regparm;
6528 unsigned int ccvt;
6530 if (TARGET_64BIT)
6531 return (ix86_function_type_abi (type) == SYSV_ABI
6532 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6533 ccvt = ix86_get_callcvt (type);
6534 regparm = ix86_regparm;
6536 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6538 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6539 if (attr)
6541 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6542 return regparm;
6545 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6546 return 2;
6547 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6548 return 1;
6550 /* Use register calling convention for local functions when possible. */
6551 if (decl
6552 && TREE_CODE (decl) == FUNCTION_DECL)
6554 cgraph_node *target = cgraph_node::get (decl);
6555 if (target)
6556 target = target->function_symbol ();
6558 /* Caller and callee must agree on the calling convention, so
6559 checking here just optimize means that with
6560 __attribute__((optimize (...))) caller could use regparm convention
6561 and callee not, or vice versa. Instead look at whether the callee
6562 is optimized or not. */
6563 if (target && opt_for_fn (target->decl, optimize)
6564 && !(profile_flag && !flag_fentry))
6566 cgraph_local_info *i = &target->local;
6567 if (i && i->local && i->can_change_signature)
6569 int local_regparm, globals = 0, regno;
6571 /* Make sure no regparm register is taken by a
6572 fixed register variable. */
6573 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6574 local_regparm++)
6575 if (fixed_regs[local_regparm])
6576 break;
6578 /* We don't want to use regparm(3) for nested functions as
6579 these use a static chain pointer in the third argument. */
6580 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6581 local_regparm = 2;
6583 /* Save a register for the split stack. */
6584 if (flag_split_stack)
6586 if (local_regparm == 3)
6587 local_regparm = 2;
6588 else if (local_regparm == 2
6589 && DECL_STATIC_CHAIN (target->decl))
6590 local_regparm = 1;
6593 /* Each fixed register usage increases register pressure,
6594 so less registers should be used for argument passing.
6595 This functionality can be overriden by an explicit
6596 regparm value. */
6597 for (regno = AX_REG; regno <= DI_REG; regno++)
6598 if (fixed_regs[regno])
6599 globals++;
6601 local_regparm
6602 = globals < local_regparm ? local_regparm - globals : 0;
6604 if (local_regparm > regparm)
6605 regparm = local_regparm;
6610 return regparm;
6613 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6614 DFmode (2) arguments in SSE registers for a function with the
6615 indicated TYPE and DECL. DECL may be NULL when calling function
6616 indirectly or considering a libcall. Return -1 if any FP parameter
6617 should be rejected by error. This is used in siutation we imply SSE
6618 calling convetion but the function is called from another function with
6619 SSE disabled. Otherwise return 0. */
6621 static int
6622 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6624 gcc_assert (!TARGET_64BIT);
6626 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6627 by the sseregparm attribute. */
6628 if (TARGET_SSEREGPARM
6629 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6631 if (!TARGET_SSE)
6633 if (warn)
6635 if (decl)
6636 error ("calling %qD with attribute sseregparm without "
6637 "SSE/SSE2 enabled", decl);
6638 else
6639 error ("calling %qT with attribute sseregparm without "
6640 "SSE/SSE2 enabled", type);
6642 return 0;
6645 return 2;
6648 if (!decl)
6649 return 0;
6651 cgraph_node *target = cgraph_node::get (decl);
6652 if (target)
6653 target = target->function_symbol ();
6655 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6656 (and DFmode for SSE2) arguments in SSE registers. */
6657 if (target
6658 /* TARGET_SSE_MATH */
6659 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6660 && opt_for_fn (target->decl, optimize)
6661 && !(profile_flag && !flag_fentry))
6663 cgraph_local_info *i = &target->local;
6664 if (i && i->local && i->can_change_signature)
6666 /* Refuse to produce wrong code when local function with SSE enabled
6667 is called from SSE disabled function.
6668 FIXME: We need a way to detect these cases cross-ltrans partition
6669 and avoid using SSE calling conventions on local functions called
6670 from function with SSE disabled. For now at least delay the
6671 warning until we know we are going to produce wrong code.
6672 See PR66047 */
6673 if (!TARGET_SSE && warn)
6674 return -1;
6675 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6676 ->x_ix86_isa_flags) ? 2 : 1;
6680 return 0;
6683 /* Return true if EAX is live at the start of the function. Used by
6684 ix86_expand_prologue to determine if we need special help before
6685 calling allocate_stack_worker. */
6687 static bool
6688 ix86_eax_live_at_start_p (void)
6690 /* Cheat. Don't bother working forward from ix86_function_regparm
6691 to the function type to whether an actual argument is located in
6692 eax. Instead just look at cfg info, which is still close enough
6693 to correct at this point. This gives false positives for broken
6694 functions that might use uninitialized data that happens to be
6695 allocated in eax, but who cares? */
6696 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6699 static bool
6700 ix86_keep_aggregate_return_pointer (tree fntype)
6702 tree attr;
6704 if (!TARGET_64BIT)
6706 attr = lookup_attribute ("callee_pop_aggregate_return",
6707 TYPE_ATTRIBUTES (fntype));
6708 if (attr)
6709 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6711 /* For 32-bit MS-ABI the default is to keep aggregate
6712 return pointer. */
6713 if (ix86_function_type_abi (fntype) == MS_ABI)
6714 return true;
6716 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6719 /* Value is the number of bytes of arguments automatically
6720 popped when returning from a subroutine call.
6721 FUNDECL is the declaration node of the function (as a tree),
6722 FUNTYPE is the data type of the function (as a tree),
6723 or for a library call it is an identifier node for the subroutine name.
6724 SIZE is the number of bytes of arguments passed on the stack.
6726 On the 80386, the RTD insn may be used to pop them if the number
6727 of args is fixed, but if the number is variable then the caller
6728 must pop them all. RTD can't be used for library calls now
6729 because the library is compiled with the Unix compiler.
6730 Use of RTD is a selectable option, since it is incompatible with
6731 standard Unix calling sequences. If the option is not selected,
6732 the caller must always pop the args.
6734 The attribute stdcall is equivalent to RTD on a per module basis. */
6736 static int
6737 ix86_return_pops_args (tree fundecl, tree funtype, int size)
6739 unsigned int ccvt;
6741 /* None of the 64-bit ABIs pop arguments. */
6742 if (TARGET_64BIT)
6743 return 0;
6745 ccvt = ix86_get_callcvt (funtype);
6747 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6748 | IX86_CALLCVT_THISCALL)) != 0
6749 && ! stdarg_p (funtype))
6750 return size;
6752 /* Lose any fake structure return argument if it is passed on the stack. */
6753 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6754 && !ix86_keep_aggregate_return_pointer (funtype))
6756 int nregs = ix86_function_regparm (funtype, fundecl);
6757 if (nregs == 0)
6758 return GET_MODE_SIZE (Pmode);
6761 return 0;
6764 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6766 static bool
6767 ix86_legitimate_combined_insn (rtx_insn *insn)
6769 int i;
6771 /* Check operand constraints in case hard registers were propagated
6772 into insn pattern. This check prevents combine pass from
6773 generating insn patterns with invalid hard register operands.
6774 These invalid insns can eventually confuse reload to error out
6775 with a spill failure. See also PRs 46829 and 46843. */
6777 gcc_assert (INSN_CODE (insn) >= 0);
6779 extract_insn (insn);
6780 preprocess_constraints (insn);
6782 int n_operands = recog_data.n_operands;
6783 int n_alternatives = recog_data.n_alternatives;
6784 for (i = 0; i < n_operands; i++)
6786 rtx op = recog_data.operand[i];
6787 machine_mode mode = GET_MODE (op);
6788 const operand_alternative *op_alt;
6789 int offset = 0;
6790 bool win;
6791 int j;
6793 /* A unary operator may be accepted by the predicate, but it
6794 is irrelevant for matching constraints. */
6795 if (UNARY_P (op))
6796 op = XEXP (op, 0);
6798 if (SUBREG_P (op))
6800 if (REG_P (SUBREG_REG (op))
6801 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6802 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6803 GET_MODE (SUBREG_REG (op)),
6804 SUBREG_BYTE (op),
6805 GET_MODE (op));
6806 op = SUBREG_REG (op);
6809 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6810 continue;
6812 op_alt = recog_op_alt;
6814 /* Operand has no constraints, anything is OK. */
6815 win = !n_alternatives;
6817 alternative_mask preferred = get_preferred_alternatives (insn);
6818 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6820 if (!TEST_BIT (preferred, j))
6821 continue;
6822 if (op_alt[i].anything_ok
6823 || (op_alt[i].matches != -1
6824 && operands_match_p
6825 (recog_data.operand[i],
6826 recog_data.operand[op_alt[i].matches]))
6827 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6829 win = true;
6830 break;
6834 if (!win)
6835 return false;
6838 return true;
6841 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6843 static unsigned HOST_WIDE_INT
6844 ix86_asan_shadow_offset (void)
6846 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6847 : HOST_WIDE_INT_C (0x7fff8000))
6848 : (HOST_WIDE_INT_1 << 29);
6851 /* Argument support functions. */
6853 /* Return true when register may be used to pass function parameters. */
6854 bool
6855 ix86_function_arg_regno_p (int regno)
6857 int i;
6858 enum calling_abi call_abi;
6859 const int *parm_regs;
6861 if (TARGET_MPX && BND_REGNO_P (regno))
6862 return true;
6864 if (!TARGET_64BIT)
6866 if (TARGET_MACHO)
6867 return (regno < REGPARM_MAX
6868 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6869 else
6870 return (regno < REGPARM_MAX
6871 || (TARGET_MMX && MMX_REGNO_P (regno)
6872 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6873 || (TARGET_SSE && SSE_REGNO_P (regno)
6874 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6877 if (TARGET_SSE && SSE_REGNO_P (regno)
6878 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6879 return true;
6881 /* TODO: The function should depend on current function ABI but
6882 builtins.c would need updating then. Therefore we use the
6883 default ABI. */
6884 call_abi = ix86_cfun_abi ();
6886 /* RAX is used as hidden argument to va_arg functions. */
6887 if (call_abi == SYSV_ABI && regno == AX_REG)
6888 return true;
6890 if (call_abi == MS_ABI)
6891 parm_regs = x86_64_ms_abi_int_parameter_registers;
6892 else
6893 parm_regs = x86_64_int_parameter_registers;
6895 for (i = 0; i < (call_abi == MS_ABI
6896 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6897 if (regno == parm_regs[i])
6898 return true;
6899 return false;
6902 /* Return if we do not know how to pass TYPE solely in registers. */
6904 static bool
6905 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6907 if (must_pass_in_stack_var_size_or_pad (mode, type))
6908 return true;
6910 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
6911 The layout_type routine is crafty and tries to trick us into passing
6912 currently unsupported vector types on the stack by using TImode. */
6913 return (!TARGET_64BIT && mode == TImode
6914 && type && TREE_CODE (type) != VECTOR_TYPE);
6917 /* It returns the size, in bytes, of the area reserved for arguments passed
6918 in registers for the function represented by fndecl dependent to the used
6919 abi format. */
6921 ix86_reg_parm_stack_space (const_tree fndecl)
6923 enum calling_abi call_abi = SYSV_ABI;
6924 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
6925 call_abi = ix86_function_abi (fndecl);
6926 else
6927 call_abi = ix86_function_type_abi (fndecl);
6928 if (TARGET_64BIT && call_abi == MS_ABI)
6929 return 32;
6930 return 0;
6933 /* We add this as a workaround in order to use libc_has_function
6934 hook in i386.md. */
6935 bool
6936 ix86_libc_has_function (enum function_class fn_class)
6938 return targetm.libc_has_function (fn_class);
6941 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
6942 specifying the call abi used. */
6943 enum calling_abi
6944 ix86_function_type_abi (const_tree fntype)
6946 enum calling_abi abi = ix86_abi;
6948 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
6949 return abi;
6951 if (abi == SYSV_ABI
6952 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
6954 static int warned;
6955 if (TARGET_X32 && !warned)
6957 error ("X32 does not support ms_abi attribute");
6958 warned = 1;
6961 abi = MS_ABI;
6963 else if (abi == MS_ABI
6964 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
6965 abi = SYSV_ABI;
6967 return abi;
6970 static enum calling_abi
6971 ix86_function_abi (const_tree fndecl)
6973 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
6976 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
6977 specifying the call abi used. */
6978 enum calling_abi
6979 ix86_cfun_abi (void)
6981 return cfun ? cfun->machine->call_abi : ix86_abi;
6984 static bool
6985 ix86_function_ms_hook_prologue (const_tree fn)
6987 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
6989 if (decl_function_context (fn) != NULL_TREE)
6990 error_at (DECL_SOURCE_LOCATION (fn),
6991 "ms_hook_prologue is not compatible with nested function");
6992 else
6993 return true;
6995 return false;
6998 static bool
6999 ix86_function_naked (const_tree fn)
7001 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7002 return true;
7004 return false;
7007 /* Write the extra assembler code needed to declare a function properly. */
7009 void
7010 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7011 tree decl)
7013 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7015 if (is_ms_hook)
7017 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7018 unsigned int filler_cc = 0xcccccccc;
7020 for (i = 0; i < filler_count; i += 4)
7021 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7024 #ifdef SUBTARGET_ASM_UNWIND_INIT
7025 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7026 #endif
7028 ASM_OUTPUT_LABEL (asm_out_file, fname);
7030 /* Output magic byte marker, if hot-patch attribute is set. */
7031 if (is_ms_hook)
7033 if (TARGET_64BIT)
7035 /* leaq [%rsp + 0], %rsp */
7036 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7037 asm_out_file);
7039 else
7041 /* movl.s %edi, %edi
7042 push %ebp
7043 movl.s %esp, %ebp */
7044 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7049 /* Implementation of call abi switching target hook. Specific to FNDECL
7050 the specific call register sets are set. See also
7051 ix86_conditional_register_usage for more details. */
7052 void
7053 ix86_call_abi_override (const_tree fndecl)
7055 cfun->machine->call_abi = ix86_function_abi (fndecl);
7058 /* Return 1 if pseudo register should be created and used to hold
7059 GOT address for PIC code. */
7060 bool
7061 ix86_use_pseudo_pic_reg (void)
7063 if ((TARGET_64BIT
7064 && (ix86_cmodel == CM_SMALL_PIC
7065 || TARGET_PECOFF))
7066 || !flag_pic)
7067 return false;
7068 return true;
7071 /* Initialize large model PIC register. */
7073 static void
7074 ix86_init_large_pic_reg (unsigned int tmp_regno)
7076 rtx_code_label *label;
7077 rtx tmp_reg;
7079 gcc_assert (Pmode == DImode);
7080 label = gen_label_rtx ();
7081 emit_label (label);
7082 LABEL_PRESERVE_P (label) = 1;
7083 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7084 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7085 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7086 label));
7087 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7088 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7089 pic_offset_table_rtx, tmp_reg));
7090 const char *name = LABEL_NAME (label);
7091 PUT_CODE (label, NOTE);
7092 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7093 NOTE_DELETED_LABEL_NAME (label) = name;
7096 /* Create and initialize PIC register if required. */
7097 static void
7098 ix86_init_pic_reg (void)
7100 edge entry_edge;
7101 rtx_insn *seq;
7103 if (!ix86_use_pseudo_pic_reg ())
7104 return;
7106 start_sequence ();
7108 if (TARGET_64BIT)
7110 if (ix86_cmodel == CM_LARGE_PIC)
7111 ix86_init_large_pic_reg (R11_REG);
7112 else
7113 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7115 else
7117 /* If there is future mcount call in the function it is more profitable
7118 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7119 rtx reg = crtl->profile
7120 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7121 : pic_offset_table_rtx;
7122 rtx_insn *insn = emit_insn (gen_set_got (reg));
7123 RTX_FRAME_RELATED_P (insn) = 1;
7124 if (crtl->profile)
7125 emit_move_insn (pic_offset_table_rtx, reg);
7126 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7129 seq = get_insns ();
7130 end_sequence ();
7132 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7133 insert_insn_on_edge (seq, entry_edge);
7134 commit_one_edge_insertion (entry_edge);
7137 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7138 for a call to a function whose data type is FNTYPE.
7139 For a library call, FNTYPE is 0. */
7141 void
7142 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7143 tree fntype, /* tree ptr for function decl */
7144 rtx libname, /* SYMBOL_REF of library name or 0 */
7145 tree fndecl,
7146 int caller)
7148 struct cgraph_local_info *i = NULL;
7149 struct cgraph_node *target = NULL;
7151 memset (cum, 0, sizeof (*cum));
7153 if (fndecl)
7155 target = cgraph_node::get (fndecl);
7156 if (target)
7158 target = target->function_symbol ();
7159 i = cgraph_node::local_info (target->decl);
7160 cum->call_abi = ix86_function_abi (target->decl);
7162 else
7163 cum->call_abi = ix86_function_abi (fndecl);
7165 else
7166 cum->call_abi = ix86_function_type_abi (fntype);
7168 cum->caller = caller;
7170 /* Set up the number of registers to use for passing arguments. */
7171 cum->nregs = ix86_regparm;
7172 if (TARGET_64BIT)
7174 cum->nregs = (cum->call_abi == SYSV_ABI
7175 ? X86_64_REGPARM_MAX
7176 : X86_64_MS_REGPARM_MAX);
7178 if (TARGET_SSE)
7180 cum->sse_nregs = SSE_REGPARM_MAX;
7181 if (TARGET_64BIT)
7183 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7184 ? X86_64_SSE_REGPARM_MAX
7185 : X86_64_MS_SSE_REGPARM_MAX);
7188 if (TARGET_MMX)
7189 cum->mmx_nregs = MMX_REGPARM_MAX;
7190 cum->warn_avx512f = true;
7191 cum->warn_avx = true;
7192 cum->warn_sse = true;
7193 cum->warn_mmx = true;
7195 /* Because type might mismatch in between caller and callee, we need to
7196 use actual type of function for local calls.
7197 FIXME: cgraph_analyze can be told to actually record if function uses
7198 va_start so for local functions maybe_vaarg can be made aggressive
7199 helping K&R code.
7200 FIXME: once typesytem is fixed, we won't need this code anymore. */
7201 if (i && i->local && i->can_change_signature)
7202 fntype = TREE_TYPE (target->decl);
7203 cum->stdarg = stdarg_p (fntype);
7204 cum->maybe_vaarg = (fntype
7205 ? (!prototype_p (fntype) || stdarg_p (fntype))
7206 : !libname);
7208 cum->bnd_regno = FIRST_BND_REG;
7209 cum->bnds_in_bt = 0;
7210 cum->force_bnd_pass = 0;
7211 cum->decl = fndecl;
7213 cum->warn_empty = !warn_abi || cum->stdarg;
7214 if (!cum->warn_empty && fntype)
7216 function_args_iterator iter;
7217 tree argtype;
7218 bool seen_empty_type = false;
7219 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7221 if (VOID_TYPE_P (argtype))
7222 break;
7223 if (TYPE_EMPTY_P (argtype))
7224 seen_empty_type = true;
7225 else if (seen_empty_type)
7227 cum->warn_empty = true;
7228 break;
7233 if (!TARGET_64BIT)
7235 /* If there are variable arguments, then we won't pass anything
7236 in registers in 32-bit mode. */
7237 if (stdarg_p (fntype))
7239 cum->nregs = 0;
7240 /* Since in 32-bit, variable arguments are always passed on
7241 stack, there is scratch register available for indirect
7242 sibcall. */
7243 cfun->machine->arg_reg_available = true;
7244 cum->sse_nregs = 0;
7245 cum->mmx_nregs = 0;
7246 cum->warn_avx512f = false;
7247 cum->warn_avx = false;
7248 cum->warn_sse = false;
7249 cum->warn_mmx = false;
7250 return;
7253 /* Use ecx and edx registers if function has fastcall attribute,
7254 else look for regparm information. */
7255 if (fntype)
7257 unsigned int ccvt = ix86_get_callcvt (fntype);
7258 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7260 cum->nregs = 1;
7261 cum->fastcall = 1; /* Same first register as in fastcall. */
7263 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7265 cum->nregs = 2;
7266 cum->fastcall = 1;
7268 else
7269 cum->nregs = ix86_function_regparm (fntype, fndecl);
7272 /* Set up the number of SSE registers used for passing SFmode
7273 and DFmode arguments. Warn for mismatching ABI. */
7274 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7277 cfun->machine->arg_reg_available = (cum->nregs > 0);
7280 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7281 But in the case of vector types, it is some vector mode.
7283 When we have only some of our vector isa extensions enabled, then there
7284 are some modes for which vector_mode_supported_p is false. For these
7285 modes, the generic vector support in gcc will choose some non-vector mode
7286 in order to implement the type. By computing the natural mode, we'll
7287 select the proper ABI location for the operand and not depend on whatever
7288 the middle-end decides to do with these vector types.
7290 The midde-end can't deal with the vector types > 16 bytes. In this
7291 case, we return the original mode and warn ABI change if CUM isn't
7292 NULL.
7294 If INT_RETURN is true, warn ABI change if the vector mode isn't
7295 available for function return value. */
7297 static machine_mode
7298 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7299 bool in_return)
7301 machine_mode mode = TYPE_MODE (type);
7303 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7305 HOST_WIDE_INT size = int_size_in_bytes (type);
7306 if ((size == 8 || size == 16 || size == 32 || size == 64)
7307 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7308 && TYPE_VECTOR_SUBPARTS (type) > 1)
7310 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7312 /* There are no XFmode vector modes. */
7313 if (innermode == XFmode)
7314 return mode;
7316 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7317 mode = MIN_MODE_VECTOR_FLOAT;
7318 else
7319 mode = MIN_MODE_VECTOR_INT;
7321 /* Get the mode which has this inner mode and number of units. */
7322 FOR_EACH_MODE_FROM (mode, mode)
7323 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7324 && GET_MODE_INNER (mode) == innermode)
7326 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7328 static bool warnedavx512f;
7329 static bool warnedavx512f_ret;
7331 if (cum && cum->warn_avx512f && !warnedavx512f)
7333 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7334 "without AVX512F enabled changes the ABI"))
7335 warnedavx512f = true;
7337 else if (in_return && !warnedavx512f_ret)
7339 if (warning (OPT_Wpsabi, "AVX512F vector return "
7340 "without AVX512F enabled changes the ABI"))
7341 warnedavx512f_ret = true;
7344 return TYPE_MODE (type);
7346 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7348 static bool warnedavx;
7349 static bool warnedavx_ret;
7351 if (cum && cum->warn_avx && !warnedavx)
7353 if (warning (OPT_Wpsabi, "AVX vector argument "
7354 "without AVX enabled changes the ABI"))
7355 warnedavx = true;
7357 else if (in_return && !warnedavx_ret)
7359 if (warning (OPT_Wpsabi, "AVX vector return "
7360 "without AVX enabled changes the ABI"))
7361 warnedavx_ret = true;
7364 return TYPE_MODE (type);
7366 else if (((size == 8 && TARGET_64BIT) || size == 16)
7367 && !TARGET_SSE
7368 && !TARGET_IAMCU)
7370 static bool warnedsse;
7371 static bool warnedsse_ret;
7373 if (cum && cum->warn_sse && !warnedsse)
7375 if (warning (OPT_Wpsabi, "SSE vector argument "
7376 "without SSE enabled changes the ABI"))
7377 warnedsse = true;
7379 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7381 if (warning (OPT_Wpsabi, "SSE vector return "
7382 "without SSE enabled changes the ABI"))
7383 warnedsse_ret = true;
7386 else if ((size == 8 && !TARGET_64BIT)
7387 && (!cfun
7388 || cfun->machine->func_type == TYPE_NORMAL)
7389 && !TARGET_MMX
7390 && !TARGET_IAMCU)
7392 static bool warnedmmx;
7393 static bool warnedmmx_ret;
7395 if (cum && cum->warn_mmx && !warnedmmx)
7397 if (warning (OPT_Wpsabi, "MMX vector argument "
7398 "without MMX enabled changes the ABI"))
7399 warnedmmx = true;
7401 else if (in_return && !warnedmmx_ret)
7403 if (warning (OPT_Wpsabi, "MMX vector return "
7404 "without MMX enabled changes the ABI"))
7405 warnedmmx_ret = true;
7408 return mode;
7411 gcc_unreachable ();
7415 return mode;
7418 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7419 this may not agree with the mode that the type system has chosen for the
7420 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7421 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7423 static rtx
7424 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7425 unsigned int regno)
7427 rtx tmp;
7429 if (orig_mode != BLKmode)
7430 tmp = gen_rtx_REG (orig_mode, regno);
7431 else
7433 tmp = gen_rtx_REG (mode, regno);
7434 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7435 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7438 return tmp;
7441 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7442 of this code is to classify each 8bytes of incoming argument by the register
7443 class and assign registers accordingly. */
7445 /* Return the union class of CLASS1 and CLASS2.
7446 See the x86-64 PS ABI for details. */
7448 static enum x86_64_reg_class
7449 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7451 /* Rule #1: If both classes are equal, this is the resulting class. */
7452 if (class1 == class2)
7453 return class1;
7455 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7456 the other class. */
7457 if (class1 == X86_64_NO_CLASS)
7458 return class2;
7459 if (class2 == X86_64_NO_CLASS)
7460 return class1;
7462 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7463 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7464 return X86_64_MEMORY_CLASS;
7466 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7467 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7468 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7469 return X86_64_INTEGERSI_CLASS;
7470 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7471 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7472 return X86_64_INTEGER_CLASS;
7474 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7475 MEMORY is used. */
7476 if (class1 == X86_64_X87_CLASS
7477 || class1 == X86_64_X87UP_CLASS
7478 || class1 == X86_64_COMPLEX_X87_CLASS
7479 || class2 == X86_64_X87_CLASS
7480 || class2 == X86_64_X87UP_CLASS
7481 || class2 == X86_64_COMPLEX_X87_CLASS)
7482 return X86_64_MEMORY_CLASS;
7484 /* Rule #6: Otherwise class SSE is used. */
7485 return X86_64_SSE_CLASS;
7488 /* Classify the argument of type TYPE and mode MODE.
7489 CLASSES will be filled by the register class used to pass each word
7490 of the operand. The number of words is returned. In case the parameter
7491 should be passed in memory, 0 is returned. As a special case for zero
7492 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7494 BIT_OFFSET is used internally for handling records and specifies offset
7495 of the offset in bits modulo 512 to avoid overflow cases.
7497 See the x86-64 PS ABI for details.
7500 static int
7501 classify_argument (machine_mode mode, const_tree type,
7502 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7504 HOST_WIDE_INT bytes =
7505 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7506 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7508 /* Variable sized entities are always passed/returned in memory. */
7509 if (bytes < 0)
7510 return 0;
7512 if (mode != VOIDmode
7513 && targetm.calls.must_pass_in_stack (mode, type))
7514 return 0;
7516 if (type && AGGREGATE_TYPE_P (type))
7518 int i;
7519 tree field;
7520 enum x86_64_reg_class subclasses[MAX_CLASSES];
7522 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7523 if (bytes > 64)
7524 return 0;
7526 for (i = 0; i < words; i++)
7527 classes[i] = X86_64_NO_CLASS;
7529 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7530 signalize memory class, so handle it as special case. */
7531 if (!words)
7533 classes[0] = X86_64_NO_CLASS;
7534 return 1;
7537 /* Classify each field of record and merge classes. */
7538 switch (TREE_CODE (type))
7540 case RECORD_TYPE:
7541 /* And now merge the fields of structure. */
7542 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7544 if (TREE_CODE (field) == FIELD_DECL)
7546 int num;
7548 if (TREE_TYPE (field) == error_mark_node)
7549 continue;
7551 /* Bitfields are always classified as integer. Handle them
7552 early, since later code would consider them to be
7553 misaligned integers. */
7554 if (DECL_BIT_FIELD (field))
7556 for (i = (int_bit_position (field)
7557 + (bit_offset % 64)) / 8 / 8;
7558 i < ((int_bit_position (field) + (bit_offset % 64))
7559 + tree_to_shwi (DECL_SIZE (field))
7560 + 63) / 8 / 8; i++)
7561 classes[i] =
7562 merge_classes (X86_64_INTEGER_CLASS,
7563 classes[i]);
7565 else
7567 int pos;
7569 type = TREE_TYPE (field);
7571 /* Flexible array member is ignored. */
7572 if (TYPE_MODE (type) == BLKmode
7573 && TREE_CODE (type) == ARRAY_TYPE
7574 && TYPE_SIZE (type) == NULL_TREE
7575 && TYPE_DOMAIN (type) != NULL_TREE
7576 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7577 == NULL_TREE))
7579 static bool warned;
7581 if (!warned && warn_psabi)
7583 warned = true;
7584 inform (input_location,
7585 "the ABI of passing struct with"
7586 " a flexible array member has"
7587 " changed in GCC 4.4");
7589 continue;
7591 num = classify_argument (TYPE_MODE (type), type,
7592 subclasses,
7593 (int_bit_position (field)
7594 + bit_offset) % 512);
7595 if (!num)
7596 return 0;
7597 pos = (int_bit_position (field)
7598 + (bit_offset % 64)) / 8 / 8;
7599 for (i = 0; i < num && (i + pos) < words; i++)
7600 classes[i + pos] =
7601 merge_classes (subclasses[i], classes[i + pos]);
7605 break;
7607 case ARRAY_TYPE:
7608 /* Arrays are handled as small records. */
7610 int num;
7611 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7612 TREE_TYPE (type), subclasses, bit_offset);
7613 if (!num)
7614 return 0;
7616 /* The partial classes are now full classes. */
7617 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7618 subclasses[0] = X86_64_SSE_CLASS;
7619 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7620 && !((bit_offset % 64) == 0 && bytes == 4))
7621 subclasses[0] = X86_64_INTEGER_CLASS;
7623 for (i = 0; i < words; i++)
7624 classes[i] = subclasses[i % num];
7626 break;
7628 case UNION_TYPE:
7629 case QUAL_UNION_TYPE:
7630 /* Unions are similar to RECORD_TYPE but offset is always 0.
7632 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7634 if (TREE_CODE (field) == FIELD_DECL)
7636 int num;
7638 if (TREE_TYPE (field) == error_mark_node)
7639 continue;
7641 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7642 TREE_TYPE (field), subclasses,
7643 bit_offset);
7644 if (!num)
7645 return 0;
7646 for (i = 0; i < num && i < words; i++)
7647 classes[i] = merge_classes (subclasses[i], classes[i]);
7650 break;
7652 default:
7653 gcc_unreachable ();
7656 if (words > 2)
7658 /* When size > 16 bytes, if the first one isn't
7659 X86_64_SSE_CLASS or any other ones aren't
7660 X86_64_SSEUP_CLASS, everything should be passed in
7661 memory. */
7662 if (classes[0] != X86_64_SSE_CLASS)
7663 return 0;
7665 for (i = 1; i < words; i++)
7666 if (classes[i] != X86_64_SSEUP_CLASS)
7667 return 0;
7670 /* Final merger cleanup. */
7671 for (i = 0; i < words; i++)
7673 /* If one class is MEMORY, everything should be passed in
7674 memory. */
7675 if (classes[i] == X86_64_MEMORY_CLASS)
7676 return 0;
7678 /* The X86_64_SSEUP_CLASS should be always preceded by
7679 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7680 if (classes[i] == X86_64_SSEUP_CLASS
7681 && classes[i - 1] != X86_64_SSE_CLASS
7682 && classes[i - 1] != X86_64_SSEUP_CLASS)
7684 /* The first one should never be X86_64_SSEUP_CLASS. */
7685 gcc_assert (i != 0);
7686 classes[i] = X86_64_SSE_CLASS;
7689 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7690 everything should be passed in memory. */
7691 if (classes[i] == X86_64_X87UP_CLASS
7692 && (classes[i - 1] != X86_64_X87_CLASS))
7694 static bool warned;
7696 /* The first one should never be X86_64_X87UP_CLASS. */
7697 gcc_assert (i != 0);
7698 if (!warned && warn_psabi)
7700 warned = true;
7701 inform (input_location,
7702 "the ABI of passing union with long double"
7703 " has changed in GCC 4.4");
7705 return 0;
7708 return words;
7711 /* Compute alignment needed. We align all types to natural boundaries with
7712 exception of XFmode that is aligned to 64bits. */
7713 if (mode != VOIDmode && mode != BLKmode)
7715 int mode_alignment = GET_MODE_BITSIZE (mode);
7717 if (mode == XFmode)
7718 mode_alignment = 128;
7719 else if (mode == XCmode)
7720 mode_alignment = 256;
7721 if (COMPLEX_MODE_P (mode))
7722 mode_alignment /= 2;
7723 /* Misaligned fields are always returned in memory. */
7724 if (bit_offset % mode_alignment)
7725 return 0;
7728 /* for V1xx modes, just use the base mode */
7729 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7730 && GET_MODE_UNIT_SIZE (mode) == bytes)
7731 mode = GET_MODE_INNER (mode);
7733 /* Classification of atomic types. */
7734 switch (mode)
7736 case E_SDmode:
7737 case E_DDmode:
7738 classes[0] = X86_64_SSE_CLASS;
7739 return 1;
7740 case E_TDmode:
7741 classes[0] = X86_64_SSE_CLASS;
7742 classes[1] = X86_64_SSEUP_CLASS;
7743 return 2;
7744 case E_DImode:
7745 case E_SImode:
7746 case E_HImode:
7747 case E_QImode:
7748 case E_CSImode:
7749 case E_CHImode:
7750 case E_CQImode:
7752 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7754 /* Analyze last 128 bits only. */
7755 size = (size - 1) & 0x7f;
7757 if (size < 32)
7759 classes[0] = X86_64_INTEGERSI_CLASS;
7760 return 1;
7762 else if (size < 64)
7764 classes[0] = X86_64_INTEGER_CLASS;
7765 return 1;
7767 else if (size < 64+32)
7769 classes[0] = X86_64_INTEGER_CLASS;
7770 classes[1] = X86_64_INTEGERSI_CLASS;
7771 return 2;
7773 else if (size < 64+64)
7775 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7776 return 2;
7778 else
7779 gcc_unreachable ();
7781 case E_CDImode:
7782 case E_TImode:
7783 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7784 return 2;
7785 case E_COImode:
7786 case E_OImode:
7787 /* OImode shouldn't be used directly. */
7788 gcc_unreachable ();
7789 case E_CTImode:
7790 return 0;
7791 case E_SFmode:
7792 if (!(bit_offset % 64))
7793 classes[0] = X86_64_SSESF_CLASS;
7794 else
7795 classes[0] = X86_64_SSE_CLASS;
7796 return 1;
7797 case E_DFmode:
7798 classes[0] = X86_64_SSEDF_CLASS;
7799 return 1;
7800 case E_XFmode:
7801 classes[0] = X86_64_X87_CLASS;
7802 classes[1] = X86_64_X87UP_CLASS;
7803 return 2;
7804 case E_TFmode:
7805 classes[0] = X86_64_SSE_CLASS;
7806 classes[1] = X86_64_SSEUP_CLASS;
7807 return 2;
7808 case E_SCmode:
7809 classes[0] = X86_64_SSE_CLASS;
7810 if (!(bit_offset % 64))
7811 return 1;
7812 else
7814 static bool warned;
7816 if (!warned && warn_psabi)
7818 warned = true;
7819 inform (input_location,
7820 "the ABI of passing structure with complex float"
7821 " member has changed in GCC 4.4");
7823 classes[1] = X86_64_SSESF_CLASS;
7824 return 2;
7826 case E_DCmode:
7827 classes[0] = X86_64_SSEDF_CLASS;
7828 classes[1] = X86_64_SSEDF_CLASS;
7829 return 2;
7830 case E_XCmode:
7831 classes[0] = X86_64_COMPLEX_X87_CLASS;
7832 return 1;
7833 case E_TCmode:
7834 /* This modes is larger than 16 bytes. */
7835 return 0;
7836 case E_V8SFmode:
7837 case E_V8SImode:
7838 case E_V32QImode:
7839 case E_V16HImode:
7840 case E_V4DFmode:
7841 case E_V4DImode:
7842 classes[0] = X86_64_SSE_CLASS;
7843 classes[1] = X86_64_SSEUP_CLASS;
7844 classes[2] = X86_64_SSEUP_CLASS;
7845 classes[3] = X86_64_SSEUP_CLASS;
7846 return 4;
7847 case E_V8DFmode:
7848 case E_V16SFmode:
7849 case E_V8DImode:
7850 case E_V16SImode:
7851 case E_V32HImode:
7852 case E_V64QImode:
7853 classes[0] = X86_64_SSE_CLASS;
7854 classes[1] = X86_64_SSEUP_CLASS;
7855 classes[2] = X86_64_SSEUP_CLASS;
7856 classes[3] = X86_64_SSEUP_CLASS;
7857 classes[4] = X86_64_SSEUP_CLASS;
7858 classes[5] = X86_64_SSEUP_CLASS;
7859 classes[6] = X86_64_SSEUP_CLASS;
7860 classes[7] = X86_64_SSEUP_CLASS;
7861 return 8;
7862 case E_V4SFmode:
7863 case E_V4SImode:
7864 case E_V16QImode:
7865 case E_V8HImode:
7866 case E_V2DFmode:
7867 case E_V2DImode:
7868 classes[0] = X86_64_SSE_CLASS;
7869 classes[1] = X86_64_SSEUP_CLASS;
7870 return 2;
7871 case E_V1TImode:
7872 case E_V1DImode:
7873 case E_V2SFmode:
7874 case E_V2SImode:
7875 case E_V4HImode:
7876 case E_V8QImode:
7877 classes[0] = X86_64_SSE_CLASS;
7878 return 1;
7879 case E_BLKmode:
7880 case E_VOIDmode:
7881 return 0;
7882 default:
7883 gcc_assert (VECTOR_MODE_P (mode));
7885 if (bytes > 16)
7886 return 0;
7888 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7890 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7891 classes[0] = X86_64_INTEGERSI_CLASS;
7892 else
7893 classes[0] = X86_64_INTEGER_CLASS;
7894 classes[1] = X86_64_INTEGER_CLASS;
7895 return 1 + (bytes > 8);
7899 /* Examine the argument and return set number of register required in each
7900 class. Return true iff parameter should be passed in memory. */
7902 static bool
7903 examine_argument (machine_mode mode, const_tree type, int in_return,
7904 int *int_nregs, int *sse_nregs)
7906 enum x86_64_reg_class regclass[MAX_CLASSES];
7907 int n = classify_argument (mode, type, regclass, 0);
7909 *int_nregs = 0;
7910 *sse_nregs = 0;
7912 if (!n)
7913 return true;
7914 for (n--; n >= 0; n--)
7915 switch (regclass[n])
7917 case X86_64_INTEGER_CLASS:
7918 case X86_64_INTEGERSI_CLASS:
7919 (*int_nregs)++;
7920 break;
7921 case X86_64_SSE_CLASS:
7922 case X86_64_SSESF_CLASS:
7923 case X86_64_SSEDF_CLASS:
7924 (*sse_nregs)++;
7925 break;
7926 case X86_64_NO_CLASS:
7927 case X86_64_SSEUP_CLASS:
7928 break;
7929 case X86_64_X87_CLASS:
7930 case X86_64_X87UP_CLASS:
7931 case X86_64_COMPLEX_X87_CLASS:
7932 if (!in_return)
7933 return true;
7934 break;
7935 case X86_64_MEMORY_CLASS:
7936 gcc_unreachable ();
7939 return false;
7942 /* Construct container for the argument used by GCC interface. See
7943 FUNCTION_ARG for the detailed description. */
7945 static rtx
7946 construct_container (machine_mode mode, machine_mode orig_mode,
7947 const_tree type, int in_return, int nintregs, int nsseregs,
7948 const int *intreg, int sse_regno)
7950 /* The following variables hold the static issued_error state. */
7951 static bool issued_sse_arg_error;
7952 static bool issued_sse_ret_error;
7953 static bool issued_x87_ret_error;
7955 machine_mode tmpmode;
7956 int bytes =
7957 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7958 enum x86_64_reg_class regclass[MAX_CLASSES];
7959 int n;
7960 int i;
7961 int nexps = 0;
7962 int needed_sseregs, needed_intregs;
7963 rtx exp[MAX_CLASSES];
7964 rtx ret;
7966 n = classify_argument (mode, type, regclass, 0);
7967 if (!n)
7968 return NULL;
7969 if (examine_argument (mode, type, in_return, &needed_intregs,
7970 &needed_sseregs))
7971 return NULL;
7972 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
7973 return NULL;
7975 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
7976 some less clueful developer tries to use floating-point anyway. */
7977 if (needed_sseregs && !TARGET_SSE)
7979 if (in_return)
7981 if (!issued_sse_ret_error)
7983 error ("SSE register return with SSE disabled");
7984 issued_sse_ret_error = true;
7987 else if (!issued_sse_arg_error)
7989 error ("SSE register argument with SSE disabled");
7990 issued_sse_arg_error = true;
7992 return NULL;
7995 /* Likewise, error if the ABI requires us to return values in the
7996 x87 registers and the user specified -mno-80387. */
7997 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
7998 for (i = 0; i < n; i++)
7999 if (regclass[i] == X86_64_X87_CLASS
8000 || regclass[i] == X86_64_X87UP_CLASS
8001 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8003 if (!issued_x87_ret_error)
8005 error ("x87 register return with x87 disabled");
8006 issued_x87_ret_error = true;
8008 return NULL;
8011 /* First construct simple cases. Avoid SCmode, since we want to use
8012 single register to pass this type. */
8013 if (n == 1 && mode != SCmode)
8014 switch (regclass[0])
8016 case X86_64_INTEGER_CLASS:
8017 case X86_64_INTEGERSI_CLASS:
8018 return gen_rtx_REG (mode, intreg[0]);
8019 case X86_64_SSE_CLASS:
8020 case X86_64_SSESF_CLASS:
8021 case X86_64_SSEDF_CLASS:
8022 if (mode != BLKmode)
8023 return gen_reg_or_parallel (mode, orig_mode,
8024 SSE_REGNO (sse_regno));
8025 break;
8026 case X86_64_X87_CLASS:
8027 case X86_64_COMPLEX_X87_CLASS:
8028 return gen_rtx_REG (mode, FIRST_STACK_REG);
8029 case X86_64_NO_CLASS:
8030 /* Zero sized array, struct or class. */
8031 return NULL;
8032 default:
8033 gcc_unreachable ();
8035 if (n == 2
8036 && regclass[0] == X86_64_SSE_CLASS
8037 && regclass[1] == X86_64_SSEUP_CLASS
8038 && mode != BLKmode)
8039 return gen_reg_or_parallel (mode, orig_mode,
8040 SSE_REGNO (sse_regno));
8041 if (n == 4
8042 && regclass[0] == X86_64_SSE_CLASS
8043 && regclass[1] == X86_64_SSEUP_CLASS
8044 && regclass[2] == X86_64_SSEUP_CLASS
8045 && regclass[3] == X86_64_SSEUP_CLASS
8046 && mode != BLKmode)
8047 return gen_reg_or_parallel (mode, orig_mode,
8048 SSE_REGNO (sse_regno));
8049 if (n == 8
8050 && regclass[0] == X86_64_SSE_CLASS
8051 && regclass[1] == X86_64_SSEUP_CLASS
8052 && regclass[2] == X86_64_SSEUP_CLASS
8053 && regclass[3] == X86_64_SSEUP_CLASS
8054 && regclass[4] == X86_64_SSEUP_CLASS
8055 && regclass[5] == X86_64_SSEUP_CLASS
8056 && regclass[6] == X86_64_SSEUP_CLASS
8057 && regclass[7] == X86_64_SSEUP_CLASS
8058 && mode != BLKmode)
8059 return gen_reg_or_parallel (mode, orig_mode,
8060 SSE_REGNO (sse_regno));
8061 if (n == 2
8062 && regclass[0] == X86_64_X87_CLASS
8063 && regclass[1] == X86_64_X87UP_CLASS)
8064 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8066 if (n == 2
8067 && regclass[0] == X86_64_INTEGER_CLASS
8068 && regclass[1] == X86_64_INTEGER_CLASS
8069 && (mode == CDImode || mode == TImode)
8070 && intreg[0] + 1 == intreg[1])
8071 return gen_rtx_REG (mode, intreg[0]);
8073 /* Otherwise figure out the entries of the PARALLEL. */
8074 for (i = 0; i < n; i++)
8076 int pos;
8078 switch (regclass[i])
8080 case X86_64_NO_CLASS:
8081 break;
8082 case X86_64_INTEGER_CLASS:
8083 case X86_64_INTEGERSI_CLASS:
8084 /* Merge TImodes on aligned occasions here too. */
8085 if (i * 8 + 8 > bytes)
8087 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8088 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8089 /* We've requested 24 bytes we
8090 don't have mode for. Use DImode. */
8091 tmpmode = DImode;
8093 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8094 tmpmode = SImode;
8095 else
8096 tmpmode = DImode;
8097 exp [nexps++]
8098 = gen_rtx_EXPR_LIST (VOIDmode,
8099 gen_rtx_REG (tmpmode, *intreg),
8100 GEN_INT (i*8));
8101 intreg++;
8102 break;
8103 case X86_64_SSESF_CLASS:
8104 exp [nexps++]
8105 = gen_rtx_EXPR_LIST (VOIDmode,
8106 gen_rtx_REG (SFmode,
8107 SSE_REGNO (sse_regno)),
8108 GEN_INT (i*8));
8109 sse_regno++;
8110 break;
8111 case X86_64_SSEDF_CLASS:
8112 exp [nexps++]
8113 = gen_rtx_EXPR_LIST (VOIDmode,
8114 gen_rtx_REG (DFmode,
8115 SSE_REGNO (sse_regno)),
8116 GEN_INT (i*8));
8117 sse_regno++;
8118 break;
8119 case X86_64_SSE_CLASS:
8120 pos = i;
8121 switch (n)
8123 case 1:
8124 tmpmode = DImode;
8125 break;
8126 case 2:
8127 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8129 tmpmode = TImode;
8130 i++;
8132 else
8133 tmpmode = DImode;
8134 break;
8135 case 4:
8136 gcc_assert (i == 0
8137 && regclass[1] == X86_64_SSEUP_CLASS
8138 && regclass[2] == X86_64_SSEUP_CLASS
8139 && regclass[3] == X86_64_SSEUP_CLASS);
8140 tmpmode = OImode;
8141 i += 3;
8142 break;
8143 case 8:
8144 gcc_assert (i == 0
8145 && regclass[1] == X86_64_SSEUP_CLASS
8146 && regclass[2] == X86_64_SSEUP_CLASS
8147 && regclass[3] == X86_64_SSEUP_CLASS
8148 && regclass[4] == X86_64_SSEUP_CLASS
8149 && regclass[5] == X86_64_SSEUP_CLASS
8150 && regclass[6] == X86_64_SSEUP_CLASS
8151 && regclass[7] == X86_64_SSEUP_CLASS);
8152 tmpmode = XImode;
8153 i += 7;
8154 break;
8155 default:
8156 gcc_unreachable ();
8158 exp [nexps++]
8159 = gen_rtx_EXPR_LIST (VOIDmode,
8160 gen_rtx_REG (tmpmode,
8161 SSE_REGNO (sse_regno)),
8162 GEN_INT (pos*8));
8163 sse_regno++;
8164 break;
8165 default:
8166 gcc_unreachable ();
8170 /* Empty aligned struct, union or class. */
8171 if (nexps == 0)
8172 return NULL;
8174 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8175 for (i = 0; i < nexps; i++)
8176 XVECEXP (ret, 0, i) = exp [i];
8177 return ret;
8180 /* Update the data in CUM to advance over an argument of mode MODE
8181 and data type TYPE. (TYPE is null for libcalls where that information
8182 may not be available.)
8184 Return a number of integer regsiters advanced over. */
8186 static int
8187 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8188 const_tree type, HOST_WIDE_INT bytes,
8189 HOST_WIDE_INT words)
8191 int res = 0;
8192 bool error_p = false;
8194 if (TARGET_IAMCU)
8196 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8197 bytes in registers. */
8198 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8199 goto pass_in_reg;
8200 return res;
8203 switch (mode)
8205 default:
8206 break;
8208 case E_BLKmode:
8209 if (bytes < 0)
8210 break;
8211 /* FALLTHRU */
8213 case E_DImode:
8214 case E_SImode:
8215 case E_HImode:
8216 case E_QImode:
8217 pass_in_reg:
8218 cum->words += words;
8219 cum->nregs -= words;
8220 cum->regno += words;
8221 if (cum->nregs >= 0)
8222 res = words;
8223 if (cum->nregs <= 0)
8225 cum->nregs = 0;
8226 cfun->machine->arg_reg_available = false;
8227 cum->regno = 0;
8229 break;
8231 case E_OImode:
8232 /* OImode shouldn't be used directly. */
8233 gcc_unreachable ();
8235 case E_DFmode:
8236 if (cum->float_in_sse == -1)
8237 error_p = true;
8238 if (cum->float_in_sse < 2)
8239 break;
8240 /* FALLTHRU */
8241 case E_SFmode:
8242 if (cum->float_in_sse == -1)
8243 error_p = true;
8244 if (cum->float_in_sse < 1)
8245 break;
8246 /* FALLTHRU */
8248 case E_V8SFmode:
8249 case E_V8SImode:
8250 case E_V64QImode:
8251 case E_V32HImode:
8252 case E_V16SImode:
8253 case E_V8DImode:
8254 case E_V16SFmode:
8255 case E_V8DFmode:
8256 case E_V32QImode:
8257 case E_V16HImode:
8258 case E_V4DFmode:
8259 case E_V4DImode:
8260 case E_TImode:
8261 case E_V16QImode:
8262 case E_V8HImode:
8263 case E_V4SImode:
8264 case E_V2DImode:
8265 case E_V4SFmode:
8266 case E_V2DFmode:
8267 if (!type || !AGGREGATE_TYPE_P (type))
8269 cum->sse_words += words;
8270 cum->sse_nregs -= 1;
8271 cum->sse_regno += 1;
8272 if (cum->sse_nregs <= 0)
8274 cum->sse_nregs = 0;
8275 cum->sse_regno = 0;
8278 break;
8280 case E_V8QImode:
8281 case E_V4HImode:
8282 case E_V2SImode:
8283 case E_V2SFmode:
8284 case E_V1TImode:
8285 case E_V1DImode:
8286 if (!type || !AGGREGATE_TYPE_P (type))
8288 cum->mmx_words += words;
8289 cum->mmx_nregs -= 1;
8290 cum->mmx_regno += 1;
8291 if (cum->mmx_nregs <= 0)
8293 cum->mmx_nregs = 0;
8294 cum->mmx_regno = 0;
8297 break;
8299 if (error_p)
8301 cum->float_in_sse = 0;
8302 error ("calling %qD with SSE calling convention without "
8303 "SSE/SSE2 enabled", cum->decl);
8304 sorry ("this is a GCC bug that can be worked around by adding "
8305 "attribute used to function called");
8308 return res;
8311 static int
8312 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8313 const_tree type, HOST_WIDE_INT words, bool named)
8315 int int_nregs, sse_nregs;
8317 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8318 if (!named && (VALID_AVX512F_REG_MODE (mode)
8319 || VALID_AVX256_REG_MODE (mode)))
8320 return 0;
8322 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8323 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8325 cum->nregs -= int_nregs;
8326 cum->sse_nregs -= sse_nregs;
8327 cum->regno += int_nregs;
8328 cum->sse_regno += sse_nregs;
8329 return int_nregs;
8331 else
8333 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8334 cum->words = ROUND_UP (cum->words, align);
8335 cum->words += words;
8336 return 0;
8340 static int
8341 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8342 HOST_WIDE_INT words)
8344 /* Otherwise, this should be passed indirect. */
8345 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8347 cum->words += words;
8348 if (cum->nregs > 0)
8350 cum->nregs -= 1;
8351 cum->regno += 1;
8352 return 1;
8354 return 0;
8357 /* Update the data in CUM to advance over an argument of mode MODE and
8358 data type TYPE. (TYPE is null for libcalls where that information
8359 may not be available.) */
8361 static void
8362 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8363 const_tree type, bool named)
8365 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8366 HOST_WIDE_INT bytes, words;
8367 int nregs;
8369 /* The argument of interrupt handler is a special case and is
8370 handled in ix86_function_arg. */
8371 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8372 return;
8374 if (mode == BLKmode)
8375 bytes = int_size_in_bytes (type);
8376 else
8377 bytes = GET_MODE_SIZE (mode);
8378 words = CEIL (bytes, UNITS_PER_WORD);
8380 if (type)
8381 mode = type_natural_mode (type, NULL, false);
8383 if ((type && POINTER_BOUNDS_TYPE_P (type))
8384 || POINTER_BOUNDS_MODE_P (mode))
8386 /* If we pass bounds in BT then just update remained bounds count. */
8387 if (cum->bnds_in_bt)
8389 cum->bnds_in_bt--;
8390 return;
8393 /* Update remained number of bounds to force. */
8394 if (cum->force_bnd_pass)
8395 cum->force_bnd_pass--;
8397 cum->bnd_regno++;
8399 return;
8402 /* The first arg not going to Bounds Tables resets this counter. */
8403 cum->bnds_in_bt = 0;
8404 /* For unnamed args we always pass bounds to avoid bounds mess when
8405 passed and received types do not match. If bounds do not follow
8406 unnamed arg, still pretend required number of bounds were passed. */
8407 if (cum->force_bnd_pass)
8409 cum->bnd_regno += cum->force_bnd_pass;
8410 cum->force_bnd_pass = 0;
8413 if (TARGET_64BIT)
8415 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8417 if (call_abi == MS_ABI)
8418 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8419 else
8420 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8422 else
8423 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8425 /* For stdarg we expect bounds to be passed for each value passed
8426 in register. */
8427 if (cum->stdarg)
8428 cum->force_bnd_pass = nregs;
8429 /* For pointers passed in memory we expect bounds passed in Bounds
8430 Table. */
8431 if (!nregs)
8433 /* Track if there are outgoing arguments on stack. */
8434 if (cum->caller)
8435 cfun->machine->outgoing_args_on_stack = true;
8437 cum->bnds_in_bt = chkp_type_bounds_count (type);
8441 /* Define where to put the arguments to a function.
8442 Value is zero to push the argument on the stack,
8443 or a hard register in which to store the argument.
8445 MODE is the argument's machine mode.
8446 TYPE is the data type of the argument (as a tree).
8447 This is null for libcalls where that information may
8448 not be available.
8449 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8450 the preceding args and about the function being called.
8451 NAMED is nonzero if this argument is a named parameter
8452 (otherwise it is an extra parameter matching an ellipsis). */
8454 static rtx
8455 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8456 machine_mode orig_mode, const_tree type,
8457 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8459 bool error_p = false;
8461 /* Avoid the AL settings for the Unix64 ABI. */
8462 if (mode == VOIDmode)
8463 return constm1_rtx;
8465 if (TARGET_IAMCU)
8467 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8468 bytes in registers. */
8469 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8470 goto pass_in_reg;
8471 return NULL_RTX;
8474 switch (mode)
8476 default:
8477 break;
8479 case E_BLKmode:
8480 if (bytes < 0)
8481 break;
8482 /* FALLTHRU */
8483 case E_DImode:
8484 case E_SImode:
8485 case E_HImode:
8486 case E_QImode:
8487 pass_in_reg:
8488 if (words <= cum->nregs)
8490 int regno = cum->regno;
8492 /* Fastcall allocates the first two DWORD (SImode) or
8493 smaller arguments to ECX and EDX if it isn't an
8494 aggregate type . */
8495 if (cum->fastcall)
8497 if (mode == BLKmode
8498 || mode == DImode
8499 || (type && AGGREGATE_TYPE_P (type)))
8500 break;
8502 /* ECX not EAX is the first allocated register. */
8503 if (regno == AX_REG)
8504 regno = CX_REG;
8506 return gen_rtx_REG (mode, regno);
8508 break;
8510 case E_DFmode:
8511 if (cum->float_in_sse == -1)
8512 error_p = true;
8513 if (cum->float_in_sse < 2)
8514 break;
8515 /* FALLTHRU */
8516 case E_SFmode:
8517 if (cum->float_in_sse == -1)
8518 error_p = true;
8519 if (cum->float_in_sse < 1)
8520 break;
8521 /* FALLTHRU */
8522 case E_TImode:
8523 /* In 32bit, we pass TImode in xmm registers. */
8524 case E_V16QImode:
8525 case E_V8HImode:
8526 case E_V4SImode:
8527 case E_V2DImode:
8528 case E_V4SFmode:
8529 case E_V2DFmode:
8530 if (!type || !AGGREGATE_TYPE_P (type))
8532 if (cum->sse_nregs)
8533 return gen_reg_or_parallel (mode, orig_mode,
8534 cum->sse_regno + FIRST_SSE_REG);
8536 break;
8538 case E_OImode:
8539 case E_XImode:
8540 /* OImode and XImode shouldn't be used directly. */
8541 gcc_unreachable ();
8543 case E_V64QImode:
8544 case E_V32HImode:
8545 case E_V16SImode:
8546 case E_V8DImode:
8547 case E_V16SFmode:
8548 case E_V8DFmode:
8549 case E_V8SFmode:
8550 case E_V8SImode:
8551 case E_V32QImode:
8552 case E_V16HImode:
8553 case E_V4DFmode:
8554 case E_V4DImode:
8555 if (!type || !AGGREGATE_TYPE_P (type))
8557 if (cum->sse_nregs)
8558 return gen_reg_or_parallel (mode, orig_mode,
8559 cum->sse_regno + FIRST_SSE_REG);
8561 break;
8563 case E_V8QImode:
8564 case E_V4HImode:
8565 case E_V2SImode:
8566 case E_V2SFmode:
8567 case E_V1TImode:
8568 case E_V1DImode:
8569 if (!type || !AGGREGATE_TYPE_P (type))
8571 if (cum->mmx_nregs)
8572 return gen_reg_or_parallel (mode, orig_mode,
8573 cum->mmx_regno + FIRST_MMX_REG);
8575 break;
8577 if (error_p)
8579 cum->float_in_sse = 0;
8580 error ("calling %qD with SSE calling convention without "
8581 "SSE/SSE2 enabled", cum->decl);
8582 sorry ("this is a GCC bug that can be worked around by adding "
8583 "attribute used to function called");
8586 return NULL_RTX;
8589 static rtx
8590 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8591 machine_mode orig_mode, const_tree type, bool named)
8593 /* Handle a hidden AL argument containing number of registers
8594 for varargs x86-64 functions. */
8595 if (mode == VOIDmode)
8596 return GEN_INT (cum->maybe_vaarg
8597 ? (cum->sse_nregs < 0
8598 ? X86_64_SSE_REGPARM_MAX
8599 : cum->sse_regno)
8600 : -1);
8602 switch (mode)
8604 default:
8605 break;
8607 case E_V8SFmode:
8608 case E_V8SImode:
8609 case E_V32QImode:
8610 case E_V16HImode:
8611 case E_V4DFmode:
8612 case E_V4DImode:
8613 case E_V16SFmode:
8614 case E_V16SImode:
8615 case E_V64QImode:
8616 case E_V32HImode:
8617 case E_V8DFmode:
8618 case E_V8DImode:
8619 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8620 if (!named)
8621 return NULL;
8622 break;
8625 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8626 cum->sse_nregs,
8627 &x86_64_int_parameter_registers [cum->regno],
8628 cum->sse_regno);
8631 static rtx
8632 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8633 machine_mode orig_mode, bool named,
8634 HOST_WIDE_INT bytes)
8636 unsigned int regno;
8638 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8639 We use value of -2 to specify that current function call is MSABI. */
8640 if (mode == VOIDmode)
8641 return GEN_INT (-2);
8643 /* If we've run out of registers, it goes on the stack. */
8644 if (cum->nregs == 0)
8645 return NULL_RTX;
8647 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8649 /* Only floating point modes are passed in anything but integer regs. */
8650 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8652 if (named)
8653 regno = cum->regno + FIRST_SSE_REG;
8654 else
8656 rtx t1, t2;
8658 /* Unnamed floating parameters are passed in both the
8659 SSE and integer registers. */
8660 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8661 t2 = gen_rtx_REG (mode, regno);
8662 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8663 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8664 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8667 /* Handle aggregated types passed in register. */
8668 if (orig_mode == BLKmode)
8670 if (bytes > 0 && bytes <= 8)
8671 mode = (bytes > 4 ? DImode : SImode);
8672 if (mode == BLKmode)
8673 mode = DImode;
8676 return gen_reg_or_parallel (mode, orig_mode, regno);
8679 /* Return where to put the arguments to a function.
8680 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8682 MODE is the argument's machine mode. TYPE is the data type of the
8683 argument. It is null for libcalls where that information may not be
8684 available. CUM gives information about the preceding args and about
8685 the function being called. NAMED is nonzero if this argument is a
8686 named parameter (otherwise it is an extra parameter matching an
8687 ellipsis). */
8689 static rtx
8690 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8691 const_tree type, bool named)
8693 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8694 machine_mode mode = omode;
8695 HOST_WIDE_INT bytes, words;
8696 rtx arg;
8698 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8700 gcc_assert (type != NULL_TREE);
8701 if (POINTER_TYPE_P (type))
8703 /* This is the pointer argument. */
8704 gcc_assert (TYPE_MODE (type) == Pmode);
8705 /* It is at -WORD(AP) in the current frame in interrupt and
8706 exception handlers. */
8707 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8709 else
8711 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8712 && TREE_CODE (type) == INTEGER_TYPE
8713 && TYPE_MODE (type) == word_mode);
8714 /* The error code is the word-mode integer argument at
8715 -2 * WORD(AP) in the current frame of the exception
8716 handler. */
8717 arg = gen_rtx_MEM (word_mode,
8718 plus_constant (Pmode,
8719 arg_pointer_rtx,
8720 -2 * UNITS_PER_WORD));
8722 return arg;
8725 /* All pointer bounds arguments are handled separately here. */
8726 if ((type && POINTER_BOUNDS_TYPE_P (type))
8727 || POINTER_BOUNDS_MODE_P (mode))
8729 /* Return NULL if bounds are forced to go in Bounds Table. */
8730 if (cum->bnds_in_bt)
8731 arg = NULL;
8732 /* Return the next available bound reg if any. */
8733 else if (cum->bnd_regno <= LAST_BND_REG)
8734 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8735 /* Return the next special slot number otherwise. */
8736 else
8737 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8739 return arg;
8742 if (mode == BLKmode)
8743 bytes = int_size_in_bytes (type);
8744 else
8745 bytes = GET_MODE_SIZE (mode);
8746 words = CEIL (bytes, UNITS_PER_WORD);
8748 /* To simplify the code below, represent vector types with a vector mode
8749 even if MMX/SSE are not active. */
8750 if (type && TREE_CODE (type) == VECTOR_TYPE)
8751 mode = type_natural_mode (type, cum, false);
8753 if (TARGET_64BIT)
8755 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8757 if (call_abi == MS_ABI)
8758 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8759 else
8760 arg = function_arg_64 (cum, mode, omode, type, named);
8762 else
8763 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8765 /* Track if there are outgoing arguments on stack. */
8766 if (arg == NULL_RTX && cum->caller)
8767 cfun->machine->outgoing_args_on_stack = true;
8769 return arg;
8772 /* A C expression that indicates when an argument must be passed by
8773 reference. If nonzero for an argument, a copy of that argument is
8774 made in memory and a pointer to the argument is passed instead of
8775 the argument itself. The pointer is passed in whatever way is
8776 appropriate for passing a pointer to that type. */
8778 static bool
8779 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8780 const_tree type, bool)
8782 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8784 /* Bounds are never passed by reference. */
8785 if ((type && POINTER_BOUNDS_TYPE_P (type))
8786 || POINTER_BOUNDS_MODE_P (mode))
8787 return false;
8789 if (TARGET_64BIT)
8791 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8793 /* See Windows x64 Software Convention. */
8794 if (call_abi == MS_ABI)
8796 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8798 if (type)
8800 /* Arrays are passed by reference. */
8801 if (TREE_CODE (type) == ARRAY_TYPE)
8802 return true;
8804 if (RECORD_OR_UNION_TYPE_P (type))
8806 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8807 are passed by reference. */
8808 msize = int_size_in_bytes (type);
8812 /* __m128 is passed by reference. */
8813 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8815 else if (type && int_size_in_bytes (type) == -1)
8816 return true;
8819 return false;
8822 /* Return true when TYPE should be 128bit aligned for 32bit argument
8823 passing ABI. XXX: This function is obsolete and is only used for
8824 checking psABI compatibility with previous versions of GCC. */
8826 static bool
8827 ix86_compat_aligned_value_p (const_tree type)
8829 machine_mode mode = TYPE_MODE (type);
8830 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8831 || mode == TDmode
8832 || mode == TFmode
8833 || mode == TCmode)
8834 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8835 return true;
8836 if (TYPE_ALIGN (type) < 128)
8837 return false;
8839 if (AGGREGATE_TYPE_P (type))
8841 /* Walk the aggregates recursively. */
8842 switch (TREE_CODE (type))
8844 case RECORD_TYPE:
8845 case UNION_TYPE:
8846 case QUAL_UNION_TYPE:
8848 tree field;
8850 /* Walk all the structure fields. */
8851 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8853 if (TREE_CODE (field) == FIELD_DECL
8854 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8855 return true;
8857 break;
8860 case ARRAY_TYPE:
8861 /* Just for use if some languages passes arrays by value. */
8862 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8863 return true;
8864 break;
8866 default:
8867 gcc_unreachable ();
8870 return false;
8873 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8874 XXX: This function is obsolete and is only used for checking psABI
8875 compatibility with previous versions of GCC. */
8877 static unsigned int
8878 ix86_compat_function_arg_boundary (machine_mode mode,
8879 const_tree type, unsigned int align)
8881 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8882 natural boundaries. */
8883 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8885 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8886 make an exception for SSE modes since these require 128bit
8887 alignment.
8889 The handling here differs from field_alignment. ICC aligns MMX
8890 arguments to 4 byte boundaries, while structure fields are aligned
8891 to 8 byte boundaries. */
8892 if (!type)
8894 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8895 align = PARM_BOUNDARY;
8897 else
8899 if (!ix86_compat_aligned_value_p (type))
8900 align = PARM_BOUNDARY;
8903 if (align > BIGGEST_ALIGNMENT)
8904 align = BIGGEST_ALIGNMENT;
8905 return align;
8908 /* Return true when TYPE should be 128bit aligned for 32bit argument
8909 passing ABI. */
8911 static bool
8912 ix86_contains_aligned_value_p (const_tree type)
8914 machine_mode mode = TYPE_MODE (type);
8916 if (mode == XFmode || mode == XCmode)
8917 return false;
8919 if (TYPE_ALIGN (type) < 128)
8920 return false;
8922 if (AGGREGATE_TYPE_P (type))
8924 /* Walk the aggregates recursively. */
8925 switch (TREE_CODE (type))
8927 case RECORD_TYPE:
8928 case UNION_TYPE:
8929 case QUAL_UNION_TYPE:
8931 tree field;
8933 /* Walk all the structure fields. */
8934 for (field = TYPE_FIELDS (type);
8935 field;
8936 field = DECL_CHAIN (field))
8938 if (TREE_CODE (field) == FIELD_DECL
8939 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
8940 return true;
8942 break;
8945 case ARRAY_TYPE:
8946 /* Just for use if some languages passes arrays by value. */
8947 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
8948 return true;
8949 break;
8951 default:
8952 gcc_unreachable ();
8955 else
8956 return TYPE_ALIGN (type) >= 128;
8958 return false;
8961 /* Gives the alignment boundary, in bits, of an argument with the
8962 specified mode and type. */
8964 static unsigned int
8965 ix86_function_arg_boundary (machine_mode mode, const_tree type)
8967 unsigned int align;
8968 if (type)
8970 /* Since the main variant type is used for call, we convert it to
8971 the main variant type. */
8972 type = TYPE_MAIN_VARIANT (type);
8973 align = TYPE_ALIGN (type);
8975 else
8976 align = GET_MODE_ALIGNMENT (mode);
8977 if (align < PARM_BOUNDARY)
8978 align = PARM_BOUNDARY;
8979 else
8981 static bool warned;
8982 unsigned int saved_align = align;
8984 if (!TARGET_64BIT)
8986 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
8987 if (!type)
8989 if (mode == XFmode || mode == XCmode)
8990 align = PARM_BOUNDARY;
8992 else if (!ix86_contains_aligned_value_p (type))
8993 align = PARM_BOUNDARY;
8995 if (align < 128)
8996 align = PARM_BOUNDARY;
8999 if (warn_psabi
9000 && !warned
9001 && align != ix86_compat_function_arg_boundary (mode, type,
9002 saved_align))
9004 warned = true;
9005 inform (input_location,
9006 "The ABI for passing parameters with %d-byte"
9007 " alignment has changed in GCC 4.6",
9008 align / BITS_PER_UNIT);
9012 return align;
9015 /* Return true if N is a possible register number of function value. */
9017 static bool
9018 ix86_function_value_regno_p (const unsigned int regno)
9020 switch (regno)
9022 case AX_REG:
9023 return true;
9024 case DX_REG:
9025 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9026 case DI_REG:
9027 case SI_REG:
9028 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9030 case BND0_REG:
9031 case BND1_REG:
9032 return chkp_function_instrumented_p (current_function_decl);
9034 /* Complex values are returned in %st(0)/%st(1) pair. */
9035 case ST0_REG:
9036 case ST1_REG:
9037 /* TODO: The function should depend on current function ABI but
9038 builtins.c would need updating then. Therefore we use the
9039 default ABI. */
9040 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9041 return false;
9042 return TARGET_FLOAT_RETURNS_IN_80387;
9044 /* Complex values are returned in %xmm0/%xmm1 pair. */
9045 case XMM0_REG:
9046 case XMM1_REG:
9047 return TARGET_SSE;
9049 case MM0_REG:
9050 if (TARGET_MACHO || TARGET_64BIT)
9051 return false;
9052 return TARGET_MMX;
9055 return false;
9058 /* Define how to find the value returned by a function.
9059 VALTYPE is the data type of the value (as a tree).
9060 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9061 otherwise, FUNC is 0. */
9063 static rtx
9064 function_value_32 (machine_mode orig_mode, machine_mode mode,
9065 const_tree fntype, const_tree fn)
9067 unsigned int regno;
9069 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9070 we normally prevent this case when mmx is not available. However
9071 some ABIs may require the result to be returned like DImode. */
9072 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9073 regno = FIRST_MMX_REG;
9075 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9076 we prevent this case when sse is not available. However some ABIs
9077 may require the result to be returned like integer TImode. */
9078 else if (mode == TImode
9079 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9080 regno = FIRST_SSE_REG;
9082 /* 32-byte vector modes in %ymm0. */
9083 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9084 regno = FIRST_SSE_REG;
9086 /* 64-byte vector modes in %zmm0. */
9087 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9088 regno = FIRST_SSE_REG;
9090 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9091 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9092 regno = FIRST_FLOAT_REG;
9093 else
9094 /* Most things go in %eax. */
9095 regno = AX_REG;
9097 /* Override FP return register with %xmm0 for local functions when
9098 SSE math is enabled or for functions with sseregparm attribute. */
9099 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9101 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9102 if (sse_level == -1)
9104 error ("calling %qD with SSE calling convention without "
9105 "SSE/SSE2 enabled", fn);
9106 sorry ("this is a GCC bug that can be worked around by adding "
9107 "attribute used to function called");
9109 else if ((sse_level >= 1 && mode == SFmode)
9110 || (sse_level == 2 && mode == DFmode))
9111 regno = FIRST_SSE_REG;
9114 /* OImode shouldn't be used directly. */
9115 gcc_assert (mode != OImode);
9117 return gen_rtx_REG (orig_mode, regno);
9120 static rtx
9121 function_value_64 (machine_mode orig_mode, machine_mode mode,
9122 const_tree valtype)
9124 rtx ret;
9126 /* Handle libcalls, which don't provide a type node. */
9127 if (valtype == NULL)
9129 unsigned int regno;
9131 switch (mode)
9133 case E_SFmode:
9134 case E_SCmode:
9135 case E_DFmode:
9136 case E_DCmode:
9137 case E_TFmode:
9138 case E_SDmode:
9139 case E_DDmode:
9140 case E_TDmode:
9141 regno = FIRST_SSE_REG;
9142 break;
9143 case E_XFmode:
9144 case E_XCmode:
9145 regno = FIRST_FLOAT_REG;
9146 break;
9147 case E_TCmode:
9148 return NULL;
9149 default:
9150 regno = AX_REG;
9153 return gen_rtx_REG (mode, regno);
9155 else if (POINTER_TYPE_P (valtype))
9157 /* Pointers are always returned in word_mode. */
9158 mode = word_mode;
9161 ret = construct_container (mode, orig_mode, valtype, 1,
9162 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9163 x86_64_int_return_registers, 0);
9165 /* For zero sized structures, construct_container returns NULL, but we
9166 need to keep rest of compiler happy by returning meaningful value. */
9167 if (!ret)
9168 ret = gen_rtx_REG (orig_mode, AX_REG);
9170 return ret;
9173 static rtx
9174 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9175 const_tree valtype)
9177 unsigned int regno = AX_REG;
9179 if (TARGET_SSE)
9181 switch (GET_MODE_SIZE (mode))
9183 case 16:
9184 if (valtype != NULL_TREE
9185 && !VECTOR_INTEGER_TYPE_P (valtype)
9186 && !VECTOR_INTEGER_TYPE_P (valtype)
9187 && !INTEGRAL_TYPE_P (valtype)
9188 && !VECTOR_FLOAT_TYPE_P (valtype))
9189 break;
9190 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9191 && !COMPLEX_MODE_P (mode))
9192 regno = FIRST_SSE_REG;
9193 break;
9194 case 8:
9195 case 4:
9196 if (mode == SFmode || mode == DFmode)
9197 regno = FIRST_SSE_REG;
9198 break;
9199 default:
9200 break;
9203 return gen_rtx_REG (orig_mode, regno);
9206 static rtx
9207 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9208 machine_mode orig_mode, machine_mode mode)
9210 const_tree fn, fntype;
9212 fn = NULL_TREE;
9213 if (fntype_or_decl && DECL_P (fntype_or_decl))
9214 fn = fntype_or_decl;
9215 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9217 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9218 || POINTER_BOUNDS_MODE_P (mode))
9219 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9220 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9221 return function_value_ms_64 (orig_mode, mode, valtype);
9222 else if (TARGET_64BIT)
9223 return function_value_64 (orig_mode, mode, valtype);
9224 else
9225 return function_value_32 (orig_mode, mode, fntype, fn);
9228 static rtx
9229 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9231 machine_mode mode, orig_mode;
9233 orig_mode = TYPE_MODE (valtype);
9234 mode = type_natural_mode (valtype, NULL, true);
9235 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9238 /* Return an RTX representing a place where a function returns
9239 or recieves pointer bounds or NULL if no bounds are returned.
9241 VALTYPE is a data type of a value returned by the function.
9243 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9244 or FUNCTION_TYPE of the function.
9246 If OUTGOING is false, return a place in which the caller will
9247 see the return value. Otherwise, return a place where a
9248 function returns a value. */
9250 static rtx
9251 ix86_function_value_bounds (const_tree valtype,
9252 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9253 bool outgoing ATTRIBUTE_UNUSED)
9255 rtx res = NULL_RTX;
9257 if (BOUNDED_TYPE_P (valtype))
9258 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9259 else if (chkp_type_has_pointer (valtype))
9261 bitmap slots;
9262 rtx bounds[2];
9263 bitmap_iterator bi;
9264 unsigned i, bnd_no = 0;
9266 bitmap_obstack_initialize (NULL);
9267 slots = BITMAP_ALLOC (NULL);
9268 chkp_find_bound_slots (valtype, slots);
9270 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9272 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9273 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9274 gcc_assert (bnd_no < 2);
9275 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9278 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9280 BITMAP_FREE (slots);
9281 bitmap_obstack_release (NULL);
9283 else
9284 res = NULL_RTX;
9286 return res;
9289 /* Pointer function arguments and return values are promoted to
9290 word_mode for normal functions. */
9292 static machine_mode
9293 ix86_promote_function_mode (const_tree type, machine_mode mode,
9294 int *punsignedp, const_tree fntype,
9295 int for_return)
9297 if (cfun->machine->func_type == TYPE_NORMAL
9298 && type != NULL_TREE
9299 && POINTER_TYPE_P (type))
9301 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9302 return word_mode;
9304 return default_promote_function_mode (type, mode, punsignedp, fntype,
9305 for_return);
9308 /* Return true if a structure, union or array with MODE containing FIELD
9309 should be accessed using BLKmode. */
9311 static bool
9312 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9314 /* Union with XFmode must be in BLKmode. */
9315 return (mode == XFmode
9316 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9317 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9321 ix86_libcall_value (machine_mode mode)
9323 return ix86_function_value_1 (NULL, NULL, mode, mode);
9326 /* Return true iff type is returned in memory. */
9328 static bool
9329 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9331 #ifdef SUBTARGET_RETURN_IN_MEMORY
9332 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9333 #else
9334 const machine_mode mode = type_natural_mode (type, NULL, true);
9335 HOST_WIDE_INT size;
9337 if (POINTER_BOUNDS_TYPE_P (type))
9338 return false;
9340 if (TARGET_64BIT)
9342 if (ix86_function_type_abi (fntype) == MS_ABI)
9344 size = int_size_in_bytes (type);
9346 /* __m128 is returned in xmm0. */
9347 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9348 || INTEGRAL_TYPE_P (type)
9349 || VECTOR_FLOAT_TYPE_P (type))
9350 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9351 && !COMPLEX_MODE_P (mode)
9352 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9353 return false;
9355 /* Otherwise, the size must be exactly in [1248]. */
9356 return size != 1 && size != 2 && size != 4 && size != 8;
9358 else
9360 int needed_intregs, needed_sseregs;
9362 return examine_argument (mode, type, 1,
9363 &needed_intregs, &needed_sseregs);
9366 else
9368 size = int_size_in_bytes (type);
9370 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9371 bytes in registers. */
9372 if (TARGET_IAMCU)
9373 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9375 if (mode == BLKmode)
9376 return true;
9378 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9379 return false;
9381 if (VECTOR_MODE_P (mode) || mode == TImode)
9383 /* User-created vectors small enough to fit in EAX. */
9384 if (size < 8)
9385 return false;
9387 /* Unless ABI prescibes otherwise,
9388 MMX/3dNow values are returned in MM0 if available. */
9390 if (size == 8)
9391 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9393 /* SSE values are returned in XMM0 if available. */
9394 if (size == 16)
9395 return !TARGET_SSE;
9397 /* AVX values are returned in YMM0 if available. */
9398 if (size == 32)
9399 return !TARGET_AVX;
9401 /* AVX512F values are returned in ZMM0 if available. */
9402 if (size == 64)
9403 return !TARGET_AVX512F;
9406 if (mode == XFmode)
9407 return false;
9409 if (size > 12)
9410 return true;
9412 /* OImode shouldn't be used directly. */
9413 gcc_assert (mode != OImode);
9415 return false;
9417 #endif
9421 /* Create the va_list data type. */
9423 static tree
9424 ix86_build_builtin_va_list_64 (void)
9426 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9428 record = lang_hooks.types.make_type (RECORD_TYPE);
9429 type_decl = build_decl (BUILTINS_LOCATION,
9430 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9432 f_gpr = build_decl (BUILTINS_LOCATION,
9433 FIELD_DECL, get_identifier ("gp_offset"),
9434 unsigned_type_node);
9435 f_fpr = build_decl (BUILTINS_LOCATION,
9436 FIELD_DECL, get_identifier ("fp_offset"),
9437 unsigned_type_node);
9438 f_ovf = build_decl (BUILTINS_LOCATION,
9439 FIELD_DECL, get_identifier ("overflow_arg_area"),
9440 ptr_type_node);
9441 f_sav = build_decl (BUILTINS_LOCATION,
9442 FIELD_DECL, get_identifier ("reg_save_area"),
9443 ptr_type_node);
9445 va_list_gpr_counter_field = f_gpr;
9446 va_list_fpr_counter_field = f_fpr;
9448 DECL_FIELD_CONTEXT (f_gpr) = record;
9449 DECL_FIELD_CONTEXT (f_fpr) = record;
9450 DECL_FIELD_CONTEXT (f_ovf) = record;
9451 DECL_FIELD_CONTEXT (f_sav) = record;
9453 TYPE_STUB_DECL (record) = type_decl;
9454 TYPE_NAME (record) = type_decl;
9455 TYPE_FIELDS (record) = f_gpr;
9456 DECL_CHAIN (f_gpr) = f_fpr;
9457 DECL_CHAIN (f_fpr) = f_ovf;
9458 DECL_CHAIN (f_ovf) = f_sav;
9460 layout_type (record);
9462 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9463 NULL_TREE, TYPE_ATTRIBUTES (record));
9465 /* The correct type is an array type of one element. */
9466 return build_array_type (record, build_index_type (size_zero_node));
9469 /* Setup the builtin va_list data type and for 64-bit the additional
9470 calling convention specific va_list data types. */
9472 static tree
9473 ix86_build_builtin_va_list (void)
9475 if (TARGET_64BIT)
9477 /* Initialize ABI specific va_list builtin types.
9479 In lto1, we can encounter two va_list types:
9480 - one as a result of the type-merge across TUs, and
9481 - the one constructed here.
9482 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9483 a type identity check in canonical_va_list_type based on
9484 TYPE_MAIN_VARIANT (which we used to have) will not work.
9485 Instead, we tag each va_list_type_node with its unique attribute, and
9486 look for the attribute in the type identity check in
9487 canonical_va_list_type.
9489 Tagging sysv_va_list_type_node directly with the attribute is
9490 problematic since it's a array of one record, which will degrade into a
9491 pointer to record when used as parameter (see build_va_arg comments for
9492 an example), dropping the attribute in the process. So we tag the
9493 record instead. */
9495 /* For SYSV_ABI we use an array of one record. */
9496 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9498 /* For MS_ABI we use plain pointer to argument area. */
9499 tree char_ptr_type = build_pointer_type (char_type_node);
9500 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9501 TYPE_ATTRIBUTES (char_ptr_type));
9502 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9504 return ((ix86_abi == MS_ABI)
9505 ? ms_va_list_type_node
9506 : sysv_va_list_type_node);
9508 else
9510 /* For i386 we use plain pointer to argument area. */
9511 return build_pointer_type (char_type_node);
9515 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9517 static void
9518 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9520 rtx save_area, mem;
9521 alias_set_type set;
9522 int i, max;
9524 /* GPR size of varargs save area. */
9525 if (cfun->va_list_gpr_size)
9526 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9527 else
9528 ix86_varargs_gpr_size = 0;
9530 /* FPR size of varargs save area. We don't need it if we don't pass
9531 anything in SSE registers. */
9532 if (TARGET_SSE && cfun->va_list_fpr_size)
9533 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9534 else
9535 ix86_varargs_fpr_size = 0;
9537 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9538 return;
9540 save_area = frame_pointer_rtx;
9541 set = get_varargs_alias_set ();
9543 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9544 if (max > X86_64_REGPARM_MAX)
9545 max = X86_64_REGPARM_MAX;
9547 for (i = cum->regno; i < max; i++)
9549 mem = gen_rtx_MEM (word_mode,
9550 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9551 MEM_NOTRAP_P (mem) = 1;
9552 set_mem_alias_set (mem, set);
9553 emit_move_insn (mem,
9554 gen_rtx_REG (word_mode,
9555 x86_64_int_parameter_registers[i]));
9558 if (ix86_varargs_fpr_size)
9560 machine_mode smode;
9561 rtx_code_label *label;
9562 rtx test;
9564 /* Now emit code to save SSE registers. The AX parameter contains number
9565 of SSE parameter registers used to call this function, though all we
9566 actually check here is the zero/non-zero status. */
9568 label = gen_label_rtx ();
9569 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9570 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9571 label));
9573 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9574 we used movdqa (i.e. TImode) instead? Perhaps even better would
9575 be if we could determine the real mode of the data, via a hook
9576 into pass_stdarg. Ignore all that for now. */
9577 smode = V4SFmode;
9578 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9579 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9581 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9582 if (max > X86_64_SSE_REGPARM_MAX)
9583 max = X86_64_SSE_REGPARM_MAX;
9585 for (i = cum->sse_regno; i < max; ++i)
9587 mem = plus_constant (Pmode, save_area,
9588 i * 16 + ix86_varargs_gpr_size);
9589 mem = gen_rtx_MEM (smode, mem);
9590 MEM_NOTRAP_P (mem) = 1;
9591 set_mem_alias_set (mem, set);
9592 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9594 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9597 emit_label (label);
9601 static void
9602 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9604 alias_set_type set = get_varargs_alias_set ();
9605 int i;
9607 /* Reset to zero, as there might be a sysv vaarg used
9608 before. */
9609 ix86_varargs_gpr_size = 0;
9610 ix86_varargs_fpr_size = 0;
9612 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9614 rtx reg, mem;
9616 mem = gen_rtx_MEM (Pmode,
9617 plus_constant (Pmode, virtual_incoming_args_rtx,
9618 i * UNITS_PER_WORD));
9619 MEM_NOTRAP_P (mem) = 1;
9620 set_mem_alias_set (mem, set);
9622 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9623 emit_move_insn (mem, reg);
9627 static void
9628 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9629 tree type, int *, int no_rtl)
9631 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9632 CUMULATIVE_ARGS next_cum;
9633 tree fntype;
9635 /* This argument doesn't appear to be used anymore. Which is good,
9636 because the old code here didn't suppress rtl generation. */
9637 gcc_assert (!no_rtl);
9639 if (!TARGET_64BIT)
9640 return;
9642 fntype = TREE_TYPE (current_function_decl);
9644 /* For varargs, we do not want to skip the dummy va_dcl argument.
9645 For stdargs, we do want to skip the last named argument. */
9646 next_cum = *cum;
9647 if (stdarg_p (fntype))
9648 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9649 true);
9651 if (cum->call_abi == MS_ABI)
9652 setup_incoming_varargs_ms_64 (&next_cum);
9653 else
9654 setup_incoming_varargs_64 (&next_cum);
9657 static void
9658 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9659 machine_mode mode,
9660 tree type,
9661 int *pretend_size ATTRIBUTE_UNUSED,
9662 int no_rtl)
9664 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9665 CUMULATIVE_ARGS next_cum;
9666 tree fntype;
9667 rtx save_area;
9668 int bnd_reg, i, max;
9670 gcc_assert (!no_rtl);
9672 /* Do nothing if we use plain pointer to argument area. */
9673 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9674 return;
9676 fntype = TREE_TYPE (current_function_decl);
9678 /* For varargs, we do not want to skip the dummy va_dcl argument.
9679 For stdargs, we do want to skip the last named argument. */
9680 next_cum = *cum;
9681 if (stdarg_p (fntype))
9682 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9683 true);
9684 save_area = frame_pointer_rtx;
9686 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9687 if (max > X86_64_REGPARM_MAX)
9688 max = X86_64_REGPARM_MAX;
9690 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9691 if (chkp_function_instrumented_p (current_function_decl))
9692 for (i = cum->regno; i < max; i++)
9694 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9695 rtx ptr = gen_rtx_REG (Pmode,
9696 x86_64_int_parameter_registers[i]);
9697 rtx bounds;
9699 if (bnd_reg <= LAST_BND_REG)
9700 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9701 else
9703 rtx ldx_addr =
9704 plus_constant (Pmode, arg_pointer_rtx,
9705 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9706 bounds = gen_reg_rtx (BNDmode);
9707 emit_insn (BNDmode == BND64mode
9708 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9709 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9712 emit_insn (BNDmode == BND64mode
9713 ? gen_bnd64_stx (addr, ptr, bounds)
9714 : gen_bnd32_stx (addr, ptr, bounds));
9716 bnd_reg++;
9721 /* Checks if TYPE is of kind va_list char *. */
9723 static bool
9724 is_va_list_char_pointer (tree type)
9726 tree canonic;
9728 /* For 32-bit it is always true. */
9729 if (!TARGET_64BIT)
9730 return true;
9731 canonic = ix86_canonical_va_list_type (type);
9732 return (canonic == ms_va_list_type_node
9733 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9736 /* Implement va_start. */
9738 static void
9739 ix86_va_start (tree valist, rtx nextarg)
9741 HOST_WIDE_INT words, n_gpr, n_fpr;
9742 tree f_gpr, f_fpr, f_ovf, f_sav;
9743 tree gpr, fpr, ovf, sav, t;
9744 tree type;
9745 rtx ovf_rtx;
9747 if (flag_split_stack
9748 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9750 unsigned int scratch_regno;
9752 /* When we are splitting the stack, we can't refer to the stack
9753 arguments using internal_arg_pointer, because they may be on
9754 the old stack. The split stack prologue will arrange to
9755 leave a pointer to the old stack arguments in a scratch
9756 register, which we here copy to a pseudo-register. The split
9757 stack prologue can't set the pseudo-register directly because
9758 it (the prologue) runs before any registers have been saved. */
9760 scratch_regno = split_stack_prologue_scratch_regno ();
9761 if (scratch_regno != INVALID_REGNUM)
9763 rtx reg;
9764 rtx_insn *seq;
9766 reg = gen_reg_rtx (Pmode);
9767 cfun->machine->split_stack_varargs_pointer = reg;
9769 start_sequence ();
9770 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9771 seq = get_insns ();
9772 end_sequence ();
9774 push_topmost_sequence ();
9775 emit_insn_after (seq, entry_of_function ());
9776 pop_topmost_sequence ();
9780 /* Only 64bit target needs something special. */
9781 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9783 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9784 std_expand_builtin_va_start (valist, nextarg);
9785 else
9787 rtx va_r, next;
9789 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9790 next = expand_binop (ptr_mode, add_optab,
9791 cfun->machine->split_stack_varargs_pointer,
9792 crtl->args.arg_offset_rtx,
9793 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9794 convert_move (va_r, next, 0);
9796 /* Store zero bounds for va_list. */
9797 if (chkp_function_instrumented_p (current_function_decl))
9798 chkp_expand_bounds_reset_for_mem (valist,
9799 make_tree (TREE_TYPE (valist),
9800 next));
9803 return;
9806 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9807 f_fpr = DECL_CHAIN (f_gpr);
9808 f_ovf = DECL_CHAIN (f_fpr);
9809 f_sav = DECL_CHAIN (f_ovf);
9811 valist = build_simple_mem_ref (valist);
9812 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9813 /* The following should be folded into the MEM_REF offset. */
9814 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9815 f_gpr, NULL_TREE);
9816 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9817 f_fpr, NULL_TREE);
9818 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9819 f_ovf, NULL_TREE);
9820 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9821 f_sav, NULL_TREE);
9823 /* Count number of gp and fp argument registers used. */
9824 words = crtl->args.info.words;
9825 n_gpr = crtl->args.info.regno;
9826 n_fpr = crtl->args.info.sse_regno;
9828 if (cfun->va_list_gpr_size)
9830 type = TREE_TYPE (gpr);
9831 t = build2 (MODIFY_EXPR, type,
9832 gpr, build_int_cst (type, n_gpr * 8));
9833 TREE_SIDE_EFFECTS (t) = 1;
9834 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9837 if (TARGET_SSE && cfun->va_list_fpr_size)
9839 type = TREE_TYPE (fpr);
9840 t = build2 (MODIFY_EXPR, type, fpr,
9841 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9842 TREE_SIDE_EFFECTS (t) = 1;
9843 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9846 /* Find the overflow area. */
9847 type = TREE_TYPE (ovf);
9848 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9849 ovf_rtx = crtl->args.internal_arg_pointer;
9850 else
9851 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9852 t = make_tree (type, ovf_rtx);
9853 if (words != 0)
9854 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9856 /* Store zero bounds for overflow area pointer. */
9857 if (chkp_function_instrumented_p (current_function_decl))
9858 chkp_expand_bounds_reset_for_mem (ovf, t);
9860 t = build2 (MODIFY_EXPR, type, ovf, t);
9861 TREE_SIDE_EFFECTS (t) = 1;
9862 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9864 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9866 /* Find the register save area.
9867 Prologue of the function save it right above stack frame. */
9868 type = TREE_TYPE (sav);
9869 t = make_tree (type, frame_pointer_rtx);
9870 if (!ix86_varargs_gpr_size)
9871 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9873 /* Store zero bounds for save area pointer. */
9874 if (chkp_function_instrumented_p (current_function_decl))
9875 chkp_expand_bounds_reset_for_mem (sav, t);
9877 t = build2 (MODIFY_EXPR, type, sav, t);
9878 TREE_SIDE_EFFECTS (t) = 1;
9879 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9883 /* Implement va_arg. */
9885 static tree
9886 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9887 gimple_seq *post_p)
9889 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9890 tree f_gpr, f_fpr, f_ovf, f_sav;
9891 tree gpr, fpr, ovf, sav, t;
9892 int size, rsize;
9893 tree lab_false, lab_over = NULL_TREE;
9894 tree addr, t2;
9895 rtx container;
9896 int indirect_p = 0;
9897 tree ptrtype;
9898 machine_mode nat_mode;
9899 unsigned int arg_boundary;
9901 /* Only 64bit target needs something special. */
9902 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9903 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9905 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9906 f_fpr = DECL_CHAIN (f_gpr);
9907 f_ovf = DECL_CHAIN (f_fpr);
9908 f_sav = DECL_CHAIN (f_ovf);
9910 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9911 valist, f_gpr, NULL_TREE);
9913 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9914 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9915 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9917 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9918 if (indirect_p)
9919 type = build_pointer_type (type);
9920 size = arg_int_size_in_bytes (type);
9921 rsize = CEIL (size, UNITS_PER_WORD);
9923 nat_mode = type_natural_mode (type, NULL, false);
9924 switch (nat_mode)
9926 case E_V8SFmode:
9927 case E_V8SImode:
9928 case E_V32QImode:
9929 case E_V16HImode:
9930 case E_V4DFmode:
9931 case E_V4DImode:
9932 case E_V16SFmode:
9933 case E_V16SImode:
9934 case E_V64QImode:
9935 case E_V32HImode:
9936 case E_V8DFmode:
9937 case E_V8DImode:
9938 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9939 if (!TARGET_64BIT_MS_ABI)
9941 container = NULL;
9942 break;
9944 /* FALLTHRU */
9946 default:
9947 container = construct_container (nat_mode, TYPE_MODE (type),
9948 type, 0, X86_64_REGPARM_MAX,
9949 X86_64_SSE_REGPARM_MAX, intreg,
9951 break;
9954 /* Pull the value out of the saved registers. */
9956 addr = create_tmp_var (ptr_type_node, "addr");
9958 if (container)
9960 int needed_intregs, needed_sseregs;
9961 bool need_temp;
9962 tree int_addr, sse_addr;
9964 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9965 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9967 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9969 need_temp = (!REG_P (container)
9970 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9971 || TYPE_ALIGN (type) > 128));
9973 /* In case we are passing structure, verify that it is consecutive block
9974 on the register save area. If not we need to do moves. */
9975 if (!need_temp && !REG_P (container))
9977 /* Verify that all registers are strictly consecutive */
9978 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
9980 int i;
9982 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9984 rtx slot = XVECEXP (container, 0, i);
9985 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
9986 || INTVAL (XEXP (slot, 1)) != i * 16)
9987 need_temp = true;
9990 else
9992 int i;
9994 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9996 rtx slot = XVECEXP (container, 0, i);
9997 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
9998 || INTVAL (XEXP (slot, 1)) != i * 8)
9999 need_temp = true;
10003 if (!need_temp)
10005 int_addr = addr;
10006 sse_addr = addr;
10008 else
10010 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10011 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10014 /* First ensure that we fit completely in registers. */
10015 if (needed_intregs)
10017 t = build_int_cst (TREE_TYPE (gpr),
10018 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10019 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10020 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10021 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10022 gimplify_and_add (t, pre_p);
10024 if (needed_sseregs)
10026 t = build_int_cst (TREE_TYPE (fpr),
10027 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10028 + X86_64_REGPARM_MAX * 8);
10029 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10030 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10031 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10032 gimplify_and_add (t, pre_p);
10035 /* Compute index to start of area used for integer regs. */
10036 if (needed_intregs)
10038 /* int_addr = gpr + sav; */
10039 t = fold_build_pointer_plus (sav, gpr);
10040 gimplify_assign (int_addr, t, pre_p);
10042 if (needed_sseregs)
10044 /* sse_addr = fpr + sav; */
10045 t = fold_build_pointer_plus (sav, fpr);
10046 gimplify_assign (sse_addr, t, pre_p);
10048 if (need_temp)
10050 int i, prev_size = 0;
10051 tree temp = create_tmp_var (type, "va_arg_tmp");
10053 /* addr = &temp; */
10054 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10055 gimplify_assign (addr, t, pre_p);
10057 for (i = 0; i < XVECLEN (container, 0); i++)
10059 rtx slot = XVECEXP (container, 0, i);
10060 rtx reg = XEXP (slot, 0);
10061 machine_mode mode = GET_MODE (reg);
10062 tree piece_type;
10063 tree addr_type;
10064 tree daddr_type;
10065 tree src_addr, src;
10066 int src_offset;
10067 tree dest_addr, dest;
10068 int cur_size = GET_MODE_SIZE (mode);
10070 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10071 prev_size = INTVAL (XEXP (slot, 1));
10072 if (prev_size + cur_size > size)
10074 cur_size = size - prev_size;
10075 unsigned int nbits = cur_size * BITS_PER_UNIT;
10076 if (!int_mode_for_size (nbits, 1).exists (&mode))
10077 mode = QImode;
10079 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10080 if (mode == GET_MODE (reg))
10081 addr_type = build_pointer_type (piece_type);
10082 else
10083 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10084 true);
10085 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10086 true);
10088 if (SSE_REGNO_P (REGNO (reg)))
10090 src_addr = sse_addr;
10091 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10093 else
10095 src_addr = int_addr;
10096 src_offset = REGNO (reg) * 8;
10098 src_addr = fold_convert (addr_type, src_addr);
10099 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10101 dest_addr = fold_convert (daddr_type, addr);
10102 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10103 if (cur_size == GET_MODE_SIZE (mode))
10105 src = build_va_arg_indirect_ref (src_addr);
10106 dest = build_va_arg_indirect_ref (dest_addr);
10108 gimplify_assign (dest, src, pre_p);
10110 else
10112 tree copy
10113 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10114 3, dest_addr, src_addr,
10115 size_int (cur_size));
10116 gimplify_and_add (copy, pre_p);
10118 prev_size += cur_size;
10122 if (needed_intregs)
10124 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10125 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10126 gimplify_assign (gpr, t, pre_p);
10129 if (needed_sseregs)
10131 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10132 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10133 gimplify_assign (unshare_expr (fpr), t, pre_p);
10136 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10138 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10141 /* ... otherwise out of the overflow area. */
10143 /* When we align parameter on stack for caller, if the parameter
10144 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10145 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10146 here with caller. */
10147 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10148 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10149 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10151 /* Care for on-stack alignment if needed. */
10152 if (arg_boundary <= 64 || size == 0)
10153 t = ovf;
10154 else
10156 HOST_WIDE_INT align = arg_boundary / 8;
10157 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10158 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10159 build_int_cst (TREE_TYPE (t), -align));
10162 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10163 gimplify_assign (addr, t, pre_p);
10165 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10166 gimplify_assign (unshare_expr (ovf), t, pre_p);
10168 if (container)
10169 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10171 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10172 addr = fold_convert (ptrtype, addr);
10174 if (indirect_p)
10175 addr = build_va_arg_indirect_ref (addr);
10176 return build_va_arg_indirect_ref (addr);
10179 /* Return true if OPNUM's MEM should be matched
10180 in movabs* patterns. */
10182 bool
10183 ix86_check_movabs (rtx insn, int opnum)
10185 rtx set, mem;
10187 set = PATTERN (insn);
10188 if (GET_CODE (set) == PARALLEL)
10189 set = XVECEXP (set, 0, 0);
10190 gcc_assert (GET_CODE (set) == SET);
10191 mem = XEXP (set, opnum);
10192 while (SUBREG_P (mem))
10193 mem = SUBREG_REG (mem);
10194 gcc_assert (MEM_P (mem));
10195 return volatile_ok || !MEM_VOLATILE_P (mem);
10198 /* Return false if INSN contains a MEM with a non-default address space. */
10199 bool
10200 ix86_check_no_addr_space (rtx insn)
10202 subrtx_var_iterator::array_type array;
10203 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10205 rtx x = *iter;
10206 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10207 return false;
10209 return true;
10212 /* Initialize the table of extra 80387 mathematical constants. */
10214 static void
10215 init_ext_80387_constants (void)
10217 static const char * cst[5] =
10219 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10220 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10221 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10222 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10223 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10225 int i;
10227 for (i = 0; i < 5; i++)
10229 real_from_string (&ext_80387_constants_table[i], cst[i]);
10230 /* Ensure each constant is rounded to XFmode precision. */
10231 real_convert (&ext_80387_constants_table[i],
10232 XFmode, &ext_80387_constants_table[i]);
10235 ext_80387_constants_init = 1;
10238 /* Return non-zero if the constant is something that
10239 can be loaded with a special instruction. */
10242 standard_80387_constant_p (rtx x)
10244 machine_mode mode = GET_MODE (x);
10246 const REAL_VALUE_TYPE *r;
10248 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10249 return -1;
10251 if (x == CONST0_RTX (mode))
10252 return 1;
10253 if (x == CONST1_RTX (mode))
10254 return 2;
10256 r = CONST_DOUBLE_REAL_VALUE (x);
10258 /* For XFmode constants, try to find a special 80387 instruction when
10259 optimizing for size or on those CPUs that benefit from them. */
10260 if (mode == XFmode
10261 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10263 int i;
10265 if (! ext_80387_constants_init)
10266 init_ext_80387_constants ();
10268 for (i = 0; i < 5; i++)
10269 if (real_identical (r, &ext_80387_constants_table[i]))
10270 return i + 3;
10273 /* Load of the constant -0.0 or -1.0 will be split as
10274 fldz;fchs or fld1;fchs sequence. */
10275 if (real_isnegzero (r))
10276 return 8;
10277 if (real_identical (r, &dconstm1))
10278 return 9;
10280 return 0;
10283 /* Return the opcode of the special instruction to be used to load
10284 the constant X. */
10286 const char *
10287 standard_80387_constant_opcode (rtx x)
10289 switch (standard_80387_constant_p (x))
10291 case 1:
10292 return "fldz";
10293 case 2:
10294 return "fld1";
10295 case 3:
10296 return "fldlg2";
10297 case 4:
10298 return "fldln2";
10299 case 5:
10300 return "fldl2e";
10301 case 6:
10302 return "fldl2t";
10303 case 7:
10304 return "fldpi";
10305 case 8:
10306 case 9:
10307 return "#";
10308 default:
10309 gcc_unreachable ();
10313 /* Return the CONST_DOUBLE representing the 80387 constant that is
10314 loaded by the specified special instruction. The argument IDX
10315 matches the return value from standard_80387_constant_p. */
10318 standard_80387_constant_rtx (int idx)
10320 int i;
10322 if (! ext_80387_constants_init)
10323 init_ext_80387_constants ();
10325 switch (idx)
10327 case 3:
10328 case 4:
10329 case 5:
10330 case 6:
10331 case 7:
10332 i = idx - 3;
10333 break;
10335 default:
10336 gcc_unreachable ();
10339 return const_double_from_real_value (ext_80387_constants_table[i],
10340 XFmode);
10343 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10344 in supported SSE/AVX vector mode. */
10347 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10349 machine_mode mode;
10351 if (!TARGET_SSE)
10352 return 0;
10354 mode = GET_MODE (x);
10356 if (x == const0_rtx || const0_operand (x, mode))
10357 return 1;
10359 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10361 /* VOIDmode integer constant, get mode from the predicate. */
10362 if (mode == VOIDmode)
10363 mode = pred_mode;
10365 switch (GET_MODE_SIZE (mode))
10367 case 64:
10368 if (TARGET_AVX512F)
10369 return 2;
10370 break;
10371 case 32:
10372 if (TARGET_AVX2)
10373 return 2;
10374 break;
10375 case 16:
10376 if (TARGET_SSE2)
10377 return 2;
10378 break;
10379 case 0:
10380 /* VOIDmode */
10381 gcc_unreachable ();
10382 default:
10383 break;
10387 return 0;
10390 /* Return the opcode of the special instruction to be used to load
10391 the constant operands[1] into operands[0]. */
10393 const char *
10394 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10396 machine_mode mode;
10397 rtx x = operands[1];
10399 gcc_assert (TARGET_SSE);
10401 mode = GET_MODE (x);
10403 if (x == const0_rtx || const0_operand (x, mode))
10405 switch (get_attr_mode (insn))
10407 case MODE_TI:
10408 if (!EXT_REX_SSE_REG_P (operands[0]))
10409 return "%vpxor\t%0, %d0";
10410 /* FALLTHRU */
10411 case MODE_XI:
10412 case MODE_OI:
10413 if (EXT_REX_SSE_REG_P (operands[0]))
10414 return (TARGET_AVX512VL
10415 ? "vpxord\t%x0, %x0, %x0"
10416 : "vpxord\t%g0, %g0, %g0");
10417 return "vpxor\t%x0, %x0, %x0";
10419 case MODE_V2DF:
10420 if (!EXT_REX_SSE_REG_P (operands[0]))
10421 return "%vxorpd\t%0, %d0";
10422 /* FALLTHRU */
10423 case MODE_V8DF:
10424 case MODE_V4DF:
10425 if (!EXT_REX_SSE_REG_P (operands[0]))
10426 return "vxorpd\t%x0, %x0, %x0";
10427 else if (TARGET_AVX512DQ)
10428 return (TARGET_AVX512VL
10429 ? "vxorpd\t%x0, %x0, %x0"
10430 : "vxorpd\t%g0, %g0, %g0");
10431 else
10432 return (TARGET_AVX512VL
10433 ? "vpxorq\t%x0, %x0, %x0"
10434 : "vpxorq\t%g0, %g0, %g0");
10436 case MODE_V4SF:
10437 if (!EXT_REX_SSE_REG_P (operands[0]))
10438 return "%vxorps\t%0, %d0";
10439 /* FALLTHRU */
10440 case MODE_V16SF:
10441 case MODE_V8SF:
10442 if (!EXT_REX_SSE_REG_P (operands[0]))
10443 return "vxorps\t%x0, %x0, %x0";
10444 else if (TARGET_AVX512DQ)
10445 return (TARGET_AVX512VL
10446 ? "vxorps\t%x0, %x0, %x0"
10447 : "vxorps\t%g0, %g0, %g0");
10448 else
10449 return (TARGET_AVX512VL
10450 ? "vpxord\t%x0, %x0, %x0"
10451 : "vpxord\t%g0, %g0, %g0");
10453 default:
10454 gcc_unreachable ();
10457 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10459 enum attr_mode insn_mode = get_attr_mode (insn);
10461 switch (insn_mode)
10463 case MODE_XI:
10464 case MODE_V8DF:
10465 case MODE_V16SF:
10466 gcc_assert (TARGET_AVX512F);
10467 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10469 case MODE_OI:
10470 case MODE_V4DF:
10471 case MODE_V8SF:
10472 gcc_assert (TARGET_AVX2);
10473 /* FALLTHRU */
10474 case MODE_TI:
10475 case MODE_V2DF:
10476 case MODE_V4SF:
10477 gcc_assert (TARGET_SSE2);
10478 if (!EXT_REX_SSE_REG_P (operands[0]))
10479 return (TARGET_AVX
10480 ? "vpcmpeqd\t%0, %0, %0"
10481 : "pcmpeqd\t%0, %0");
10482 else if (TARGET_AVX512VL)
10483 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10484 else
10485 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10487 default:
10488 gcc_unreachable ();
10492 gcc_unreachable ();
10495 /* Returns true if INSN can be transformed from a memory load
10496 to a supported FP constant load. */
10498 bool
10499 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10501 rtx src = find_constant_src (insn);
10503 gcc_assert (REG_P (dst));
10505 if (src == NULL
10506 || (SSE_REGNO_P (REGNO (dst))
10507 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10508 || (STACK_REGNO_P (REGNO (dst))
10509 && standard_80387_constant_p (src) < 1))
10510 return false;
10512 return true;
10515 /* Returns true if OP contains a symbol reference */
10517 bool
10518 symbolic_reference_mentioned_p (rtx op)
10520 const char *fmt;
10521 int i;
10523 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10524 return true;
10526 fmt = GET_RTX_FORMAT (GET_CODE (op));
10527 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10529 if (fmt[i] == 'E')
10531 int j;
10533 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10534 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10535 return true;
10538 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10539 return true;
10542 return false;
10545 /* Return true if it is appropriate to emit `ret' instructions in the
10546 body of a function. Do this only if the epilogue is simple, needing a
10547 couple of insns. Prior to reloading, we can't tell how many registers
10548 must be saved, so return false then. Return false if there is no frame
10549 marker to de-allocate. */
10551 bool
10552 ix86_can_use_return_insn_p (void)
10554 if (ix86_function_naked (current_function_decl))
10555 return false;
10557 /* Don't use `ret' instruction in interrupt handler. */
10558 if (! reload_completed
10559 || frame_pointer_needed
10560 || cfun->machine->func_type != TYPE_NORMAL)
10561 return 0;
10563 /* Don't allow more than 32k pop, since that's all we can do
10564 with one instruction. */
10565 if (crtl->args.pops_args && crtl->args.size >= 32768)
10566 return 0;
10568 struct ix86_frame &frame = cfun->machine->frame;
10569 return (frame.stack_pointer_offset == UNITS_PER_WORD
10570 && (frame.nregs + frame.nsseregs) == 0);
10573 /* Value should be nonzero if functions must have frame pointers.
10574 Zero means the frame pointer need not be set up (and parms may
10575 be accessed via the stack pointer) in functions that seem suitable. */
10577 static bool
10578 ix86_frame_pointer_required (void)
10580 /* If we accessed previous frames, then the generated code expects
10581 to be able to access the saved ebp value in our frame. */
10582 if (cfun->machine->accesses_prev_frame)
10583 return true;
10585 /* Several x86 os'es need a frame pointer for other reasons,
10586 usually pertaining to setjmp. */
10587 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10588 return true;
10590 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10591 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10592 return true;
10594 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10595 allocation is 4GB. */
10596 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10597 return true;
10599 /* SSE saves require frame-pointer when stack is misaligned. */
10600 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10601 return true;
10603 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10604 turns off the frame pointer by default. Turn it back on now if
10605 we've not got a leaf function. */
10606 if (TARGET_OMIT_LEAF_FRAME_POINTER
10607 && (!crtl->is_leaf
10608 || ix86_current_function_calls_tls_descriptor))
10609 return true;
10611 if (crtl->profile && !flag_fentry)
10612 return true;
10614 return false;
10617 /* Record that the current function accesses previous call frames. */
10619 void
10620 ix86_setup_frame_addresses (void)
10622 cfun->machine->accesses_prev_frame = 1;
10625 #ifndef USE_HIDDEN_LINKONCE
10626 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10627 # define USE_HIDDEN_LINKONCE 1
10628 # else
10629 # define USE_HIDDEN_LINKONCE 0
10630 # endif
10631 #endif
10633 static int pic_labels_used;
10635 /* Fills in the label name that should be used for a pc thunk for
10636 the given register. */
10638 static void
10639 get_pc_thunk_name (char name[32], unsigned int regno)
10641 gcc_assert (!TARGET_64BIT);
10643 if (USE_HIDDEN_LINKONCE)
10644 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10645 else
10646 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10650 /* This function generates code for -fpic that loads %ebx with
10651 the return address of the caller and then returns. */
10653 static void
10654 ix86_code_end (void)
10656 rtx xops[2];
10657 int regno;
10659 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
10661 char name[32];
10662 tree decl;
10664 if (!(pic_labels_used & (1 << regno)))
10665 continue;
10667 get_pc_thunk_name (name, regno);
10669 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10670 get_identifier (name),
10671 build_function_type_list (void_type_node, NULL_TREE));
10672 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10673 NULL_TREE, void_type_node);
10674 TREE_PUBLIC (decl) = 1;
10675 TREE_STATIC (decl) = 1;
10676 DECL_IGNORED_P (decl) = 1;
10678 #if TARGET_MACHO
10679 if (TARGET_MACHO)
10681 switch_to_section (darwin_sections[picbase_thunk_section]);
10682 fputs ("\t.weak_definition\t", asm_out_file);
10683 assemble_name (asm_out_file, name);
10684 fputs ("\n\t.private_extern\t", asm_out_file);
10685 assemble_name (asm_out_file, name);
10686 putc ('\n', asm_out_file);
10687 ASM_OUTPUT_LABEL (asm_out_file, name);
10688 DECL_WEAK (decl) = 1;
10690 else
10691 #endif
10692 if (USE_HIDDEN_LINKONCE)
10694 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10696 targetm.asm_out.unique_section (decl, 0);
10697 switch_to_section (get_named_section (decl, NULL, 0));
10699 targetm.asm_out.globalize_label (asm_out_file, name);
10700 fputs ("\t.hidden\t", asm_out_file);
10701 assemble_name (asm_out_file, name);
10702 putc ('\n', asm_out_file);
10703 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10705 else
10707 switch_to_section (text_section);
10708 ASM_OUTPUT_LABEL (asm_out_file, name);
10711 DECL_INITIAL (decl) = make_node (BLOCK);
10712 current_function_decl = decl;
10713 allocate_struct_function (decl, false);
10714 init_function_start (decl);
10715 /* We're about to hide the function body from callees of final_* by
10716 emitting it directly; tell them we're a thunk, if they care. */
10717 cfun->is_thunk = true;
10718 first_function_block_is_cold = false;
10719 /* Make sure unwind info is emitted for the thunk if needed. */
10720 final_start_function (emit_barrier (), asm_out_file, 1);
10722 /* Pad stack IP move with 4 instructions (two NOPs count
10723 as one instruction). */
10724 if (TARGET_PAD_SHORT_FUNCTION)
10726 int i = 8;
10728 while (i--)
10729 fputs ("\tnop\n", asm_out_file);
10732 xops[0] = gen_rtx_REG (Pmode, regno);
10733 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
10734 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
10735 output_asm_insn ("%!ret", NULL);
10736 final_end_function ();
10737 init_insn_lengths ();
10738 free_after_compilation (cfun);
10739 set_cfun (NULL);
10740 current_function_decl = NULL;
10743 if (flag_split_stack)
10744 file_end_indicate_split_stack ();
10747 /* Emit code for the SET_GOT patterns. */
10749 const char *
10750 output_set_got (rtx dest, rtx label)
10752 rtx xops[3];
10754 xops[0] = dest;
10756 if (TARGET_VXWORKS_RTP && flag_pic)
10758 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
10759 xops[2] = gen_rtx_MEM (Pmode,
10760 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
10761 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
10763 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
10764 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
10765 an unadorned address. */
10766 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
10767 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
10768 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
10769 return "";
10772 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
10774 if (flag_pic)
10776 char name[32];
10777 get_pc_thunk_name (name, REGNO (dest));
10778 pic_labels_used |= 1 << REGNO (dest);
10780 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
10781 xops[2] = gen_rtx_MEM (QImode, xops[2]);
10782 output_asm_insn ("%!call\t%X2", xops);
10784 #if TARGET_MACHO
10785 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
10786 This is what will be referenced by the Mach-O PIC subsystem. */
10787 if (machopic_should_output_picbase_label () || !label)
10788 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
10790 /* When we are restoring the pic base at the site of a nonlocal label,
10791 and we decided to emit the pic base above, we will still output a
10792 local label used for calculating the correction offset (even though
10793 the offset will be 0 in that case). */
10794 if (label)
10795 targetm.asm_out.internal_label (asm_out_file, "L",
10796 CODE_LABEL_NUMBER (label));
10797 #endif
10799 else
10801 if (TARGET_MACHO)
10802 /* We don't need a pic base, we're not producing pic. */
10803 gcc_unreachable ();
10805 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
10806 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
10807 targetm.asm_out.internal_label (asm_out_file, "L",
10808 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
10811 if (!TARGET_MACHO)
10812 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
10814 return "";
10817 /* Generate an "push" pattern for input ARG. */
10819 static rtx
10820 gen_push (rtx arg)
10822 struct machine_function *m = cfun->machine;
10824 if (m->fs.cfa_reg == stack_pointer_rtx)
10825 m->fs.cfa_offset += UNITS_PER_WORD;
10826 m->fs.sp_offset += UNITS_PER_WORD;
10828 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10829 arg = gen_rtx_REG (word_mode, REGNO (arg));
10831 return gen_rtx_SET (gen_rtx_MEM (word_mode,
10832 gen_rtx_PRE_DEC (Pmode,
10833 stack_pointer_rtx)),
10834 arg);
10837 /* Generate an "pop" pattern for input ARG. */
10839 static rtx
10840 gen_pop (rtx arg)
10842 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10843 arg = gen_rtx_REG (word_mode, REGNO (arg));
10845 return gen_rtx_SET (arg,
10846 gen_rtx_MEM (word_mode,
10847 gen_rtx_POST_INC (Pmode,
10848 stack_pointer_rtx)));
10851 /* Return >= 0 if there is an unused call-clobbered register available
10852 for the entire function. */
10854 static unsigned int
10855 ix86_select_alt_pic_regnum (void)
10857 if (ix86_use_pseudo_pic_reg ())
10858 return INVALID_REGNUM;
10860 if (crtl->is_leaf
10861 && !crtl->profile
10862 && !ix86_current_function_calls_tls_descriptor)
10864 int i, drap;
10865 /* Can't use the same register for both PIC and DRAP. */
10866 if (crtl->drap_reg)
10867 drap = REGNO (crtl->drap_reg);
10868 else
10869 drap = -1;
10870 for (i = 2; i >= 0; --i)
10871 if (i != drap && !df_regs_ever_live_p (i))
10872 return i;
10875 return INVALID_REGNUM;
10878 /* Return true if REGNO is used by the epilogue. */
10880 bool
10881 ix86_epilogue_uses (int regno)
10883 /* If there are no caller-saved registers, we preserve all registers,
10884 except for MMX and x87 registers which aren't supported when saving
10885 and restoring registers. Don't explicitly save SP register since
10886 it is always preserved. */
10887 return (epilogue_completed
10888 && cfun->machine->no_caller_saved_registers
10889 && !fixed_regs[regno]
10890 && !STACK_REGNO_P (regno)
10891 && !MMX_REGNO_P (regno));
10894 /* Return nonzero if register REGNO can be used as a scratch register
10895 in peephole2. */
10897 static bool
10898 ix86_hard_regno_scratch_ok (unsigned int regno)
10900 /* If there are no caller-saved registers, we can't use any register
10901 as a scratch register after epilogue and use REGNO as scratch
10902 register only if it has been used before to avoid saving and
10903 restoring it. */
10904 return (!cfun->machine->no_caller_saved_registers
10905 || (!epilogue_completed
10906 && df_regs_ever_live_p (regno)));
10909 /* Return true if register class CL should be an additional allocno
10910 class. */
10912 static bool
10913 ix86_additional_allocno_class_p (reg_class_t cl)
10915 return cl == MOD4_SSE_REGS;
10918 /* Return TRUE if we need to save REGNO. */
10920 static bool
10921 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
10923 /* If there are no caller-saved registers, we preserve all registers,
10924 except for MMX and x87 registers which aren't supported when saving
10925 and restoring registers. Don't explicitly save SP register since
10926 it is always preserved. */
10927 if (cfun->machine->no_caller_saved_registers)
10929 /* Don't preserve registers used for function return value. */
10930 rtx reg = crtl->return_rtx;
10931 if (reg)
10933 unsigned int i = REGNO (reg);
10934 unsigned int nregs = REG_NREGS (reg);
10935 while (nregs-- > 0)
10936 if ((i + nregs) == regno)
10937 return false;
10939 reg = crtl->return_bnd;
10940 if (reg)
10942 i = REGNO (reg);
10943 nregs = REG_NREGS (reg);
10944 while (nregs-- > 0)
10945 if ((i + nregs) == regno)
10946 return false;
10950 return (df_regs_ever_live_p (regno)
10951 && !fixed_regs[regno]
10952 && !STACK_REGNO_P (regno)
10953 && !MMX_REGNO_P (regno)
10954 && (regno != HARD_FRAME_POINTER_REGNUM
10955 || !frame_pointer_needed));
10958 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
10959 && pic_offset_table_rtx)
10961 if (ix86_use_pseudo_pic_reg ())
10963 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
10964 _mcount in prologue. */
10965 if (!TARGET_64BIT && flag_pic && crtl->profile)
10966 return true;
10968 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10969 || crtl->profile
10970 || crtl->calls_eh_return
10971 || crtl->uses_const_pool
10972 || cfun->has_nonlocal_label)
10973 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
10976 if (crtl->calls_eh_return && maybe_eh_return)
10978 unsigned i;
10979 for (i = 0; ; i++)
10981 unsigned test = EH_RETURN_DATA_REGNO (i);
10982 if (test == INVALID_REGNUM)
10983 break;
10984 if (test == regno)
10985 return true;
10989 if (ignore_outlined && cfun->machine->call_ms2sysv)
10991 unsigned count = cfun->machine->call_ms2sysv_extra_regs
10992 + xlogue_layout::MIN_REGS;
10993 if (xlogue_layout::is_stub_managed_reg (regno, count))
10994 return false;
10997 if (crtl->drap_reg
10998 && regno == REGNO (crtl->drap_reg)
10999 && !cfun->machine->no_drap_save_restore)
11000 return true;
11002 return (df_regs_ever_live_p (regno)
11003 && !call_used_regs[regno]
11004 && !fixed_regs[regno]
11005 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11008 /* Return number of saved general prupose registers. */
11010 static int
11011 ix86_nsaved_regs (void)
11013 int nregs = 0;
11014 int regno;
11016 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11017 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11018 nregs ++;
11019 return nregs;
11022 /* Return number of saved SSE registers. */
11024 static int
11025 ix86_nsaved_sseregs (void)
11027 int nregs = 0;
11028 int regno;
11030 if (!TARGET_64BIT_MS_ABI)
11031 return 0;
11032 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11033 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11034 nregs ++;
11035 return nregs;
11038 /* Given FROM and TO register numbers, say whether this elimination is
11039 allowed. If stack alignment is needed, we can only replace argument
11040 pointer with hard frame pointer, or replace frame pointer with stack
11041 pointer. Otherwise, frame pointer elimination is automatically
11042 handled and all other eliminations are valid. */
11044 static bool
11045 ix86_can_eliminate (const int from, const int to)
11047 if (stack_realign_fp)
11048 return ((from == ARG_POINTER_REGNUM
11049 && to == HARD_FRAME_POINTER_REGNUM)
11050 || (from == FRAME_POINTER_REGNUM
11051 && to == STACK_POINTER_REGNUM));
11052 else
11053 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11056 /* Return the offset between two registers, one to be eliminated, and the other
11057 its replacement, at the start of a routine. */
11059 HOST_WIDE_INT
11060 ix86_initial_elimination_offset (int from, int to)
11062 struct ix86_frame &frame = cfun->machine->frame;
11064 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11065 return frame.hard_frame_pointer_offset;
11066 else if (from == FRAME_POINTER_REGNUM
11067 && to == HARD_FRAME_POINTER_REGNUM)
11068 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11069 else
11071 gcc_assert (to == STACK_POINTER_REGNUM);
11073 if (from == ARG_POINTER_REGNUM)
11074 return frame.stack_pointer_offset;
11076 gcc_assert (from == FRAME_POINTER_REGNUM);
11077 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11081 /* In a dynamically-aligned function, we can't know the offset from
11082 stack pointer to frame pointer, so we must ensure that setjmp
11083 eliminates fp against the hard fp (%ebp) rather than trying to
11084 index from %esp up to the top of the frame across a gap that is
11085 of unknown (at compile-time) size. */
11086 static rtx
11087 ix86_builtin_setjmp_frame_value (void)
11089 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11092 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11093 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11095 static bool warned_once = false;
11096 if (!warned_once)
11098 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11099 feature);
11100 warned_once = true;
11104 /* When using -fsplit-stack, the allocation routines set a field in
11105 the TCB to the bottom of the stack plus this much space, measured
11106 in bytes. */
11108 #define SPLIT_STACK_AVAILABLE 256
11110 /* Fill structure ix86_frame about frame of currently computed function. */
11112 static void
11113 ix86_compute_frame_layout (void)
11115 struct ix86_frame *frame = &cfun->machine->frame;
11116 struct machine_function *m = cfun->machine;
11117 unsigned HOST_WIDE_INT stack_alignment_needed;
11118 HOST_WIDE_INT offset;
11119 unsigned HOST_WIDE_INT preferred_alignment;
11120 HOST_WIDE_INT size = get_frame_size ();
11121 HOST_WIDE_INT to_allocate;
11123 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11124 * ms_abi functions that call a sysv function. We now need to prune away
11125 * cases where it should be disabled. */
11126 if (TARGET_64BIT && m->call_ms2sysv)
11128 gcc_assert (TARGET_64BIT_MS_ABI);
11129 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11130 gcc_assert (!TARGET_SEH);
11131 gcc_assert (TARGET_SSE);
11132 gcc_assert (!ix86_using_red_zone ());
11134 if (crtl->calls_eh_return)
11136 gcc_assert (!reload_completed);
11137 m->call_ms2sysv = false;
11138 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11141 else if (ix86_static_chain_on_stack)
11143 gcc_assert (!reload_completed);
11144 m->call_ms2sysv = false;
11145 warn_once_call_ms2sysv_xlogues ("static call chains");
11148 /* Finally, compute which registers the stub will manage. */
11149 else
11151 unsigned count = xlogue_layout::count_stub_managed_regs ();
11152 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11153 m->call_ms2sysv_pad_in = 0;
11157 frame->nregs = ix86_nsaved_regs ();
11158 frame->nsseregs = ix86_nsaved_sseregs ();
11160 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11161 except for function prologues, leaf functions and when the defult
11162 incoming stack boundary is overriden at command line or via
11163 force_align_arg_pointer attribute. */
11164 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11165 && (!crtl->is_leaf || cfun->calls_alloca != 0
11166 || ix86_current_function_calls_tls_descriptor
11167 || ix86_incoming_stack_boundary < 128))
11169 crtl->preferred_stack_boundary = 128;
11170 crtl->stack_alignment_needed = 128;
11173 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11174 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11176 gcc_assert (!size || stack_alignment_needed);
11177 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11178 gcc_assert (preferred_alignment <= stack_alignment_needed);
11180 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11181 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11182 if (TARGET_64BIT && m->call_ms2sysv)
11184 gcc_assert (stack_alignment_needed >= 16);
11185 gcc_assert (!frame->nsseregs);
11188 /* For SEH we have to limit the amount of code movement into the prologue.
11189 At present we do this via a BLOCKAGE, at which point there's very little
11190 scheduling that can be done, which means that there's very little point
11191 in doing anything except PUSHs. */
11192 if (TARGET_SEH)
11193 m->use_fast_prologue_epilogue = false;
11194 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11196 int count = frame->nregs;
11197 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11199 /* The fast prologue uses move instead of push to save registers. This
11200 is significantly longer, but also executes faster as modern hardware
11201 can execute the moves in parallel, but can't do that for push/pop.
11203 Be careful about choosing what prologue to emit: When function takes
11204 many instructions to execute we may use slow version as well as in
11205 case function is known to be outside hot spot (this is known with
11206 feedback only). Weight the size of function by number of registers
11207 to save as it is cheap to use one or two push instructions but very
11208 slow to use many of them. */
11209 if (count)
11210 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11211 if (node->frequency < NODE_FREQUENCY_NORMAL
11212 || (flag_branch_probabilities
11213 && node->frequency < NODE_FREQUENCY_HOT))
11214 m->use_fast_prologue_epilogue = false;
11215 else
11216 m->use_fast_prologue_epilogue
11217 = !expensive_function_p (count);
11220 frame->save_regs_using_mov
11221 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11222 /* If static stack checking is enabled and done with probes,
11223 the registers need to be saved before allocating the frame. */
11224 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11226 /* Skip return address and error code in exception handler. */
11227 offset = INCOMING_FRAME_SP_OFFSET;
11229 /* Skip pushed static chain. */
11230 if (ix86_static_chain_on_stack)
11231 offset += UNITS_PER_WORD;
11233 /* Skip saved base pointer. */
11234 if (frame_pointer_needed)
11235 offset += UNITS_PER_WORD;
11236 frame->hfp_save_offset = offset;
11238 /* The traditional frame pointer location is at the top of the frame. */
11239 frame->hard_frame_pointer_offset = offset;
11241 /* Register save area */
11242 offset += frame->nregs * UNITS_PER_WORD;
11243 frame->reg_save_offset = offset;
11245 /* On SEH target, registers are pushed just before the frame pointer
11246 location. */
11247 if (TARGET_SEH)
11248 frame->hard_frame_pointer_offset = offset;
11250 /* Calculate the size of the va-arg area (not including padding, if any). */
11251 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11253 if (stack_realign_fp)
11255 /* We may need a 16-byte aligned stack for the remainder of the
11256 register save area, but the stack frame for the local function
11257 may require a greater alignment if using AVX/2/512. In order
11258 to avoid wasting space, we first calculate the space needed for
11259 the rest of the register saves, add that to the stack pointer,
11260 and then realign the stack to the boundary of the start of the
11261 frame for the local function. */
11262 HOST_WIDE_INT space_needed = 0;
11263 HOST_WIDE_INT sse_reg_space_needed = 0;
11265 if (TARGET_64BIT)
11267 if (m->call_ms2sysv)
11269 m->call_ms2sysv_pad_in = 0;
11270 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11273 else if (frame->nsseregs)
11274 /* The only ABI that has saved SSE registers (Win64) also has a
11275 16-byte aligned default stack. However, many programs violate
11276 the ABI, and Wine64 forces stack realignment to compensate. */
11277 space_needed = frame->nsseregs * 16;
11279 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11281 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11282 rounding to be pedantic. */
11283 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11285 else
11286 space_needed = frame->va_arg_size;
11288 /* Record the allocation size required prior to the realignment AND. */
11289 frame->stack_realign_allocate = space_needed;
11291 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11292 before this point are not directly comparable with values below
11293 this point. Use sp_valid_at to determine if the stack pointer is
11294 valid for a given offset, fp_valid_at for the frame pointer, or
11295 choose_baseaddr to have a base register chosen for you.
11297 Note that the result of (frame->stack_realign_offset
11298 & (stack_alignment_needed - 1)) may not equal zero. */
11299 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11300 frame->stack_realign_offset = offset - space_needed;
11301 frame->sse_reg_save_offset = frame->stack_realign_offset
11302 + sse_reg_space_needed;
11304 else
11306 frame->stack_realign_offset = offset;
11308 if (TARGET_64BIT && m->call_ms2sysv)
11310 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11311 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11314 /* Align and set SSE register save area. */
11315 else if (frame->nsseregs)
11317 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11318 required and the DRAP re-alignment boundary is at least 16 bytes,
11319 then we want the SSE register save area properly aligned. */
11320 if (ix86_incoming_stack_boundary >= 128
11321 || (stack_realign_drap && stack_alignment_needed >= 16))
11322 offset = ROUND_UP (offset, 16);
11323 offset += frame->nsseregs * 16;
11325 frame->sse_reg_save_offset = offset;
11326 offset += frame->va_arg_size;
11329 /* Align start of frame for local function. */
11330 if (m->call_ms2sysv
11331 || frame->va_arg_size != 0
11332 || size != 0
11333 || !crtl->is_leaf
11334 || cfun->calls_alloca
11335 || ix86_current_function_calls_tls_descriptor)
11336 offset = ROUND_UP (offset, stack_alignment_needed);
11338 /* Frame pointer points here. */
11339 frame->frame_pointer_offset = offset;
11341 offset += size;
11343 /* Add outgoing arguments area. Can be skipped if we eliminated
11344 all the function calls as dead code.
11345 Skipping is however impossible when function calls alloca. Alloca
11346 expander assumes that last crtl->outgoing_args_size
11347 of stack frame are unused. */
11348 if (ACCUMULATE_OUTGOING_ARGS
11349 && (!crtl->is_leaf || cfun->calls_alloca
11350 || ix86_current_function_calls_tls_descriptor))
11352 offset += crtl->outgoing_args_size;
11353 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11355 else
11356 frame->outgoing_arguments_size = 0;
11358 /* Align stack boundary. Only needed if we're calling another function
11359 or using alloca. */
11360 if (!crtl->is_leaf || cfun->calls_alloca
11361 || ix86_current_function_calls_tls_descriptor)
11362 offset = ROUND_UP (offset, preferred_alignment);
11364 /* We've reached end of stack frame. */
11365 frame->stack_pointer_offset = offset;
11367 /* Size prologue needs to allocate. */
11368 to_allocate = offset - frame->sse_reg_save_offset;
11370 if ((!to_allocate && frame->nregs <= 1)
11371 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
11372 frame->save_regs_using_mov = false;
11374 if (ix86_using_red_zone ()
11375 && crtl->sp_is_unchanging
11376 && crtl->is_leaf
11377 && !ix86_pc_thunk_call_expanded
11378 && !ix86_current_function_calls_tls_descriptor)
11380 frame->red_zone_size = to_allocate;
11381 if (frame->save_regs_using_mov)
11382 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11383 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11384 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11386 else
11387 frame->red_zone_size = 0;
11388 frame->stack_pointer_offset -= frame->red_zone_size;
11390 /* The SEH frame pointer location is near the bottom of the frame.
11391 This is enforced by the fact that the difference between the
11392 stack pointer and the frame pointer is limited to 240 bytes in
11393 the unwind data structure. */
11394 if (TARGET_SEH)
11396 HOST_WIDE_INT diff;
11398 /* If we can leave the frame pointer where it is, do so. Also, returns
11399 the establisher frame for __builtin_frame_address (0). */
11400 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11401 if (diff <= SEH_MAX_FRAME_SIZE
11402 && (diff > 240 || (diff & 15) != 0)
11403 && !crtl->accesses_prior_frames)
11405 /* Ideally we'd determine what portion of the local stack frame
11406 (within the constraint of the lowest 240) is most heavily used.
11407 But without that complication, simply bias the frame pointer
11408 by 128 bytes so as to maximize the amount of the local stack
11409 frame that is addressable with 8-bit offsets. */
11410 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11415 /* This is semi-inlined memory_address_length, but simplified
11416 since we know that we're always dealing with reg+offset, and
11417 to avoid having to create and discard all that rtl. */
11419 static inline int
11420 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11422 int len = 4;
11424 if (offset == 0)
11426 /* EBP and R13 cannot be encoded without an offset. */
11427 len = (regno == BP_REG || regno == R13_REG);
11429 else if (IN_RANGE (offset, -128, 127))
11430 len = 1;
11432 /* ESP and R12 must be encoded with a SIB byte. */
11433 if (regno == SP_REG || regno == R12_REG)
11434 len++;
11436 return len;
11439 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11440 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11442 static bool
11443 sp_valid_at (HOST_WIDE_INT cfa_offset)
11445 const struct machine_frame_state &fs = cfun->machine->fs;
11446 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11448 /* Validate that the cfa_offset isn't in a "no-man's land". */
11449 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11450 return false;
11452 return fs.sp_valid;
11455 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11456 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11458 static inline bool
11459 fp_valid_at (HOST_WIDE_INT cfa_offset)
11461 const struct machine_frame_state &fs = cfun->machine->fs;
11462 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11464 /* Validate that the cfa_offset isn't in a "no-man's land". */
11465 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11466 return false;
11468 return fs.fp_valid;
11471 /* Choose a base register based upon alignment requested, speed and/or
11472 size. */
11474 static void
11475 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11476 HOST_WIDE_INT &base_offset,
11477 unsigned int align_reqested, unsigned int *align)
11479 const struct machine_function *m = cfun->machine;
11480 unsigned int hfp_align;
11481 unsigned int drap_align;
11482 unsigned int sp_align;
11483 bool hfp_ok = fp_valid_at (cfa_offset);
11484 bool drap_ok = m->fs.drap_valid;
11485 bool sp_ok = sp_valid_at (cfa_offset);
11487 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11489 /* Filter out any registers that don't meet the requested alignment
11490 criteria. */
11491 if (align_reqested)
11493 if (m->fs.realigned)
11494 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11495 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11496 notes (which we would need to use a realigned stack pointer),
11497 so disable on SEH targets. */
11498 else if (m->fs.sp_realigned)
11499 sp_align = crtl->stack_alignment_needed;
11501 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11502 drap_ok = drap_ok && drap_align >= align_reqested;
11503 sp_ok = sp_ok && sp_align >= align_reqested;
11506 if (m->use_fast_prologue_epilogue)
11508 /* Choose the base register most likely to allow the most scheduling
11509 opportunities. Generally FP is valid throughout the function,
11510 while DRAP must be reloaded within the epilogue. But choose either
11511 over the SP due to increased encoding size. */
11513 if (hfp_ok)
11515 base_reg = hard_frame_pointer_rtx;
11516 base_offset = m->fs.fp_offset - cfa_offset;
11518 else if (drap_ok)
11520 base_reg = crtl->drap_reg;
11521 base_offset = 0 - cfa_offset;
11523 else if (sp_ok)
11525 base_reg = stack_pointer_rtx;
11526 base_offset = m->fs.sp_offset - cfa_offset;
11529 else
11531 HOST_WIDE_INT toffset;
11532 int len = 16, tlen;
11534 /* Choose the base register with the smallest address encoding.
11535 With a tie, choose FP > DRAP > SP. */
11536 if (sp_ok)
11538 base_reg = stack_pointer_rtx;
11539 base_offset = m->fs.sp_offset - cfa_offset;
11540 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11542 if (drap_ok)
11544 toffset = 0 - cfa_offset;
11545 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11546 if (tlen <= len)
11548 base_reg = crtl->drap_reg;
11549 base_offset = toffset;
11550 len = tlen;
11553 if (hfp_ok)
11555 toffset = m->fs.fp_offset - cfa_offset;
11556 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11557 if (tlen <= len)
11559 base_reg = hard_frame_pointer_rtx;
11560 base_offset = toffset;
11561 len = tlen;
11566 /* Set the align return value. */
11567 if (align)
11569 if (base_reg == stack_pointer_rtx)
11570 *align = sp_align;
11571 else if (base_reg == crtl->drap_reg)
11572 *align = drap_align;
11573 else if (base_reg == hard_frame_pointer_rtx)
11574 *align = hfp_align;
11578 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11579 the alignment of address. If ALIGN is non-null, it should point to
11580 an alignment value (in bits) that is preferred or zero and will
11581 recieve the alignment of the base register that was selected,
11582 irrespective of rather or not CFA_OFFSET is a multiple of that
11583 alignment value. If it is possible for the base register offset to be
11584 non-immediate then SCRATCH_REGNO should specify a scratch register to
11585 use.
11587 The valid base registers are taken from CFUN->MACHINE->FS. */
11589 static rtx
11590 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
11591 unsigned int scratch_regno = INVALID_REGNUM)
11593 rtx base_reg = NULL;
11594 HOST_WIDE_INT base_offset = 0;
11596 /* If a specific alignment is requested, try to get a base register
11597 with that alignment first. */
11598 if (align && *align)
11599 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11601 if (!base_reg)
11602 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11604 gcc_assert (base_reg != NULL);
11606 rtx base_offset_rtx = GEN_INT (base_offset);
11608 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
11610 gcc_assert (scratch_regno != INVALID_REGNUM);
11612 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11613 emit_move_insn (scratch_reg, base_offset_rtx);
11615 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
11618 return plus_constant (Pmode, base_reg, base_offset);
11621 /* Emit code to save registers in the prologue. */
11623 static void
11624 ix86_emit_save_regs (void)
11626 unsigned int regno;
11627 rtx_insn *insn;
11629 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
11630 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11632 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
11633 RTX_FRAME_RELATED_P (insn) = 1;
11637 /* Emit a single register save at CFA - CFA_OFFSET. */
11639 static void
11640 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
11641 HOST_WIDE_INT cfa_offset)
11643 struct machine_function *m = cfun->machine;
11644 rtx reg = gen_rtx_REG (mode, regno);
11645 rtx mem, addr, base, insn;
11646 unsigned int align = GET_MODE_ALIGNMENT (mode);
11648 addr = choose_baseaddr (cfa_offset, &align);
11649 mem = gen_frame_mem (mode, addr);
11651 /* The location aligment depends upon the base register. */
11652 align = MIN (GET_MODE_ALIGNMENT (mode), align);
11653 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
11654 set_mem_align (mem, align);
11656 insn = emit_insn (gen_rtx_SET (mem, reg));
11657 RTX_FRAME_RELATED_P (insn) = 1;
11659 base = addr;
11660 if (GET_CODE (base) == PLUS)
11661 base = XEXP (base, 0);
11662 gcc_checking_assert (REG_P (base));
11664 /* When saving registers into a re-aligned local stack frame, avoid
11665 any tricky guessing by dwarf2out. */
11666 if (m->fs.realigned)
11668 gcc_checking_assert (stack_realign_drap);
11670 if (regno == REGNO (crtl->drap_reg))
11672 /* A bit of a hack. We force the DRAP register to be saved in
11673 the re-aligned stack frame, which provides us with a copy
11674 of the CFA that will last past the prologue. Install it. */
11675 gcc_checking_assert (cfun->machine->fs.fp_valid);
11676 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11677 cfun->machine->fs.fp_offset - cfa_offset);
11678 mem = gen_rtx_MEM (mode, addr);
11679 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
11681 else
11683 /* The frame pointer is a stable reference within the
11684 aligned frame. Use it. */
11685 gcc_checking_assert (cfun->machine->fs.fp_valid);
11686 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11687 cfun->machine->fs.fp_offset - cfa_offset);
11688 mem = gen_rtx_MEM (mode, addr);
11689 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11693 else if (base == stack_pointer_rtx && m->fs.sp_realigned
11694 && cfa_offset >= m->fs.sp_realigned_offset)
11696 gcc_checking_assert (stack_realign_fp);
11697 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11700 /* The memory may not be relative to the current CFA register,
11701 which means that we may need to generate a new pattern for
11702 use by the unwind info. */
11703 else if (base != m->fs.cfa_reg)
11705 addr = plus_constant (Pmode, m->fs.cfa_reg,
11706 m->fs.cfa_offset - cfa_offset);
11707 mem = gen_rtx_MEM (mode, addr);
11708 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
11712 /* Emit code to save registers using MOV insns.
11713 First register is stored at CFA - CFA_OFFSET. */
11714 static void
11715 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
11717 unsigned int regno;
11719 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11720 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11722 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
11723 cfa_offset -= UNITS_PER_WORD;
11727 /* Emit code to save SSE registers using MOV insns.
11728 First register is stored at CFA - CFA_OFFSET. */
11729 static void
11730 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
11732 unsigned int regno;
11734 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11735 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11737 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
11738 cfa_offset -= GET_MODE_SIZE (V4SFmode);
11742 static GTY(()) rtx queued_cfa_restores;
11744 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
11745 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
11746 Don't add the note if the previously saved value will be left untouched
11747 within stack red-zone till return, as unwinders can find the same value
11748 in the register and on the stack. */
11750 static void
11751 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
11753 if (!crtl->shrink_wrapped
11754 && cfa_offset <= cfun->machine->fs.red_zone_offset)
11755 return;
11757 if (insn)
11759 add_reg_note (insn, REG_CFA_RESTORE, reg);
11760 RTX_FRAME_RELATED_P (insn) = 1;
11762 else
11763 queued_cfa_restores
11764 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
11767 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
11769 static void
11770 ix86_add_queued_cfa_restore_notes (rtx insn)
11772 rtx last;
11773 if (!queued_cfa_restores)
11774 return;
11775 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
11777 XEXP (last, 1) = REG_NOTES (insn);
11778 REG_NOTES (insn) = queued_cfa_restores;
11779 queued_cfa_restores = NULL_RTX;
11780 RTX_FRAME_RELATED_P (insn) = 1;
11783 /* Expand prologue or epilogue stack adjustment.
11784 The pattern exist to put a dependency on all ebp-based memory accesses.
11785 STYLE should be negative if instructions should be marked as frame related,
11786 zero if %r11 register is live and cannot be freely used and positive
11787 otherwise. */
11789 static rtx
11790 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
11791 int style, bool set_cfa)
11793 struct machine_function *m = cfun->machine;
11794 rtx insn;
11795 bool add_frame_related_expr = false;
11797 if (Pmode == SImode)
11798 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
11799 else if (x86_64_immediate_operand (offset, DImode))
11800 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
11801 else
11803 rtx tmp;
11804 /* r11 is used by indirect sibcall return as well, set before the
11805 epilogue and used after the epilogue. */
11806 if (style)
11807 tmp = gen_rtx_REG (DImode, R11_REG);
11808 else
11810 gcc_assert (src != hard_frame_pointer_rtx
11811 && dest != hard_frame_pointer_rtx);
11812 tmp = hard_frame_pointer_rtx;
11814 insn = emit_insn (gen_rtx_SET (tmp, offset));
11815 if (style < 0)
11816 add_frame_related_expr = true;
11818 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
11821 insn = emit_insn (insn);
11822 if (style >= 0)
11823 ix86_add_queued_cfa_restore_notes (insn);
11825 if (set_cfa)
11827 rtx r;
11829 gcc_assert (m->fs.cfa_reg == src);
11830 m->fs.cfa_offset += INTVAL (offset);
11831 m->fs.cfa_reg = dest;
11833 r = gen_rtx_PLUS (Pmode, src, offset);
11834 r = gen_rtx_SET (dest, r);
11835 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
11836 RTX_FRAME_RELATED_P (insn) = 1;
11838 else if (style < 0)
11840 RTX_FRAME_RELATED_P (insn) = 1;
11841 if (add_frame_related_expr)
11843 rtx r = gen_rtx_PLUS (Pmode, src, offset);
11844 r = gen_rtx_SET (dest, r);
11845 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
11849 if (dest == stack_pointer_rtx)
11851 HOST_WIDE_INT ooffset = m->fs.sp_offset;
11852 bool valid = m->fs.sp_valid;
11853 bool realigned = m->fs.sp_realigned;
11855 if (src == hard_frame_pointer_rtx)
11857 valid = m->fs.fp_valid;
11858 realigned = false;
11859 ooffset = m->fs.fp_offset;
11861 else if (src == crtl->drap_reg)
11863 valid = m->fs.drap_valid;
11864 realigned = false;
11865 ooffset = 0;
11867 else
11869 /* Else there are two possibilities: SP itself, which we set
11870 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
11871 taken care of this by hand along the eh_return path. */
11872 gcc_checking_assert (src == stack_pointer_rtx
11873 || offset == const0_rtx);
11876 m->fs.sp_offset = ooffset - INTVAL (offset);
11877 m->fs.sp_valid = valid;
11878 m->fs.sp_realigned = realigned;
11880 return insn;
11883 /* Find an available register to be used as dynamic realign argument
11884 pointer regsiter. Such a register will be written in prologue and
11885 used in begin of body, so it must not be
11886 1. parameter passing register.
11887 2. GOT pointer.
11888 We reuse static-chain register if it is available. Otherwise, we
11889 use DI for i386 and R13 for x86-64. We chose R13 since it has
11890 shorter encoding.
11892 Return: the regno of chosen register. */
11894 static unsigned int
11895 find_drap_reg (void)
11897 tree decl = cfun->decl;
11899 /* Always use callee-saved register if there are no caller-saved
11900 registers. */
11901 if (TARGET_64BIT)
11903 /* Use R13 for nested function or function need static chain.
11904 Since function with tail call may use any caller-saved
11905 registers in epilogue, DRAP must not use caller-saved
11906 register in such case. */
11907 if (DECL_STATIC_CHAIN (decl)
11908 || cfun->machine->no_caller_saved_registers
11909 || crtl->tail_call_emit)
11910 return R13_REG;
11912 return R10_REG;
11914 else
11916 /* Use DI for nested function or function need static chain.
11917 Since function with tail call may use any caller-saved
11918 registers in epilogue, DRAP must not use caller-saved
11919 register in such case. */
11920 if (DECL_STATIC_CHAIN (decl)
11921 || cfun->machine->no_caller_saved_registers
11922 || crtl->tail_call_emit)
11923 return DI_REG;
11925 /* Reuse static chain register if it isn't used for parameter
11926 passing. */
11927 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
11929 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
11930 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
11931 return CX_REG;
11933 return DI_REG;
11937 /* Handle a "force_align_arg_pointer" attribute. */
11939 static tree
11940 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
11941 tree, int, bool *no_add_attrs)
11943 if (TREE_CODE (*node) != FUNCTION_TYPE
11944 && TREE_CODE (*node) != METHOD_TYPE
11945 && TREE_CODE (*node) != FIELD_DECL
11946 && TREE_CODE (*node) != TYPE_DECL)
11948 warning (OPT_Wattributes, "%qE attribute only applies to functions",
11949 name);
11950 *no_add_attrs = true;
11953 return NULL_TREE;
11956 /* Return minimum incoming stack alignment. */
11958 static unsigned int
11959 ix86_minimum_incoming_stack_boundary (bool sibcall)
11961 unsigned int incoming_stack_boundary;
11963 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
11964 if (cfun->machine->func_type != TYPE_NORMAL)
11965 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
11966 /* Prefer the one specified at command line. */
11967 else if (ix86_user_incoming_stack_boundary)
11968 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
11969 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
11970 if -mstackrealign is used, it isn't used for sibcall check and
11971 estimated stack alignment is 128bit. */
11972 else if (!sibcall
11973 && ix86_force_align_arg_pointer
11974 && crtl->stack_alignment_estimated == 128)
11975 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11976 else
11977 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
11979 /* Incoming stack alignment can be changed on individual functions
11980 via force_align_arg_pointer attribute. We use the smallest
11981 incoming stack boundary. */
11982 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
11983 && lookup_attribute (ix86_force_align_arg_pointer_string,
11984 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
11985 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11987 /* The incoming stack frame has to be aligned at least at
11988 parm_stack_boundary. */
11989 if (incoming_stack_boundary < crtl->parm_stack_boundary)
11990 incoming_stack_boundary = crtl->parm_stack_boundary;
11992 /* Stack at entrance of main is aligned by runtime. We use the
11993 smallest incoming stack boundary. */
11994 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
11995 && DECL_NAME (current_function_decl)
11996 && MAIN_NAME_P (DECL_NAME (current_function_decl))
11997 && DECL_FILE_SCOPE_P (current_function_decl))
11998 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12000 return incoming_stack_boundary;
12003 /* Update incoming stack boundary and estimated stack alignment. */
12005 static void
12006 ix86_update_stack_boundary (void)
12008 ix86_incoming_stack_boundary
12009 = ix86_minimum_incoming_stack_boundary (false);
12011 /* x86_64 vararg needs 16byte stack alignment for register save
12012 area. */
12013 if (TARGET_64BIT
12014 && cfun->stdarg
12015 && crtl->stack_alignment_estimated < 128)
12016 crtl->stack_alignment_estimated = 128;
12018 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12019 if (ix86_tls_descriptor_calls_expanded_in_cfun
12020 && crtl->preferred_stack_boundary < 128)
12021 crtl->preferred_stack_boundary = 128;
12024 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12025 needed or an rtx for DRAP otherwise. */
12027 static rtx
12028 ix86_get_drap_rtx (void)
12030 /* We must use DRAP if there are outgoing arguments on stack and
12031 ACCUMULATE_OUTGOING_ARGS is false. */
12032 if (ix86_force_drap
12033 || (cfun->machine->outgoing_args_on_stack
12034 && !ACCUMULATE_OUTGOING_ARGS))
12035 crtl->need_drap = true;
12037 if (stack_realign_drap)
12039 /* Assign DRAP to vDRAP and returns vDRAP */
12040 unsigned int regno = find_drap_reg ();
12041 rtx drap_vreg;
12042 rtx arg_ptr;
12043 rtx_insn *seq, *insn;
12045 arg_ptr = gen_rtx_REG (Pmode, regno);
12046 crtl->drap_reg = arg_ptr;
12048 start_sequence ();
12049 drap_vreg = copy_to_reg (arg_ptr);
12050 seq = get_insns ();
12051 end_sequence ();
12053 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12054 if (!optimize)
12056 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12057 RTX_FRAME_RELATED_P (insn) = 1;
12059 return drap_vreg;
12061 else
12062 return NULL;
12065 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12067 static rtx
12068 ix86_internal_arg_pointer (void)
12070 return virtual_incoming_args_rtx;
12073 struct scratch_reg {
12074 rtx reg;
12075 bool saved;
12078 /* Return a short-lived scratch register for use on function entry.
12079 In 32-bit mode, it is valid only after the registers are saved
12080 in the prologue. This register must be released by means of
12081 release_scratch_register_on_entry once it is dead. */
12083 static void
12084 get_scratch_register_on_entry (struct scratch_reg *sr)
12086 int regno;
12088 sr->saved = false;
12090 if (TARGET_64BIT)
12092 /* We always use R11 in 64-bit mode. */
12093 regno = R11_REG;
12095 else
12097 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12098 bool fastcall_p
12099 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12100 bool thiscall_p
12101 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12102 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12103 int regparm = ix86_function_regparm (fntype, decl);
12104 int drap_regno
12105 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12107 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12108 for the static chain register. */
12109 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12110 && drap_regno != AX_REG)
12111 regno = AX_REG;
12112 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12113 for the static chain register. */
12114 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12115 regno = AX_REG;
12116 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12117 regno = DX_REG;
12118 /* ecx is the static chain register. */
12119 else if (regparm < 3 && !fastcall_p && !thiscall_p
12120 && !static_chain_p
12121 && drap_regno != CX_REG)
12122 regno = CX_REG;
12123 else if (ix86_save_reg (BX_REG, true, false))
12124 regno = BX_REG;
12125 /* esi is the static chain register. */
12126 else if (!(regparm == 3 && static_chain_p)
12127 && ix86_save_reg (SI_REG, true, false))
12128 regno = SI_REG;
12129 else if (ix86_save_reg (DI_REG, true, false))
12130 regno = DI_REG;
12131 else
12133 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12134 sr->saved = true;
12138 sr->reg = gen_rtx_REG (Pmode, regno);
12139 if (sr->saved)
12141 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12142 RTX_FRAME_RELATED_P (insn) = 1;
12146 /* Release a scratch register obtained from the preceding function. */
12148 static void
12149 release_scratch_register_on_entry (struct scratch_reg *sr)
12151 if (sr->saved)
12153 struct machine_function *m = cfun->machine;
12154 rtx x, insn = emit_insn (gen_pop (sr->reg));
12156 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12157 RTX_FRAME_RELATED_P (insn) = 1;
12158 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12159 x = gen_rtx_SET (stack_pointer_rtx, x);
12160 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12161 m->fs.sp_offset -= UNITS_PER_WORD;
12165 /* Return the probing interval for -fstack-clash-protection. */
12167 static HOST_WIDE_INT
12168 get_probe_interval (void)
12170 if (flag_stack_clash_protection)
12171 return (HOST_WIDE_INT_1U
12172 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
12173 else
12174 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
12177 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12179 This differs from the next routine in that it tries hard to prevent
12180 attacks that jump the stack guard. Thus it is never allowed to allocate
12181 more than PROBE_INTERVAL bytes of stack space without a suitable
12182 probe. */
12184 static void
12185 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
12187 struct machine_function *m = cfun->machine;
12189 /* If this function does not statically allocate stack space, then
12190 no probes are needed. */
12191 if (!size)
12193 /* However, the allocation of space via pushes for register
12194 saves could be viewed as allocating space, but without the
12195 need to probe. */
12196 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12197 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12198 else
12199 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12200 return;
12203 /* If we are a noreturn function, then we have to consider the
12204 possibility that we're called via a jump rather than a call.
12206 Thus we don't have the implicit probe generated by saving the
12207 return address into the stack at the call. Thus, the stack
12208 pointer could be anywhere in the guard page. The safe thing
12209 to do is emit a probe now.
12211 ?!? This should be revamped to work like aarch64 and s390 where
12212 we track the offset from the most recent probe. Normally that
12213 offset would be zero. For a noreturn function we would reset
12214 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12215 we just probe when we cross PROBE_INTERVAL. */
12216 if (TREE_THIS_VOLATILE (cfun->decl))
12218 /* We can safely use any register here since we're just going to push
12219 its value and immediately pop it back. But we do try and avoid
12220 argument passing registers so as not to introduce dependencies in
12221 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12222 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12223 rtx_insn *insn = emit_insn (gen_push (dummy_reg));
12224 RTX_FRAME_RELATED_P (insn) = 1;
12225 ix86_emit_restore_reg_using_pop (dummy_reg);
12226 emit_insn (gen_blockage ());
12229 /* If we allocate less than the size of the guard statically,
12230 then no probing is necessary, but we do need to allocate
12231 the stack. */
12232 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12234 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12235 GEN_INT (-size), -1,
12236 m->fs.cfa_reg == stack_pointer_rtx);
12237 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12238 return;
12241 /* We're allocating a large enough stack frame that we need to
12242 emit probes. Either emit them inline or in a loop depending
12243 on the size. */
12244 HOST_WIDE_INT probe_interval = get_probe_interval ();
12245 if (size <= 4 * probe_interval)
12247 HOST_WIDE_INT i;
12248 for (i = probe_interval; i <= size; i += probe_interval)
12250 /* Allocate PROBE_INTERVAL bytes. */
12251 rtx insn
12252 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12253 GEN_INT (-probe_interval), -1,
12254 m->fs.cfa_reg == stack_pointer_rtx);
12255 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12257 /* And probe at *sp. */
12258 emit_stack_probe (stack_pointer_rtx);
12259 emit_insn (gen_blockage ());
12262 /* We need to allocate space for the residual, but we do not need
12263 to probe the residual. */
12264 HOST_WIDE_INT residual = (i - probe_interval - size);
12265 if (residual)
12266 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12267 GEN_INT (residual), -1,
12268 m->fs.cfa_reg == stack_pointer_rtx);
12269 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12271 else
12273 struct scratch_reg sr;
12274 get_scratch_register_on_entry (&sr);
12276 /* Step 1: round SIZE down to a multiple of the interval. */
12277 HOST_WIDE_INT rounded_size = size & -probe_interval;
12279 /* Step 2: compute final value of the loop counter. Use lea if
12280 possible. */
12281 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12282 rtx insn;
12283 if (address_no_seg_operand (addr, Pmode))
12284 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12285 else
12287 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12288 insn = emit_insn (gen_rtx_SET (sr.reg,
12289 gen_rtx_PLUS (Pmode, sr.reg,
12290 stack_pointer_rtx)));
12292 if (m->fs.cfa_reg == stack_pointer_rtx)
12294 add_reg_note (insn, REG_CFA_DEF_CFA,
12295 plus_constant (Pmode, sr.reg,
12296 m->fs.cfa_offset + rounded_size));
12297 RTX_FRAME_RELATED_P (insn) = 1;
12300 /* Step 3: the loop. */
12301 rtx size_rtx = GEN_INT (rounded_size);
12302 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12303 size_rtx));
12304 if (m->fs.cfa_reg == stack_pointer_rtx)
12306 m->fs.cfa_offset += rounded_size;
12307 add_reg_note (insn, REG_CFA_DEF_CFA,
12308 plus_constant (Pmode, stack_pointer_rtx,
12309 m->fs.cfa_offset));
12310 RTX_FRAME_RELATED_P (insn) = 1;
12312 m->fs.sp_offset += rounded_size;
12313 emit_insn (gen_blockage ());
12315 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12316 is equal to ROUNDED_SIZE. */
12318 if (size != rounded_size)
12319 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12320 GEN_INT (rounded_size - size), -1,
12321 m->fs.cfa_reg == stack_pointer_rtx);
12322 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12324 release_scratch_register_on_entry (&sr);
12327 /* Make sure nothing is scheduled before we are done. */
12328 emit_insn (gen_blockage ());
12331 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
12333 static void
12334 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
12336 /* We skip the probe for the first interval + a small dope of 4 words and
12337 probe that many bytes past the specified size to maintain a protection
12338 area at the botton of the stack. */
12339 const int dope = 4 * UNITS_PER_WORD;
12340 rtx size_rtx = GEN_INT (size), last;
12342 /* See if we have a constant small number of probes to generate. If so,
12343 that's the easy case. The run-time loop is made up of 9 insns in the
12344 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12345 for n # of intervals. */
12346 if (size <= 4 * get_probe_interval ())
12348 HOST_WIDE_INT i, adjust;
12349 bool first_probe = true;
12351 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12352 values of N from 1 until it exceeds SIZE. If only one probe is
12353 needed, this will not generate any code. Then adjust and probe
12354 to PROBE_INTERVAL + SIZE. */
12355 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12357 if (first_probe)
12359 adjust = 2 * get_probe_interval () + dope;
12360 first_probe = false;
12362 else
12363 adjust = get_probe_interval ();
12365 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12366 plus_constant (Pmode, stack_pointer_rtx,
12367 -adjust)));
12368 emit_stack_probe (stack_pointer_rtx);
12371 if (first_probe)
12372 adjust = size + get_probe_interval () + dope;
12373 else
12374 adjust = size + get_probe_interval () - i;
12376 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12377 plus_constant (Pmode, stack_pointer_rtx,
12378 -adjust)));
12379 emit_stack_probe (stack_pointer_rtx);
12381 /* Adjust back to account for the additional first interval. */
12382 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12383 plus_constant (Pmode, stack_pointer_rtx,
12384 (get_probe_interval ()
12385 + dope))));
12388 /* Otherwise, do the same as above, but in a loop. Note that we must be
12389 extra careful with variables wrapping around because we might be at
12390 the very top (or the very bottom) of the address space and we have
12391 to be able to handle this case properly; in particular, we use an
12392 equality test for the loop condition. */
12393 else
12395 HOST_WIDE_INT rounded_size;
12396 struct scratch_reg sr;
12398 get_scratch_register_on_entry (&sr);
12401 /* Step 1: round SIZE to the previous multiple of the interval. */
12403 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12406 /* Step 2: compute initial and final value of the loop counter. */
12408 /* SP = SP_0 + PROBE_INTERVAL. */
12409 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12410 plus_constant (Pmode, stack_pointer_rtx,
12411 - (get_probe_interval () + dope))));
12413 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12414 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12415 emit_insn (gen_rtx_SET (sr.reg,
12416 plus_constant (Pmode, stack_pointer_rtx,
12417 -rounded_size)));
12418 else
12420 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12421 emit_insn (gen_rtx_SET (sr.reg,
12422 gen_rtx_PLUS (Pmode, sr.reg,
12423 stack_pointer_rtx)));
12427 /* Step 3: the loop
12431 SP = SP + PROBE_INTERVAL
12432 probe at SP
12434 while (SP != LAST_ADDR)
12436 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12437 values of N from 1 until it is equal to ROUNDED_SIZE. */
12439 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12442 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12443 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12445 if (size != rounded_size)
12447 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12448 plus_constant (Pmode, stack_pointer_rtx,
12449 rounded_size - size)));
12450 emit_stack_probe (stack_pointer_rtx);
12453 /* Adjust back to account for the additional first interval. */
12454 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12455 plus_constant (Pmode, stack_pointer_rtx,
12456 (get_probe_interval ()
12457 + dope))));
12459 release_scratch_register_on_entry (&sr);
12462 /* Even if the stack pointer isn't the CFA register, we need to correctly
12463 describe the adjustments made to it, in particular differentiate the
12464 frame-related ones from the frame-unrelated ones. */
12465 if (size > 0)
12467 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12468 XVECEXP (expr, 0, 0)
12469 = gen_rtx_SET (stack_pointer_rtx,
12470 plus_constant (Pmode, stack_pointer_rtx, -size));
12471 XVECEXP (expr, 0, 1)
12472 = gen_rtx_SET (stack_pointer_rtx,
12473 plus_constant (Pmode, stack_pointer_rtx,
12474 get_probe_interval () + dope + size));
12475 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12476 RTX_FRAME_RELATED_P (last) = 1;
12478 cfun->machine->fs.sp_offset += size;
12481 /* Make sure nothing is scheduled before we are done. */
12482 emit_insn (gen_blockage ());
12485 /* Adjust the stack pointer up to REG while probing it. */
12487 const char *
12488 output_adjust_stack_and_probe (rtx reg)
12490 static int labelno = 0;
12491 char loop_lab[32];
12492 rtx xops[2];
12494 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12496 /* Loop. */
12497 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12499 /* SP = SP + PROBE_INTERVAL. */
12500 xops[0] = stack_pointer_rtx;
12501 xops[1] = GEN_INT (get_probe_interval ());
12502 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12504 /* Probe at SP. */
12505 xops[1] = const0_rtx;
12506 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12508 /* Test if SP == LAST_ADDR. */
12509 xops[0] = stack_pointer_rtx;
12510 xops[1] = reg;
12511 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12513 /* Branch. */
12514 fputs ("\tjne\t", asm_out_file);
12515 assemble_name_raw (asm_out_file, loop_lab);
12516 fputc ('\n', asm_out_file);
12518 return "";
12521 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12522 inclusive. These are offsets from the current stack pointer. */
12524 static void
12525 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
12527 /* See if we have a constant small number of probes to generate. If so,
12528 that's the easy case. The run-time loop is made up of 6 insns in the
12529 generic case while the compile-time loop is made up of n insns for n #
12530 of intervals. */
12531 if (size <= 6 * get_probe_interval ())
12533 HOST_WIDE_INT i;
12535 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12536 it exceeds SIZE. If only one probe is needed, this will not
12537 generate any code. Then probe at FIRST + SIZE. */
12538 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12539 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12540 -(first + i)));
12542 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12543 -(first + size)));
12546 /* Otherwise, do the same as above, but in a loop. Note that we must be
12547 extra careful with variables wrapping around because we might be at
12548 the very top (or the very bottom) of the address space and we have
12549 to be able to handle this case properly; in particular, we use an
12550 equality test for the loop condition. */
12551 else
12553 HOST_WIDE_INT rounded_size, last;
12554 struct scratch_reg sr;
12556 get_scratch_register_on_entry (&sr);
12559 /* Step 1: round SIZE to the previous multiple of the interval. */
12561 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12564 /* Step 2: compute initial and final value of the loop counter. */
12566 /* TEST_OFFSET = FIRST. */
12567 emit_move_insn (sr.reg, GEN_INT (-first));
12569 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
12570 last = first + rounded_size;
12573 /* Step 3: the loop
12577 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
12578 probe at TEST_ADDR
12580 while (TEST_ADDR != LAST_ADDR)
12582 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
12583 until it is equal to ROUNDED_SIZE. */
12585 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
12588 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
12589 that SIZE is equal to ROUNDED_SIZE. */
12591 if (size != rounded_size)
12592 emit_stack_probe (plus_constant (Pmode,
12593 gen_rtx_PLUS (Pmode,
12594 stack_pointer_rtx,
12595 sr.reg),
12596 rounded_size - size));
12598 release_scratch_register_on_entry (&sr);
12601 /* Make sure nothing is scheduled before we are done. */
12602 emit_insn (gen_blockage ());
12605 /* Probe a range of stack addresses from REG to END, inclusive. These are
12606 offsets from the current stack pointer. */
12608 const char *
12609 output_probe_stack_range (rtx reg, rtx end)
12611 static int labelno = 0;
12612 char loop_lab[32];
12613 rtx xops[3];
12615 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12617 /* Loop. */
12618 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12620 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
12621 xops[0] = reg;
12622 xops[1] = GEN_INT (get_probe_interval ());
12623 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12625 /* Probe at TEST_ADDR. */
12626 xops[0] = stack_pointer_rtx;
12627 xops[1] = reg;
12628 xops[2] = const0_rtx;
12629 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
12631 /* Test if TEST_ADDR == LAST_ADDR. */
12632 xops[0] = reg;
12633 xops[1] = end;
12634 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12636 /* Branch. */
12637 fputs ("\tjne\t", asm_out_file);
12638 assemble_name_raw (asm_out_file, loop_lab);
12639 fputc ('\n', asm_out_file);
12641 return "";
12644 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
12645 will guide prologue/epilogue to be generated in correct form. */
12647 static void
12648 ix86_finalize_stack_frame_flags (void)
12650 /* Check if stack realign is really needed after reload, and
12651 stores result in cfun */
12652 unsigned int incoming_stack_boundary
12653 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
12654 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
12655 unsigned int stack_alignment
12656 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
12657 ? crtl->max_used_stack_slot_alignment
12658 : crtl->stack_alignment_needed);
12659 unsigned int stack_realign
12660 = (incoming_stack_boundary < stack_alignment);
12661 bool recompute_frame_layout_p = false;
12663 if (crtl->stack_realign_finalized)
12665 /* After stack_realign_needed is finalized, we can't no longer
12666 change it. */
12667 gcc_assert (crtl->stack_realign_needed == stack_realign);
12668 return;
12671 /* If the only reason for frame_pointer_needed is that we conservatively
12672 assumed stack realignment might be needed or -fno-omit-frame-pointer
12673 is used, but in the end nothing that needed the stack alignment had
12674 been spilled nor stack access, clear frame_pointer_needed and say we
12675 don't need stack realignment. */
12676 if ((stack_realign || !flag_omit_frame_pointer)
12677 && frame_pointer_needed
12678 && crtl->is_leaf
12679 && crtl->sp_is_unchanging
12680 && !ix86_current_function_calls_tls_descriptor
12681 && !crtl->accesses_prior_frames
12682 && !cfun->calls_alloca
12683 && !crtl->calls_eh_return
12684 /* See ira_setup_eliminable_regset for the rationale. */
12685 && !(STACK_CHECK_MOVING_SP
12686 && flag_stack_check
12687 && flag_exceptions
12688 && cfun->can_throw_non_call_exceptions)
12689 && !ix86_frame_pointer_required ()
12690 && get_frame_size () == 0
12691 && ix86_nsaved_sseregs () == 0
12692 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
12694 HARD_REG_SET set_up_by_prologue, prologue_used;
12695 basic_block bb;
12697 CLEAR_HARD_REG_SET (prologue_used);
12698 CLEAR_HARD_REG_SET (set_up_by_prologue);
12699 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
12700 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
12701 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
12702 HARD_FRAME_POINTER_REGNUM);
12704 /* The preferred stack alignment is the minimum stack alignment. */
12705 if (stack_alignment > crtl->preferred_stack_boundary)
12706 stack_alignment = crtl->preferred_stack_boundary;
12708 bool require_stack_frame = false;
12710 FOR_EACH_BB_FN (bb, cfun)
12712 rtx_insn *insn;
12713 FOR_BB_INSNS (bb, insn)
12714 if (NONDEBUG_INSN_P (insn)
12715 && requires_stack_frame_p (insn, prologue_used,
12716 set_up_by_prologue))
12718 require_stack_frame = true;
12720 if (stack_realign)
12722 /* Find the maximum stack alignment. */
12723 subrtx_iterator::array_type array;
12724 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
12725 if (MEM_P (*iter)
12726 && (reg_mentioned_p (stack_pointer_rtx,
12727 *iter)
12728 || reg_mentioned_p (frame_pointer_rtx,
12729 *iter)))
12731 unsigned int alignment = MEM_ALIGN (*iter);
12732 if (alignment > stack_alignment)
12733 stack_alignment = alignment;
12739 if (require_stack_frame)
12741 /* Stack frame is required. If stack alignment needed is less
12742 than incoming stack boundary, don't realign stack. */
12743 stack_realign = incoming_stack_boundary < stack_alignment;
12744 if (!stack_realign)
12746 crtl->max_used_stack_slot_alignment
12747 = incoming_stack_boundary;
12748 crtl->stack_alignment_needed
12749 = incoming_stack_boundary;
12750 /* Also update preferred_stack_boundary for leaf
12751 functions. */
12752 crtl->preferred_stack_boundary
12753 = incoming_stack_boundary;
12756 else
12758 /* If drap has been set, but it actually isn't live at the
12759 start of the function, there is no reason to set it up. */
12760 if (crtl->drap_reg)
12762 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12763 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
12764 REGNO (crtl->drap_reg)))
12766 crtl->drap_reg = NULL_RTX;
12767 crtl->need_drap = false;
12770 else
12771 cfun->machine->no_drap_save_restore = true;
12773 frame_pointer_needed = false;
12774 stack_realign = false;
12775 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
12776 crtl->stack_alignment_needed = incoming_stack_boundary;
12777 crtl->stack_alignment_estimated = incoming_stack_boundary;
12778 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
12779 crtl->preferred_stack_boundary = incoming_stack_boundary;
12780 df_finish_pass (true);
12781 df_scan_alloc (NULL);
12782 df_scan_blocks ();
12783 df_compute_regs_ever_live (true);
12784 df_analyze ();
12786 if (flag_var_tracking)
12788 /* Since frame pointer is no longer available, replace it with
12789 stack pointer - UNITS_PER_WORD in debug insns. */
12790 df_ref ref, next;
12791 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
12792 ref; ref = next)
12794 next = DF_REF_NEXT_REG (ref);
12795 if (!DF_REF_INSN_INFO (ref))
12796 continue;
12798 /* Make sure the next ref is for a different instruction,
12799 so that we're not affected by the rescan. */
12800 rtx_insn *insn = DF_REF_INSN (ref);
12801 while (next && DF_REF_INSN (next) == insn)
12802 next = DF_REF_NEXT_REG (next);
12804 if (DEBUG_INSN_P (insn))
12806 bool changed = false;
12807 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
12809 rtx *loc = DF_REF_LOC (ref);
12810 if (*loc == hard_frame_pointer_rtx)
12812 *loc = plus_constant (Pmode,
12813 stack_pointer_rtx,
12814 -UNITS_PER_WORD);
12815 changed = true;
12818 if (changed)
12819 df_insn_rescan (insn);
12824 recompute_frame_layout_p = true;
12828 if (crtl->stack_realign_needed != stack_realign)
12829 recompute_frame_layout_p = true;
12830 crtl->stack_realign_needed = stack_realign;
12831 crtl->stack_realign_finalized = true;
12832 if (recompute_frame_layout_p)
12833 ix86_compute_frame_layout ();
12836 /* Delete SET_GOT right after entry block if it is allocated to reg. */
12838 static void
12839 ix86_elim_entry_set_got (rtx reg)
12841 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12842 rtx_insn *c_insn = BB_HEAD (bb);
12843 if (!NONDEBUG_INSN_P (c_insn))
12844 c_insn = next_nonnote_nondebug_insn (c_insn);
12845 if (c_insn && NONJUMP_INSN_P (c_insn))
12847 rtx pat = PATTERN (c_insn);
12848 if (GET_CODE (pat) == PARALLEL)
12850 rtx vec = XVECEXP (pat, 0, 0);
12851 if (GET_CODE (vec) == SET
12852 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
12853 && REGNO (XEXP (vec, 0)) == REGNO (reg))
12854 delete_insn (c_insn);
12859 static rtx
12860 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
12862 rtx addr, mem;
12864 if (offset)
12865 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
12866 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
12867 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
12870 static inline rtx
12871 gen_frame_load (rtx reg, rtx frame_reg, int offset)
12873 return gen_frame_set (reg, frame_reg, offset, false);
12876 static inline rtx
12877 gen_frame_store (rtx reg, rtx frame_reg, int offset)
12879 return gen_frame_set (reg, frame_reg, offset, true);
12882 static void
12883 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
12885 struct machine_function *m = cfun->machine;
12886 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
12887 + m->call_ms2sysv_extra_regs;
12888 rtvec v = rtvec_alloc (ncregs + 1);
12889 unsigned int align, i, vi = 0;
12890 rtx_insn *insn;
12891 rtx sym, addr;
12892 rtx rax = gen_rtx_REG (word_mode, AX_REG);
12893 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12895 /* AL should only be live with sysv_abi. */
12896 gcc_assert (!ix86_eax_live_at_start_p ());
12897 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
12899 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
12900 we've actually realigned the stack or not. */
12901 align = GET_MODE_ALIGNMENT (V4SFmode);
12902 addr = choose_baseaddr (frame.stack_realign_offset
12903 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
12904 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
12906 emit_insn (gen_rtx_SET (rax, addr));
12908 /* Get the stub symbol. */
12909 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
12910 : XLOGUE_STUB_SAVE);
12911 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
12913 for (i = 0; i < ncregs; ++i)
12915 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
12916 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
12917 r.regno);
12918 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
12921 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
12923 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
12924 RTX_FRAME_RELATED_P (insn) = true;
12927 /* Expand the prologue into a bunch of separate insns. */
12929 void
12930 ix86_expand_prologue (void)
12932 struct machine_function *m = cfun->machine;
12933 rtx insn, t;
12934 struct ix86_frame frame;
12935 HOST_WIDE_INT allocate;
12936 bool int_registers_saved;
12937 bool sse_registers_saved;
12938 bool save_stub_call_needed;
12939 rtx static_chain = NULL_RTX;
12941 if (ix86_function_naked (current_function_decl))
12942 return;
12944 ix86_finalize_stack_frame_flags ();
12946 /* DRAP should not coexist with stack_realign_fp */
12947 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
12949 memset (&m->fs, 0, sizeof (m->fs));
12951 /* Initialize CFA state for before the prologue. */
12952 m->fs.cfa_reg = stack_pointer_rtx;
12953 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
12955 /* Track SP offset to the CFA. We continue tracking this after we've
12956 swapped the CFA register away from SP. In the case of re-alignment
12957 this is fudged; we're interested to offsets within the local frame. */
12958 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
12959 m->fs.sp_valid = true;
12960 m->fs.sp_realigned = false;
12962 frame = m->frame;
12964 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
12966 /* We should have already generated an error for any use of
12967 ms_hook on a nested function. */
12968 gcc_checking_assert (!ix86_static_chain_on_stack);
12970 /* Check if profiling is active and we shall use profiling before
12971 prologue variant. If so sorry. */
12972 if (crtl->profile && flag_fentry != 0)
12973 sorry ("ms_hook_prologue attribute isn%'t compatible "
12974 "with -mfentry for 32-bit");
12976 /* In ix86_asm_output_function_label we emitted:
12977 8b ff movl.s %edi,%edi
12978 55 push %ebp
12979 8b ec movl.s %esp,%ebp
12981 This matches the hookable function prologue in Win32 API
12982 functions in Microsoft Windows XP Service Pack 2 and newer.
12983 Wine uses this to enable Windows apps to hook the Win32 API
12984 functions provided by Wine.
12986 What that means is that we've already set up the frame pointer. */
12988 if (frame_pointer_needed
12989 && !(crtl->drap_reg && crtl->stack_realign_needed))
12991 rtx push, mov;
12993 /* We've decided to use the frame pointer already set up.
12994 Describe this to the unwinder by pretending that both
12995 push and mov insns happen right here.
12997 Putting the unwind info here at the end of the ms_hook
12998 is done so that we can make absolutely certain we get
12999 the required byte sequence at the start of the function,
13000 rather than relying on an assembler that can produce
13001 the exact encoding required.
13003 However it does mean (in the unpatched case) that we have
13004 a 1 insn window where the asynchronous unwind info is
13005 incorrect. However, if we placed the unwind info at
13006 its correct location we would have incorrect unwind info
13007 in the patched case. Which is probably all moot since
13008 I don't expect Wine generates dwarf2 unwind info for the
13009 system libraries that use this feature. */
13011 insn = emit_insn (gen_blockage ());
13013 push = gen_push (hard_frame_pointer_rtx);
13014 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13015 stack_pointer_rtx);
13016 RTX_FRAME_RELATED_P (push) = 1;
13017 RTX_FRAME_RELATED_P (mov) = 1;
13019 RTX_FRAME_RELATED_P (insn) = 1;
13020 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13021 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13023 /* Note that gen_push incremented m->fs.cfa_offset, even
13024 though we didn't emit the push insn here. */
13025 m->fs.cfa_reg = hard_frame_pointer_rtx;
13026 m->fs.fp_offset = m->fs.cfa_offset;
13027 m->fs.fp_valid = true;
13029 else
13031 /* The frame pointer is not needed so pop %ebp again.
13032 This leaves us with a pristine state. */
13033 emit_insn (gen_pop (hard_frame_pointer_rtx));
13037 /* The first insn of a function that accepts its static chain on the
13038 stack is to push the register that would be filled in by a direct
13039 call. This insn will be skipped by the trampoline. */
13040 else if (ix86_static_chain_on_stack)
13042 static_chain = ix86_static_chain (cfun->decl, false);
13043 insn = emit_insn (gen_push (static_chain));
13044 emit_insn (gen_blockage ());
13046 /* We don't want to interpret this push insn as a register save,
13047 only as a stack adjustment. The real copy of the register as
13048 a save will be done later, if needed. */
13049 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13050 t = gen_rtx_SET (stack_pointer_rtx, t);
13051 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13052 RTX_FRAME_RELATED_P (insn) = 1;
13055 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13056 of DRAP is needed and stack realignment is really needed after reload */
13057 if (stack_realign_drap)
13059 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13061 /* Can't use DRAP in interrupt function. */
13062 if (cfun->machine->func_type != TYPE_NORMAL)
13063 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13064 "in interrupt service routine. This may be worked "
13065 "around by avoiding functions with aggregate return.");
13067 /* Only need to push parameter pointer reg if it is caller saved. */
13068 if (!call_used_regs[REGNO (crtl->drap_reg)])
13070 /* Push arg pointer reg */
13071 insn = emit_insn (gen_push (crtl->drap_reg));
13072 RTX_FRAME_RELATED_P (insn) = 1;
13075 /* Grab the argument pointer. */
13076 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13077 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13078 RTX_FRAME_RELATED_P (insn) = 1;
13079 m->fs.cfa_reg = crtl->drap_reg;
13080 m->fs.cfa_offset = 0;
13082 /* Align the stack. */
13083 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13084 stack_pointer_rtx,
13085 GEN_INT (-align_bytes)));
13086 RTX_FRAME_RELATED_P (insn) = 1;
13088 /* Replicate the return address on the stack so that return
13089 address can be reached via (argp - 1) slot. This is needed
13090 to implement macro RETURN_ADDR_RTX and intrinsic function
13091 expand_builtin_return_addr etc. */
13092 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13093 t = gen_frame_mem (word_mode, t);
13094 insn = emit_insn (gen_push (t));
13095 RTX_FRAME_RELATED_P (insn) = 1;
13097 /* For the purposes of frame and register save area addressing,
13098 we've started over with a new frame. */
13099 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13100 m->fs.realigned = true;
13102 if (static_chain)
13104 /* Replicate static chain on the stack so that static chain
13105 can be reached via (argp - 2) slot. This is needed for
13106 nested function with stack realignment. */
13107 insn = emit_insn (gen_push (static_chain));
13108 RTX_FRAME_RELATED_P (insn) = 1;
13112 int_registers_saved = (frame.nregs == 0);
13113 sse_registers_saved = (frame.nsseregs == 0);
13114 save_stub_call_needed = (m->call_ms2sysv);
13115 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13117 if (frame_pointer_needed && !m->fs.fp_valid)
13119 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13120 slower on all targets. Also sdb didn't like it. */
13121 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13122 RTX_FRAME_RELATED_P (insn) = 1;
13124 /* Push registers now, before setting the frame pointer
13125 on SEH target. */
13126 if (!int_registers_saved
13127 && TARGET_SEH
13128 && !frame.save_regs_using_mov)
13130 ix86_emit_save_regs ();
13131 int_registers_saved = true;
13132 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13135 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13137 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13138 RTX_FRAME_RELATED_P (insn) = 1;
13140 if (m->fs.cfa_reg == stack_pointer_rtx)
13141 m->fs.cfa_reg = hard_frame_pointer_rtx;
13142 m->fs.fp_offset = m->fs.sp_offset;
13143 m->fs.fp_valid = true;
13147 if (!int_registers_saved)
13149 /* If saving registers via PUSH, do so now. */
13150 if (!frame.save_regs_using_mov)
13152 ix86_emit_save_regs ();
13153 int_registers_saved = true;
13154 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13157 /* When using red zone we may start register saving before allocating
13158 the stack frame saving one cycle of the prologue. However, avoid
13159 doing this if we have to probe the stack; at least on x86_64 the
13160 stack probe can turn into a call that clobbers a red zone location. */
13161 else if (ix86_using_red_zone ()
13162 && (! TARGET_STACK_PROBE
13163 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13165 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13166 int_registers_saved = true;
13170 if (stack_realign_fp)
13172 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13173 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13175 /* Record last valid frame pointer offset. */
13176 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13178 /* The computation of the size of the re-aligned stack frame means
13179 that we must allocate the size of the register save area before
13180 performing the actual alignment. Otherwise we cannot guarantee
13181 that there's enough storage above the realignment point. */
13182 allocate = frame.reg_save_offset - m->fs.sp_offset
13183 + frame.stack_realign_allocate;
13184 if (allocate)
13185 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13186 GEN_INT (-allocate), -1, false);
13188 /* Align the stack. */
13189 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13190 stack_pointer_rtx,
13191 GEN_INT (-align_bytes)));
13192 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13193 m->fs.sp_realigned_offset = m->fs.sp_offset
13194 - frame.stack_realign_allocate;
13195 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13196 Beyond this point, stack access should be done via choose_baseaddr or
13197 by using sp_valid_at and fp_valid_at to determine the correct base
13198 register. Henceforth, any CFA offset should be thought of as logical
13199 and not physical. */
13200 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13201 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13202 m->fs.sp_realigned = true;
13204 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13205 is needed to describe where a register is saved using a realigned
13206 stack pointer, so we need to invalidate the stack pointer for that
13207 target. */
13208 if (TARGET_SEH)
13209 m->fs.sp_valid = false;
13211 /* If SP offset is non-immediate after allocation of the stack frame,
13212 then emit SSE saves or stub call prior to allocating the rest of the
13213 stack frame. This is less efficient for the out-of-line stub because
13214 we can't combine allocations across the call barrier, but it's better
13215 than using a scratch register. */
13216 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13217 - m->fs.sp_realigned_offset),
13218 Pmode))
13220 if (!sse_registers_saved)
13222 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13223 sse_registers_saved = true;
13225 else if (save_stub_call_needed)
13227 ix86_emit_outlined_ms2sysv_save (frame);
13228 save_stub_call_needed = false;
13233 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13235 if (flag_stack_usage_info)
13237 /* We start to count from ARG_POINTER. */
13238 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13240 /* If it was realigned, take into account the fake frame. */
13241 if (stack_realign_drap)
13243 if (ix86_static_chain_on_stack)
13244 stack_size += UNITS_PER_WORD;
13246 if (!call_used_regs[REGNO (crtl->drap_reg)])
13247 stack_size += UNITS_PER_WORD;
13249 /* This over-estimates by 1 minimal-stack-alignment-unit but
13250 mitigates that by counting in the new return address slot. */
13251 current_function_dynamic_stack_size
13252 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13255 current_function_static_stack_size = stack_size;
13258 /* On SEH target with very large frame size, allocate an area to save
13259 SSE registers (as the very large allocation won't be described). */
13260 if (TARGET_SEH
13261 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13262 && !sse_registers_saved)
13264 HOST_WIDE_INT sse_size =
13265 frame.sse_reg_save_offset - frame.reg_save_offset;
13267 gcc_assert (int_registers_saved);
13269 /* No need to do stack checking as the area will be immediately
13270 written. */
13271 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13272 GEN_INT (-sse_size), -1,
13273 m->fs.cfa_reg == stack_pointer_rtx);
13274 allocate -= sse_size;
13275 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13276 sse_registers_saved = true;
13279 /* The stack has already been decremented by the instruction calling us
13280 so probe if the size is non-negative to preserve the protection area. */
13281 if (allocate >= 0
13282 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13283 || flag_stack_clash_protection))
13285 /* This assert wants to verify that integer registers were saved
13286 prior to probing. This is necessary when probing may be implemented
13287 as a function call (Windows). It is not necessary for stack clash
13288 protection probing. */
13289 if (!flag_stack_clash_protection)
13290 gcc_assert (int_registers_saved);
13292 if (flag_stack_clash_protection)
13294 ix86_adjust_stack_and_probe_stack_clash (allocate);
13295 allocate = 0;
13297 else if (STACK_CHECK_MOVING_SP)
13299 if (!(crtl->is_leaf && !cfun->calls_alloca
13300 && allocate <= get_probe_interval ()))
13302 ix86_adjust_stack_and_probe (allocate);
13303 allocate = 0;
13306 else
13308 HOST_WIDE_INT size = allocate;
13310 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13311 size = 0x80000000 - get_stack_check_protect () - 1;
13313 if (TARGET_STACK_PROBE)
13315 if (crtl->is_leaf && !cfun->calls_alloca)
13317 if (size > get_probe_interval ())
13318 ix86_emit_probe_stack_range (0, size);
13320 else
13321 ix86_emit_probe_stack_range (0,
13322 size + get_stack_check_protect ());
13324 else
13326 if (crtl->is_leaf && !cfun->calls_alloca)
13328 if (size > get_probe_interval ()
13329 && size > get_stack_check_protect ())
13330 ix86_emit_probe_stack_range (get_stack_check_protect (),
13331 size - get_stack_check_protect ());
13333 else
13334 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
13339 if (allocate == 0)
13341 else if (!ix86_target_stack_probe ()
13342 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13344 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13345 GEN_INT (-allocate), -1,
13346 m->fs.cfa_reg == stack_pointer_rtx);
13348 else
13350 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13351 rtx r10 = NULL;
13352 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13353 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13354 bool eax_live = ix86_eax_live_at_start_p ();
13355 bool r10_live = false;
13357 if (TARGET_64BIT)
13358 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13360 if (eax_live)
13362 insn = emit_insn (gen_push (eax));
13363 allocate -= UNITS_PER_WORD;
13364 /* Note that SEH directives need to continue tracking the stack
13365 pointer even after the frame pointer has been set up. */
13366 if (sp_is_cfa_reg || TARGET_SEH)
13368 if (sp_is_cfa_reg)
13369 m->fs.cfa_offset += UNITS_PER_WORD;
13370 RTX_FRAME_RELATED_P (insn) = 1;
13371 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13372 gen_rtx_SET (stack_pointer_rtx,
13373 plus_constant (Pmode, stack_pointer_rtx,
13374 -UNITS_PER_WORD)));
13378 if (r10_live)
13380 r10 = gen_rtx_REG (Pmode, R10_REG);
13381 insn = emit_insn (gen_push (r10));
13382 allocate -= UNITS_PER_WORD;
13383 if (sp_is_cfa_reg || TARGET_SEH)
13385 if (sp_is_cfa_reg)
13386 m->fs.cfa_offset += UNITS_PER_WORD;
13387 RTX_FRAME_RELATED_P (insn) = 1;
13388 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13389 gen_rtx_SET (stack_pointer_rtx,
13390 plus_constant (Pmode, stack_pointer_rtx,
13391 -UNITS_PER_WORD)));
13395 emit_move_insn (eax, GEN_INT (allocate));
13396 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13398 /* Use the fact that AX still contains ALLOCATE. */
13399 adjust_stack_insn = (Pmode == DImode
13400 ? gen_pro_epilogue_adjust_stack_di_sub
13401 : gen_pro_epilogue_adjust_stack_si_sub);
13403 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13404 stack_pointer_rtx, eax));
13406 if (sp_is_cfa_reg || TARGET_SEH)
13408 if (sp_is_cfa_reg)
13409 m->fs.cfa_offset += allocate;
13410 RTX_FRAME_RELATED_P (insn) = 1;
13411 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13412 gen_rtx_SET (stack_pointer_rtx,
13413 plus_constant (Pmode, stack_pointer_rtx,
13414 -allocate)));
13416 m->fs.sp_offset += allocate;
13418 /* Use stack_pointer_rtx for relative addressing so that code
13419 works for realigned stack, too. */
13420 if (r10_live && eax_live)
13422 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13423 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13424 gen_frame_mem (word_mode, t));
13425 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13426 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13427 gen_frame_mem (word_mode, t));
13429 else if (eax_live || r10_live)
13431 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13432 emit_move_insn (gen_rtx_REG (word_mode,
13433 (eax_live ? AX_REG : R10_REG)),
13434 gen_frame_mem (word_mode, t));
13437 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13439 /* If we havn't already set up the frame pointer, do so now. */
13440 if (frame_pointer_needed && !m->fs.fp_valid)
13442 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13443 GEN_INT (frame.stack_pointer_offset
13444 - frame.hard_frame_pointer_offset));
13445 insn = emit_insn (insn);
13446 RTX_FRAME_RELATED_P (insn) = 1;
13447 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13449 if (m->fs.cfa_reg == stack_pointer_rtx)
13450 m->fs.cfa_reg = hard_frame_pointer_rtx;
13451 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13452 m->fs.fp_valid = true;
13455 if (!int_registers_saved)
13456 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13457 if (!sse_registers_saved)
13458 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13459 else if (save_stub_call_needed)
13460 ix86_emit_outlined_ms2sysv_save (frame);
13462 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13463 in PROLOGUE. */
13464 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13466 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13467 insn = emit_insn (gen_set_got (pic));
13468 RTX_FRAME_RELATED_P (insn) = 1;
13469 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13470 emit_insn (gen_prologue_use (pic));
13471 /* Deleting already emmitted SET_GOT if exist and allocated to
13472 REAL_PIC_OFFSET_TABLE_REGNUM. */
13473 ix86_elim_entry_set_got (pic);
13476 if (crtl->drap_reg && !crtl->stack_realign_needed)
13478 /* vDRAP is setup but after reload it turns out stack realign
13479 isn't necessary, here we will emit prologue to setup DRAP
13480 without stack realign adjustment */
13481 t = choose_baseaddr (0, NULL);
13482 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13485 /* Prevent instructions from being scheduled into register save push
13486 sequence when access to the redzone area is done through frame pointer.
13487 The offset between the frame pointer and the stack pointer is calculated
13488 relative to the value of the stack pointer at the end of the function
13489 prologue, and moving instructions that access redzone area via frame
13490 pointer inside push sequence violates this assumption. */
13491 if (frame_pointer_needed && frame.red_zone_size)
13492 emit_insn (gen_memory_blockage ());
13494 /* SEH requires that the prologue end within 256 bytes of the start of
13495 the function. Prevent instruction schedules that would extend that.
13496 Further, prevent alloca modifications to the stack pointer from being
13497 combined with prologue modifications. */
13498 if (TARGET_SEH)
13499 emit_insn (gen_prologue_use (stack_pointer_rtx));
13502 /* Emit code to restore REG using a POP insn. */
13504 static void
13505 ix86_emit_restore_reg_using_pop (rtx reg)
13507 struct machine_function *m = cfun->machine;
13508 rtx_insn *insn = emit_insn (gen_pop (reg));
13510 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13511 m->fs.sp_offset -= UNITS_PER_WORD;
13513 if (m->fs.cfa_reg == crtl->drap_reg
13514 && REGNO (reg) == REGNO (crtl->drap_reg))
13516 /* Previously we'd represented the CFA as an expression
13517 like *(%ebp - 8). We've just popped that value from
13518 the stack, which means we need to reset the CFA to
13519 the drap register. This will remain until we restore
13520 the stack pointer. */
13521 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13522 RTX_FRAME_RELATED_P (insn) = 1;
13524 /* This means that the DRAP register is valid for addressing too. */
13525 m->fs.drap_valid = true;
13526 return;
13529 if (m->fs.cfa_reg == stack_pointer_rtx)
13531 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13532 x = gen_rtx_SET (stack_pointer_rtx, x);
13533 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
13534 RTX_FRAME_RELATED_P (insn) = 1;
13536 m->fs.cfa_offset -= UNITS_PER_WORD;
13539 /* When the frame pointer is the CFA, and we pop it, we are
13540 swapping back to the stack pointer as the CFA. This happens
13541 for stack frames that don't allocate other data, so we assume
13542 the stack pointer is now pointing at the return address, i.e.
13543 the function entry state, which makes the offset be 1 word. */
13544 if (reg == hard_frame_pointer_rtx)
13546 m->fs.fp_valid = false;
13547 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13549 m->fs.cfa_reg = stack_pointer_rtx;
13550 m->fs.cfa_offset -= UNITS_PER_WORD;
13552 add_reg_note (insn, REG_CFA_DEF_CFA,
13553 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
13554 GEN_INT (m->fs.cfa_offset)));
13555 RTX_FRAME_RELATED_P (insn) = 1;
13560 /* Emit code to restore saved registers using POP insns. */
13562 static void
13563 ix86_emit_restore_regs_using_pop (void)
13565 unsigned int regno;
13567 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13568 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
13569 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
13572 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
13573 omits the emit and only attaches the notes. */
13575 static void
13576 ix86_emit_leave (rtx_insn *insn)
13578 struct machine_function *m = cfun->machine;
13579 if (!insn)
13580 insn = emit_insn (ix86_gen_leave ());
13582 ix86_add_queued_cfa_restore_notes (insn);
13584 gcc_assert (m->fs.fp_valid);
13585 m->fs.sp_valid = true;
13586 m->fs.sp_realigned = false;
13587 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
13588 m->fs.fp_valid = false;
13590 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13592 m->fs.cfa_reg = stack_pointer_rtx;
13593 m->fs.cfa_offset = m->fs.sp_offset;
13595 add_reg_note (insn, REG_CFA_DEF_CFA,
13596 plus_constant (Pmode, stack_pointer_rtx,
13597 m->fs.sp_offset));
13598 RTX_FRAME_RELATED_P (insn) = 1;
13600 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
13601 m->fs.fp_offset);
13604 /* Emit code to restore saved registers using MOV insns.
13605 First register is restored from CFA - CFA_OFFSET. */
13606 static void
13607 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
13608 bool maybe_eh_return)
13610 struct machine_function *m = cfun->machine;
13611 unsigned int regno;
13613 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13614 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13616 rtx reg = gen_rtx_REG (word_mode, regno);
13617 rtx mem;
13618 rtx_insn *insn;
13620 mem = choose_baseaddr (cfa_offset, NULL);
13621 mem = gen_frame_mem (word_mode, mem);
13622 insn = emit_move_insn (reg, mem);
13624 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
13626 /* Previously we'd represented the CFA as an expression
13627 like *(%ebp - 8). We've just popped that value from
13628 the stack, which means we need to reset the CFA to
13629 the drap register. This will remain until we restore
13630 the stack pointer. */
13631 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13632 RTX_FRAME_RELATED_P (insn) = 1;
13634 /* This means that the DRAP register is valid for addressing. */
13635 m->fs.drap_valid = true;
13637 else
13638 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13640 cfa_offset -= UNITS_PER_WORD;
13644 /* Emit code to restore saved registers using MOV insns.
13645 First register is restored from CFA - CFA_OFFSET. */
13646 static void
13647 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
13648 bool maybe_eh_return)
13650 unsigned int regno;
13652 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13653 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13655 rtx reg = gen_rtx_REG (V4SFmode, regno);
13656 rtx mem;
13657 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
13659 mem = choose_baseaddr (cfa_offset, &align);
13660 mem = gen_rtx_MEM (V4SFmode, mem);
13662 /* The location aligment depends upon the base register. */
13663 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
13664 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13665 set_mem_align (mem, align);
13666 emit_insn (gen_rtx_SET (reg, mem));
13668 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13670 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13674 static void
13675 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
13676 bool use_call, int style)
13678 struct machine_function *m = cfun->machine;
13679 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13680 + m->call_ms2sysv_extra_regs;
13681 rtvec v;
13682 unsigned int elems_needed, align, i, vi = 0;
13683 rtx_insn *insn;
13684 rtx sym, tmp;
13685 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
13686 rtx r10 = NULL_RTX;
13687 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13688 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
13689 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
13690 rtx rsi_frame_load = NULL_RTX;
13691 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
13692 enum xlogue_stub stub;
13694 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
13696 /* If using a realigned stack, we should never start with padding. */
13697 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
13699 /* Setup RSI as the stub's base pointer. */
13700 align = GET_MODE_ALIGNMENT (V4SFmode);
13701 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
13702 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13704 emit_insn (gen_rtx_SET (rsi, tmp));
13706 /* Get a symbol for the stub. */
13707 if (frame_pointer_needed)
13708 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
13709 : XLOGUE_STUB_RESTORE_HFP_TAIL;
13710 else
13711 stub = use_call ? XLOGUE_STUB_RESTORE
13712 : XLOGUE_STUB_RESTORE_TAIL;
13713 sym = xlogue.get_stub_rtx (stub);
13715 elems_needed = ncregs;
13716 if (use_call)
13717 elems_needed += 1;
13718 else
13719 elems_needed += frame_pointer_needed ? 5 : 3;
13720 v = rtvec_alloc (elems_needed);
13722 /* We call the epilogue stub when we need to pop incoming args or we are
13723 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
13724 epilogue stub and it is the tail-call. */
13725 if (use_call)
13726 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13727 else
13729 RTVEC_ELT (v, vi++) = ret_rtx;
13730 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13731 if (frame_pointer_needed)
13733 rtx rbp = gen_rtx_REG (DImode, BP_REG);
13734 gcc_assert (m->fs.fp_valid);
13735 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
13737 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
13738 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
13739 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
13740 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
13741 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
13743 else
13745 /* If no hard frame pointer, we set R10 to the SP restore value. */
13746 gcc_assert (!m->fs.fp_valid);
13747 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13748 gcc_assert (m->fs.sp_valid);
13750 r10 = gen_rtx_REG (DImode, R10_REG);
13751 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
13752 emit_insn (gen_rtx_SET (r10, tmp));
13754 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
13758 /* Generate frame load insns and restore notes. */
13759 for (i = 0; i < ncregs; ++i)
13761 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13762 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
13763 rtx reg, frame_load;
13765 reg = gen_rtx_REG (mode, r.regno);
13766 frame_load = gen_frame_load (reg, rsi, r.offset);
13768 /* Save RSI frame load insn & note to add last. */
13769 if (r.regno == SI_REG)
13771 gcc_assert (!rsi_frame_load);
13772 rsi_frame_load = frame_load;
13773 rsi_restore_offset = r.offset;
13775 else
13777 RTVEC_ELT (v, vi++) = frame_load;
13778 ix86_add_cfa_restore_note (NULL, reg, r.offset);
13782 /* Add RSI frame load & restore note at the end. */
13783 gcc_assert (rsi_frame_load);
13784 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
13785 RTVEC_ELT (v, vi++) = rsi_frame_load;
13786 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
13787 rsi_restore_offset);
13789 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
13790 if (!use_call && !frame_pointer_needed)
13792 gcc_assert (m->fs.sp_valid);
13793 gcc_assert (!m->fs.sp_realigned);
13795 /* At this point, R10 should point to frame.stack_realign_offset. */
13796 if (m->fs.cfa_reg == stack_pointer_rtx)
13797 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
13798 m->fs.sp_offset = frame.stack_realign_offset;
13801 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
13802 tmp = gen_rtx_PARALLEL (VOIDmode, v);
13803 if (use_call)
13804 insn = emit_insn (tmp);
13805 else
13807 insn = emit_jump_insn (tmp);
13808 JUMP_LABEL (insn) = ret_rtx;
13810 if (frame_pointer_needed)
13811 ix86_emit_leave (insn);
13812 else
13814 /* Need CFA adjust note. */
13815 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
13816 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
13820 RTX_FRAME_RELATED_P (insn) = true;
13821 ix86_add_queued_cfa_restore_notes (insn);
13823 /* If we're not doing a tail-call, we need to adjust the stack. */
13824 if (use_call && m->fs.sp_valid)
13826 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
13827 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13828 GEN_INT (dealloc), style,
13829 m->fs.cfa_reg == stack_pointer_rtx);
13833 /* Restore function stack, frame, and registers. */
13835 void
13836 ix86_expand_epilogue (int style)
13838 struct machine_function *m = cfun->machine;
13839 struct machine_frame_state frame_state_save = m->fs;
13840 struct ix86_frame frame;
13841 bool restore_regs_via_mov;
13842 bool using_drap;
13843 bool restore_stub_is_tail = false;
13845 if (ix86_function_naked (current_function_decl))
13847 /* The program should not reach this point. */
13848 emit_insn (gen_ud2 ());
13849 return;
13852 ix86_finalize_stack_frame_flags ();
13853 frame = m->frame;
13855 m->fs.sp_realigned = stack_realign_fp;
13856 m->fs.sp_valid = stack_realign_fp
13857 || !frame_pointer_needed
13858 || crtl->sp_is_unchanging;
13859 gcc_assert (!m->fs.sp_valid
13860 || m->fs.sp_offset == frame.stack_pointer_offset);
13862 /* The FP must be valid if the frame pointer is present. */
13863 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
13864 gcc_assert (!m->fs.fp_valid
13865 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
13867 /* We must have *some* valid pointer to the stack frame. */
13868 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
13870 /* The DRAP is never valid at this point. */
13871 gcc_assert (!m->fs.drap_valid);
13873 /* See the comment about red zone and frame
13874 pointer usage in ix86_expand_prologue. */
13875 if (frame_pointer_needed && frame.red_zone_size)
13876 emit_insn (gen_memory_blockage ());
13878 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
13879 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
13881 /* Determine the CFA offset of the end of the red-zone. */
13882 m->fs.red_zone_offset = 0;
13883 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
13885 /* The red-zone begins below return address and error code in
13886 exception handler. */
13887 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
13889 /* When the register save area is in the aligned portion of
13890 the stack, determine the maximum runtime displacement that
13891 matches up with the aligned frame. */
13892 if (stack_realign_drap)
13893 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
13894 + UNITS_PER_WORD);
13897 /* Special care must be taken for the normal return case of a function
13898 using eh_return: the eax and edx registers are marked as saved, but
13899 not restored along this path. Adjust the save location to match. */
13900 if (crtl->calls_eh_return && style != 2)
13901 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
13903 /* EH_RETURN requires the use of moves to function properly. */
13904 if (crtl->calls_eh_return)
13905 restore_regs_via_mov = true;
13906 /* SEH requires the use of pops to identify the epilogue. */
13907 else if (TARGET_SEH)
13908 restore_regs_via_mov = false;
13909 /* If we're only restoring one register and sp cannot be used then
13910 using a move instruction to restore the register since it's
13911 less work than reloading sp and popping the register. */
13912 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
13913 restore_regs_via_mov = true;
13914 else if (TARGET_EPILOGUE_USING_MOVE
13915 && cfun->machine->use_fast_prologue_epilogue
13916 && (frame.nregs > 1
13917 || m->fs.sp_offset != frame.reg_save_offset))
13918 restore_regs_via_mov = true;
13919 else if (frame_pointer_needed
13920 && !frame.nregs
13921 && m->fs.sp_offset != frame.reg_save_offset)
13922 restore_regs_via_mov = true;
13923 else if (frame_pointer_needed
13924 && TARGET_USE_LEAVE
13925 && cfun->machine->use_fast_prologue_epilogue
13926 && frame.nregs == 1)
13927 restore_regs_via_mov = true;
13928 else
13929 restore_regs_via_mov = false;
13931 if (restore_regs_via_mov || frame.nsseregs)
13933 /* Ensure that the entire register save area is addressable via
13934 the stack pointer, if we will restore SSE regs via sp. */
13935 if (TARGET_64BIT
13936 && m->fs.sp_offset > 0x7fffffff
13937 && sp_valid_at (frame.stack_realign_offset + 1)
13938 && (frame.nsseregs + frame.nregs) != 0)
13940 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13941 GEN_INT (m->fs.sp_offset
13942 - frame.sse_reg_save_offset),
13943 style,
13944 m->fs.cfa_reg == stack_pointer_rtx);
13948 /* If there are any SSE registers to restore, then we have to do it
13949 via moves, since there's obviously no pop for SSE regs. */
13950 if (frame.nsseregs)
13951 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
13952 style == 2);
13954 if (m->call_ms2sysv)
13956 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
13958 /* We cannot use a tail-call for the stub if:
13959 1. We have to pop incoming args,
13960 2. We have additional int regs to restore, or
13961 3. A sibling call will be the tail-call, or
13962 4. We are emitting an eh_return_internal epilogue.
13964 TODO: Item 4 has not yet tested!
13966 If any of the above are true, we will call the stub rather than
13967 jump to it. */
13968 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
13969 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
13972 /* If using out-of-line stub that is a tail-call, then...*/
13973 if (m->call_ms2sysv && restore_stub_is_tail)
13975 /* TODO: parinoid tests. (remove eventually) */
13976 gcc_assert (m->fs.sp_valid);
13977 gcc_assert (!m->fs.sp_realigned);
13978 gcc_assert (!m->fs.fp_valid);
13979 gcc_assert (!m->fs.realigned);
13980 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
13981 gcc_assert (!crtl->drap_reg);
13982 gcc_assert (!frame.nregs);
13984 else if (restore_regs_via_mov)
13986 rtx t;
13988 if (frame.nregs)
13989 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
13991 /* eh_return epilogues need %ecx added to the stack pointer. */
13992 if (style == 2)
13994 rtx sa = EH_RETURN_STACKADJ_RTX;
13995 rtx_insn *insn;
13997 /* %ecx can't be used for both DRAP register and eh_return. */
13998 if (crtl->drap_reg)
13999 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14001 /* regparm nested functions don't work with eh_return. */
14002 gcc_assert (!ix86_static_chain_on_stack);
14004 if (frame_pointer_needed)
14006 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14007 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14008 emit_insn (gen_rtx_SET (sa, t));
14010 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14011 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14013 /* Note that we use SA as a temporary CFA, as the return
14014 address is at the proper place relative to it. We
14015 pretend this happens at the FP restore insn because
14016 prior to this insn the FP would be stored at the wrong
14017 offset relative to SA, and after this insn we have no
14018 other reasonable register to use for the CFA. We don't
14019 bother resetting the CFA to the SP for the duration of
14020 the return insn, unless the control flow instrumentation
14021 is done. In this case the SP is used later and we have
14022 to reset CFA to SP. */
14023 add_reg_note (insn, REG_CFA_DEF_CFA,
14024 plus_constant (Pmode, sa, UNITS_PER_WORD));
14025 ix86_add_queued_cfa_restore_notes (insn);
14026 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14027 RTX_FRAME_RELATED_P (insn) = 1;
14029 m->fs.cfa_reg = sa;
14030 m->fs.cfa_offset = UNITS_PER_WORD;
14031 m->fs.fp_valid = false;
14033 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14034 const0_rtx, style,
14035 flag_cf_protection);
14037 else
14039 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14040 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14041 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14042 ix86_add_queued_cfa_restore_notes (insn);
14044 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14045 if (m->fs.cfa_offset != UNITS_PER_WORD)
14047 m->fs.cfa_offset = UNITS_PER_WORD;
14048 add_reg_note (insn, REG_CFA_DEF_CFA,
14049 plus_constant (Pmode, stack_pointer_rtx,
14050 UNITS_PER_WORD));
14051 RTX_FRAME_RELATED_P (insn) = 1;
14054 m->fs.sp_offset = UNITS_PER_WORD;
14055 m->fs.sp_valid = true;
14056 m->fs.sp_realigned = false;
14059 else
14061 /* SEH requires that the function end with (1) a stack adjustment
14062 if necessary, (2) a sequence of pops, and (3) a return or
14063 jump instruction. Prevent insns from the function body from
14064 being scheduled into this sequence. */
14065 if (TARGET_SEH)
14067 /* Prevent a catch region from being adjacent to the standard
14068 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14069 several other flags that would be interesting to test are
14070 not yet set up. */
14071 if (flag_non_call_exceptions)
14072 emit_insn (gen_nops (const1_rtx));
14073 else
14074 emit_insn (gen_blockage ());
14077 /* First step is to deallocate the stack frame so that we can
14078 pop the registers. If the stack pointer was realigned, it needs
14079 to be restored now. Also do it on SEH target for very large
14080 frame as the emitted instructions aren't allowed by the ABI
14081 in epilogues. */
14082 if (!m->fs.sp_valid || m->fs.sp_realigned
14083 || (TARGET_SEH
14084 && (m->fs.sp_offset - frame.reg_save_offset
14085 >= SEH_MAX_FRAME_SIZE)))
14087 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14088 GEN_INT (m->fs.fp_offset
14089 - frame.reg_save_offset),
14090 style, false);
14092 else if (m->fs.sp_offset != frame.reg_save_offset)
14094 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14095 GEN_INT (m->fs.sp_offset
14096 - frame.reg_save_offset),
14097 style,
14098 m->fs.cfa_reg == stack_pointer_rtx);
14101 ix86_emit_restore_regs_using_pop ();
14104 /* If we used a stack pointer and haven't already got rid of it,
14105 then do so now. */
14106 if (m->fs.fp_valid)
14108 /* If the stack pointer is valid and pointing at the frame
14109 pointer store address, then we only need a pop. */
14110 if (sp_valid_at (frame.hfp_save_offset)
14111 && m->fs.sp_offset == frame.hfp_save_offset)
14112 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14113 /* Leave results in shorter dependency chains on CPUs that are
14114 able to grok it fast. */
14115 else if (TARGET_USE_LEAVE
14116 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14117 || !cfun->machine->use_fast_prologue_epilogue)
14118 ix86_emit_leave (NULL);
14119 else
14121 pro_epilogue_adjust_stack (stack_pointer_rtx,
14122 hard_frame_pointer_rtx,
14123 const0_rtx, style, !using_drap);
14124 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14128 if (using_drap)
14130 int param_ptr_offset = UNITS_PER_WORD;
14131 rtx_insn *insn;
14133 gcc_assert (stack_realign_drap);
14135 if (ix86_static_chain_on_stack)
14136 param_ptr_offset += UNITS_PER_WORD;
14137 if (!call_used_regs[REGNO (crtl->drap_reg)])
14138 param_ptr_offset += UNITS_PER_WORD;
14140 insn = emit_insn (gen_rtx_SET
14141 (stack_pointer_rtx,
14142 gen_rtx_PLUS (Pmode,
14143 crtl->drap_reg,
14144 GEN_INT (-param_ptr_offset))));
14145 m->fs.cfa_reg = stack_pointer_rtx;
14146 m->fs.cfa_offset = param_ptr_offset;
14147 m->fs.sp_offset = param_ptr_offset;
14148 m->fs.realigned = false;
14150 add_reg_note (insn, REG_CFA_DEF_CFA,
14151 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14152 GEN_INT (param_ptr_offset)));
14153 RTX_FRAME_RELATED_P (insn) = 1;
14155 if (!call_used_regs[REGNO (crtl->drap_reg)])
14156 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14159 /* At this point the stack pointer must be valid, and we must have
14160 restored all of the registers. We may not have deallocated the
14161 entire stack frame. We've delayed this until now because it may
14162 be possible to merge the local stack deallocation with the
14163 deallocation forced by ix86_static_chain_on_stack. */
14164 gcc_assert (m->fs.sp_valid);
14165 gcc_assert (!m->fs.sp_realigned);
14166 gcc_assert (!m->fs.fp_valid);
14167 gcc_assert (!m->fs.realigned);
14168 if (m->fs.sp_offset != UNITS_PER_WORD)
14170 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14171 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14172 style, true);
14174 else
14175 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14177 /* Sibcall epilogues don't want a return instruction. */
14178 if (style == 0)
14180 m->fs = frame_state_save;
14181 return;
14184 if (cfun->machine->func_type != TYPE_NORMAL)
14185 emit_jump_insn (gen_interrupt_return ());
14186 else if (crtl->args.pops_args && crtl->args.size)
14188 rtx popc = GEN_INT (crtl->args.pops_args);
14190 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14191 address, do explicit add, and jump indirectly to the caller. */
14193 if (crtl->args.pops_args >= 65536)
14195 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14196 rtx_insn *insn;
14198 /* There is no "pascal" calling convention in any 64bit ABI. */
14199 gcc_assert (!TARGET_64BIT);
14201 insn = emit_insn (gen_pop (ecx));
14202 m->fs.cfa_offset -= UNITS_PER_WORD;
14203 m->fs.sp_offset -= UNITS_PER_WORD;
14205 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14206 x = gen_rtx_SET (stack_pointer_rtx, x);
14207 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14208 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14209 RTX_FRAME_RELATED_P (insn) = 1;
14211 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14212 popc, -1, true);
14213 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14215 else
14216 emit_jump_insn (gen_simple_return_pop_internal (popc));
14218 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14220 /* In case of return from EH a simple return cannot be used
14221 as a return address will be compared with a shadow stack
14222 return address. Use indirect jump instead. */
14223 if (style == 2 && flag_cf_protection)
14225 /* Register used in indirect jump must be in word_mode. But
14226 Pmode may not be the same as word_mode for x32. */
14227 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14228 rtx_insn *insn;
14230 insn = emit_insn (gen_pop (ecx));
14231 m->fs.cfa_offset -= UNITS_PER_WORD;
14232 m->fs.sp_offset -= UNITS_PER_WORD;
14234 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14235 x = gen_rtx_SET (stack_pointer_rtx, x);
14236 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14237 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14238 RTX_FRAME_RELATED_P (insn) = 1;
14240 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14242 else
14243 emit_jump_insn (gen_simple_return_internal ());
14246 /* Restore the state back to the state from the prologue,
14247 so that it's correct for the next epilogue. */
14248 m->fs = frame_state_save;
14251 /* Reset from the function's potential modifications. */
14253 static void
14254 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14256 if (pic_offset_table_rtx
14257 && !ix86_use_pseudo_pic_reg ())
14258 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14260 if (TARGET_MACHO)
14262 rtx_insn *insn = get_last_insn ();
14263 rtx_insn *deleted_debug_label = NULL;
14265 /* Mach-O doesn't support labels at the end of objects, so if
14266 it looks like we might want one, take special action.
14267 First, collect any sequence of deleted debug labels. */
14268 while (insn
14269 && NOTE_P (insn)
14270 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14272 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14273 notes only, instead set their CODE_LABEL_NUMBER to -1,
14274 otherwise there would be code generation differences
14275 in between -g and -g0. */
14276 if (NOTE_P (insn) && NOTE_KIND (insn)
14277 == NOTE_INSN_DELETED_DEBUG_LABEL)
14278 deleted_debug_label = insn;
14279 insn = PREV_INSN (insn);
14282 /* If we have:
14283 label:
14284 barrier
14285 then this needs to be detected, so skip past the barrier. */
14287 if (insn && BARRIER_P (insn))
14288 insn = PREV_INSN (insn);
14290 /* Up to now we've only seen notes or barriers. */
14291 if (insn)
14293 if (LABEL_P (insn)
14294 || (NOTE_P (insn)
14295 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14296 /* Trailing label. */
14297 fputs ("\tnop\n", file);
14298 else if (cfun && ! cfun->is_thunk)
14300 /* See if we have a completely empty function body, skipping
14301 the special case of the picbase thunk emitted as asm. */
14302 while (insn && ! INSN_P (insn))
14303 insn = PREV_INSN (insn);
14304 /* If we don't find any insns, we've got an empty function body;
14305 I.e. completely empty - without a return or branch. This is
14306 taken as the case where a function body has been removed
14307 because it contains an inline __builtin_unreachable(). GCC
14308 declares that reaching __builtin_unreachable() means UB so
14309 we're not obliged to do anything special; however, we want
14310 non-zero-sized function bodies. To meet this, and help the
14311 user out, let's trap the case. */
14312 if (insn == NULL)
14313 fputs ("\tud2\n", file);
14316 else if (deleted_debug_label)
14317 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14318 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14319 CODE_LABEL_NUMBER (insn) = -1;
14323 /* Return a scratch register to use in the split stack prologue. The
14324 split stack prologue is used for -fsplit-stack. It is the first
14325 instructions in the function, even before the regular prologue.
14326 The scratch register can be any caller-saved register which is not
14327 used for parameters or for the static chain. */
14329 static unsigned int
14330 split_stack_prologue_scratch_regno (void)
14332 if (TARGET_64BIT)
14333 return R11_REG;
14334 else
14336 bool is_fastcall, is_thiscall;
14337 int regparm;
14339 is_fastcall = (lookup_attribute ("fastcall",
14340 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14341 != NULL);
14342 is_thiscall = (lookup_attribute ("thiscall",
14343 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14344 != NULL);
14345 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14347 if (is_fastcall)
14349 if (DECL_STATIC_CHAIN (cfun->decl))
14351 sorry ("-fsplit-stack does not support fastcall with "
14352 "nested function");
14353 return INVALID_REGNUM;
14355 return AX_REG;
14357 else if (is_thiscall)
14359 if (!DECL_STATIC_CHAIN (cfun->decl))
14360 return DX_REG;
14361 return AX_REG;
14363 else if (regparm < 3)
14365 if (!DECL_STATIC_CHAIN (cfun->decl))
14366 return CX_REG;
14367 else
14369 if (regparm >= 2)
14371 sorry ("-fsplit-stack does not support 2 register "
14372 "parameters for a nested function");
14373 return INVALID_REGNUM;
14375 return DX_REG;
14378 else
14380 /* FIXME: We could make this work by pushing a register
14381 around the addition and comparison. */
14382 sorry ("-fsplit-stack does not support 3 register parameters");
14383 return INVALID_REGNUM;
14388 /* A SYMBOL_REF for the function which allocates new stackspace for
14389 -fsplit-stack. */
14391 static GTY(()) rtx split_stack_fn;
14393 /* A SYMBOL_REF for the more stack function when using the large
14394 model. */
14396 static GTY(()) rtx split_stack_fn_large;
14398 /* Return location of the stack guard value in the TLS block. */
14401 ix86_split_stack_guard (void)
14403 int offset;
14404 addr_space_t as = DEFAULT_TLS_SEG_REG;
14405 rtx r;
14407 gcc_assert (flag_split_stack);
14409 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14410 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14411 #else
14412 gcc_unreachable ();
14413 #endif
14415 r = GEN_INT (offset);
14416 r = gen_const_mem (Pmode, r);
14417 set_mem_addr_space (r, as);
14419 return r;
14422 /* Handle -fsplit-stack. These are the first instructions in the
14423 function, even before the regular prologue. */
14425 void
14426 ix86_expand_split_stack_prologue (void)
14428 HOST_WIDE_INT allocate;
14429 unsigned HOST_WIDE_INT args_size;
14430 rtx_code_label *label;
14431 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14432 rtx scratch_reg = NULL_RTX;
14433 rtx_code_label *varargs_label = NULL;
14434 rtx fn;
14436 gcc_assert (flag_split_stack && reload_completed);
14438 ix86_finalize_stack_frame_flags ();
14439 struct ix86_frame &frame = cfun->machine->frame;
14440 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14442 /* This is the label we will branch to if we have enough stack
14443 space. We expect the basic block reordering pass to reverse this
14444 branch if optimizing, so that we branch in the unlikely case. */
14445 label = gen_label_rtx ();
14447 /* We need to compare the stack pointer minus the frame size with
14448 the stack boundary in the TCB. The stack boundary always gives
14449 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14450 can compare directly. Otherwise we need to do an addition. */
14452 limit = ix86_split_stack_guard ();
14454 if (allocate < SPLIT_STACK_AVAILABLE)
14455 current = stack_pointer_rtx;
14456 else
14458 unsigned int scratch_regno;
14459 rtx offset;
14461 /* We need a scratch register to hold the stack pointer minus
14462 the required frame size. Since this is the very start of the
14463 function, the scratch register can be any caller-saved
14464 register which is not used for parameters. */
14465 offset = GEN_INT (- allocate);
14466 scratch_regno = split_stack_prologue_scratch_regno ();
14467 if (scratch_regno == INVALID_REGNUM)
14468 return;
14469 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14470 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14472 /* We don't use ix86_gen_add3 in this case because it will
14473 want to split to lea, but when not optimizing the insn
14474 will not be split after this point. */
14475 emit_insn (gen_rtx_SET (scratch_reg,
14476 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14477 offset)));
14479 else
14481 emit_move_insn (scratch_reg, offset);
14482 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14483 stack_pointer_rtx));
14485 current = scratch_reg;
14488 ix86_expand_branch (GEU, current, limit, label);
14489 rtx_insn *jump_insn = get_last_insn ();
14490 JUMP_LABEL (jump_insn) = label;
14492 /* Mark the jump as very likely to be taken. */
14493 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14495 if (split_stack_fn == NULL_RTX)
14497 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14498 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14500 fn = split_stack_fn;
14502 /* Get more stack space. We pass in the desired stack space and the
14503 size of the arguments to copy to the new stack. In 32-bit mode
14504 we push the parameters; __morestack will return on a new stack
14505 anyhow. In 64-bit mode we pass the parameters in r10 and
14506 r11. */
14507 allocate_rtx = GEN_INT (allocate);
14508 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14509 call_fusage = NULL_RTX;
14510 rtx pop = NULL_RTX;
14511 if (TARGET_64BIT)
14513 rtx reg10, reg11;
14515 reg10 = gen_rtx_REG (Pmode, R10_REG);
14516 reg11 = gen_rtx_REG (Pmode, R11_REG);
14518 /* If this function uses a static chain, it will be in %r10.
14519 Preserve it across the call to __morestack. */
14520 if (DECL_STATIC_CHAIN (cfun->decl))
14522 rtx rax;
14524 rax = gen_rtx_REG (word_mode, AX_REG);
14525 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14526 use_reg (&call_fusage, rax);
14529 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14530 && !TARGET_PECOFF)
14532 HOST_WIDE_INT argval;
14534 gcc_assert (Pmode == DImode);
14535 /* When using the large model we need to load the address
14536 into a register, and we've run out of registers. So we
14537 switch to a different calling convention, and we call a
14538 different function: __morestack_large. We pass the
14539 argument size in the upper 32 bits of r10 and pass the
14540 frame size in the lower 32 bits. */
14541 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14542 gcc_assert ((args_size & 0xffffffff) == args_size);
14544 if (split_stack_fn_large == NULL_RTX)
14546 split_stack_fn_large =
14547 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14548 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14550 if (ix86_cmodel == CM_LARGE_PIC)
14552 rtx_code_label *label;
14553 rtx x;
14555 label = gen_label_rtx ();
14556 emit_label (label);
14557 LABEL_PRESERVE_P (label) = 1;
14558 emit_insn (gen_set_rip_rex64 (reg10, label));
14559 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14560 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14561 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14562 UNSPEC_GOT);
14563 x = gen_rtx_CONST (Pmode, x);
14564 emit_move_insn (reg11, x);
14565 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14566 x = gen_const_mem (Pmode, x);
14567 emit_move_insn (reg11, x);
14569 else
14570 emit_move_insn (reg11, split_stack_fn_large);
14572 fn = reg11;
14574 argval = ((args_size << 16) << 16) + allocate;
14575 emit_move_insn (reg10, GEN_INT (argval));
14577 else
14579 emit_move_insn (reg10, allocate_rtx);
14580 emit_move_insn (reg11, GEN_INT (args_size));
14581 use_reg (&call_fusage, reg11);
14584 use_reg (&call_fusage, reg10);
14586 else
14588 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
14589 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
14590 insn = emit_insn (gen_push (allocate_rtx));
14591 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
14592 pop = GEN_INT (2 * UNITS_PER_WORD);
14594 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14595 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14596 pop, false);
14597 add_function_usage_to (call_insn, call_fusage);
14598 if (!TARGET_64BIT)
14599 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
14600 /* Indicate that this function can't jump to non-local gotos. */
14601 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
14603 /* In order to make call/return prediction work right, we now need
14604 to execute a return instruction. See
14605 libgcc/config/i386/morestack.S for the details on how this works.
14607 For flow purposes gcc must not see this as a return
14608 instruction--we need control flow to continue at the subsequent
14609 label. Therefore, we use an unspec. */
14610 gcc_assert (crtl->args.pops_args < 65536);
14611 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14613 /* If we are in 64-bit mode and this function uses a static chain,
14614 we saved %r10 in %rax before calling _morestack. */
14615 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14616 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14617 gen_rtx_REG (word_mode, AX_REG));
14619 /* If this function calls va_start, we need to store a pointer to
14620 the arguments on the old stack, because they may not have been
14621 all copied to the new stack. At this point the old stack can be
14622 found at the frame pointer value used by __morestack, because
14623 __morestack has set that up before calling back to us. Here we
14624 store that pointer in a scratch register, and in
14625 ix86_expand_prologue we store the scratch register in a stack
14626 slot. */
14627 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14629 unsigned int scratch_regno;
14630 rtx frame_reg;
14631 int words;
14633 scratch_regno = split_stack_prologue_scratch_regno ();
14634 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14635 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14637 /* 64-bit:
14638 fp -> old fp value
14639 return address within this function
14640 return address of caller of this function
14641 stack arguments
14642 So we add three words to get to the stack arguments.
14644 32-bit:
14645 fp -> old fp value
14646 return address within this function
14647 first argument to __morestack
14648 second argument to __morestack
14649 return address of caller of this function
14650 stack arguments
14651 So we add five words to get to the stack arguments.
14653 words = TARGET_64BIT ? 3 : 5;
14654 emit_insn (gen_rtx_SET (scratch_reg,
14655 gen_rtx_PLUS (Pmode, frame_reg,
14656 GEN_INT (words * UNITS_PER_WORD))));
14658 varargs_label = gen_label_rtx ();
14659 emit_jump_insn (gen_jump (varargs_label));
14660 JUMP_LABEL (get_last_insn ()) = varargs_label;
14662 emit_barrier ();
14665 emit_label (label);
14666 LABEL_NUSES (label) = 1;
14668 /* If this function calls va_start, we now have to set the scratch
14669 register for the case where we do not call __morestack. In this
14670 case we need to set it based on the stack pointer. */
14671 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14673 emit_insn (gen_rtx_SET (scratch_reg,
14674 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14675 GEN_INT (UNITS_PER_WORD))));
14677 emit_label (varargs_label);
14678 LABEL_NUSES (varargs_label) = 1;
14682 /* We may have to tell the dataflow pass that the split stack prologue
14683 is initializing a scratch register. */
14685 static void
14686 ix86_live_on_entry (bitmap regs)
14688 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14690 gcc_assert (flag_split_stack);
14691 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14695 /* Extract the parts of an RTL expression that is a valid memory address
14696 for an instruction. Return 0 if the structure of the address is
14697 grossly off. Return -1 if the address contains ASHIFT, so it is not
14698 strictly valid, but still used for computing length of lea instruction. */
14701 ix86_decompose_address (rtx addr, struct ix86_address *out)
14703 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14704 rtx base_reg, index_reg;
14705 HOST_WIDE_INT scale = 1;
14706 rtx scale_rtx = NULL_RTX;
14707 rtx tmp;
14708 int retval = 1;
14709 addr_space_t seg = ADDR_SPACE_GENERIC;
14711 /* Allow zero-extended SImode addresses,
14712 they will be emitted with addr32 prefix. */
14713 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14715 if (GET_CODE (addr) == ZERO_EXTEND
14716 && GET_MODE (XEXP (addr, 0)) == SImode)
14718 addr = XEXP (addr, 0);
14719 if (CONST_INT_P (addr))
14720 return 0;
14722 else if (GET_CODE (addr) == AND
14723 && const_32bit_mask (XEXP (addr, 1), DImode))
14725 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14726 if (addr == NULL_RTX)
14727 return 0;
14729 if (CONST_INT_P (addr))
14730 return 0;
14734 /* Allow SImode subregs of DImode addresses,
14735 they will be emitted with addr32 prefix. */
14736 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14738 if (SUBREG_P (addr)
14739 && GET_MODE (SUBREG_REG (addr)) == DImode)
14741 addr = SUBREG_REG (addr);
14742 if (CONST_INT_P (addr))
14743 return 0;
14747 if (REG_P (addr))
14748 base = addr;
14749 else if (SUBREG_P (addr))
14751 if (REG_P (SUBREG_REG (addr)))
14752 base = addr;
14753 else
14754 return 0;
14756 else if (GET_CODE (addr) == PLUS)
14758 rtx addends[4], op;
14759 int n = 0, i;
14761 op = addr;
14764 if (n >= 4)
14765 return 0;
14766 addends[n++] = XEXP (op, 1);
14767 op = XEXP (op, 0);
14769 while (GET_CODE (op) == PLUS);
14770 if (n >= 4)
14771 return 0;
14772 addends[n] = op;
14774 for (i = n; i >= 0; --i)
14776 op = addends[i];
14777 switch (GET_CODE (op))
14779 case MULT:
14780 if (index)
14781 return 0;
14782 index = XEXP (op, 0);
14783 scale_rtx = XEXP (op, 1);
14784 break;
14786 case ASHIFT:
14787 if (index)
14788 return 0;
14789 index = XEXP (op, 0);
14790 tmp = XEXP (op, 1);
14791 if (!CONST_INT_P (tmp))
14792 return 0;
14793 scale = INTVAL (tmp);
14794 if ((unsigned HOST_WIDE_INT) scale > 3)
14795 return 0;
14796 scale = 1 << scale;
14797 break;
14799 case ZERO_EXTEND:
14800 op = XEXP (op, 0);
14801 if (GET_CODE (op) != UNSPEC)
14802 return 0;
14803 /* FALLTHRU */
14805 case UNSPEC:
14806 if (XINT (op, 1) == UNSPEC_TP
14807 && TARGET_TLS_DIRECT_SEG_REFS
14808 && seg == ADDR_SPACE_GENERIC)
14809 seg = DEFAULT_TLS_SEG_REG;
14810 else
14811 return 0;
14812 break;
14814 case SUBREG:
14815 if (!REG_P (SUBREG_REG (op)))
14816 return 0;
14817 /* FALLTHRU */
14819 case REG:
14820 if (!base)
14821 base = op;
14822 else if (!index)
14823 index = op;
14824 else
14825 return 0;
14826 break;
14828 case CONST:
14829 case CONST_INT:
14830 case SYMBOL_REF:
14831 case LABEL_REF:
14832 if (disp)
14833 return 0;
14834 disp = op;
14835 break;
14837 default:
14838 return 0;
14842 else if (GET_CODE (addr) == MULT)
14844 index = XEXP (addr, 0); /* index*scale */
14845 scale_rtx = XEXP (addr, 1);
14847 else if (GET_CODE (addr) == ASHIFT)
14849 /* We're called for lea too, which implements ashift on occasion. */
14850 index = XEXP (addr, 0);
14851 tmp = XEXP (addr, 1);
14852 if (!CONST_INT_P (tmp))
14853 return 0;
14854 scale = INTVAL (tmp);
14855 if ((unsigned HOST_WIDE_INT) scale > 3)
14856 return 0;
14857 scale = 1 << scale;
14858 retval = -1;
14860 else
14861 disp = addr; /* displacement */
14863 if (index)
14865 if (REG_P (index))
14867 else if (SUBREG_P (index)
14868 && REG_P (SUBREG_REG (index)))
14870 else
14871 return 0;
14874 /* Extract the integral value of scale. */
14875 if (scale_rtx)
14877 if (!CONST_INT_P (scale_rtx))
14878 return 0;
14879 scale = INTVAL (scale_rtx);
14882 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
14883 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
14885 /* Avoid useless 0 displacement. */
14886 if (disp == const0_rtx && (base || index))
14887 disp = NULL_RTX;
14889 /* Allow arg pointer and stack pointer as index if there is not scaling. */
14890 if (base_reg && index_reg && scale == 1
14891 && (REGNO (index_reg) == ARG_POINTER_REGNUM
14892 || REGNO (index_reg) == FRAME_POINTER_REGNUM
14893 || REGNO (index_reg) == SP_REG))
14895 std::swap (base, index);
14896 std::swap (base_reg, index_reg);
14899 /* Special case: %ebp cannot be encoded as a base without a displacement.
14900 Similarly %r13. */
14901 if (!disp && base_reg
14902 && (REGNO (base_reg) == ARG_POINTER_REGNUM
14903 || REGNO (base_reg) == FRAME_POINTER_REGNUM
14904 || REGNO (base_reg) == BP_REG
14905 || REGNO (base_reg) == R13_REG))
14906 disp = const0_rtx;
14908 /* Special case: on K6, [%esi] makes the instruction vector decoded.
14909 Avoid this by transforming to [%esi+0].
14910 Reload calls address legitimization without cfun defined, so we need
14911 to test cfun for being non-NULL. */
14912 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
14913 && base_reg && !index_reg && !disp
14914 && REGNO (base_reg) == SI_REG)
14915 disp = const0_rtx;
14917 /* Special case: encode reg+reg instead of reg*2. */
14918 if (!base && index && scale == 2)
14919 base = index, base_reg = index_reg, scale = 1;
14921 /* Special case: scaling cannot be encoded without base or displacement. */
14922 if (!base && !disp && index && scale != 1)
14923 disp = const0_rtx;
14925 out->base = base;
14926 out->index = index;
14927 out->disp = disp;
14928 out->scale = scale;
14929 out->seg = seg;
14931 return retval;
14934 /* Return cost of the memory address x.
14935 For i386, it is better to use a complex address than let gcc copy
14936 the address into a reg and make a new pseudo. But not if the address
14937 requires to two regs - that would mean more pseudos with longer
14938 lifetimes. */
14939 static int
14940 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
14942 struct ix86_address parts;
14943 int cost = 1;
14944 int ok = ix86_decompose_address (x, &parts);
14946 gcc_assert (ok);
14948 if (parts.base && SUBREG_P (parts.base))
14949 parts.base = SUBREG_REG (parts.base);
14950 if (parts.index && SUBREG_P (parts.index))
14951 parts.index = SUBREG_REG (parts.index);
14953 /* Attempt to minimize number of registers in the address by increasing
14954 address cost for each used register. We don't increase address cost
14955 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
14956 is not invariant itself it most likely means that base or index is not
14957 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
14958 which is not profitable for x86. */
14959 if (parts.base
14960 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
14961 && (current_pass->type == GIMPLE_PASS
14962 || !pic_offset_table_rtx
14963 || !REG_P (parts.base)
14964 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
14965 cost++;
14967 if (parts.index
14968 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
14969 && (current_pass->type == GIMPLE_PASS
14970 || !pic_offset_table_rtx
14971 || !REG_P (parts.index)
14972 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
14973 cost++;
14975 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
14976 since it's predecode logic can't detect the length of instructions
14977 and it degenerates to vector decoded. Increase cost of such
14978 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
14979 to split such addresses or even refuse such addresses at all.
14981 Following addressing modes are affected:
14982 [base+scale*index]
14983 [scale*index+disp]
14984 [base+index]
14986 The first and last case may be avoidable by explicitly coding the zero in
14987 memory address, but I don't have AMD-K6 machine handy to check this
14988 theory. */
14990 if (TARGET_K6
14991 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
14992 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
14993 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
14994 cost += 10;
14996 return cost;
14999 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15000 this is used for to form addresses to local data when -fPIC is in
15001 use. */
15003 static bool
15004 darwin_local_data_pic (rtx disp)
15006 return (GET_CODE (disp) == UNSPEC
15007 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15010 /* True if operand X should be loaded from GOT. */
15012 bool
15013 ix86_force_load_from_GOT_p (rtx x)
15015 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15016 && !TARGET_PECOFF && !TARGET_MACHO
15017 && !flag_plt && !flag_pic
15018 && ix86_cmodel != CM_LARGE
15019 && GET_CODE (x) == SYMBOL_REF
15020 && SYMBOL_REF_FUNCTION_P (x)
15021 && !SYMBOL_REF_LOCAL_P (x));
15024 /* Determine if a given RTX is a valid constant. We already know this
15025 satisfies CONSTANT_P. */
15027 static bool
15028 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15030 /* Pointer bounds constants are not valid. */
15031 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15032 return false;
15034 switch (GET_CODE (x))
15036 case CONST:
15037 x = XEXP (x, 0);
15039 if (GET_CODE (x) == PLUS)
15041 if (!CONST_INT_P (XEXP (x, 1)))
15042 return false;
15043 x = XEXP (x, 0);
15046 if (TARGET_MACHO && darwin_local_data_pic (x))
15047 return true;
15049 /* Only some unspecs are valid as "constants". */
15050 if (GET_CODE (x) == UNSPEC)
15051 switch (XINT (x, 1))
15053 case UNSPEC_GOT:
15054 case UNSPEC_GOTOFF:
15055 case UNSPEC_PLTOFF:
15056 return TARGET_64BIT;
15057 case UNSPEC_TPOFF:
15058 case UNSPEC_NTPOFF:
15059 x = XVECEXP (x, 0, 0);
15060 return (GET_CODE (x) == SYMBOL_REF
15061 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15062 case UNSPEC_DTPOFF:
15063 x = XVECEXP (x, 0, 0);
15064 return (GET_CODE (x) == SYMBOL_REF
15065 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15066 default:
15067 return false;
15070 /* We must have drilled down to a symbol. */
15071 if (GET_CODE (x) == LABEL_REF)
15072 return true;
15073 if (GET_CODE (x) != SYMBOL_REF)
15074 return false;
15075 /* FALLTHRU */
15077 case SYMBOL_REF:
15078 /* TLS symbols are never valid. */
15079 if (SYMBOL_REF_TLS_MODEL (x))
15080 return false;
15082 /* DLLIMPORT symbols are never valid. */
15083 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15084 && SYMBOL_REF_DLLIMPORT_P (x))
15085 return false;
15087 #if TARGET_MACHO
15088 /* mdynamic-no-pic */
15089 if (MACHO_DYNAMIC_NO_PIC_P)
15090 return machopic_symbol_defined_p (x);
15091 #endif
15093 /* External function address should be loaded
15094 via the GOT slot to avoid PLT. */
15095 if (ix86_force_load_from_GOT_p (x))
15096 return false;
15098 break;
15100 CASE_CONST_SCALAR_INT:
15101 switch (mode)
15103 case E_TImode:
15104 if (TARGET_64BIT)
15105 return true;
15106 /* FALLTHRU */
15107 case E_OImode:
15108 case E_XImode:
15109 if (!standard_sse_constant_p (x, mode))
15110 return false;
15111 default:
15112 break;
15114 break;
15116 case CONST_VECTOR:
15117 if (!standard_sse_constant_p (x, mode))
15118 return false;
15120 default:
15121 break;
15124 /* Otherwise we handle everything else in the move patterns. */
15125 return true;
15128 /* Determine if it's legal to put X into the constant pool. This
15129 is not possible for the address of thread-local symbols, which
15130 is checked above. */
15132 static bool
15133 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15135 /* We can put any immediate constant in memory. */
15136 switch (GET_CODE (x))
15138 CASE_CONST_ANY:
15139 return false;
15141 default:
15142 break;
15145 return !ix86_legitimate_constant_p (mode, x);
15148 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15149 otherwise zero. */
15151 static bool
15152 is_imported_p (rtx x)
15154 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15155 || GET_CODE (x) != SYMBOL_REF)
15156 return false;
15158 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15162 /* Nonzero if the constant value X is a legitimate general operand
15163 when generating PIC code. It is given that flag_pic is on and
15164 that X satisfies CONSTANT_P. */
15166 bool
15167 legitimate_pic_operand_p (rtx x)
15169 rtx inner;
15171 switch (GET_CODE (x))
15173 case CONST:
15174 inner = XEXP (x, 0);
15175 if (GET_CODE (inner) == PLUS
15176 && CONST_INT_P (XEXP (inner, 1)))
15177 inner = XEXP (inner, 0);
15179 /* Only some unspecs are valid as "constants". */
15180 if (GET_CODE (inner) == UNSPEC)
15181 switch (XINT (inner, 1))
15183 case UNSPEC_GOT:
15184 case UNSPEC_GOTOFF:
15185 case UNSPEC_PLTOFF:
15186 return TARGET_64BIT;
15187 case UNSPEC_TPOFF:
15188 x = XVECEXP (inner, 0, 0);
15189 return (GET_CODE (x) == SYMBOL_REF
15190 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15191 case UNSPEC_MACHOPIC_OFFSET:
15192 return legitimate_pic_address_disp_p (x);
15193 default:
15194 return false;
15196 /* FALLTHRU */
15198 case SYMBOL_REF:
15199 case LABEL_REF:
15200 return legitimate_pic_address_disp_p (x);
15202 default:
15203 return true;
15207 /* Determine if a given CONST RTX is a valid memory displacement
15208 in PIC mode. */
15210 bool
15211 legitimate_pic_address_disp_p (rtx disp)
15213 bool saw_plus;
15215 /* In 64bit mode we can allow direct addresses of symbols and labels
15216 when they are not dynamic symbols. */
15217 if (TARGET_64BIT)
15219 rtx op0 = disp, op1;
15221 switch (GET_CODE (disp))
15223 case LABEL_REF:
15224 return true;
15226 case CONST:
15227 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15228 break;
15229 op0 = XEXP (XEXP (disp, 0), 0);
15230 op1 = XEXP (XEXP (disp, 0), 1);
15231 if (!CONST_INT_P (op1))
15232 break;
15233 if (GET_CODE (op0) == UNSPEC
15234 && (XINT (op0, 1) == UNSPEC_DTPOFF
15235 || XINT (op0, 1) == UNSPEC_NTPOFF)
15236 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15237 return true;
15238 if (INTVAL (op1) >= 16*1024*1024
15239 || INTVAL (op1) < -16*1024*1024)
15240 break;
15241 if (GET_CODE (op0) == LABEL_REF)
15242 return true;
15243 if (GET_CODE (op0) == CONST
15244 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15245 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15246 return true;
15247 if (GET_CODE (op0) == UNSPEC
15248 && XINT (op0, 1) == UNSPEC_PCREL)
15249 return true;
15250 if (GET_CODE (op0) != SYMBOL_REF)
15251 break;
15252 /* FALLTHRU */
15254 case SYMBOL_REF:
15255 /* TLS references should always be enclosed in UNSPEC.
15256 The dllimported symbol needs always to be resolved. */
15257 if (SYMBOL_REF_TLS_MODEL (op0)
15258 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15259 return false;
15261 if (TARGET_PECOFF)
15263 if (is_imported_p (op0))
15264 return true;
15266 if (SYMBOL_REF_FAR_ADDR_P (op0)
15267 || !SYMBOL_REF_LOCAL_P (op0))
15268 break;
15270 /* Function-symbols need to be resolved only for
15271 large-model.
15272 For the small-model we don't need to resolve anything
15273 here. */
15274 if ((ix86_cmodel != CM_LARGE_PIC
15275 && SYMBOL_REF_FUNCTION_P (op0))
15276 || ix86_cmodel == CM_SMALL_PIC)
15277 return true;
15278 /* Non-external symbols don't need to be resolved for
15279 large, and medium-model. */
15280 if ((ix86_cmodel == CM_LARGE_PIC
15281 || ix86_cmodel == CM_MEDIUM_PIC)
15282 && !SYMBOL_REF_EXTERNAL_P (op0))
15283 return true;
15285 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15286 && (SYMBOL_REF_LOCAL_P (op0)
15287 || (HAVE_LD_PIE_COPYRELOC
15288 && flag_pie
15289 && !SYMBOL_REF_WEAK (op0)
15290 && !SYMBOL_REF_FUNCTION_P (op0)))
15291 && ix86_cmodel != CM_LARGE_PIC)
15292 return true;
15293 break;
15295 default:
15296 break;
15299 if (GET_CODE (disp) != CONST)
15300 return false;
15301 disp = XEXP (disp, 0);
15303 if (TARGET_64BIT)
15305 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15306 of GOT tables. We should not need these anyway. */
15307 if (GET_CODE (disp) != UNSPEC
15308 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15309 && XINT (disp, 1) != UNSPEC_GOTOFF
15310 && XINT (disp, 1) != UNSPEC_PCREL
15311 && XINT (disp, 1) != UNSPEC_PLTOFF))
15312 return false;
15314 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15315 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15316 return false;
15317 return true;
15320 saw_plus = false;
15321 if (GET_CODE (disp) == PLUS)
15323 if (!CONST_INT_P (XEXP (disp, 1)))
15324 return false;
15325 disp = XEXP (disp, 0);
15326 saw_plus = true;
15329 if (TARGET_MACHO && darwin_local_data_pic (disp))
15330 return true;
15332 if (GET_CODE (disp) != UNSPEC)
15333 return false;
15335 switch (XINT (disp, 1))
15337 case UNSPEC_GOT:
15338 if (saw_plus)
15339 return false;
15340 /* We need to check for both symbols and labels because VxWorks loads
15341 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15342 details. */
15343 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15344 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15345 case UNSPEC_GOTOFF:
15346 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15347 While ABI specify also 32bit relocation but we don't produce it in
15348 small PIC model at all. */
15349 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15350 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15351 && !TARGET_64BIT)
15352 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15353 return false;
15354 case UNSPEC_GOTTPOFF:
15355 case UNSPEC_GOTNTPOFF:
15356 case UNSPEC_INDNTPOFF:
15357 if (saw_plus)
15358 return false;
15359 disp = XVECEXP (disp, 0, 0);
15360 return (GET_CODE (disp) == SYMBOL_REF
15361 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15362 case UNSPEC_NTPOFF:
15363 disp = XVECEXP (disp, 0, 0);
15364 return (GET_CODE (disp) == SYMBOL_REF
15365 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15366 case UNSPEC_DTPOFF:
15367 disp = XVECEXP (disp, 0, 0);
15368 return (GET_CODE (disp) == SYMBOL_REF
15369 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15372 return false;
15375 /* Determine if op is suitable RTX for an address register.
15376 Return naked register if a register or a register subreg is
15377 found, otherwise return NULL_RTX. */
15379 static rtx
15380 ix86_validate_address_register (rtx op)
15382 machine_mode mode = GET_MODE (op);
15384 /* Only SImode or DImode registers can form the address. */
15385 if (mode != SImode && mode != DImode)
15386 return NULL_RTX;
15388 if (REG_P (op))
15389 return op;
15390 else if (SUBREG_P (op))
15392 rtx reg = SUBREG_REG (op);
15394 if (!REG_P (reg))
15395 return NULL_RTX;
15397 mode = GET_MODE (reg);
15399 /* Don't allow SUBREGs that span more than a word. It can
15400 lead to spill failures when the register is one word out
15401 of a two word structure. */
15402 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15403 return NULL_RTX;
15405 /* Allow only SUBREGs of non-eliminable hard registers. */
15406 if (register_no_elim_operand (reg, mode))
15407 return reg;
15410 /* Op is not a register. */
15411 return NULL_RTX;
15414 /* Recognizes RTL expressions that are valid memory addresses for an
15415 instruction. The MODE argument is the machine mode for the MEM
15416 expression that wants to use this address.
15418 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15419 convert common non-canonical forms to canonical form so that they will
15420 be recognized. */
15422 static bool
15423 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15425 struct ix86_address parts;
15426 rtx base, index, disp;
15427 HOST_WIDE_INT scale;
15428 addr_space_t seg;
15430 if (ix86_decompose_address (addr, &parts) <= 0)
15431 /* Decomposition failed. */
15432 return false;
15434 base = parts.base;
15435 index = parts.index;
15436 disp = parts.disp;
15437 scale = parts.scale;
15438 seg = parts.seg;
15440 /* Validate base register. */
15441 if (base)
15443 rtx reg = ix86_validate_address_register (base);
15445 if (reg == NULL_RTX)
15446 return false;
15448 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15449 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15450 /* Base is not valid. */
15451 return false;
15454 /* Validate index register. */
15455 if (index)
15457 rtx reg = ix86_validate_address_register (index);
15459 if (reg == NULL_RTX)
15460 return false;
15462 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15463 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15464 /* Index is not valid. */
15465 return false;
15468 /* Index and base should have the same mode. */
15469 if (base && index
15470 && GET_MODE (base) != GET_MODE (index))
15471 return false;
15473 /* Address override works only on the (%reg) part of %fs:(%reg). */
15474 if (seg != ADDR_SPACE_GENERIC
15475 && ((base && GET_MODE (base) != word_mode)
15476 || (index && GET_MODE (index) != word_mode)))
15477 return false;
15479 /* Validate scale factor. */
15480 if (scale != 1)
15482 if (!index)
15483 /* Scale without index. */
15484 return false;
15486 if (scale != 2 && scale != 4 && scale != 8)
15487 /* Scale is not a valid multiplier. */
15488 return false;
15491 /* Validate displacement. */
15492 if (disp)
15494 if (GET_CODE (disp) == CONST
15495 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15496 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15497 switch (XINT (XEXP (disp, 0), 1))
15499 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15500 when used. While ABI specify also 32bit relocations, we
15501 don't produce them at all and use IP relative instead.
15502 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15503 should be loaded via GOT. */
15504 case UNSPEC_GOT:
15505 if (!TARGET_64BIT
15506 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15507 goto is_legitimate_pic;
15508 /* FALLTHRU */
15509 case UNSPEC_GOTOFF:
15510 gcc_assert (flag_pic);
15511 if (!TARGET_64BIT)
15512 goto is_legitimate_pic;
15514 /* 64bit address unspec. */
15515 return false;
15517 case UNSPEC_GOTPCREL:
15518 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15519 goto is_legitimate_pic;
15520 /* FALLTHRU */
15521 case UNSPEC_PCREL:
15522 gcc_assert (flag_pic);
15523 goto is_legitimate_pic;
15525 case UNSPEC_GOTTPOFF:
15526 case UNSPEC_GOTNTPOFF:
15527 case UNSPEC_INDNTPOFF:
15528 case UNSPEC_NTPOFF:
15529 case UNSPEC_DTPOFF:
15530 break;
15532 default:
15533 /* Invalid address unspec. */
15534 return false;
15537 else if (SYMBOLIC_CONST (disp)
15538 && (flag_pic
15539 || (TARGET_MACHO
15540 #if TARGET_MACHO
15541 && MACHOPIC_INDIRECT
15542 && !machopic_operand_p (disp)
15543 #endif
15547 is_legitimate_pic:
15548 if (TARGET_64BIT && (index || base))
15550 /* foo@dtpoff(%rX) is ok. */
15551 if (GET_CODE (disp) != CONST
15552 || GET_CODE (XEXP (disp, 0)) != PLUS
15553 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15554 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15555 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15556 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15557 /* Non-constant pic memory reference. */
15558 return false;
15560 else if ((!TARGET_MACHO || flag_pic)
15561 && ! legitimate_pic_address_disp_p (disp))
15562 /* Displacement is an invalid pic construct. */
15563 return false;
15564 #if TARGET_MACHO
15565 else if (MACHO_DYNAMIC_NO_PIC_P
15566 && !ix86_legitimate_constant_p (Pmode, disp))
15567 /* displacment must be referenced via non_lazy_pointer */
15568 return false;
15569 #endif
15571 /* This code used to verify that a symbolic pic displacement
15572 includes the pic_offset_table_rtx register.
15574 While this is good idea, unfortunately these constructs may
15575 be created by "adds using lea" optimization for incorrect
15576 code like:
15578 int a;
15579 int foo(int i)
15581 return *(&a+i);
15584 This code is nonsensical, but results in addressing
15585 GOT table with pic_offset_table_rtx base. We can't
15586 just refuse it easily, since it gets matched by
15587 "addsi3" pattern, that later gets split to lea in the
15588 case output register differs from input. While this
15589 can be handled by separate addsi pattern for this case
15590 that never results in lea, this seems to be easier and
15591 correct fix for crash to disable this test. */
15593 else if (GET_CODE (disp) != LABEL_REF
15594 && !CONST_INT_P (disp)
15595 && (GET_CODE (disp) != CONST
15596 || !ix86_legitimate_constant_p (Pmode, disp))
15597 && (GET_CODE (disp) != SYMBOL_REF
15598 || !ix86_legitimate_constant_p (Pmode, disp)))
15599 /* Displacement is not constant. */
15600 return false;
15601 else if (TARGET_64BIT
15602 && !x86_64_immediate_operand (disp, VOIDmode))
15603 /* Displacement is out of range. */
15604 return false;
15605 /* In x32 mode, constant addresses are sign extended to 64bit, so
15606 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15607 else if (TARGET_X32 && !(index || base)
15608 && CONST_INT_P (disp)
15609 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15610 return false;
15613 /* Everything looks valid. */
15614 return true;
15617 /* Determine if a given RTX is a valid constant address. */
15619 bool
15620 constant_address_p (rtx x)
15622 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15625 /* Return a unique alias set for the GOT. */
15627 static alias_set_type
15628 ix86_GOT_alias_set (void)
15630 static alias_set_type set = -1;
15631 if (set == -1)
15632 set = new_alias_set ();
15633 return set;
15636 /* Return a legitimate reference for ORIG (an address) using the
15637 register REG. If REG is 0, a new pseudo is generated.
15639 There are two types of references that must be handled:
15641 1. Global data references must load the address from the GOT, via
15642 the PIC reg. An insn is emitted to do this load, and the reg is
15643 returned.
15645 2. Static data references, constant pool addresses, and code labels
15646 compute the address as an offset from the GOT, whose base is in
15647 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15648 differentiate them from global data objects. The returned
15649 address is the PIC reg + an unspec constant.
15651 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15652 reg also appears in the address. */
15654 static rtx
15655 legitimize_pic_address (rtx orig, rtx reg)
15657 rtx addr = orig;
15658 rtx new_rtx = orig;
15660 #if TARGET_MACHO
15661 if (TARGET_MACHO && !TARGET_64BIT)
15663 if (reg == 0)
15664 reg = gen_reg_rtx (Pmode);
15665 /* Use the generic Mach-O PIC machinery. */
15666 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15668 #endif
15670 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15672 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15673 if (tmp)
15674 return tmp;
15677 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15678 new_rtx = addr;
15679 else if ((!TARGET_64BIT
15680 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15681 && !TARGET_PECOFF
15682 && gotoff_operand (addr, Pmode))
15684 /* This symbol may be referenced via a displacement
15685 from the PIC base address (@GOTOFF). */
15686 if (GET_CODE (addr) == CONST)
15687 addr = XEXP (addr, 0);
15689 if (GET_CODE (addr) == PLUS)
15691 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15692 UNSPEC_GOTOFF);
15693 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15695 else
15696 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15698 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15700 if (TARGET_64BIT)
15701 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15703 if (reg != 0)
15705 gcc_assert (REG_P (reg));
15706 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15707 new_rtx, reg, 1, OPTAB_DIRECT);
15709 else
15710 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15712 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15713 /* We can't use @GOTOFF for text labels
15714 on VxWorks, see gotoff_operand. */
15715 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15717 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15718 if (tmp)
15719 return tmp;
15721 /* For x64 PE-COFF there is no GOT table,
15722 so we use address directly. */
15723 if (TARGET_64BIT && TARGET_PECOFF)
15725 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15726 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15728 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15730 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15731 UNSPEC_GOTPCREL);
15732 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15733 new_rtx = gen_const_mem (Pmode, new_rtx);
15734 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15736 else
15738 /* This symbol must be referenced via a load
15739 from the Global Offset Table (@GOT). */
15740 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15741 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15742 if (TARGET_64BIT)
15743 new_rtx = force_reg (Pmode, new_rtx);
15744 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15745 new_rtx = gen_const_mem (Pmode, new_rtx);
15746 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15749 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15751 else
15753 if (CONST_INT_P (addr)
15754 && !x86_64_immediate_operand (addr, VOIDmode))
15755 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15756 else if (GET_CODE (addr) == CONST)
15758 addr = XEXP (addr, 0);
15760 /* We must match stuff we generate before. Assume the only
15761 unspecs that can get here are ours. Not that we could do
15762 anything with them anyway.... */
15763 if (GET_CODE (addr) == UNSPEC
15764 || (GET_CODE (addr) == PLUS
15765 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
15766 return orig;
15767 gcc_assert (GET_CODE (addr) == PLUS);
15770 if (GET_CODE (addr) == PLUS)
15772 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
15774 /* Check first to see if this is a constant
15775 offset from a @GOTOFF symbol reference. */
15776 if (!TARGET_PECOFF
15777 && gotoff_operand (op0, Pmode)
15778 && CONST_INT_P (op1))
15780 if (!TARGET_64BIT)
15782 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
15783 UNSPEC_GOTOFF);
15784 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
15785 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15787 if (reg != 0)
15789 gcc_assert (REG_P (reg));
15790 new_rtx = expand_simple_binop (Pmode, PLUS,
15791 pic_offset_table_rtx,
15792 new_rtx, reg, 1,
15793 OPTAB_DIRECT);
15795 else
15796 new_rtx
15797 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15799 else
15801 if (INTVAL (op1) < -16*1024*1024
15802 || INTVAL (op1) >= 16*1024*1024)
15804 if (!x86_64_immediate_operand (op1, Pmode))
15805 op1 = force_reg (Pmode, op1);
15807 new_rtx
15808 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
15812 else
15814 rtx base = legitimize_pic_address (op0, reg);
15815 machine_mode mode = GET_MODE (base);
15816 new_rtx
15817 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
15819 if (CONST_INT_P (new_rtx))
15821 if (INTVAL (new_rtx) < -16*1024*1024
15822 || INTVAL (new_rtx) >= 16*1024*1024)
15824 if (!x86_64_immediate_operand (new_rtx, mode))
15825 new_rtx = force_reg (mode, new_rtx);
15827 new_rtx
15828 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
15830 else
15831 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
15833 else
15835 /* For %rip addressing, we have to use
15836 just disp32, not base nor index. */
15837 if (TARGET_64BIT
15838 && (GET_CODE (base) == SYMBOL_REF
15839 || GET_CODE (base) == LABEL_REF))
15840 base = force_reg (mode, base);
15841 if (GET_CODE (new_rtx) == PLUS
15842 && CONSTANT_P (XEXP (new_rtx, 1)))
15844 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
15845 new_rtx = XEXP (new_rtx, 1);
15847 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
15852 return new_rtx;
15855 /* Load the thread pointer. If TO_REG is true, force it into a register. */
15857 static rtx
15858 get_thread_pointer (machine_mode tp_mode, bool to_reg)
15860 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
15862 if (GET_MODE (tp) != tp_mode)
15864 gcc_assert (GET_MODE (tp) == SImode);
15865 gcc_assert (tp_mode == DImode);
15867 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
15870 if (to_reg)
15871 tp = copy_to_mode_reg (tp_mode, tp);
15873 return tp;
15876 /* Construct the SYMBOL_REF for the tls_get_addr function. */
15878 static GTY(()) rtx ix86_tls_symbol;
15880 static rtx
15881 ix86_tls_get_addr (void)
15883 if (!ix86_tls_symbol)
15885 const char *sym
15886 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
15887 ? "___tls_get_addr" : "__tls_get_addr");
15889 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
15892 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
15894 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
15895 UNSPEC_PLTOFF);
15896 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
15897 gen_rtx_CONST (Pmode, unspec));
15900 return ix86_tls_symbol;
15903 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
15905 static GTY(()) rtx ix86_tls_module_base_symbol;
15908 ix86_tls_module_base (void)
15910 if (!ix86_tls_module_base_symbol)
15912 ix86_tls_module_base_symbol
15913 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
15915 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15916 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15919 return ix86_tls_module_base_symbol;
15922 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
15923 false if we expect this to be used for a memory address and true if
15924 we expect to load the address into a register. */
15926 static rtx
15927 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
15929 rtx dest, base, off;
15930 rtx pic = NULL_RTX, tp = NULL_RTX;
15931 machine_mode tp_mode = Pmode;
15932 int type;
15934 /* Fall back to global dynamic model if tool chain cannot support local
15935 dynamic. */
15936 if (TARGET_SUN_TLS && !TARGET_64BIT
15937 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
15938 && model == TLS_MODEL_LOCAL_DYNAMIC)
15939 model = TLS_MODEL_GLOBAL_DYNAMIC;
15941 switch (model)
15943 case TLS_MODEL_GLOBAL_DYNAMIC:
15944 dest = gen_reg_rtx (Pmode);
15946 if (!TARGET_64BIT)
15948 if (flag_pic && !TARGET_PECOFF)
15949 pic = pic_offset_table_rtx;
15950 else
15952 pic = gen_reg_rtx (Pmode);
15953 emit_insn (gen_set_got (pic));
15957 if (TARGET_GNU2_TLS)
15959 if (TARGET_64BIT)
15960 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
15961 else
15962 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
15964 tp = get_thread_pointer (Pmode, true);
15965 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
15967 if (GET_MODE (x) != Pmode)
15968 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15970 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
15972 else
15974 rtx caddr = ix86_tls_get_addr ();
15976 if (TARGET_64BIT)
15978 rtx rax = gen_rtx_REG (Pmode, AX_REG);
15979 rtx_insn *insns;
15981 start_sequence ();
15982 emit_call_insn
15983 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
15984 insns = get_insns ();
15985 end_sequence ();
15987 if (GET_MODE (x) != Pmode)
15988 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15990 RTL_CONST_CALL_P (insns) = 1;
15991 emit_libcall_block (insns, dest, rax, x);
15993 else
15994 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
15996 break;
15998 case TLS_MODEL_LOCAL_DYNAMIC:
15999 base = gen_reg_rtx (Pmode);
16001 if (!TARGET_64BIT)
16003 if (flag_pic)
16004 pic = pic_offset_table_rtx;
16005 else
16007 pic = gen_reg_rtx (Pmode);
16008 emit_insn (gen_set_got (pic));
16012 if (TARGET_GNU2_TLS)
16014 rtx tmp = ix86_tls_module_base ();
16016 if (TARGET_64BIT)
16017 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16018 else
16019 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16021 tp = get_thread_pointer (Pmode, true);
16022 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16023 gen_rtx_MINUS (Pmode, tmp, tp));
16025 else
16027 rtx caddr = ix86_tls_get_addr ();
16029 if (TARGET_64BIT)
16031 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16032 rtx_insn *insns;
16033 rtx eqv;
16035 start_sequence ();
16036 emit_call_insn
16037 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16038 insns = get_insns ();
16039 end_sequence ();
16041 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16042 share the LD_BASE result with other LD model accesses. */
16043 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16044 UNSPEC_TLS_LD_BASE);
16046 RTL_CONST_CALL_P (insns) = 1;
16047 emit_libcall_block (insns, base, rax, eqv);
16049 else
16050 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16053 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16054 off = gen_rtx_CONST (Pmode, off);
16056 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16058 if (TARGET_GNU2_TLS)
16060 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16062 if (GET_MODE (x) != Pmode)
16063 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16065 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16067 break;
16069 case TLS_MODEL_INITIAL_EXEC:
16070 if (TARGET_64BIT)
16072 if (TARGET_SUN_TLS && !TARGET_X32)
16074 /* The Sun linker took the AMD64 TLS spec literally
16075 and can only handle %rax as destination of the
16076 initial executable code sequence. */
16078 dest = gen_reg_rtx (DImode);
16079 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16080 return dest;
16083 /* Generate DImode references to avoid %fs:(%reg32)
16084 problems and linker IE->LE relaxation bug. */
16085 tp_mode = DImode;
16086 pic = NULL;
16087 type = UNSPEC_GOTNTPOFF;
16089 else if (flag_pic)
16091 pic = pic_offset_table_rtx;
16092 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16094 else if (!TARGET_ANY_GNU_TLS)
16096 pic = gen_reg_rtx (Pmode);
16097 emit_insn (gen_set_got (pic));
16098 type = UNSPEC_GOTTPOFF;
16100 else
16102 pic = NULL;
16103 type = UNSPEC_INDNTPOFF;
16106 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16107 off = gen_rtx_CONST (tp_mode, off);
16108 if (pic)
16109 off = gen_rtx_PLUS (tp_mode, pic, off);
16110 off = gen_const_mem (tp_mode, off);
16111 set_mem_alias_set (off, ix86_GOT_alias_set ());
16113 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16115 base = get_thread_pointer (tp_mode,
16116 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16117 off = force_reg (tp_mode, off);
16118 dest = gen_rtx_PLUS (tp_mode, base, off);
16119 if (tp_mode != Pmode)
16120 dest = convert_to_mode (Pmode, dest, 1);
16122 else
16124 base = get_thread_pointer (Pmode, true);
16125 dest = gen_reg_rtx (Pmode);
16126 emit_insn (ix86_gen_sub3 (dest, base, off));
16128 break;
16130 case TLS_MODEL_LOCAL_EXEC:
16131 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16132 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16133 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16134 off = gen_rtx_CONST (Pmode, off);
16136 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16138 base = get_thread_pointer (Pmode,
16139 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16140 return gen_rtx_PLUS (Pmode, base, off);
16142 else
16144 base = get_thread_pointer (Pmode, true);
16145 dest = gen_reg_rtx (Pmode);
16146 emit_insn (ix86_gen_sub3 (dest, base, off));
16148 break;
16150 default:
16151 gcc_unreachable ();
16154 return dest;
16157 /* Return true if OP refers to a TLS address. */
16158 bool
16159 ix86_tls_address_pattern_p (rtx op)
16161 subrtx_var_iterator::array_type array;
16162 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16164 rtx op = *iter;
16165 if (MEM_P (op))
16167 rtx *x = &XEXP (op, 0);
16168 while (GET_CODE (*x) == PLUS)
16170 int i;
16171 for (i = 0; i < 2; i++)
16173 rtx u = XEXP (*x, i);
16174 if (GET_CODE (u) == ZERO_EXTEND)
16175 u = XEXP (u, 0);
16176 if (GET_CODE (u) == UNSPEC
16177 && XINT (u, 1) == UNSPEC_TP)
16178 return true;
16180 x = &XEXP (*x, 0);
16183 iter.skip_subrtxes ();
16187 return false;
16190 /* Rewrite *LOC so that it refers to a default TLS address space. */
16191 void
16192 ix86_rewrite_tls_address_1 (rtx *loc)
16194 subrtx_ptr_iterator::array_type array;
16195 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16197 rtx *loc = *iter;
16198 if (MEM_P (*loc))
16200 rtx addr = XEXP (*loc, 0);
16201 rtx *x = &addr;
16202 while (GET_CODE (*x) == PLUS)
16204 int i;
16205 for (i = 0; i < 2; i++)
16207 rtx u = XEXP (*x, i);
16208 if (GET_CODE (u) == ZERO_EXTEND)
16209 u = XEXP (u, 0);
16210 if (GET_CODE (u) == UNSPEC
16211 && XINT (u, 1) == UNSPEC_TP)
16213 addr_space_t as = DEFAULT_TLS_SEG_REG;
16215 *x = XEXP (*x, 1 - i);
16217 *loc = replace_equiv_address_nv (*loc, addr, true);
16218 set_mem_addr_space (*loc, as);
16219 return;
16222 x = &XEXP (*x, 0);
16225 iter.skip_subrtxes ();
16230 /* Rewrite instruction pattern involvning TLS address
16231 so that it refers to a default TLS address space. */
16233 ix86_rewrite_tls_address (rtx pattern)
16235 pattern = copy_insn (pattern);
16236 ix86_rewrite_tls_address_1 (&pattern);
16237 return pattern;
16240 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16241 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16242 unique refptr-DECL symbol corresponding to symbol DECL. */
16244 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16246 static inline hashval_t hash (tree_map *m) { return m->hash; }
16247 static inline bool
16248 equal (tree_map *a, tree_map *b)
16250 return a->base.from == b->base.from;
16253 static int
16254 keep_cache_entry (tree_map *&m)
16256 return ggc_marked_p (m->base.from);
16260 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16262 static tree
16263 get_dllimport_decl (tree decl, bool beimport)
16265 struct tree_map *h, in;
16266 const char *name;
16267 const char *prefix;
16268 size_t namelen, prefixlen;
16269 char *imp_name;
16270 tree to;
16271 rtx rtl;
16273 if (!dllimport_map)
16274 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16276 in.hash = htab_hash_pointer (decl);
16277 in.base.from = decl;
16278 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16279 h = *loc;
16280 if (h)
16281 return h->to;
16283 *loc = h = ggc_alloc<tree_map> ();
16284 h->hash = in.hash;
16285 h->base.from = decl;
16286 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16287 VAR_DECL, NULL, ptr_type_node);
16288 DECL_ARTIFICIAL (to) = 1;
16289 DECL_IGNORED_P (to) = 1;
16290 DECL_EXTERNAL (to) = 1;
16291 TREE_READONLY (to) = 1;
16293 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16294 name = targetm.strip_name_encoding (name);
16295 if (beimport)
16296 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16297 ? "*__imp_" : "*__imp__";
16298 else
16299 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16300 namelen = strlen (name);
16301 prefixlen = strlen (prefix);
16302 imp_name = (char *) alloca (namelen + prefixlen + 1);
16303 memcpy (imp_name, prefix, prefixlen);
16304 memcpy (imp_name + prefixlen, name, namelen + 1);
16306 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16307 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16308 SET_SYMBOL_REF_DECL (rtl, to);
16309 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16310 if (!beimport)
16312 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16313 #ifdef SUB_TARGET_RECORD_STUB
16314 SUB_TARGET_RECORD_STUB (name);
16315 #endif
16318 rtl = gen_const_mem (Pmode, rtl);
16319 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16321 SET_DECL_RTL (to, rtl);
16322 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16324 return to;
16327 /* Expand SYMBOL into its corresponding far-address symbol.
16328 WANT_REG is true if we require the result be a register. */
16330 static rtx
16331 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16333 tree imp_decl;
16334 rtx x;
16336 gcc_assert (SYMBOL_REF_DECL (symbol));
16337 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16339 x = DECL_RTL (imp_decl);
16340 if (want_reg)
16341 x = force_reg (Pmode, x);
16342 return x;
16345 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16346 true if we require the result be a register. */
16348 static rtx
16349 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16351 tree imp_decl;
16352 rtx x;
16354 gcc_assert (SYMBOL_REF_DECL (symbol));
16355 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16357 x = DECL_RTL (imp_decl);
16358 if (want_reg)
16359 x = force_reg (Pmode, x);
16360 return x;
16363 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16364 is true if we require the result be a register. */
16366 static rtx
16367 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16369 if (!TARGET_PECOFF)
16370 return NULL_RTX;
16372 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16374 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16375 return legitimize_dllimport_symbol (addr, inreg);
16376 if (GET_CODE (addr) == CONST
16377 && GET_CODE (XEXP (addr, 0)) == PLUS
16378 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16379 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16381 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16382 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16386 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16387 return NULL_RTX;
16388 if (GET_CODE (addr) == SYMBOL_REF
16389 && !is_imported_p (addr)
16390 && SYMBOL_REF_EXTERNAL_P (addr)
16391 && SYMBOL_REF_DECL (addr))
16392 return legitimize_pe_coff_extern_decl (addr, inreg);
16394 if (GET_CODE (addr) == CONST
16395 && GET_CODE (XEXP (addr, 0)) == PLUS
16396 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16397 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16398 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16399 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16401 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16402 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16404 return NULL_RTX;
16407 /* Try machine-dependent ways of modifying an illegitimate address
16408 to be legitimate. If we find one, return the new, valid address.
16409 This macro is used in only one place: `memory_address' in explow.c.
16411 OLDX is the address as it was before break_out_memory_refs was called.
16412 In some cases it is useful to look at this to decide what needs to be done.
16414 It is always safe for this macro to do nothing. It exists to recognize
16415 opportunities to optimize the output.
16417 For the 80386, we handle X+REG by loading X into a register R and
16418 using R+REG. R will go in a general reg and indexing will be used.
16419 However, if REG is a broken-out memory address or multiplication,
16420 nothing needs to be done because REG can certainly go in a general reg.
16422 When -fpic is used, special handling is needed for symbolic references.
16423 See comments by legitimize_pic_address in i386.c for details. */
16425 static rtx
16426 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16428 bool changed = false;
16429 unsigned log;
16431 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16432 if (log)
16433 return legitimize_tls_address (x, (enum tls_model) log, false);
16434 if (GET_CODE (x) == CONST
16435 && GET_CODE (XEXP (x, 0)) == PLUS
16436 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16437 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16439 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16440 (enum tls_model) log, false);
16441 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16444 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16446 rtx tmp = legitimize_pe_coff_symbol (x, true);
16447 if (tmp)
16448 return tmp;
16451 if (flag_pic && SYMBOLIC_CONST (x))
16452 return legitimize_pic_address (x, 0);
16454 #if TARGET_MACHO
16455 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16456 return machopic_indirect_data_reference (x, 0);
16457 #endif
16459 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16460 if (GET_CODE (x) == ASHIFT
16461 && CONST_INT_P (XEXP (x, 1))
16462 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16464 changed = true;
16465 log = INTVAL (XEXP (x, 1));
16466 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16467 GEN_INT (1 << log));
16470 if (GET_CODE (x) == PLUS)
16472 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16474 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16475 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16476 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16478 changed = true;
16479 log = INTVAL (XEXP (XEXP (x, 0), 1));
16480 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16481 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16482 GEN_INT (1 << log));
16485 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16486 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16487 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16489 changed = true;
16490 log = INTVAL (XEXP (XEXP (x, 1), 1));
16491 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16492 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16493 GEN_INT (1 << log));
16496 /* Put multiply first if it isn't already. */
16497 if (GET_CODE (XEXP (x, 1)) == MULT)
16499 std::swap (XEXP (x, 0), XEXP (x, 1));
16500 changed = true;
16503 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16504 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16505 created by virtual register instantiation, register elimination, and
16506 similar optimizations. */
16507 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16509 changed = true;
16510 x = gen_rtx_PLUS (Pmode,
16511 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16512 XEXP (XEXP (x, 1), 0)),
16513 XEXP (XEXP (x, 1), 1));
16516 /* Canonicalize
16517 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16518 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16519 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16520 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16521 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16522 && CONSTANT_P (XEXP (x, 1)))
16524 rtx constant;
16525 rtx other = NULL_RTX;
16527 if (CONST_INT_P (XEXP (x, 1)))
16529 constant = XEXP (x, 1);
16530 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16532 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16534 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16535 other = XEXP (x, 1);
16537 else
16538 constant = 0;
16540 if (constant)
16542 changed = true;
16543 x = gen_rtx_PLUS (Pmode,
16544 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16545 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16546 plus_constant (Pmode, other,
16547 INTVAL (constant)));
16551 if (changed && ix86_legitimate_address_p (mode, x, false))
16552 return x;
16554 if (GET_CODE (XEXP (x, 0)) == MULT)
16556 changed = true;
16557 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16560 if (GET_CODE (XEXP (x, 1)) == MULT)
16562 changed = true;
16563 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16566 if (changed
16567 && REG_P (XEXP (x, 1))
16568 && REG_P (XEXP (x, 0)))
16569 return x;
16571 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16573 changed = true;
16574 x = legitimize_pic_address (x, 0);
16577 if (changed && ix86_legitimate_address_p (mode, x, false))
16578 return x;
16580 if (REG_P (XEXP (x, 0)))
16582 rtx temp = gen_reg_rtx (Pmode);
16583 rtx val = force_operand (XEXP (x, 1), temp);
16584 if (val != temp)
16586 val = convert_to_mode (Pmode, val, 1);
16587 emit_move_insn (temp, val);
16590 XEXP (x, 1) = temp;
16591 return x;
16594 else if (REG_P (XEXP (x, 1)))
16596 rtx temp = gen_reg_rtx (Pmode);
16597 rtx val = force_operand (XEXP (x, 0), temp);
16598 if (val != temp)
16600 val = convert_to_mode (Pmode, val, 1);
16601 emit_move_insn (temp, val);
16604 XEXP (x, 0) = temp;
16605 return x;
16609 return x;
16612 /* Print an integer constant expression in assembler syntax. Addition
16613 and subtraction are the only arithmetic that may appear in these
16614 expressions. FILE is the stdio stream to write to, X is the rtx, and
16615 CODE is the operand print code from the output string. */
16617 static void
16618 output_pic_addr_const (FILE *file, rtx x, int code)
16620 char buf[256];
16622 switch (GET_CODE (x))
16624 case PC:
16625 gcc_assert (flag_pic);
16626 putc ('.', file);
16627 break;
16629 case SYMBOL_REF:
16630 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16631 output_addr_const (file, x);
16632 else
16634 const char *name = XSTR (x, 0);
16636 /* Mark the decl as referenced so that cgraph will
16637 output the function. */
16638 if (SYMBOL_REF_DECL (x))
16639 mark_decl_referenced (SYMBOL_REF_DECL (x));
16641 #if TARGET_MACHO
16642 if (MACHOPIC_INDIRECT
16643 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16644 name = machopic_indirection_name (x, /*stub_p=*/true);
16645 #endif
16646 assemble_name (file, name);
16648 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16649 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16650 fputs ("@PLT", file);
16651 break;
16653 case LABEL_REF:
16654 x = XEXP (x, 0);
16655 /* FALLTHRU */
16656 case CODE_LABEL:
16657 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16658 assemble_name (asm_out_file, buf);
16659 break;
16661 case CONST_INT:
16662 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16663 break;
16665 case CONST:
16666 /* This used to output parentheses around the expression,
16667 but that does not work on the 386 (either ATT or BSD assembler). */
16668 output_pic_addr_const (file, XEXP (x, 0), code);
16669 break;
16671 case CONST_DOUBLE:
16672 /* We can't handle floating point constants;
16673 TARGET_PRINT_OPERAND must handle them. */
16674 output_operand_lossage ("floating constant misused");
16675 break;
16677 case PLUS:
16678 /* Some assemblers need integer constants to appear first. */
16679 if (CONST_INT_P (XEXP (x, 0)))
16681 output_pic_addr_const (file, XEXP (x, 0), code);
16682 putc ('+', file);
16683 output_pic_addr_const (file, XEXP (x, 1), code);
16685 else
16687 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16688 output_pic_addr_const (file, XEXP (x, 1), code);
16689 putc ('+', file);
16690 output_pic_addr_const (file, XEXP (x, 0), code);
16692 break;
16694 case MINUS:
16695 if (!TARGET_MACHO)
16696 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16697 output_pic_addr_const (file, XEXP (x, 0), code);
16698 putc ('-', file);
16699 output_pic_addr_const (file, XEXP (x, 1), code);
16700 if (!TARGET_MACHO)
16701 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16702 break;
16704 case UNSPEC:
16705 gcc_assert (XVECLEN (x, 0) == 1);
16706 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16707 switch (XINT (x, 1))
16709 case UNSPEC_GOT:
16710 fputs ("@GOT", file);
16711 break;
16712 case UNSPEC_GOTOFF:
16713 fputs ("@GOTOFF", file);
16714 break;
16715 case UNSPEC_PLTOFF:
16716 fputs ("@PLTOFF", file);
16717 break;
16718 case UNSPEC_PCREL:
16719 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16720 "(%rip)" : "[rip]", file);
16721 break;
16722 case UNSPEC_GOTPCREL:
16723 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16724 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16725 break;
16726 case UNSPEC_GOTTPOFF:
16727 /* FIXME: This might be @TPOFF in Sun ld too. */
16728 fputs ("@gottpoff", file);
16729 break;
16730 case UNSPEC_TPOFF:
16731 fputs ("@tpoff", file);
16732 break;
16733 case UNSPEC_NTPOFF:
16734 if (TARGET_64BIT)
16735 fputs ("@tpoff", file);
16736 else
16737 fputs ("@ntpoff", file);
16738 break;
16739 case UNSPEC_DTPOFF:
16740 fputs ("@dtpoff", file);
16741 break;
16742 case UNSPEC_GOTNTPOFF:
16743 if (TARGET_64BIT)
16744 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16745 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16746 else
16747 fputs ("@gotntpoff", file);
16748 break;
16749 case UNSPEC_INDNTPOFF:
16750 fputs ("@indntpoff", file);
16751 break;
16752 #if TARGET_MACHO
16753 case UNSPEC_MACHOPIC_OFFSET:
16754 putc ('-', file);
16755 machopic_output_function_base_name (file);
16756 break;
16757 #endif
16758 default:
16759 output_operand_lossage ("invalid UNSPEC as operand");
16760 break;
16762 break;
16764 default:
16765 output_operand_lossage ("invalid expression as operand");
16769 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16770 We need to emit DTP-relative relocations. */
16772 static void ATTRIBUTE_UNUSED
16773 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16775 fputs (ASM_LONG, file);
16776 output_addr_const (file, x);
16777 fputs ("@dtpoff", file);
16778 switch (size)
16780 case 4:
16781 break;
16782 case 8:
16783 fputs (", 0", file);
16784 break;
16785 default:
16786 gcc_unreachable ();
16790 /* Return true if X is a representation of the PIC register. This copes
16791 with calls from ix86_find_base_term, where the register might have
16792 been replaced by a cselib value. */
16794 static bool
16795 ix86_pic_register_p (rtx x)
16797 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16798 return (pic_offset_table_rtx
16799 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16800 else if (!REG_P (x))
16801 return false;
16802 else if (pic_offset_table_rtx)
16804 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16805 return true;
16806 if (HARD_REGISTER_P (x)
16807 && !HARD_REGISTER_P (pic_offset_table_rtx)
16808 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16809 return true;
16810 return false;
16812 else
16813 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16816 /* Helper function for ix86_delegitimize_address.
16817 Attempt to delegitimize TLS local-exec accesses. */
16819 static rtx
16820 ix86_delegitimize_tls_address (rtx orig_x)
16822 rtx x = orig_x, unspec;
16823 struct ix86_address addr;
16825 if (!TARGET_TLS_DIRECT_SEG_REFS)
16826 return orig_x;
16827 if (MEM_P (x))
16828 x = XEXP (x, 0);
16829 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16830 return orig_x;
16831 if (ix86_decompose_address (x, &addr) == 0
16832 || addr.seg != DEFAULT_TLS_SEG_REG
16833 || addr.disp == NULL_RTX
16834 || GET_CODE (addr.disp) != CONST)
16835 return orig_x;
16836 unspec = XEXP (addr.disp, 0);
16837 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
16838 unspec = XEXP (unspec, 0);
16839 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
16840 return orig_x;
16841 x = XVECEXP (unspec, 0, 0);
16842 gcc_assert (GET_CODE (x) == SYMBOL_REF);
16843 if (unspec != XEXP (addr.disp, 0))
16844 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
16845 if (addr.index)
16847 rtx idx = addr.index;
16848 if (addr.scale != 1)
16849 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
16850 x = gen_rtx_PLUS (Pmode, idx, x);
16852 if (addr.base)
16853 x = gen_rtx_PLUS (Pmode, addr.base, x);
16854 if (MEM_P (orig_x))
16855 x = replace_equiv_address_nv (orig_x, x);
16856 return x;
16859 /* In the name of slightly smaller debug output, and to cater to
16860 general assembler lossage, recognize PIC+GOTOFF and turn it back
16861 into a direct symbol reference.
16863 On Darwin, this is necessary to avoid a crash, because Darwin
16864 has a different PIC label for each routine but the DWARF debugging
16865 information is not associated with any particular routine, so it's
16866 necessary to remove references to the PIC label from RTL stored by
16867 the DWARF output code.
16869 This helper is used in the normal ix86_delegitimize_address
16870 entrypoint (e.g. used in the target delegitimization hook) and
16871 in ix86_find_base_term. As compile time memory optimization, we
16872 avoid allocating rtxes that will not change anything on the outcome
16873 of the callers (find_base_value and find_base_term). */
16875 static inline rtx
16876 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
16878 rtx orig_x = delegitimize_mem_from_attrs (x);
16879 /* addend is NULL or some rtx if x is something+GOTOFF where
16880 something doesn't include the PIC register. */
16881 rtx addend = NULL_RTX;
16882 /* reg_addend is NULL or a multiple of some register. */
16883 rtx reg_addend = NULL_RTX;
16884 /* const_addend is NULL or a const_int. */
16885 rtx const_addend = NULL_RTX;
16886 /* This is the result, or NULL. */
16887 rtx result = NULL_RTX;
16889 x = orig_x;
16891 if (MEM_P (x))
16892 x = XEXP (x, 0);
16894 if (TARGET_64BIT)
16896 if (GET_CODE (x) == CONST
16897 && GET_CODE (XEXP (x, 0)) == PLUS
16898 && GET_MODE (XEXP (x, 0)) == Pmode
16899 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16900 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
16901 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
16903 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
16904 base. A CONST can't be arg_pointer_rtx based. */
16905 if (base_term_p && MEM_P (orig_x))
16906 return orig_x;
16907 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
16908 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
16909 if (MEM_P (orig_x))
16910 x = replace_equiv_address_nv (orig_x, x);
16911 return x;
16914 if (GET_CODE (x) == CONST
16915 && GET_CODE (XEXP (x, 0)) == UNSPEC
16916 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
16917 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
16918 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
16920 x = XVECEXP (XEXP (x, 0), 0, 0);
16921 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
16923 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
16924 if (x == NULL_RTX)
16925 return orig_x;
16927 return x;
16930 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
16931 return ix86_delegitimize_tls_address (orig_x);
16933 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
16934 and -mcmodel=medium -fpic. */
16937 if (GET_CODE (x) != PLUS
16938 || GET_CODE (XEXP (x, 1)) != CONST)
16939 return ix86_delegitimize_tls_address (orig_x);
16941 if (ix86_pic_register_p (XEXP (x, 0)))
16942 /* %ebx + GOT/GOTOFF */
16944 else if (GET_CODE (XEXP (x, 0)) == PLUS)
16946 /* %ebx + %reg * scale + GOT/GOTOFF */
16947 reg_addend = XEXP (x, 0);
16948 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
16949 reg_addend = XEXP (reg_addend, 1);
16950 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
16951 reg_addend = XEXP (reg_addend, 0);
16952 else
16954 reg_addend = NULL_RTX;
16955 addend = XEXP (x, 0);
16958 else
16959 addend = XEXP (x, 0);
16961 x = XEXP (XEXP (x, 1), 0);
16962 if (GET_CODE (x) == PLUS
16963 && CONST_INT_P (XEXP (x, 1)))
16965 const_addend = XEXP (x, 1);
16966 x = XEXP (x, 0);
16969 if (GET_CODE (x) == UNSPEC
16970 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
16971 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
16972 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
16973 && !MEM_P (orig_x) && !addend)))
16974 result = XVECEXP (x, 0, 0);
16976 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
16977 && !MEM_P (orig_x))
16978 result = XVECEXP (x, 0, 0);
16980 if (! result)
16981 return ix86_delegitimize_tls_address (orig_x);
16983 /* For (PLUS something CONST_INT) both find_base_{value,term} just
16984 recurse on the first operand. */
16985 if (const_addend && !base_term_p)
16986 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
16987 if (reg_addend)
16988 result = gen_rtx_PLUS (Pmode, reg_addend, result);
16989 if (addend)
16991 /* If the rest of original X doesn't involve the PIC register, add
16992 addend and subtract pic_offset_table_rtx. This can happen e.g.
16993 for code like:
16994 leal (%ebx, %ecx, 4), %ecx
16996 movl foo@GOTOFF(%ecx), %edx
16997 in which case we return (%ecx - %ebx) + foo
16998 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
16999 and reload has completed. Don't do the latter for debug,
17000 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17001 if (pic_offset_table_rtx
17002 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17003 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17004 pic_offset_table_rtx),
17005 result);
17006 else if (base_term_p
17007 && pic_offset_table_rtx
17008 && !TARGET_MACHO
17009 && !TARGET_VXWORKS_RTP)
17011 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17012 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17013 result = gen_rtx_PLUS (Pmode, tmp, result);
17015 else
17016 return orig_x;
17018 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17020 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17021 if (result == NULL_RTX)
17022 return orig_x;
17024 return result;
17027 /* The normal instantiation of the above template. */
17029 static rtx
17030 ix86_delegitimize_address (rtx x)
17032 return ix86_delegitimize_address_1 (x, false);
17035 /* If X is a machine specific address (i.e. a symbol or label being
17036 referenced as a displacement from the GOT implemented using an
17037 UNSPEC), then return the base term. Otherwise return X. */
17040 ix86_find_base_term (rtx x)
17042 rtx term;
17044 if (TARGET_64BIT)
17046 if (GET_CODE (x) != CONST)
17047 return x;
17048 term = XEXP (x, 0);
17049 if (GET_CODE (term) == PLUS
17050 && CONST_INT_P (XEXP (term, 1)))
17051 term = XEXP (term, 0);
17052 if (GET_CODE (term) != UNSPEC
17053 || (XINT (term, 1) != UNSPEC_GOTPCREL
17054 && XINT (term, 1) != UNSPEC_PCREL))
17055 return x;
17057 return XVECEXP (term, 0, 0);
17060 return ix86_delegitimize_address_1 (x, true);
17063 /* Return true if X shouldn't be emitted into the debug info.
17064 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17065 symbol easily into the .debug_info section, so we need not to
17066 delegitimize, but instead assemble as @gotoff.
17067 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17068 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17070 static bool
17071 ix86_const_not_ok_for_debug_p (rtx x)
17073 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17074 return true;
17076 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17077 return true;
17079 return false;
17082 static void
17083 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17084 bool fp, FILE *file)
17086 const char *suffix;
17088 if (mode == CCFPmode)
17090 code = ix86_fp_compare_code_to_integer (code);
17091 mode = CCmode;
17093 if (reverse)
17094 code = reverse_condition (code);
17096 switch (code)
17098 case EQ:
17099 gcc_assert (mode != CCGZmode);
17100 switch (mode)
17102 case E_CCAmode:
17103 suffix = "a";
17104 break;
17105 case E_CCCmode:
17106 suffix = "c";
17107 break;
17108 case E_CCOmode:
17109 suffix = "o";
17110 break;
17111 case E_CCPmode:
17112 suffix = "p";
17113 break;
17114 case E_CCSmode:
17115 suffix = "s";
17116 break;
17117 default:
17118 suffix = "e";
17119 break;
17121 break;
17122 case NE:
17123 gcc_assert (mode != CCGZmode);
17124 switch (mode)
17126 case E_CCAmode:
17127 suffix = "na";
17128 break;
17129 case E_CCCmode:
17130 suffix = "nc";
17131 break;
17132 case E_CCOmode:
17133 suffix = "no";
17134 break;
17135 case E_CCPmode:
17136 suffix = "np";
17137 break;
17138 case E_CCSmode:
17139 suffix = "ns";
17140 break;
17141 default:
17142 suffix = "ne";
17143 break;
17145 break;
17146 case GT:
17147 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17148 suffix = "g";
17149 break;
17150 case GTU:
17151 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17152 Those same assemblers have the same but opposite lossage on cmov. */
17153 if (mode == CCmode)
17154 suffix = fp ? "nbe" : "a";
17155 else
17156 gcc_unreachable ();
17157 break;
17158 case LT:
17159 switch (mode)
17161 case E_CCNOmode:
17162 case E_CCGOCmode:
17163 suffix = "s";
17164 break;
17166 case E_CCmode:
17167 case E_CCGCmode:
17168 case E_CCGZmode:
17169 suffix = "l";
17170 break;
17172 default:
17173 gcc_unreachable ();
17175 break;
17176 case LTU:
17177 if (mode == CCmode || mode == CCGZmode)
17178 suffix = "b";
17179 else if (mode == CCCmode)
17180 suffix = fp ? "b" : "c";
17181 else
17182 gcc_unreachable ();
17183 break;
17184 case GE:
17185 switch (mode)
17187 case E_CCNOmode:
17188 case E_CCGOCmode:
17189 suffix = "ns";
17190 break;
17192 case E_CCmode:
17193 case E_CCGCmode:
17194 case E_CCGZmode:
17195 suffix = "ge";
17196 break;
17198 default:
17199 gcc_unreachable ();
17201 break;
17202 case GEU:
17203 if (mode == CCmode || mode == CCGZmode)
17204 suffix = "nb";
17205 else if (mode == CCCmode)
17206 suffix = fp ? "nb" : "nc";
17207 else
17208 gcc_unreachable ();
17209 break;
17210 case LE:
17211 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17212 suffix = "le";
17213 break;
17214 case LEU:
17215 if (mode == CCmode)
17216 suffix = "be";
17217 else
17218 gcc_unreachable ();
17219 break;
17220 case UNORDERED:
17221 suffix = fp ? "u" : "p";
17222 break;
17223 case ORDERED:
17224 suffix = fp ? "nu" : "np";
17225 break;
17226 default:
17227 gcc_unreachable ();
17229 fputs (suffix, file);
17232 /* Print the name of register X to FILE based on its machine mode and number.
17233 If CODE is 'w', pretend the mode is HImode.
17234 If CODE is 'b', pretend the mode is QImode.
17235 If CODE is 'k', pretend the mode is SImode.
17236 If CODE is 'q', pretend the mode is DImode.
17237 If CODE is 'x', pretend the mode is V4SFmode.
17238 If CODE is 't', pretend the mode is V8SFmode.
17239 If CODE is 'g', pretend the mode is V16SFmode.
17240 If CODE is 'h', pretend the reg is the 'high' byte register.
17241 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17242 If CODE is 'd', duplicate the operand for AVX instruction.
17245 void
17246 print_reg (rtx x, int code, FILE *file)
17248 const char *reg;
17249 int msize;
17250 unsigned int regno;
17251 bool duplicated;
17253 if (ASSEMBLER_DIALECT == ASM_ATT)
17254 putc ('%', file);
17256 if (x == pc_rtx)
17258 gcc_assert (TARGET_64BIT);
17259 fputs ("rip", file);
17260 return;
17263 if (code == 'y' && STACK_TOP_P (x))
17265 fputs ("st(0)", file);
17266 return;
17269 if (code == 'w')
17270 msize = 2;
17271 else if (code == 'b')
17272 msize = 1;
17273 else if (code == 'k')
17274 msize = 4;
17275 else if (code == 'q')
17276 msize = 8;
17277 else if (code == 'h')
17278 msize = 0;
17279 else if (code == 'x')
17280 msize = 16;
17281 else if (code == 't')
17282 msize = 32;
17283 else if (code == 'g')
17284 msize = 64;
17285 else
17286 msize = GET_MODE_SIZE (GET_MODE (x));
17288 regno = REGNO (x);
17290 if (regno == ARG_POINTER_REGNUM
17291 || regno == FRAME_POINTER_REGNUM
17292 || regno == FPSR_REG
17293 || regno == FPCR_REG)
17295 output_operand_lossage
17296 ("invalid use of register '%s'", reg_names[regno]);
17297 return;
17299 else if (regno == FLAGS_REG)
17301 output_operand_lossage ("invalid use of asm flag output");
17302 return;
17305 duplicated = code == 'd' && TARGET_AVX;
17307 switch (msize)
17309 case 16:
17310 case 12:
17311 case 8:
17312 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17313 warning (0, "unsupported size for integer register");
17314 /* FALLTHRU */
17315 case 4:
17316 if (LEGACY_INT_REGNO_P (regno))
17317 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17318 /* FALLTHRU */
17319 case 2:
17320 normal:
17321 reg = hi_reg_name[regno];
17322 break;
17323 case 1:
17324 if (regno >= ARRAY_SIZE (qi_reg_name))
17325 goto normal;
17326 if (!ANY_QI_REGNO_P (regno))
17327 error ("unsupported size for integer register");
17328 reg = qi_reg_name[regno];
17329 break;
17330 case 0:
17331 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17332 goto normal;
17333 reg = qi_high_reg_name[regno];
17334 break;
17335 case 32:
17336 case 64:
17337 if (SSE_REGNO_P (regno))
17339 gcc_assert (!duplicated);
17340 putc (msize == 32 ? 'y' : 'z', file);
17341 reg = hi_reg_name[regno] + 1;
17342 break;
17344 goto normal;
17345 default:
17346 gcc_unreachable ();
17349 fputs (reg, file);
17351 /* Irritatingly, AMD extended registers use
17352 different naming convention: "r%d[bwd]" */
17353 if (REX_INT_REGNO_P (regno))
17355 gcc_assert (TARGET_64BIT);
17356 switch (msize)
17358 case 0:
17359 error ("extended registers have no high halves");
17360 break;
17361 case 1:
17362 putc ('b', file);
17363 break;
17364 case 2:
17365 putc ('w', file);
17366 break;
17367 case 4:
17368 putc ('d', file);
17369 break;
17370 case 8:
17371 /* no suffix */
17372 break;
17373 default:
17374 error ("unsupported operand size for extended register");
17375 break;
17377 return;
17380 if (duplicated)
17382 if (ASSEMBLER_DIALECT == ASM_ATT)
17383 fprintf (file, ", %%%s", reg);
17384 else
17385 fprintf (file, ", %s", reg);
17389 /* Meaning of CODE:
17390 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17391 C -- print opcode suffix for set/cmov insn.
17392 c -- like C, but print reversed condition
17393 F,f -- likewise, but for floating-point.
17394 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17395 otherwise nothing
17396 R -- print embeded rounding and sae.
17397 r -- print only sae.
17398 z -- print the opcode suffix for the size of the current operand.
17399 Z -- likewise, with special suffixes for x87 instructions.
17400 * -- print a star (in certain assembler syntax)
17401 A -- print an absolute memory reference.
17402 E -- print address with DImode register names if TARGET_64BIT.
17403 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17404 s -- print a shift double count, followed by the assemblers argument
17405 delimiter.
17406 b -- print the QImode name of the register for the indicated operand.
17407 %b0 would print %al if operands[0] is reg 0.
17408 w -- likewise, print the HImode name of the register.
17409 k -- likewise, print the SImode name of the register.
17410 q -- likewise, print the DImode name of the register.
17411 x -- likewise, print the V4SFmode name of the register.
17412 t -- likewise, print the V8SFmode name of the register.
17413 g -- likewise, print the V16SFmode name of the register.
17414 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17415 y -- print "st(0)" instead of "st" as a register.
17416 d -- print duplicated register operand for AVX instruction.
17417 D -- print condition for SSE cmp instruction.
17418 P -- if PIC, print an @PLT suffix.
17419 p -- print raw symbol name.
17420 X -- don't print any sort of PIC '@' suffix for a symbol.
17421 & -- print some in-use local-dynamic symbol name.
17422 H -- print a memory address offset by 8; used for sse high-parts
17423 Y -- print condition for XOP pcom* instruction.
17424 + -- print a branch hint as 'cs' or 'ds' prefix
17425 ; -- print a semicolon (after prefixes due to bug in older gas).
17426 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17427 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17428 ! -- print MPX prefix for jxx/call/ret instructions if required.
17431 void
17432 ix86_print_operand (FILE *file, rtx x, int code)
17434 if (code)
17436 switch (code)
17438 case 'A':
17439 switch (ASSEMBLER_DIALECT)
17441 case ASM_ATT:
17442 putc ('*', file);
17443 break;
17445 case ASM_INTEL:
17446 /* Intel syntax. For absolute addresses, registers should not
17447 be surrounded by braces. */
17448 if (!REG_P (x))
17450 putc ('[', file);
17451 ix86_print_operand (file, x, 0);
17452 putc (']', file);
17453 return;
17455 break;
17457 default:
17458 gcc_unreachable ();
17461 ix86_print_operand (file, x, 0);
17462 return;
17464 case 'E':
17465 /* Wrap address in an UNSPEC to declare special handling. */
17466 if (TARGET_64BIT)
17467 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17469 output_address (VOIDmode, x);
17470 return;
17472 case 'L':
17473 if (ASSEMBLER_DIALECT == ASM_ATT)
17474 putc ('l', file);
17475 return;
17477 case 'W':
17478 if (ASSEMBLER_DIALECT == ASM_ATT)
17479 putc ('w', file);
17480 return;
17482 case 'B':
17483 if (ASSEMBLER_DIALECT == ASM_ATT)
17484 putc ('b', file);
17485 return;
17487 case 'Q':
17488 if (ASSEMBLER_DIALECT == ASM_ATT)
17489 putc ('l', file);
17490 return;
17492 case 'S':
17493 if (ASSEMBLER_DIALECT == ASM_ATT)
17494 putc ('s', file);
17495 return;
17497 case 'T':
17498 if (ASSEMBLER_DIALECT == ASM_ATT)
17499 putc ('t', file);
17500 return;
17502 case 'O':
17503 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17504 if (ASSEMBLER_DIALECT != ASM_ATT)
17505 return;
17507 switch (GET_MODE_SIZE (GET_MODE (x)))
17509 case 2:
17510 putc ('w', file);
17511 break;
17513 case 4:
17514 putc ('l', file);
17515 break;
17517 case 8:
17518 putc ('q', file);
17519 break;
17521 default:
17522 output_operand_lossage ("invalid operand size for operand "
17523 "code 'O'");
17524 return;
17527 putc ('.', file);
17528 #endif
17529 return;
17531 case 'z':
17532 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17534 /* Opcodes don't get size suffixes if using Intel opcodes. */
17535 if (ASSEMBLER_DIALECT == ASM_INTEL)
17536 return;
17538 switch (GET_MODE_SIZE (GET_MODE (x)))
17540 case 1:
17541 putc ('b', file);
17542 return;
17544 case 2:
17545 putc ('w', file);
17546 return;
17548 case 4:
17549 putc ('l', file);
17550 return;
17552 case 8:
17553 putc ('q', file);
17554 return;
17556 default:
17557 output_operand_lossage ("invalid operand size for operand "
17558 "code 'z'");
17559 return;
17563 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17564 warning (0, "non-integer operand used with operand code 'z'");
17565 /* FALLTHRU */
17567 case 'Z':
17568 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17569 if (ASSEMBLER_DIALECT == ASM_INTEL)
17570 return;
17572 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17574 switch (GET_MODE_SIZE (GET_MODE (x)))
17576 case 2:
17577 #ifdef HAVE_AS_IX86_FILDS
17578 putc ('s', file);
17579 #endif
17580 return;
17582 case 4:
17583 putc ('l', file);
17584 return;
17586 case 8:
17587 #ifdef HAVE_AS_IX86_FILDQ
17588 putc ('q', file);
17589 #else
17590 fputs ("ll", file);
17591 #endif
17592 return;
17594 default:
17595 break;
17598 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17600 /* 387 opcodes don't get size suffixes
17601 if the operands are registers. */
17602 if (STACK_REG_P (x))
17603 return;
17605 switch (GET_MODE_SIZE (GET_MODE (x)))
17607 case 4:
17608 putc ('s', file);
17609 return;
17611 case 8:
17612 putc ('l', file);
17613 return;
17615 case 12:
17616 case 16:
17617 putc ('t', file);
17618 return;
17620 default:
17621 break;
17624 else
17626 output_operand_lossage ("invalid operand type used with "
17627 "operand code 'Z'");
17628 return;
17631 output_operand_lossage ("invalid operand size for operand code 'Z'");
17632 return;
17634 case 'd':
17635 case 'b':
17636 case 'w':
17637 case 'k':
17638 case 'q':
17639 case 'h':
17640 case 't':
17641 case 'g':
17642 case 'y':
17643 case 'x':
17644 case 'X':
17645 case 'P':
17646 case 'p':
17647 break;
17649 case 's':
17650 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17652 ix86_print_operand (file, x, 0);
17653 fputs (", ", file);
17655 return;
17657 case 'Y':
17658 switch (GET_CODE (x))
17660 case NE:
17661 fputs ("neq", file);
17662 break;
17663 case EQ:
17664 fputs ("eq", file);
17665 break;
17666 case GE:
17667 case GEU:
17668 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17669 break;
17670 case GT:
17671 case GTU:
17672 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17673 break;
17674 case LE:
17675 case LEU:
17676 fputs ("le", file);
17677 break;
17678 case LT:
17679 case LTU:
17680 fputs ("lt", file);
17681 break;
17682 case UNORDERED:
17683 fputs ("unord", file);
17684 break;
17685 case ORDERED:
17686 fputs ("ord", file);
17687 break;
17688 case UNEQ:
17689 fputs ("ueq", file);
17690 break;
17691 case UNGE:
17692 fputs ("nlt", file);
17693 break;
17694 case UNGT:
17695 fputs ("nle", file);
17696 break;
17697 case UNLE:
17698 fputs ("ule", file);
17699 break;
17700 case UNLT:
17701 fputs ("ult", file);
17702 break;
17703 case LTGT:
17704 fputs ("une", file);
17705 break;
17706 default:
17707 output_operand_lossage ("operand is not a condition code, "
17708 "invalid operand code 'Y'");
17709 return;
17711 return;
17713 case 'D':
17714 /* Little bit of braindamage here. The SSE compare instructions
17715 does use completely different names for the comparisons that the
17716 fp conditional moves. */
17717 switch (GET_CODE (x))
17719 case UNEQ:
17720 if (TARGET_AVX)
17722 fputs ("eq_us", file);
17723 break;
17725 /* FALLTHRU */
17726 case EQ:
17727 fputs ("eq", file);
17728 break;
17729 case UNLT:
17730 if (TARGET_AVX)
17732 fputs ("nge", file);
17733 break;
17735 /* FALLTHRU */
17736 case LT:
17737 fputs ("lt", file);
17738 break;
17739 case UNLE:
17740 if (TARGET_AVX)
17742 fputs ("ngt", file);
17743 break;
17745 /* FALLTHRU */
17746 case LE:
17747 fputs ("le", file);
17748 break;
17749 case UNORDERED:
17750 fputs ("unord", file);
17751 break;
17752 case LTGT:
17753 if (TARGET_AVX)
17755 fputs ("neq_oq", file);
17756 break;
17758 /* FALLTHRU */
17759 case NE:
17760 fputs ("neq", file);
17761 break;
17762 case GE:
17763 if (TARGET_AVX)
17765 fputs ("ge", file);
17766 break;
17768 /* FALLTHRU */
17769 case UNGE:
17770 fputs ("nlt", file);
17771 break;
17772 case GT:
17773 if (TARGET_AVX)
17775 fputs ("gt", file);
17776 break;
17778 /* FALLTHRU */
17779 case UNGT:
17780 fputs ("nle", file);
17781 break;
17782 case ORDERED:
17783 fputs ("ord", file);
17784 break;
17785 default:
17786 output_operand_lossage ("operand is not a condition code, "
17787 "invalid operand code 'D'");
17788 return;
17790 return;
17792 case 'F':
17793 case 'f':
17794 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17795 if (ASSEMBLER_DIALECT == ASM_ATT)
17796 putc ('.', file);
17797 gcc_fallthrough ();
17798 #endif
17800 case 'C':
17801 case 'c':
17802 if (!COMPARISON_P (x))
17804 output_operand_lossage ("operand is not a condition code, "
17805 "invalid operand code '%c'", code);
17806 return;
17808 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17809 code == 'c' || code == 'f',
17810 code == 'F' || code == 'f',
17811 file);
17812 return;
17814 case 'H':
17815 if (!offsettable_memref_p (x))
17817 output_operand_lossage ("operand is not an offsettable memory "
17818 "reference, invalid operand code 'H'");
17819 return;
17821 /* It doesn't actually matter what mode we use here, as we're
17822 only going to use this for printing. */
17823 x = adjust_address_nv (x, DImode, 8);
17824 /* Output 'qword ptr' for intel assembler dialect. */
17825 if (ASSEMBLER_DIALECT == ASM_INTEL)
17826 code = 'q';
17827 break;
17829 case 'K':
17830 if (!CONST_INT_P (x))
17832 output_operand_lossage ("operand is not an integer, invalid "
17833 "operand code 'K'");
17834 return;
17837 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17838 #ifdef HAVE_AS_IX86_HLE
17839 fputs ("xacquire ", file);
17840 #else
17841 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17842 #endif
17843 else if (INTVAL (x) & IX86_HLE_RELEASE)
17844 #ifdef HAVE_AS_IX86_HLE
17845 fputs ("xrelease ", file);
17846 #else
17847 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17848 #endif
17849 /* We do not want to print value of the operand. */
17850 return;
17852 case 'N':
17853 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17854 fputs ("{z}", file);
17855 return;
17857 case 'r':
17858 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
17860 output_operand_lossage ("operand is not a specific integer, "
17861 "invalid operand code 'r'");
17862 return;
17865 if (ASSEMBLER_DIALECT == ASM_INTEL)
17866 fputs (", ", file);
17868 fputs ("{sae}", file);
17870 if (ASSEMBLER_DIALECT == ASM_ATT)
17871 fputs (", ", file);
17873 return;
17875 case 'R':
17876 if (!CONST_INT_P (x))
17878 output_operand_lossage ("operand is not an integer, invalid "
17879 "operand code 'R'");
17880 return;
17883 if (ASSEMBLER_DIALECT == ASM_INTEL)
17884 fputs (", ", file);
17886 switch (INTVAL (x))
17888 case ROUND_NEAREST_INT | ROUND_SAE:
17889 fputs ("{rn-sae}", file);
17890 break;
17891 case ROUND_NEG_INF | ROUND_SAE:
17892 fputs ("{rd-sae}", file);
17893 break;
17894 case ROUND_POS_INF | ROUND_SAE:
17895 fputs ("{ru-sae}", file);
17896 break;
17897 case ROUND_ZERO | ROUND_SAE:
17898 fputs ("{rz-sae}", file);
17899 break;
17900 default:
17901 output_operand_lossage ("operand is not a specific integer, "
17902 "invalid operand code 'R'");
17905 if (ASSEMBLER_DIALECT == ASM_ATT)
17906 fputs (", ", file);
17908 return;
17910 case '*':
17911 if (ASSEMBLER_DIALECT == ASM_ATT)
17912 putc ('*', file);
17913 return;
17915 case '&':
17917 const char *name = get_some_local_dynamic_name ();
17918 if (name == NULL)
17919 output_operand_lossage ("'%%&' used without any "
17920 "local dynamic TLS references");
17921 else
17922 assemble_name (file, name);
17923 return;
17926 case '+':
17928 rtx x;
17930 if (!optimize
17931 || optimize_function_for_size_p (cfun)
17932 || !TARGET_BRANCH_PREDICTION_HINTS)
17933 return;
17935 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
17936 if (x)
17938 int pred_val = profile_probability::from_reg_br_prob_note
17939 (XINT (x, 0)).to_reg_br_prob_base ();
17941 if (pred_val < REG_BR_PROB_BASE * 45 / 100
17942 || pred_val > REG_BR_PROB_BASE * 55 / 100)
17944 bool taken = pred_val > REG_BR_PROB_BASE / 2;
17945 bool cputaken
17946 = final_forward_branch_p (current_output_insn) == 0;
17948 /* Emit hints only in the case default branch prediction
17949 heuristics would fail. */
17950 if (taken != cputaken)
17952 /* We use 3e (DS) prefix for taken branches and
17953 2e (CS) prefix for not taken branches. */
17954 if (taken)
17955 fputs ("ds ; ", file);
17956 else
17957 fputs ("cs ; ", file);
17961 return;
17964 case ';':
17965 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
17966 putc (';', file);
17967 #endif
17968 return;
17970 case '~':
17971 putc (TARGET_AVX2 ? 'i' : 'f', file);
17972 return;
17974 case '^':
17975 if (TARGET_64BIT && Pmode != word_mode)
17976 fputs ("addr32 ", file);
17977 return;
17979 case '!':
17980 if (ix86_bnd_prefixed_insn_p (current_output_insn))
17981 fputs ("bnd ", file);
17982 if (ix86_notrack_prefixed_insn_p (current_output_insn))
17983 fputs ("notrack ", file);
17984 return;
17986 default:
17987 output_operand_lossage ("invalid operand code '%c'", code);
17991 if (REG_P (x))
17992 print_reg (x, code, file);
17994 else if (MEM_P (x))
17996 rtx addr = XEXP (x, 0);
17998 /* No `byte ptr' prefix for call instructions ... */
17999 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18001 machine_mode mode = GET_MODE (x);
18002 const char *size;
18004 /* Check for explicit size override codes. */
18005 if (code == 'b')
18006 size = "BYTE";
18007 else if (code == 'w')
18008 size = "WORD";
18009 else if (code == 'k')
18010 size = "DWORD";
18011 else if (code == 'q')
18012 size = "QWORD";
18013 else if (code == 'x')
18014 size = "XMMWORD";
18015 else if (code == 't')
18016 size = "YMMWORD";
18017 else if (code == 'g')
18018 size = "ZMMWORD";
18019 else if (mode == BLKmode)
18020 /* ... or BLKmode operands, when not overridden. */
18021 size = NULL;
18022 else
18023 switch (GET_MODE_SIZE (mode))
18025 case 1: size = "BYTE"; break;
18026 case 2: size = "WORD"; break;
18027 case 4: size = "DWORD"; break;
18028 case 8: size = "QWORD"; break;
18029 case 12: size = "TBYTE"; break;
18030 case 16:
18031 if (mode == XFmode)
18032 size = "TBYTE";
18033 else
18034 size = "XMMWORD";
18035 break;
18036 case 32: size = "YMMWORD"; break;
18037 case 64: size = "ZMMWORD"; break;
18038 default:
18039 gcc_unreachable ();
18041 if (size)
18043 fputs (size, file);
18044 fputs (" PTR ", file);
18048 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18049 output_operand_lossage ("invalid constraints for operand");
18050 else
18051 ix86_print_operand_address_as
18052 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18055 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18057 long l;
18059 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18061 if (ASSEMBLER_DIALECT == ASM_ATT)
18062 putc ('$', file);
18063 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18064 if (code == 'q')
18065 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18066 (unsigned long long) (int) l);
18067 else
18068 fprintf (file, "0x%08x", (unsigned int) l);
18071 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18073 long l[2];
18075 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18077 if (ASSEMBLER_DIALECT == ASM_ATT)
18078 putc ('$', file);
18079 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18082 /* These float cases don't actually occur as immediate operands. */
18083 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18085 char dstr[30];
18087 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18088 fputs (dstr, file);
18091 else
18093 /* We have patterns that allow zero sets of memory, for instance.
18094 In 64-bit mode, we should probably support all 8-byte vectors,
18095 since we can in fact encode that into an immediate. */
18096 if (GET_CODE (x) == CONST_VECTOR)
18098 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18099 x = const0_rtx;
18102 if (code != 'P' && code != 'p')
18104 if (CONST_INT_P (x))
18106 if (ASSEMBLER_DIALECT == ASM_ATT)
18107 putc ('$', file);
18109 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18110 || GET_CODE (x) == LABEL_REF)
18112 if (ASSEMBLER_DIALECT == ASM_ATT)
18113 putc ('$', file);
18114 else
18115 fputs ("OFFSET FLAT:", file);
18118 if (CONST_INT_P (x))
18119 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18120 else if (flag_pic || MACHOPIC_INDIRECT)
18121 output_pic_addr_const (file, x, code);
18122 else
18123 output_addr_const (file, x);
18127 static bool
18128 ix86_print_operand_punct_valid_p (unsigned char code)
18130 return (code == '*' || code == '+' || code == '&' || code == ';'
18131 || code == '~' || code == '^' || code == '!');
18134 /* Print a memory operand whose address is ADDR. */
18136 static void
18137 ix86_print_operand_address_as (FILE *file, rtx addr,
18138 addr_space_t as, bool no_rip)
18140 struct ix86_address parts;
18141 rtx base, index, disp;
18142 int scale;
18143 int ok;
18144 bool vsib = false;
18145 int code = 0;
18147 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18149 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18150 gcc_assert (parts.index == NULL_RTX);
18151 parts.index = XVECEXP (addr, 0, 1);
18152 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18153 addr = XVECEXP (addr, 0, 0);
18154 vsib = true;
18156 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18158 gcc_assert (TARGET_64BIT);
18159 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18160 code = 'q';
18162 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18164 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18165 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18166 if (parts.base != NULL_RTX)
18168 parts.index = parts.base;
18169 parts.scale = 1;
18171 parts.base = XVECEXP (addr, 0, 0);
18172 addr = XVECEXP (addr, 0, 0);
18174 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18176 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18177 gcc_assert (parts.index == NULL_RTX);
18178 parts.index = XVECEXP (addr, 0, 1);
18179 addr = XVECEXP (addr, 0, 0);
18181 else
18182 ok = ix86_decompose_address (addr, &parts);
18184 gcc_assert (ok);
18186 base = parts.base;
18187 index = parts.index;
18188 disp = parts.disp;
18189 scale = parts.scale;
18191 if (ADDR_SPACE_GENERIC_P (as))
18192 as = parts.seg;
18193 else
18194 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18196 if (!ADDR_SPACE_GENERIC_P (as))
18198 const char *string;
18200 if (as == ADDR_SPACE_SEG_FS)
18201 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18202 else if (as == ADDR_SPACE_SEG_GS)
18203 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18204 else
18205 gcc_unreachable ();
18206 fputs (string, file);
18209 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18210 if (TARGET_64BIT && !base && !index && !no_rip)
18212 rtx symbol = disp;
18214 if (GET_CODE (disp) == CONST
18215 && GET_CODE (XEXP (disp, 0)) == PLUS
18216 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18217 symbol = XEXP (XEXP (disp, 0), 0);
18219 if (GET_CODE (symbol) == LABEL_REF
18220 || (GET_CODE (symbol) == SYMBOL_REF
18221 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18222 base = pc_rtx;
18225 if (!base && !index)
18227 /* Displacement only requires special attention. */
18228 if (CONST_INT_P (disp))
18230 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18231 fputs ("ds:", file);
18232 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18234 /* Load the external function address via the GOT slot to avoid PLT. */
18235 else if (GET_CODE (disp) == CONST
18236 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18237 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18238 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18239 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18240 output_pic_addr_const (file, disp, 0);
18241 else if (flag_pic)
18242 output_pic_addr_const (file, disp, 0);
18243 else
18244 output_addr_const (file, disp);
18246 else
18248 /* Print SImode register names to force addr32 prefix. */
18249 if (SImode_address_operand (addr, VOIDmode))
18251 if (flag_checking)
18253 gcc_assert (TARGET_64BIT);
18254 switch (GET_CODE (addr))
18256 case SUBREG:
18257 gcc_assert (GET_MODE (addr) == SImode);
18258 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18259 break;
18260 case ZERO_EXTEND:
18261 case AND:
18262 gcc_assert (GET_MODE (addr) == DImode);
18263 break;
18264 default:
18265 gcc_unreachable ();
18268 gcc_assert (!code);
18269 code = 'k';
18271 else if (code == 0
18272 && TARGET_X32
18273 && disp
18274 && CONST_INT_P (disp)
18275 && INTVAL (disp) < -16*1024*1024)
18277 /* X32 runs in 64-bit mode, where displacement, DISP, in
18278 address DISP(%r64), is encoded as 32-bit immediate sign-
18279 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18280 address is %r64 + 0xffffffffbffffd00. When %r64 <
18281 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18282 which is invalid for x32. The correct address is %r64
18283 - 0x40000300 == 0xf7ffdd64. To properly encode
18284 -0x40000300(%r64) for x32, we zero-extend negative
18285 displacement by forcing addr32 prefix which truncates
18286 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18287 zero-extend all negative displacements, including -1(%rsp).
18288 However, for small negative displacements, sign-extension
18289 won't cause overflow. We only zero-extend negative
18290 displacements if they < -16*1024*1024, which is also used
18291 to check legitimate address displacements for PIC. */
18292 code = 'k';
18295 /* Since the upper 32 bits of RSP are always zero for x32,
18296 we can encode %esp as %rsp to avoid 0x67 prefix if
18297 there is no index register. */
18298 if (TARGET_X32 && Pmode == SImode
18299 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18300 code = 'q';
18302 if (ASSEMBLER_DIALECT == ASM_ATT)
18304 if (disp)
18306 if (flag_pic)
18307 output_pic_addr_const (file, disp, 0);
18308 else if (GET_CODE (disp) == LABEL_REF)
18309 output_asm_label (disp);
18310 else
18311 output_addr_const (file, disp);
18314 putc ('(', file);
18315 if (base)
18316 print_reg (base, code, file);
18317 if (index)
18319 putc (',', file);
18320 print_reg (index, vsib ? 0 : code, file);
18321 if (scale != 1 || vsib)
18322 fprintf (file, ",%d", scale);
18324 putc (')', file);
18326 else
18328 rtx offset = NULL_RTX;
18330 if (disp)
18332 /* Pull out the offset of a symbol; print any symbol itself. */
18333 if (GET_CODE (disp) == CONST
18334 && GET_CODE (XEXP (disp, 0)) == PLUS
18335 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18337 offset = XEXP (XEXP (disp, 0), 1);
18338 disp = gen_rtx_CONST (VOIDmode,
18339 XEXP (XEXP (disp, 0), 0));
18342 if (flag_pic)
18343 output_pic_addr_const (file, disp, 0);
18344 else if (GET_CODE (disp) == LABEL_REF)
18345 output_asm_label (disp);
18346 else if (CONST_INT_P (disp))
18347 offset = disp;
18348 else
18349 output_addr_const (file, disp);
18352 putc ('[', file);
18353 if (base)
18355 print_reg (base, code, file);
18356 if (offset)
18358 if (INTVAL (offset) >= 0)
18359 putc ('+', file);
18360 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18363 else if (offset)
18364 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18365 else
18366 putc ('0', file);
18368 if (index)
18370 putc ('+', file);
18371 print_reg (index, vsib ? 0 : code, file);
18372 if (scale != 1 || vsib)
18373 fprintf (file, "*%d", scale);
18375 putc (']', file);
18380 static void
18381 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18383 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18386 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18388 static bool
18389 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18391 rtx op;
18393 if (GET_CODE (x) != UNSPEC)
18394 return false;
18396 op = XVECEXP (x, 0, 0);
18397 switch (XINT (x, 1))
18399 case UNSPEC_GOTOFF:
18400 output_addr_const (file, op);
18401 fputs ("@gotoff", file);
18402 break;
18403 case UNSPEC_GOTTPOFF:
18404 output_addr_const (file, op);
18405 /* FIXME: This might be @TPOFF in Sun ld. */
18406 fputs ("@gottpoff", file);
18407 break;
18408 case UNSPEC_TPOFF:
18409 output_addr_const (file, op);
18410 fputs ("@tpoff", file);
18411 break;
18412 case UNSPEC_NTPOFF:
18413 output_addr_const (file, op);
18414 if (TARGET_64BIT)
18415 fputs ("@tpoff", file);
18416 else
18417 fputs ("@ntpoff", file);
18418 break;
18419 case UNSPEC_DTPOFF:
18420 output_addr_const (file, op);
18421 fputs ("@dtpoff", file);
18422 break;
18423 case UNSPEC_GOTNTPOFF:
18424 output_addr_const (file, op);
18425 if (TARGET_64BIT)
18426 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18427 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18428 else
18429 fputs ("@gotntpoff", file);
18430 break;
18431 case UNSPEC_INDNTPOFF:
18432 output_addr_const (file, op);
18433 fputs ("@indntpoff", file);
18434 break;
18435 #if TARGET_MACHO
18436 case UNSPEC_MACHOPIC_OFFSET:
18437 output_addr_const (file, op);
18438 putc ('-', file);
18439 machopic_output_function_base_name (file);
18440 break;
18441 #endif
18443 default:
18444 return false;
18447 return true;
18450 /* Split one or more double-mode RTL references into pairs of half-mode
18451 references. The RTL can be REG, offsettable MEM, integer constant, or
18452 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18453 split and "num" is its length. lo_half and hi_half are output arrays
18454 that parallel "operands". */
18456 void
18457 split_double_mode (machine_mode mode, rtx operands[],
18458 int num, rtx lo_half[], rtx hi_half[])
18460 machine_mode half_mode;
18461 unsigned int byte;
18463 switch (mode)
18465 case E_TImode:
18466 half_mode = DImode;
18467 break;
18468 case E_DImode:
18469 half_mode = SImode;
18470 break;
18471 default:
18472 gcc_unreachable ();
18475 byte = GET_MODE_SIZE (half_mode);
18477 while (num--)
18479 rtx op = operands[num];
18481 /* simplify_subreg refuse to split volatile memory addresses,
18482 but we still have to handle it. */
18483 if (MEM_P (op))
18485 lo_half[num] = adjust_address (op, half_mode, 0);
18486 hi_half[num] = adjust_address (op, half_mode, byte);
18488 else
18490 lo_half[num] = simplify_gen_subreg (half_mode, op,
18491 GET_MODE (op) == VOIDmode
18492 ? mode : GET_MODE (op), 0);
18493 hi_half[num] = simplify_gen_subreg (half_mode, op,
18494 GET_MODE (op) == VOIDmode
18495 ? mode : GET_MODE (op), byte);
18500 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18501 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18502 is the expression of the binary operation. The output may either be
18503 emitted here, or returned to the caller, like all output_* functions.
18505 There is no guarantee that the operands are the same mode, as they
18506 might be within FLOAT or FLOAT_EXTEND expressions. */
18508 #ifndef SYSV386_COMPAT
18509 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18510 wants to fix the assemblers because that causes incompatibility
18511 with gcc. No-one wants to fix gcc because that causes
18512 incompatibility with assemblers... You can use the option of
18513 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18514 #define SYSV386_COMPAT 1
18515 #endif
18517 const char *
18518 output_387_binary_op (rtx_insn *insn, rtx *operands)
18520 static char buf[40];
18521 const char *p;
18522 bool is_sse
18523 = (SSE_REG_P (operands[0])
18524 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
18526 if (is_sse)
18527 p = "%v";
18528 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18529 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18530 p = "fi";
18531 else
18532 p = "f";
18534 strcpy (buf, p);
18536 switch (GET_CODE (operands[3]))
18538 case PLUS:
18539 p = "add"; break;
18540 case MINUS:
18541 p = "sub"; break;
18542 case MULT:
18543 p = "mul"; break;
18544 case DIV:
18545 p = "div"; break;
18546 default:
18547 gcc_unreachable ();
18550 strcat (buf, p);
18552 if (is_sse)
18554 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
18555 strcat (buf, p);
18557 if (TARGET_AVX)
18558 p = "\t{%2, %1, %0|%0, %1, %2}";
18559 else
18560 p = "\t{%2, %0|%0, %2}";
18562 strcat (buf, p);
18563 return buf;
18566 /* Even if we do not want to check the inputs, this documents input
18567 constraints. Which helps in understanding the following code. */
18568 if (flag_checking)
18570 if (STACK_REG_P (operands[0])
18571 && ((REG_P (operands[1])
18572 && REGNO (operands[0]) == REGNO (operands[1])
18573 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18574 || (REG_P (operands[2])
18575 && REGNO (operands[0]) == REGNO (operands[2])
18576 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18577 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18578 ; /* ok */
18579 else
18580 gcc_unreachable ();
18583 switch (GET_CODE (operands[3]))
18585 case MULT:
18586 case PLUS:
18587 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18588 std::swap (operands[1], operands[2]);
18590 /* know operands[0] == operands[1]. */
18592 if (MEM_P (operands[2]))
18594 p = "%Z2\t%2";
18595 break;
18598 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18600 if (STACK_TOP_P (operands[0]))
18601 /* How is it that we are storing to a dead operand[2]?
18602 Well, presumably operands[1] is dead too. We can't
18603 store the result to st(0) as st(0) gets popped on this
18604 instruction. Instead store to operands[2] (which I
18605 think has to be st(1)). st(1) will be popped later.
18606 gcc <= 2.8.1 didn't have this check and generated
18607 assembly code that the Unixware assembler rejected. */
18608 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18609 else
18610 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18611 break;
18614 if (STACK_TOP_P (operands[0]))
18615 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18616 else
18617 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18618 break;
18620 case MINUS:
18621 case DIV:
18622 if (MEM_P (operands[1]))
18624 p = "r%Z1\t%1";
18625 break;
18628 if (MEM_P (operands[2]))
18630 p = "%Z2\t%2";
18631 break;
18634 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18636 #if SYSV386_COMPAT
18637 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18638 derived assemblers, confusingly reverse the direction of
18639 the operation for fsub{r} and fdiv{r} when the
18640 destination register is not st(0). The Intel assembler
18641 doesn't have this brain damage. Read !SYSV386_COMPAT to
18642 figure out what the hardware really does. */
18643 if (STACK_TOP_P (operands[0]))
18644 p = "{p\t%0, %2|rp\t%2, %0}";
18645 else
18646 p = "{rp\t%2, %0|p\t%0, %2}";
18647 #else
18648 if (STACK_TOP_P (operands[0]))
18649 /* As above for fmul/fadd, we can't store to st(0). */
18650 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18651 else
18652 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18653 #endif
18654 break;
18657 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18659 #if SYSV386_COMPAT
18660 if (STACK_TOP_P (operands[0]))
18661 p = "{rp\t%0, %1|p\t%1, %0}";
18662 else
18663 p = "{p\t%1, %0|rp\t%0, %1}";
18664 #else
18665 if (STACK_TOP_P (operands[0]))
18666 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18667 else
18668 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18669 #endif
18670 break;
18673 if (STACK_TOP_P (operands[0]))
18675 if (STACK_TOP_P (operands[1]))
18676 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18677 else
18678 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18679 break;
18681 else if (STACK_TOP_P (operands[1]))
18683 #if SYSV386_COMPAT
18684 p = "{\t%1, %0|r\t%0, %1}";
18685 #else
18686 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18687 #endif
18689 else
18691 #if SYSV386_COMPAT
18692 p = "{r\t%2, %0|\t%0, %2}";
18693 #else
18694 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18695 #endif
18697 break;
18699 default:
18700 gcc_unreachable ();
18703 strcat (buf, p);
18704 return buf;
18707 /* Return needed mode for entity in optimize_mode_switching pass. */
18709 static int
18710 ix86_dirflag_mode_needed (rtx_insn *insn)
18712 if (CALL_P (insn))
18714 if (cfun->machine->func_type == TYPE_NORMAL)
18715 return X86_DIRFLAG_ANY;
18716 else
18717 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18718 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18721 if (recog_memoized (insn) < 0)
18722 return X86_DIRFLAG_ANY;
18724 if (get_attr_type (insn) == TYPE_STR)
18726 /* Emit cld instruction if stringops are used in the function. */
18727 if (cfun->machine->func_type == TYPE_NORMAL)
18728 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18729 else
18730 return X86_DIRFLAG_RESET;
18733 return X86_DIRFLAG_ANY;
18736 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
18738 static bool
18739 ix86_check_avx_upper_register (const_rtx exp)
18741 if (SUBREG_P (exp))
18742 exp = SUBREG_REG (exp);
18744 return (REG_P (exp)
18745 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
18746 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
18749 /* Return needed mode for entity in optimize_mode_switching pass. */
18751 static int
18752 ix86_avx_u128_mode_needed (rtx_insn *insn)
18754 if (CALL_P (insn))
18756 rtx link;
18758 /* Needed mode is set to AVX_U128_CLEAN if there are
18759 no 256bit or 512bit modes used in function arguments. */
18760 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18761 link;
18762 link = XEXP (link, 1))
18764 if (GET_CODE (XEXP (link, 0)) == USE)
18766 rtx arg = XEXP (XEXP (link, 0), 0);
18768 if (ix86_check_avx_upper_register (arg))
18769 return AVX_U128_DIRTY;
18773 return AVX_U128_CLEAN;
18776 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
18777 Hardware changes state only when a 256bit register is written to,
18778 but we need to prevent the compiler from moving optimal insertion
18779 point above eventual read from 256bit or 512 bit register. */
18780 subrtx_iterator::array_type array;
18781 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18782 if (ix86_check_avx_upper_register (*iter))
18783 return AVX_U128_DIRTY;
18785 return AVX_U128_ANY;
18788 /* Return mode that i387 must be switched into
18789 prior to the execution of insn. */
18791 static int
18792 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18794 enum attr_i387_cw mode;
18796 /* The mode UNINITIALIZED is used to store control word after a
18797 function call or ASM pattern. The mode ANY specify that function
18798 has no requirements on the control word and make no changes in the
18799 bits we are interested in. */
18801 if (CALL_P (insn)
18802 || (NONJUMP_INSN_P (insn)
18803 && (asm_noperands (PATTERN (insn)) >= 0
18804 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18805 return I387_CW_UNINITIALIZED;
18807 if (recog_memoized (insn) < 0)
18808 return I387_CW_ANY;
18810 mode = get_attr_i387_cw (insn);
18812 switch (entity)
18814 case I387_TRUNC:
18815 if (mode == I387_CW_TRUNC)
18816 return mode;
18817 break;
18819 case I387_FLOOR:
18820 if (mode == I387_CW_FLOOR)
18821 return mode;
18822 break;
18824 case I387_CEIL:
18825 if (mode == I387_CW_CEIL)
18826 return mode;
18827 break;
18829 case I387_MASK_PM:
18830 if (mode == I387_CW_MASK_PM)
18831 return mode;
18832 break;
18834 default:
18835 gcc_unreachable ();
18838 return I387_CW_ANY;
18841 /* Return mode that entity must be switched into
18842 prior to the execution of insn. */
18844 static int
18845 ix86_mode_needed (int entity, rtx_insn *insn)
18847 switch (entity)
18849 case X86_DIRFLAG:
18850 return ix86_dirflag_mode_needed (insn);
18851 case AVX_U128:
18852 return ix86_avx_u128_mode_needed (insn);
18853 case I387_TRUNC:
18854 case I387_FLOOR:
18855 case I387_CEIL:
18856 case I387_MASK_PM:
18857 return ix86_i387_mode_needed (entity, insn);
18858 default:
18859 gcc_unreachable ();
18861 return 0;
18864 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
18866 static void
18867 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
18869 if (ix86_check_avx_upper_register (dest))
18871 bool *used = (bool *) data;
18872 *used = true;
18876 /* Calculate mode of upper 128bit AVX registers after the insn. */
18878 static int
18879 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
18881 rtx pat = PATTERN (insn);
18883 if (vzeroupper_operation (pat, VOIDmode)
18884 || vzeroall_operation (pat, VOIDmode))
18885 return AVX_U128_CLEAN;
18887 /* We know that state is clean after CALL insn if there are no
18888 256bit or 512bit registers used in the function return register. */
18889 if (CALL_P (insn))
18891 bool avx_upper_reg_found = false;
18892 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
18894 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
18897 /* Otherwise, return current mode. Remember that if insn
18898 references AVX 256bit or 512bit registers, the mode was already
18899 changed to DIRTY from MODE_NEEDED. */
18900 return mode;
18903 /* Return the mode that an insn results in. */
18905 static int
18906 ix86_mode_after (int entity, int mode, rtx_insn *insn)
18908 switch (entity)
18910 case X86_DIRFLAG:
18911 return mode;
18912 case AVX_U128:
18913 return ix86_avx_u128_mode_after (mode, insn);
18914 case I387_TRUNC:
18915 case I387_FLOOR:
18916 case I387_CEIL:
18917 case I387_MASK_PM:
18918 return mode;
18919 default:
18920 gcc_unreachable ();
18924 static int
18925 ix86_dirflag_mode_entry (void)
18927 /* For TARGET_CLD or in the interrupt handler we can't assume
18928 direction flag state at function entry. */
18929 if (TARGET_CLD
18930 || cfun->machine->func_type != TYPE_NORMAL)
18931 return X86_DIRFLAG_ANY;
18933 return X86_DIRFLAG_RESET;
18936 static int
18937 ix86_avx_u128_mode_entry (void)
18939 tree arg;
18941 /* Entry mode is set to AVX_U128_DIRTY if there are
18942 256bit or 512bit modes used in function arguments. */
18943 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
18944 arg = TREE_CHAIN (arg))
18946 rtx incoming = DECL_INCOMING_RTL (arg);
18948 if (incoming && ix86_check_avx_upper_register (incoming))
18949 return AVX_U128_DIRTY;
18952 return AVX_U128_CLEAN;
18955 /* Return a mode that ENTITY is assumed to be
18956 switched to at function entry. */
18958 static int
18959 ix86_mode_entry (int entity)
18961 switch (entity)
18963 case X86_DIRFLAG:
18964 return ix86_dirflag_mode_entry ();
18965 case AVX_U128:
18966 return ix86_avx_u128_mode_entry ();
18967 case I387_TRUNC:
18968 case I387_FLOOR:
18969 case I387_CEIL:
18970 case I387_MASK_PM:
18971 return I387_CW_ANY;
18972 default:
18973 gcc_unreachable ();
18977 static int
18978 ix86_avx_u128_mode_exit (void)
18980 rtx reg = crtl->return_rtx;
18982 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
18983 or 512 bit modes used in the function return register. */
18984 if (reg && ix86_check_avx_upper_register (reg))
18985 return AVX_U128_DIRTY;
18987 return AVX_U128_CLEAN;
18990 /* Return a mode that ENTITY is assumed to be
18991 switched to at function exit. */
18993 static int
18994 ix86_mode_exit (int entity)
18996 switch (entity)
18998 case X86_DIRFLAG:
18999 return X86_DIRFLAG_ANY;
19000 case AVX_U128:
19001 return ix86_avx_u128_mode_exit ();
19002 case I387_TRUNC:
19003 case I387_FLOOR:
19004 case I387_CEIL:
19005 case I387_MASK_PM:
19006 return I387_CW_ANY;
19007 default:
19008 gcc_unreachable ();
19012 static int
19013 ix86_mode_priority (int, int n)
19015 return n;
19018 /* Output code to initialize control word copies used by trunc?f?i and
19019 rounding patterns. CURRENT_MODE is set to current control word,
19020 while NEW_MODE is set to new control word. */
19022 static void
19023 emit_i387_cw_initialization (int mode)
19025 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19026 rtx new_mode;
19028 enum ix86_stack_slot slot;
19030 rtx reg = gen_reg_rtx (HImode);
19032 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19033 emit_move_insn (reg, copy_rtx (stored_mode));
19035 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19036 || optimize_insn_for_size_p ())
19038 switch (mode)
19040 case I387_CW_TRUNC:
19041 /* round toward zero (truncate) */
19042 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19043 slot = SLOT_CW_TRUNC;
19044 break;
19046 case I387_CW_FLOOR:
19047 /* round down toward -oo */
19048 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19049 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19050 slot = SLOT_CW_FLOOR;
19051 break;
19053 case I387_CW_CEIL:
19054 /* round up toward +oo */
19055 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19056 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19057 slot = SLOT_CW_CEIL;
19058 break;
19060 case I387_CW_MASK_PM:
19061 /* mask precision exception for nearbyint() */
19062 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19063 slot = SLOT_CW_MASK_PM;
19064 break;
19066 default:
19067 gcc_unreachable ();
19070 else
19072 switch (mode)
19074 case I387_CW_TRUNC:
19075 /* round toward zero (truncate) */
19076 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19077 slot = SLOT_CW_TRUNC;
19078 break;
19080 case I387_CW_FLOOR:
19081 /* round down toward -oo */
19082 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19083 slot = SLOT_CW_FLOOR;
19084 break;
19086 case I387_CW_CEIL:
19087 /* round up toward +oo */
19088 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19089 slot = SLOT_CW_CEIL;
19090 break;
19092 case I387_CW_MASK_PM:
19093 /* mask precision exception for nearbyint() */
19094 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19095 slot = SLOT_CW_MASK_PM;
19096 break;
19098 default:
19099 gcc_unreachable ();
19103 gcc_assert (slot < MAX_386_STACK_LOCALS);
19105 new_mode = assign_386_stack_local (HImode, slot);
19106 emit_move_insn (new_mode, reg);
19109 /* Emit vzeroupper. */
19111 void
19112 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19114 int i;
19116 /* Cancel automatic vzeroupper insertion if there are
19117 live call-saved SSE registers at the insertion point. */
19119 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19120 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19121 return;
19123 if (TARGET_64BIT)
19124 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19125 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19126 return;
19128 emit_insn (gen_avx_vzeroupper ());
19131 /* Generate one or more insns to set ENTITY to MODE. */
19133 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19134 is the set of hard registers live at the point where the insn(s)
19135 are to be inserted. */
19137 static void
19138 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19139 HARD_REG_SET regs_live)
19141 switch (entity)
19143 case X86_DIRFLAG:
19144 if (mode == X86_DIRFLAG_RESET)
19145 emit_insn (gen_cld ());
19146 break;
19147 case AVX_U128:
19148 if (mode == AVX_U128_CLEAN)
19149 ix86_avx_emit_vzeroupper (regs_live);
19150 break;
19151 case I387_TRUNC:
19152 case I387_FLOOR:
19153 case I387_CEIL:
19154 case I387_MASK_PM:
19155 if (mode != I387_CW_ANY
19156 && mode != I387_CW_UNINITIALIZED)
19157 emit_i387_cw_initialization (mode);
19158 break;
19159 default:
19160 gcc_unreachable ();
19164 /* Output code for INSN to convert a float to a signed int. OPERANDS
19165 are the insn operands. The output may be [HSD]Imode and the input
19166 operand may be [SDX]Fmode. */
19168 const char *
19169 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19171 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19172 bool dimode_p = GET_MODE (operands[0]) == DImode;
19173 int round_mode = get_attr_i387_cw (insn);
19175 static char buf[40];
19176 const char *p;
19178 /* Jump through a hoop or two for DImode, since the hardware has no
19179 non-popping instruction. We used to do this a different way, but
19180 that was somewhat fragile and broke with post-reload splitters. */
19181 if ((dimode_p || fisttp) && !stack_top_dies)
19182 output_asm_insn ("fld\t%y1", operands);
19184 gcc_assert (STACK_TOP_P (operands[1]));
19185 gcc_assert (MEM_P (operands[0]));
19186 gcc_assert (GET_MODE (operands[1]) != TFmode);
19188 if (fisttp)
19189 return "fisttp%Z0\t%0";
19191 strcpy (buf, "fist");
19193 if (round_mode != I387_CW_ANY)
19194 output_asm_insn ("fldcw\t%3", operands);
19196 p = "p%Z0\t%0";
19197 strcat (buf, p + !(stack_top_dies || dimode_p));
19199 output_asm_insn (buf, operands);
19201 if (round_mode != I387_CW_ANY)
19202 output_asm_insn ("fldcw\t%2", operands);
19204 return "";
19207 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19208 have the values zero or one, indicates the ffreep insn's operand
19209 from the OPERANDS array. */
19211 static const char *
19212 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19214 if (TARGET_USE_FFREEP)
19215 #ifdef HAVE_AS_IX86_FFREEP
19216 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19217 #else
19219 static char retval[32];
19220 int regno = REGNO (operands[opno]);
19222 gcc_assert (STACK_REGNO_P (regno));
19224 regno -= FIRST_STACK_REG;
19226 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19227 return retval;
19229 #endif
19231 return opno ? "fstp\t%y1" : "fstp\t%y0";
19235 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19236 should be used. UNORDERED_P is true when fucom should be used. */
19238 const char *
19239 output_fp_compare (rtx_insn *insn, rtx *operands,
19240 bool eflags_p, bool unordered_p)
19242 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19243 bool stack_top_dies;
19245 static char buf[40];
19246 const char *p;
19248 gcc_assert (STACK_TOP_P (xops[0]));
19250 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19252 if (eflags_p)
19254 p = unordered_p ? "fucomi" : "fcomi";
19255 strcpy (buf, p);
19257 p = "p\t{%y1, %0|%0, %y1}";
19258 strcat (buf, p + !stack_top_dies);
19260 return buf;
19263 if (STACK_REG_P (xops[1])
19264 && stack_top_dies
19265 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19267 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19269 /* If both the top of the 387 stack die, and the other operand
19270 is also a stack register that dies, then this must be a
19271 `fcompp' float compare. */
19272 p = unordered_p ? "fucompp" : "fcompp";
19273 strcpy (buf, p);
19275 else if (const0_operand (xops[1], VOIDmode))
19277 gcc_assert (!unordered_p);
19278 strcpy (buf, "ftst");
19280 else
19282 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19284 gcc_assert (!unordered_p);
19285 p = "ficom";
19287 else
19288 p = unordered_p ? "fucom" : "fcom";
19290 strcpy (buf, p);
19292 p = "p%Z2\t%y2";
19293 strcat (buf, p + !stack_top_dies);
19296 output_asm_insn (buf, operands);
19297 return "fnstsw\t%0";
19300 void
19301 ix86_output_addr_vec_elt (FILE *file, int value)
19303 const char *directive = ASM_LONG;
19305 #ifdef ASM_QUAD
19306 if (TARGET_LP64)
19307 directive = ASM_QUAD;
19308 #else
19309 gcc_assert (!TARGET_64BIT);
19310 #endif
19312 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19315 void
19316 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19318 const char *directive = ASM_LONG;
19320 #ifdef ASM_QUAD
19321 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19322 directive = ASM_QUAD;
19323 #else
19324 gcc_assert (!TARGET_64BIT);
19325 #endif
19326 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19327 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19328 fprintf (file, "%s%s%d-%s%d\n",
19329 directive, LPREFIX, value, LPREFIX, rel);
19330 else if (HAVE_AS_GOTOFF_IN_DATA)
19331 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19332 #if TARGET_MACHO
19333 else if (TARGET_MACHO)
19335 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19336 machopic_output_function_base_name (file);
19337 putc ('\n', file);
19339 #endif
19340 else
19341 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19342 GOT_SYMBOL_NAME, LPREFIX, value);
19345 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19346 for the target. */
19348 void
19349 ix86_expand_clear (rtx dest)
19351 rtx tmp;
19353 /* We play register width games, which are only valid after reload. */
19354 gcc_assert (reload_completed);
19356 /* Avoid HImode and its attendant prefix byte. */
19357 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19358 dest = gen_rtx_REG (SImode, REGNO (dest));
19359 tmp = gen_rtx_SET (dest, const0_rtx);
19361 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19363 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19364 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19367 emit_insn (tmp);
19370 void
19371 ix86_expand_move (machine_mode mode, rtx operands[])
19373 rtx op0, op1;
19374 rtx tmp, addend = NULL_RTX;
19375 enum tls_model model;
19377 op0 = operands[0];
19378 op1 = operands[1];
19380 switch (GET_CODE (op1))
19382 case CONST:
19383 tmp = XEXP (op1, 0);
19385 if (GET_CODE (tmp) != PLUS
19386 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19387 break;
19389 op1 = XEXP (tmp, 0);
19390 addend = XEXP (tmp, 1);
19391 /* FALLTHRU */
19393 case SYMBOL_REF:
19394 model = SYMBOL_REF_TLS_MODEL (op1);
19396 if (model)
19397 op1 = legitimize_tls_address (op1, model, true);
19398 else if (ix86_force_load_from_GOT_p (op1))
19400 /* Load the external function address via GOT slot to avoid PLT. */
19401 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19402 (TARGET_64BIT
19403 ? UNSPEC_GOTPCREL
19404 : UNSPEC_GOT));
19405 op1 = gen_rtx_CONST (Pmode, op1);
19406 op1 = gen_const_mem (Pmode, op1);
19407 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19409 else
19411 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19412 if (tmp)
19414 op1 = tmp;
19415 if (!addend)
19416 break;
19418 else
19420 op1 = operands[1];
19421 break;
19425 if (addend)
19427 op1 = force_operand (op1, NULL_RTX);
19428 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19429 op0, 1, OPTAB_DIRECT);
19431 else
19432 op1 = force_operand (op1, op0);
19434 if (op1 == op0)
19435 return;
19437 op1 = convert_to_mode (mode, op1, 1);
19439 default:
19440 break;
19443 if ((flag_pic || MACHOPIC_INDIRECT)
19444 && symbolic_operand (op1, mode))
19446 if (TARGET_MACHO && !TARGET_64BIT)
19448 #if TARGET_MACHO
19449 /* dynamic-no-pic */
19450 if (MACHOPIC_INDIRECT)
19452 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19453 ? op0 : gen_reg_rtx (Pmode);
19454 op1 = machopic_indirect_data_reference (op1, temp);
19455 if (MACHOPIC_PURE)
19456 op1 = machopic_legitimize_pic_address (op1, mode,
19457 temp == op1 ? 0 : temp);
19459 if (op0 != op1 && GET_CODE (op0) != MEM)
19461 rtx insn = gen_rtx_SET (op0, op1);
19462 emit_insn (insn);
19463 return;
19465 if (GET_CODE (op0) == MEM)
19466 op1 = force_reg (Pmode, op1);
19467 else
19469 rtx temp = op0;
19470 if (GET_CODE (temp) != REG)
19471 temp = gen_reg_rtx (Pmode);
19472 temp = legitimize_pic_address (op1, temp);
19473 if (temp == op0)
19474 return;
19475 op1 = temp;
19477 /* dynamic-no-pic */
19478 #endif
19480 else
19482 if (MEM_P (op0))
19483 op1 = force_reg (mode, op1);
19484 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19486 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19487 op1 = legitimize_pic_address (op1, reg);
19488 if (op0 == op1)
19489 return;
19490 op1 = convert_to_mode (mode, op1, 1);
19494 else
19496 if (MEM_P (op0)
19497 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19498 || !push_operand (op0, mode))
19499 && MEM_P (op1))
19500 op1 = force_reg (mode, op1);
19502 if (push_operand (op0, mode)
19503 && ! general_no_elim_operand (op1, mode))
19504 op1 = copy_to_mode_reg (mode, op1);
19506 /* Force large constants in 64bit compilation into register
19507 to get them CSEed. */
19508 if (can_create_pseudo_p ()
19509 && (mode == DImode) && TARGET_64BIT
19510 && immediate_operand (op1, mode)
19511 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19512 && !register_operand (op0, mode)
19513 && optimize)
19514 op1 = copy_to_mode_reg (mode, op1);
19516 if (can_create_pseudo_p ()
19517 && CONST_DOUBLE_P (op1))
19519 /* If we are loading a floating point constant to a register,
19520 force the value to memory now, since we'll get better code
19521 out the back end. */
19523 op1 = validize_mem (force_const_mem (mode, op1));
19524 if (!register_operand (op0, mode))
19526 rtx temp = gen_reg_rtx (mode);
19527 emit_insn (gen_rtx_SET (temp, op1));
19528 emit_move_insn (op0, temp);
19529 return;
19534 emit_insn (gen_rtx_SET (op0, op1));
19537 void
19538 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19540 rtx op0 = operands[0], op1 = operands[1];
19541 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19542 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19543 unsigned int align = (TARGET_IAMCU
19544 ? GET_MODE_BITSIZE (mode)
19545 : GET_MODE_ALIGNMENT (mode));
19547 if (push_operand (op0, VOIDmode))
19548 op0 = emit_move_resolve_push (mode, op0);
19550 /* Force constants other than zero into memory. We do not know how
19551 the instructions used to build constants modify the upper 64 bits
19552 of the register, once we have that information we may be able
19553 to handle some of them more efficiently. */
19554 if (can_create_pseudo_p ()
19555 && (CONSTANT_P (op1)
19556 || (SUBREG_P (op1)
19557 && CONSTANT_P (SUBREG_REG (op1))))
19558 && ((register_operand (op0, mode)
19559 && !standard_sse_constant_p (op1, mode))
19560 /* ix86_expand_vector_move_misalign() does not like constants. */
19561 || (SSE_REG_MODE_P (mode)
19562 && MEM_P (op0)
19563 && MEM_ALIGN (op0) < align)))
19565 if (SUBREG_P (op1))
19567 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19568 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19569 if (r)
19570 r = validize_mem (r);
19571 else
19572 r = force_reg (imode, SUBREG_REG (op1));
19573 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19575 else
19576 op1 = validize_mem (force_const_mem (mode, op1));
19579 /* We need to check memory alignment for SSE mode since attribute
19580 can make operands unaligned. */
19581 if (can_create_pseudo_p ()
19582 && SSE_REG_MODE_P (mode)
19583 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19584 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19586 rtx tmp[2];
19588 /* ix86_expand_vector_move_misalign() does not like both
19589 arguments in memory. */
19590 if (!register_operand (op0, mode)
19591 && !register_operand (op1, mode))
19592 op1 = force_reg (mode, op1);
19594 tmp[0] = op0; tmp[1] = op1;
19595 ix86_expand_vector_move_misalign (mode, tmp);
19596 return;
19599 /* Make operand1 a register if it isn't already. */
19600 if (can_create_pseudo_p ()
19601 && !register_operand (op0, mode)
19602 && !register_operand (op1, mode))
19604 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19605 return;
19608 emit_insn (gen_rtx_SET (op0, op1));
19611 /* Split 32-byte AVX unaligned load and store if needed. */
19613 static void
19614 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19616 rtx m;
19617 rtx (*extract) (rtx, rtx, rtx);
19618 machine_mode mode;
19620 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19621 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19623 emit_insn (gen_rtx_SET (op0, op1));
19624 return;
19627 rtx orig_op0 = NULL_RTX;
19628 mode = GET_MODE (op0);
19629 switch (GET_MODE_CLASS (mode))
19631 case MODE_VECTOR_INT:
19632 case MODE_INT:
19633 if (mode != V32QImode)
19635 if (!MEM_P (op0))
19637 orig_op0 = op0;
19638 op0 = gen_reg_rtx (V32QImode);
19640 else
19641 op0 = gen_lowpart (V32QImode, op0);
19642 op1 = gen_lowpart (V32QImode, op1);
19643 mode = V32QImode;
19645 break;
19646 case MODE_VECTOR_FLOAT:
19647 break;
19648 default:
19649 gcc_unreachable ();
19652 switch (mode)
19654 default:
19655 gcc_unreachable ();
19656 case E_V32QImode:
19657 extract = gen_avx_vextractf128v32qi;
19658 mode = V16QImode;
19659 break;
19660 case E_V8SFmode:
19661 extract = gen_avx_vextractf128v8sf;
19662 mode = V4SFmode;
19663 break;
19664 case E_V4DFmode:
19665 extract = gen_avx_vextractf128v4df;
19666 mode = V2DFmode;
19667 break;
19670 if (MEM_P (op1))
19672 rtx r = gen_reg_rtx (mode);
19673 m = adjust_address (op1, mode, 0);
19674 emit_move_insn (r, m);
19675 m = adjust_address (op1, mode, 16);
19676 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19677 emit_move_insn (op0, r);
19679 else if (MEM_P (op0))
19681 m = adjust_address (op0, mode, 0);
19682 emit_insn (extract (m, op1, const0_rtx));
19683 m = adjust_address (op0, mode, 16);
19684 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19686 else
19687 gcc_unreachable ();
19689 if (orig_op0)
19690 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19693 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19694 straight to ix86_expand_vector_move. */
19695 /* Code generation for scalar reg-reg moves of single and double precision data:
19696 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19697 movaps reg, reg
19698 else
19699 movss reg, reg
19700 if (x86_sse_partial_reg_dependency == true)
19701 movapd reg, reg
19702 else
19703 movsd reg, reg
19705 Code generation for scalar loads of double precision data:
19706 if (x86_sse_split_regs == true)
19707 movlpd mem, reg (gas syntax)
19708 else
19709 movsd mem, reg
19711 Code generation for unaligned packed loads of single precision data
19712 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19713 if (x86_sse_unaligned_move_optimal)
19714 movups mem, reg
19716 if (x86_sse_partial_reg_dependency == true)
19718 xorps reg, reg
19719 movlps mem, reg
19720 movhps mem+8, reg
19722 else
19724 movlps mem, reg
19725 movhps mem+8, reg
19728 Code generation for unaligned packed loads of double precision data
19729 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19730 if (x86_sse_unaligned_move_optimal)
19731 movupd mem, reg
19733 if (x86_sse_split_regs == true)
19735 movlpd mem, reg
19736 movhpd mem+8, reg
19738 else
19740 movsd mem, reg
19741 movhpd mem+8, reg
19745 void
19746 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19748 rtx op0, op1, m;
19750 op0 = operands[0];
19751 op1 = operands[1];
19753 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19754 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19756 emit_insn (gen_rtx_SET (op0, op1));
19757 return;
19760 if (TARGET_AVX)
19762 if (GET_MODE_SIZE (mode) == 32)
19763 ix86_avx256_split_vector_move_misalign (op0, op1);
19764 else
19765 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19766 emit_insn (gen_rtx_SET (op0, op1));
19767 return;
19770 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19771 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19773 emit_insn (gen_rtx_SET (op0, op1));
19774 return;
19777 /* ??? If we have typed data, then it would appear that using
19778 movdqu is the only way to get unaligned data loaded with
19779 integer type. */
19780 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19782 emit_insn (gen_rtx_SET (op0, op1));
19783 return;
19786 if (MEM_P (op1))
19788 if (TARGET_SSE2 && mode == V2DFmode)
19790 rtx zero;
19792 /* When SSE registers are split into halves, we can avoid
19793 writing to the top half twice. */
19794 if (TARGET_SSE_SPLIT_REGS)
19796 emit_clobber (op0);
19797 zero = op0;
19799 else
19801 /* ??? Not sure about the best option for the Intel chips.
19802 The following would seem to satisfy; the register is
19803 entirely cleared, breaking the dependency chain. We
19804 then store to the upper half, with a dependency depth
19805 of one. A rumor has it that Intel recommends two movsd
19806 followed by an unpacklpd, but this is unconfirmed. And
19807 given that the dependency depth of the unpacklpd would
19808 still be one, I'm not sure why this would be better. */
19809 zero = CONST0_RTX (V2DFmode);
19812 m = adjust_address (op1, DFmode, 0);
19813 emit_insn (gen_sse2_loadlpd (op0, zero, m));
19814 m = adjust_address (op1, DFmode, 8);
19815 emit_insn (gen_sse2_loadhpd (op0, op0, m));
19817 else
19819 rtx t;
19821 if (mode != V4SFmode)
19822 t = gen_reg_rtx (V4SFmode);
19823 else
19824 t = op0;
19826 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
19827 emit_move_insn (t, CONST0_RTX (V4SFmode));
19828 else
19829 emit_clobber (t);
19831 m = adjust_address (op1, V2SFmode, 0);
19832 emit_insn (gen_sse_loadlps (t, t, m));
19833 m = adjust_address (op1, V2SFmode, 8);
19834 emit_insn (gen_sse_loadhps (t, t, m));
19835 if (mode != V4SFmode)
19836 emit_move_insn (op0, gen_lowpart (mode, t));
19839 else if (MEM_P (op0))
19841 if (TARGET_SSE2 && mode == V2DFmode)
19843 m = adjust_address (op0, DFmode, 0);
19844 emit_insn (gen_sse2_storelpd (m, op1));
19845 m = adjust_address (op0, DFmode, 8);
19846 emit_insn (gen_sse2_storehpd (m, op1));
19848 else
19850 if (mode != V4SFmode)
19851 op1 = gen_lowpart (V4SFmode, op1);
19853 m = adjust_address (op0, V2SFmode, 0);
19854 emit_insn (gen_sse_storelps (m, op1));
19855 m = adjust_address (op0, V2SFmode, 8);
19856 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
19859 else
19860 gcc_unreachable ();
19863 /* Helper function of ix86_fixup_binary_operands to canonicalize
19864 operand order. Returns true if the operands should be swapped. */
19866 static bool
19867 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
19868 rtx operands[])
19870 rtx dst = operands[0];
19871 rtx src1 = operands[1];
19872 rtx src2 = operands[2];
19874 /* If the operation is not commutative, we can't do anything. */
19875 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
19876 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
19877 return false;
19879 /* Highest priority is that src1 should match dst. */
19880 if (rtx_equal_p (dst, src1))
19881 return false;
19882 if (rtx_equal_p (dst, src2))
19883 return true;
19885 /* Next highest priority is that immediate constants come second. */
19886 if (immediate_operand (src2, mode))
19887 return false;
19888 if (immediate_operand (src1, mode))
19889 return true;
19891 /* Lowest priority is that memory references should come second. */
19892 if (MEM_P (src2))
19893 return false;
19894 if (MEM_P (src1))
19895 return true;
19897 return false;
19901 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
19902 destination to use for the operation. If different from the true
19903 destination in operands[0], a copy operation will be required. */
19906 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
19907 rtx operands[])
19909 rtx dst = operands[0];
19910 rtx src1 = operands[1];
19911 rtx src2 = operands[2];
19913 /* Canonicalize operand order. */
19914 if (ix86_swap_binary_operands_p (code, mode, operands))
19916 /* It is invalid to swap operands of different modes. */
19917 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
19919 std::swap (src1, src2);
19922 /* Both source operands cannot be in memory. */
19923 if (MEM_P (src1) && MEM_P (src2))
19925 /* Optimization: Only read from memory once. */
19926 if (rtx_equal_p (src1, src2))
19928 src2 = force_reg (mode, src2);
19929 src1 = src2;
19931 else if (rtx_equal_p (dst, src1))
19932 src2 = force_reg (mode, src2);
19933 else
19934 src1 = force_reg (mode, src1);
19937 /* If the destination is memory, and we do not have matching source
19938 operands, do things in registers. */
19939 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19940 dst = gen_reg_rtx (mode);
19942 /* Source 1 cannot be a constant. */
19943 if (CONSTANT_P (src1))
19944 src1 = force_reg (mode, src1);
19946 /* Source 1 cannot be a non-matching memory. */
19947 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
19948 src1 = force_reg (mode, src1);
19950 /* Improve address combine. */
19951 if (code == PLUS
19952 && GET_MODE_CLASS (mode) == MODE_INT
19953 && MEM_P (src2))
19954 src2 = force_reg (mode, src2);
19956 operands[1] = src1;
19957 operands[2] = src2;
19958 return dst;
19961 /* Similarly, but assume that the destination has already been
19962 set up properly. */
19964 void
19965 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
19966 machine_mode mode, rtx operands[])
19968 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
19969 gcc_assert (dst == operands[0]);
19972 /* Attempt to expand a binary operator. Make the expansion closer to the
19973 actual machine, then just general_operand, which will allow 3 separate
19974 memory references (one output, two input) in a single insn. */
19976 void
19977 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
19978 rtx operands[])
19980 rtx src1, src2, dst, op, clob;
19982 dst = ix86_fixup_binary_operands (code, mode, operands);
19983 src1 = operands[1];
19984 src2 = operands[2];
19986 /* Emit the instruction. */
19988 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
19990 if (reload_completed
19991 && code == PLUS
19992 && !rtx_equal_p (dst, src1))
19994 /* This is going to be an LEA; avoid splitting it later. */
19995 emit_insn (op);
19997 else
19999 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20000 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20003 /* Fix up the destination if needed. */
20004 if (dst != operands[0])
20005 emit_move_insn (operands[0], dst);
20008 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20009 the given OPERANDS. */
20011 void
20012 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20013 rtx operands[])
20015 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20016 if (SUBREG_P (operands[1]))
20018 op1 = operands[1];
20019 op2 = operands[2];
20021 else if (SUBREG_P (operands[2]))
20023 op1 = operands[2];
20024 op2 = operands[1];
20026 /* Optimize (__m128i) d | (__m128i) e and similar code
20027 when d and e are float vectors into float vector logical
20028 insn. In C/C++ without using intrinsics there is no other way
20029 to express vector logical operation on float vectors than
20030 to cast them temporarily to integer vectors. */
20031 if (op1
20032 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20033 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20034 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20035 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20036 && SUBREG_BYTE (op1) == 0
20037 && (GET_CODE (op2) == CONST_VECTOR
20038 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20039 && SUBREG_BYTE (op2) == 0))
20040 && can_create_pseudo_p ())
20042 rtx dst;
20043 switch (GET_MODE (SUBREG_REG (op1)))
20045 case E_V4SFmode:
20046 case E_V8SFmode:
20047 case E_V16SFmode:
20048 case E_V2DFmode:
20049 case E_V4DFmode:
20050 case E_V8DFmode:
20051 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20052 if (GET_CODE (op2) == CONST_VECTOR)
20054 op2 = gen_lowpart (GET_MODE (dst), op2);
20055 op2 = force_reg (GET_MODE (dst), op2);
20057 else
20059 op1 = operands[1];
20060 op2 = SUBREG_REG (operands[2]);
20061 if (!vector_operand (op2, GET_MODE (dst)))
20062 op2 = force_reg (GET_MODE (dst), op2);
20064 op1 = SUBREG_REG (op1);
20065 if (!vector_operand (op1, GET_MODE (dst)))
20066 op1 = force_reg (GET_MODE (dst), op1);
20067 emit_insn (gen_rtx_SET (dst,
20068 gen_rtx_fmt_ee (code, GET_MODE (dst),
20069 op1, op2)));
20070 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20071 return;
20072 default:
20073 break;
20076 if (!vector_operand (operands[1], mode))
20077 operands[1] = force_reg (mode, operands[1]);
20078 if (!vector_operand (operands[2], mode))
20079 operands[2] = force_reg (mode, operands[2]);
20080 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20081 emit_insn (gen_rtx_SET (operands[0],
20082 gen_rtx_fmt_ee (code, mode, operands[1],
20083 operands[2])));
20086 /* Return TRUE or FALSE depending on whether the binary operator meets the
20087 appropriate constraints. */
20089 bool
20090 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20091 rtx operands[3])
20093 rtx dst = operands[0];
20094 rtx src1 = operands[1];
20095 rtx src2 = operands[2];
20097 /* Both source operands cannot be in memory. */
20098 if (MEM_P (src1) && MEM_P (src2))
20099 return false;
20101 /* Canonicalize operand order for commutative operators. */
20102 if (ix86_swap_binary_operands_p (code, mode, operands))
20103 std::swap (src1, src2);
20105 /* If the destination is memory, we must have a matching source operand. */
20106 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20107 return false;
20109 /* Source 1 cannot be a constant. */
20110 if (CONSTANT_P (src1))
20111 return false;
20113 /* Source 1 cannot be a non-matching memory. */
20114 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20115 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20116 return (code == AND
20117 && (mode == HImode
20118 || mode == SImode
20119 || (TARGET_64BIT && mode == DImode))
20120 && satisfies_constraint_L (src2));
20122 return true;
20125 /* Attempt to expand a unary operator. Make the expansion closer to the
20126 actual machine, then just general_operand, which will allow 2 separate
20127 memory references (one output, one input) in a single insn. */
20129 void
20130 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20131 rtx operands[])
20133 bool matching_memory = false;
20134 rtx src, dst, op, clob;
20136 dst = operands[0];
20137 src = operands[1];
20139 /* If the destination is memory, and we do not have matching source
20140 operands, do things in registers. */
20141 if (MEM_P (dst))
20143 if (rtx_equal_p (dst, src))
20144 matching_memory = true;
20145 else
20146 dst = gen_reg_rtx (mode);
20149 /* When source operand is memory, destination must match. */
20150 if (MEM_P (src) && !matching_memory)
20151 src = force_reg (mode, src);
20153 /* Emit the instruction. */
20155 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20157 if (code == NOT)
20158 emit_insn (op);
20159 else
20161 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20162 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20165 /* Fix up the destination if needed. */
20166 if (dst != operands[0])
20167 emit_move_insn (operands[0], dst);
20170 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20171 divisor are within the range [0-255]. */
20173 void
20174 ix86_split_idivmod (machine_mode mode, rtx operands[],
20175 bool signed_p)
20177 rtx_code_label *end_label, *qimode_label;
20178 rtx div, mod;
20179 rtx_insn *insn;
20180 rtx scratch, tmp0, tmp1, tmp2;
20181 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20182 rtx (*gen_zero_extend) (rtx, rtx);
20183 rtx (*gen_test_ccno_1) (rtx, rtx);
20185 switch (mode)
20187 case E_SImode:
20188 if (GET_MODE (operands[0]) == SImode)
20190 if (GET_MODE (operands[1]) == SImode)
20191 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20192 else
20193 gen_divmod4_1
20194 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20195 gen_zero_extend = gen_zero_extendqisi2;
20197 else
20199 gen_divmod4_1
20200 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20201 gen_zero_extend = gen_zero_extendqidi2;
20203 gen_test_ccno_1 = gen_testsi_ccno_1;
20204 break;
20205 case E_DImode:
20206 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20207 gen_test_ccno_1 = gen_testdi_ccno_1;
20208 gen_zero_extend = gen_zero_extendqidi2;
20209 break;
20210 default:
20211 gcc_unreachable ();
20214 end_label = gen_label_rtx ();
20215 qimode_label = gen_label_rtx ();
20217 scratch = gen_reg_rtx (mode);
20219 /* Use 8bit unsigned divimod if dividend and divisor are within
20220 the range [0-255]. */
20221 emit_move_insn (scratch, operands[2]);
20222 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20223 scratch, 1, OPTAB_DIRECT);
20224 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20225 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20226 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20227 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20228 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20229 pc_rtx);
20230 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20231 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20232 JUMP_LABEL (insn) = qimode_label;
20234 /* Generate original signed/unsigned divimod. */
20235 div = gen_divmod4_1 (operands[0], operands[1],
20236 operands[2], operands[3]);
20237 emit_insn (div);
20239 /* Branch to the end. */
20240 emit_jump_insn (gen_jump (end_label));
20241 emit_barrier ();
20243 /* Generate 8bit unsigned divide. */
20244 emit_label (qimode_label);
20245 /* Don't use operands[0] for result of 8bit divide since not all
20246 registers support QImode ZERO_EXTRACT. */
20247 tmp0 = lowpart_subreg (HImode, scratch, mode);
20248 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20249 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20250 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20252 if (signed_p)
20254 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20255 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20257 else
20259 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20260 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20262 if (mode == SImode)
20264 if (GET_MODE (operands[0]) != SImode)
20265 div = gen_rtx_ZERO_EXTEND (DImode, div);
20266 if (GET_MODE (operands[1]) != SImode)
20267 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20270 /* Extract remainder from AH. */
20271 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20272 tmp0, GEN_INT (8), GEN_INT (8));
20273 if (REG_P (operands[1]))
20274 insn = emit_move_insn (operands[1], tmp1);
20275 else
20277 /* Need a new scratch register since the old one has result
20278 of 8bit divide. */
20279 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20280 emit_move_insn (scratch, tmp1);
20281 insn = emit_move_insn (operands[1], scratch);
20283 set_unique_reg_note (insn, REG_EQUAL, mod);
20285 /* Zero extend quotient from AL. */
20286 tmp1 = gen_lowpart (QImode, tmp0);
20287 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20288 set_unique_reg_note (insn, REG_EQUAL, div);
20290 emit_label (end_label);
20293 #define LEA_MAX_STALL (3)
20294 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20296 /* Increase given DISTANCE in half-cycles according to
20297 dependencies between PREV and NEXT instructions.
20298 Add 1 half-cycle if there is no dependency and
20299 go to next cycle if there is some dependecy. */
20301 static unsigned int
20302 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20304 df_ref def, use;
20306 if (!prev || !next)
20307 return distance + (distance & 1) + 2;
20309 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20310 return distance + 1;
20312 FOR_EACH_INSN_USE (use, next)
20313 FOR_EACH_INSN_DEF (def, prev)
20314 if (!DF_REF_IS_ARTIFICIAL (def)
20315 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20316 return distance + (distance & 1) + 2;
20318 return distance + 1;
20321 /* Function checks if instruction INSN defines register number
20322 REGNO1 or REGNO2. */
20324 static bool
20325 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20326 rtx_insn *insn)
20328 df_ref def;
20330 FOR_EACH_INSN_DEF (def, insn)
20331 if (DF_REF_REG_DEF_P (def)
20332 && !DF_REF_IS_ARTIFICIAL (def)
20333 && (regno1 == DF_REF_REGNO (def)
20334 || regno2 == DF_REF_REGNO (def)))
20335 return true;
20337 return false;
20340 /* Function checks if instruction INSN uses register number
20341 REGNO as a part of address expression. */
20343 static bool
20344 insn_uses_reg_mem (unsigned int regno, rtx insn)
20346 df_ref use;
20348 FOR_EACH_INSN_USE (use, insn)
20349 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20350 return true;
20352 return false;
20355 /* Search backward for non-agu definition of register number REGNO1
20356 or register number REGNO2 in basic block starting from instruction
20357 START up to head of basic block or instruction INSN.
20359 Function puts true value into *FOUND var if definition was found
20360 and false otherwise.
20362 Distance in half-cycles between START and found instruction or head
20363 of BB is added to DISTANCE and returned. */
20365 static int
20366 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20367 rtx_insn *insn, int distance,
20368 rtx_insn *start, bool *found)
20370 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20371 rtx_insn *prev = start;
20372 rtx_insn *next = NULL;
20374 *found = false;
20376 while (prev
20377 && prev != insn
20378 && distance < LEA_SEARCH_THRESHOLD)
20380 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20382 distance = increase_distance (prev, next, distance);
20383 if (insn_defines_reg (regno1, regno2, prev))
20385 if (recog_memoized (prev) < 0
20386 || get_attr_type (prev) != TYPE_LEA)
20388 *found = true;
20389 return distance;
20393 next = prev;
20395 if (prev == BB_HEAD (bb))
20396 break;
20398 prev = PREV_INSN (prev);
20401 return distance;
20404 /* Search backward for non-agu definition of register number REGNO1
20405 or register number REGNO2 in INSN's basic block until
20406 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20407 2. Reach neighbor BBs boundary, or
20408 3. Reach agu definition.
20409 Returns the distance between the non-agu definition point and INSN.
20410 If no definition point, returns -1. */
20412 static int
20413 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20414 rtx_insn *insn)
20416 basic_block bb = BLOCK_FOR_INSN (insn);
20417 int distance = 0;
20418 bool found = false;
20420 if (insn != BB_HEAD (bb))
20421 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20422 distance, PREV_INSN (insn),
20423 &found);
20425 if (!found && distance < LEA_SEARCH_THRESHOLD)
20427 edge e;
20428 edge_iterator ei;
20429 bool simple_loop = false;
20431 FOR_EACH_EDGE (e, ei, bb->preds)
20432 if (e->src == bb)
20434 simple_loop = true;
20435 break;
20438 if (simple_loop)
20439 distance = distance_non_agu_define_in_bb (regno1, regno2,
20440 insn, distance,
20441 BB_END (bb), &found);
20442 else
20444 int shortest_dist = -1;
20445 bool found_in_bb = false;
20447 FOR_EACH_EDGE (e, ei, bb->preds)
20449 int bb_dist
20450 = distance_non_agu_define_in_bb (regno1, regno2,
20451 insn, distance,
20452 BB_END (e->src),
20453 &found_in_bb);
20454 if (found_in_bb)
20456 if (shortest_dist < 0)
20457 shortest_dist = bb_dist;
20458 else if (bb_dist > 0)
20459 shortest_dist = MIN (bb_dist, shortest_dist);
20461 found = true;
20465 distance = shortest_dist;
20469 /* get_attr_type may modify recog data. We want to make sure
20470 that recog data is valid for instruction INSN, on which
20471 distance_non_agu_define is called. INSN is unchanged here. */
20472 extract_insn_cached (insn);
20474 if (!found)
20475 return -1;
20477 return distance >> 1;
20480 /* Return the distance in half-cycles between INSN and the next
20481 insn that uses register number REGNO in memory address added
20482 to DISTANCE. Return -1 if REGNO0 is set.
20484 Put true value into *FOUND if register usage was found and
20485 false otherwise.
20486 Put true value into *REDEFINED if register redefinition was
20487 found and false otherwise. */
20489 static int
20490 distance_agu_use_in_bb (unsigned int regno,
20491 rtx_insn *insn, int distance, rtx_insn *start,
20492 bool *found, bool *redefined)
20494 basic_block bb = NULL;
20495 rtx_insn *next = start;
20496 rtx_insn *prev = NULL;
20498 *found = false;
20499 *redefined = false;
20501 if (start != NULL_RTX)
20503 bb = BLOCK_FOR_INSN (start);
20504 if (start != BB_HEAD (bb))
20505 /* If insn and start belong to the same bb, set prev to insn,
20506 so the call to increase_distance will increase the distance
20507 between insns by 1. */
20508 prev = insn;
20511 while (next
20512 && next != insn
20513 && distance < LEA_SEARCH_THRESHOLD)
20515 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20517 distance = increase_distance(prev, next, distance);
20518 if (insn_uses_reg_mem (regno, next))
20520 /* Return DISTANCE if OP0 is used in memory
20521 address in NEXT. */
20522 *found = true;
20523 return distance;
20526 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20528 /* Return -1 if OP0 is set in NEXT. */
20529 *redefined = true;
20530 return -1;
20533 prev = next;
20536 if (next == BB_END (bb))
20537 break;
20539 next = NEXT_INSN (next);
20542 return distance;
20545 /* Return the distance between INSN and the next insn that uses
20546 register number REGNO0 in memory address. Return -1 if no such
20547 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20549 static int
20550 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20552 basic_block bb = BLOCK_FOR_INSN (insn);
20553 int distance = 0;
20554 bool found = false;
20555 bool redefined = false;
20557 if (insn != BB_END (bb))
20558 distance = distance_agu_use_in_bb (regno0, insn, distance,
20559 NEXT_INSN (insn),
20560 &found, &redefined);
20562 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20564 edge e;
20565 edge_iterator ei;
20566 bool simple_loop = false;
20568 FOR_EACH_EDGE (e, ei, bb->succs)
20569 if (e->dest == bb)
20571 simple_loop = true;
20572 break;
20575 if (simple_loop)
20576 distance = distance_agu_use_in_bb (regno0, insn,
20577 distance, BB_HEAD (bb),
20578 &found, &redefined);
20579 else
20581 int shortest_dist = -1;
20582 bool found_in_bb = false;
20583 bool redefined_in_bb = false;
20585 FOR_EACH_EDGE (e, ei, bb->succs)
20587 int bb_dist
20588 = distance_agu_use_in_bb (regno0, insn,
20589 distance, BB_HEAD (e->dest),
20590 &found_in_bb, &redefined_in_bb);
20591 if (found_in_bb)
20593 if (shortest_dist < 0)
20594 shortest_dist = bb_dist;
20595 else if (bb_dist > 0)
20596 shortest_dist = MIN (bb_dist, shortest_dist);
20598 found = true;
20602 distance = shortest_dist;
20606 if (!found || redefined)
20607 return -1;
20609 return distance >> 1;
20612 /* Define this macro to tune LEA priority vs ADD, it take effect when
20613 there is a dilemma of choicing LEA or ADD
20614 Negative value: ADD is more preferred than LEA
20615 Zero: Netrual
20616 Positive value: LEA is more preferred than ADD*/
20617 #define IX86_LEA_PRIORITY 0
20619 /* Return true if usage of lea INSN has performance advantage
20620 over a sequence of instructions. Instructions sequence has
20621 SPLIT_COST cycles higher latency than lea latency. */
20623 static bool
20624 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20625 unsigned int regno2, int split_cost, bool has_scale)
20627 int dist_define, dist_use;
20629 /* For Silvermont if using a 2-source or 3-source LEA for
20630 non-destructive destination purposes, or due to wanting
20631 ability to use SCALE, the use of LEA is justified. */
20632 if (TARGET_SILVERMONT || TARGET_INTEL)
20634 if (has_scale)
20635 return true;
20636 if (split_cost < 1)
20637 return false;
20638 if (regno0 == regno1 || regno0 == regno2)
20639 return false;
20640 return true;
20643 dist_define = distance_non_agu_define (regno1, regno2, insn);
20644 dist_use = distance_agu_use (regno0, insn);
20646 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20648 /* If there is no non AGU operand definition, no AGU
20649 operand usage and split cost is 0 then both lea
20650 and non lea variants have same priority. Currently
20651 we prefer lea for 64 bit code and non lea on 32 bit
20652 code. */
20653 if (dist_use < 0 && split_cost == 0)
20654 return TARGET_64BIT || IX86_LEA_PRIORITY;
20655 else
20656 return true;
20659 /* With longer definitions distance lea is more preferable.
20660 Here we change it to take into account splitting cost and
20661 lea priority. */
20662 dist_define += split_cost + IX86_LEA_PRIORITY;
20664 /* If there is no use in memory addess then we just check
20665 that split cost exceeds AGU stall. */
20666 if (dist_use < 0)
20667 return dist_define > LEA_MAX_STALL;
20669 /* If this insn has both backward non-agu dependence and forward
20670 agu dependence, the one with short distance takes effect. */
20671 return dist_define >= dist_use;
20674 /* Return true if it is legal to clobber flags by INSN and
20675 false otherwise. */
20677 static bool
20678 ix86_ok_to_clobber_flags (rtx_insn *insn)
20680 basic_block bb = BLOCK_FOR_INSN (insn);
20681 df_ref use;
20682 bitmap live;
20684 while (insn)
20686 if (NONDEBUG_INSN_P (insn))
20688 FOR_EACH_INSN_USE (use, insn)
20689 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20690 return false;
20692 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20693 return true;
20696 if (insn == BB_END (bb))
20697 break;
20699 insn = NEXT_INSN (insn);
20702 live = df_get_live_out(bb);
20703 return !REGNO_REG_SET_P (live, FLAGS_REG);
20706 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20707 move and add to avoid AGU stalls. */
20709 bool
20710 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20712 unsigned int regno0, regno1, regno2;
20714 /* Check if we need to optimize. */
20715 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20716 return false;
20718 /* Check it is correct to split here. */
20719 if (!ix86_ok_to_clobber_flags(insn))
20720 return false;
20722 regno0 = true_regnum (operands[0]);
20723 regno1 = true_regnum (operands[1]);
20724 regno2 = true_regnum (operands[2]);
20726 /* We need to split only adds with non destructive
20727 destination operand. */
20728 if (regno0 == regno1 || regno0 == regno2)
20729 return false;
20730 else
20731 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20734 /* Return true if we should emit lea instruction instead of mov
20735 instruction. */
20737 bool
20738 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20740 unsigned int regno0, regno1;
20742 /* Check if we need to optimize. */
20743 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20744 return false;
20746 /* Use lea for reg to reg moves only. */
20747 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20748 return false;
20750 regno0 = true_regnum (operands[0]);
20751 regno1 = true_regnum (operands[1]);
20753 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20756 /* Return true if we need to split lea into a sequence of
20757 instructions to avoid AGU stalls. */
20759 bool
20760 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20762 unsigned int regno0, regno1, regno2;
20763 int split_cost;
20764 struct ix86_address parts;
20765 int ok;
20767 /* Check we need to optimize. */
20768 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20769 return false;
20771 /* The "at least two components" test below might not catch simple
20772 move or zero extension insns if parts.base is non-NULL and parts.disp
20773 is const0_rtx as the only components in the address, e.g. if the
20774 register is %rbp or %r13. As this test is much cheaper and moves or
20775 zero extensions are the common case, do this check first. */
20776 if (REG_P (operands[1])
20777 || (SImode_address_operand (operands[1], VOIDmode)
20778 && REG_P (XEXP (operands[1], 0))))
20779 return false;
20781 /* Check if it is OK to split here. */
20782 if (!ix86_ok_to_clobber_flags (insn))
20783 return false;
20785 ok = ix86_decompose_address (operands[1], &parts);
20786 gcc_assert (ok);
20788 /* There should be at least two components in the address. */
20789 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20790 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20791 return false;
20793 /* We should not split into add if non legitimate pic
20794 operand is used as displacement. */
20795 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20796 return false;
20798 regno0 = true_regnum (operands[0]) ;
20799 regno1 = INVALID_REGNUM;
20800 regno2 = INVALID_REGNUM;
20802 if (parts.base)
20803 regno1 = true_regnum (parts.base);
20804 if (parts.index)
20805 regno2 = true_regnum (parts.index);
20807 split_cost = 0;
20809 /* Compute how many cycles we will add to execution time
20810 if split lea into a sequence of instructions. */
20811 if (parts.base || parts.index)
20813 /* Have to use mov instruction if non desctructive
20814 destination form is used. */
20815 if (regno1 != regno0 && regno2 != regno0)
20816 split_cost += 1;
20818 /* Have to add index to base if both exist. */
20819 if (parts.base && parts.index)
20820 split_cost += 1;
20822 /* Have to use shift and adds if scale is 2 or greater. */
20823 if (parts.scale > 1)
20825 if (regno0 != regno1)
20826 split_cost += 1;
20827 else if (regno2 == regno0)
20828 split_cost += 4;
20829 else
20830 split_cost += parts.scale;
20833 /* Have to use add instruction with immediate if
20834 disp is non zero. */
20835 if (parts.disp && parts.disp != const0_rtx)
20836 split_cost += 1;
20838 /* Subtract the price of lea. */
20839 split_cost -= 1;
20842 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
20843 parts.scale > 1);
20846 /* Emit x86 binary operand CODE in mode MODE, where the first operand
20847 matches destination. RTX includes clobber of FLAGS_REG. */
20849 static void
20850 ix86_emit_binop (enum rtx_code code, machine_mode mode,
20851 rtx dst, rtx src)
20853 rtx op, clob;
20855 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
20856 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20858 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20861 /* Return true if regno1 def is nearest to the insn. */
20863 static bool
20864 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
20866 rtx_insn *prev = insn;
20867 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
20869 if (insn == start)
20870 return false;
20871 while (prev && prev != start)
20873 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
20875 prev = PREV_INSN (prev);
20876 continue;
20878 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
20879 return true;
20880 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
20881 return false;
20882 prev = PREV_INSN (prev);
20885 /* None of the regs is defined in the bb. */
20886 return false;
20889 /* Split lea instructions into a sequence of instructions
20890 which are executed on ALU to avoid AGU stalls.
20891 It is assumed that it is allowed to clobber flags register
20892 at lea position. */
20894 void
20895 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
20897 unsigned int regno0, regno1, regno2;
20898 struct ix86_address parts;
20899 rtx target, tmp;
20900 int ok, adds;
20902 ok = ix86_decompose_address (operands[1], &parts);
20903 gcc_assert (ok);
20905 target = gen_lowpart (mode, operands[0]);
20907 regno0 = true_regnum (target);
20908 regno1 = INVALID_REGNUM;
20909 regno2 = INVALID_REGNUM;
20911 if (parts.base)
20913 parts.base = gen_lowpart (mode, parts.base);
20914 regno1 = true_regnum (parts.base);
20917 if (parts.index)
20919 parts.index = gen_lowpart (mode, parts.index);
20920 regno2 = true_regnum (parts.index);
20923 if (parts.disp)
20924 parts.disp = gen_lowpart (mode, parts.disp);
20926 if (parts.scale > 1)
20928 /* Case r1 = r1 + ... */
20929 if (regno1 == regno0)
20931 /* If we have a case r1 = r1 + C * r2 then we
20932 should use multiplication which is very
20933 expensive. Assume cost model is wrong if we
20934 have such case here. */
20935 gcc_assert (regno2 != regno0);
20937 for (adds = parts.scale; adds > 0; adds--)
20938 ix86_emit_binop (PLUS, mode, target, parts.index);
20940 else
20942 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
20943 if (regno0 != regno2)
20944 emit_insn (gen_rtx_SET (target, parts.index));
20946 /* Use shift for scaling. */
20947 ix86_emit_binop (ASHIFT, mode, target,
20948 GEN_INT (exact_log2 (parts.scale)));
20950 if (parts.base)
20951 ix86_emit_binop (PLUS, mode, target, parts.base);
20953 if (parts.disp && parts.disp != const0_rtx)
20954 ix86_emit_binop (PLUS, mode, target, parts.disp);
20957 else if (!parts.base && !parts.index)
20959 gcc_assert(parts.disp);
20960 emit_insn (gen_rtx_SET (target, parts.disp));
20962 else
20964 if (!parts.base)
20966 if (regno0 != regno2)
20967 emit_insn (gen_rtx_SET (target, parts.index));
20969 else if (!parts.index)
20971 if (regno0 != regno1)
20972 emit_insn (gen_rtx_SET (target, parts.base));
20974 else
20976 if (regno0 == regno1)
20977 tmp = parts.index;
20978 else if (regno0 == regno2)
20979 tmp = parts.base;
20980 else
20982 rtx tmp1;
20984 /* Find better operand for SET instruction, depending
20985 on which definition is farther from the insn. */
20986 if (find_nearest_reg_def (insn, regno1, regno2))
20987 tmp = parts.index, tmp1 = parts.base;
20988 else
20989 tmp = parts.base, tmp1 = parts.index;
20991 emit_insn (gen_rtx_SET (target, tmp));
20993 if (parts.disp && parts.disp != const0_rtx)
20994 ix86_emit_binop (PLUS, mode, target, parts.disp);
20996 ix86_emit_binop (PLUS, mode, target, tmp1);
20997 return;
21000 ix86_emit_binop (PLUS, mode, target, tmp);
21003 if (parts.disp && parts.disp != const0_rtx)
21004 ix86_emit_binop (PLUS, mode, target, parts.disp);
21008 /* Return true if it is ok to optimize an ADD operation to LEA
21009 operation to avoid flag register consumation. For most processors,
21010 ADD is faster than LEA. For the processors like BONNELL, if the
21011 destination register of LEA holds an actual address which will be
21012 used soon, LEA is better and otherwise ADD is better. */
21014 bool
21015 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21017 unsigned int regno0 = true_regnum (operands[0]);
21018 unsigned int regno1 = true_regnum (operands[1]);
21019 unsigned int regno2 = true_regnum (operands[2]);
21021 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21022 if (regno0 != regno1 && regno0 != regno2)
21023 return true;
21025 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21026 return false;
21028 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21031 /* Return true if destination reg of SET_BODY is shift count of
21032 USE_BODY. */
21034 static bool
21035 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21037 rtx set_dest;
21038 rtx shift_rtx;
21039 int i;
21041 /* Retrieve destination of SET_BODY. */
21042 switch (GET_CODE (set_body))
21044 case SET:
21045 set_dest = SET_DEST (set_body);
21046 if (!set_dest || !REG_P (set_dest))
21047 return false;
21048 break;
21049 case PARALLEL:
21050 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21051 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21052 use_body))
21053 return true;
21054 /* FALLTHROUGH */
21055 default:
21056 return false;
21059 /* Retrieve shift count of USE_BODY. */
21060 switch (GET_CODE (use_body))
21062 case SET:
21063 shift_rtx = XEXP (use_body, 1);
21064 break;
21065 case PARALLEL:
21066 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21067 if (ix86_dep_by_shift_count_body (set_body,
21068 XVECEXP (use_body, 0, i)))
21069 return true;
21070 /* FALLTHROUGH */
21071 default:
21072 return false;
21075 if (shift_rtx
21076 && (GET_CODE (shift_rtx) == ASHIFT
21077 || GET_CODE (shift_rtx) == LSHIFTRT
21078 || GET_CODE (shift_rtx) == ASHIFTRT
21079 || GET_CODE (shift_rtx) == ROTATE
21080 || GET_CODE (shift_rtx) == ROTATERT))
21082 rtx shift_count = XEXP (shift_rtx, 1);
21084 /* Return true if shift count is dest of SET_BODY. */
21085 if (REG_P (shift_count))
21087 /* Add check since it can be invoked before register
21088 allocation in pre-reload schedule. */
21089 if (reload_completed
21090 && true_regnum (set_dest) == true_regnum (shift_count))
21091 return true;
21092 else if (REGNO(set_dest) == REGNO(shift_count))
21093 return true;
21097 return false;
21100 /* Return true if destination reg of SET_INSN is shift count of
21101 USE_INSN. */
21103 bool
21104 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21106 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21107 PATTERN (use_insn));
21110 /* Return TRUE or FALSE depending on whether the unary operator meets the
21111 appropriate constraints. */
21113 bool
21114 ix86_unary_operator_ok (enum rtx_code,
21115 machine_mode,
21116 rtx operands[2])
21118 /* If one of operands is memory, source and destination must match. */
21119 if ((MEM_P (operands[0])
21120 || MEM_P (operands[1]))
21121 && ! rtx_equal_p (operands[0], operands[1]))
21122 return false;
21123 return true;
21126 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21127 are ok, keeping in mind the possible movddup alternative. */
21129 bool
21130 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21132 if (MEM_P (operands[0]))
21133 return rtx_equal_p (operands[0], operands[1 + high]);
21134 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21135 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21136 return true;
21139 /* Post-reload splitter for converting an SF or DFmode value in an
21140 SSE register into an unsigned SImode. */
21142 void
21143 ix86_split_convert_uns_si_sse (rtx operands[])
21145 machine_mode vecmode;
21146 rtx value, large, zero_or_two31, input, two31, x;
21148 large = operands[1];
21149 zero_or_two31 = operands[2];
21150 input = operands[3];
21151 two31 = operands[4];
21152 vecmode = GET_MODE (large);
21153 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21155 /* Load up the value into the low element. We must ensure that the other
21156 elements are valid floats -- zero is the easiest such value. */
21157 if (MEM_P (input))
21159 if (vecmode == V4SFmode)
21160 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21161 else
21162 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21164 else
21166 input = gen_rtx_REG (vecmode, REGNO (input));
21167 emit_move_insn (value, CONST0_RTX (vecmode));
21168 if (vecmode == V4SFmode)
21169 emit_insn (gen_sse_movss (value, value, input));
21170 else
21171 emit_insn (gen_sse2_movsd (value, value, input));
21174 emit_move_insn (large, two31);
21175 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21177 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21178 emit_insn (gen_rtx_SET (large, x));
21180 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21181 emit_insn (gen_rtx_SET (zero_or_two31, x));
21183 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21184 emit_insn (gen_rtx_SET (value, x));
21186 large = gen_rtx_REG (V4SImode, REGNO (large));
21187 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21189 x = gen_rtx_REG (V4SImode, REGNO (value));
21190 if (vecmode == V4SFmode)
21191 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21192 else
21193 emit_insn (gen_sse2_cvttpd2dq (x, value));
21194 value = x;
21196 emit_insn (gen_xorv4si3 (value, value, large));
21199 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21200 Expects the 64-bit DImode to be supplied in a pair of integral
21201 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21202 -mfpmath=sse, !optimize_size only. */
21204 void
21205 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21207 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21208 rtx int_xmm, fp_xmm;
21209 rtx biases, exponents;
21210 rtx x;
21212 int_xmm = gen_reg_rtx (V4SImode);
21213 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21214 emit_insn (gen_movdi_to_sse (int_xmm, input));
21215 else if (TARGET_SSE_SPLIT_REGS)
21217 emit_clobber (int_xmm);
21218 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21220 else
21222 x = gen_reg_rtx (V2DImode);
21223 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21224 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21227 x = gen_rtx_CONST_VECTOR (V4SImode,
21228 gen_rtvec (4, GEN_INT (0x43300000UL),
21229 GEN_INT (0x45300000UL),
21230 const0_rtx, const0_rtx));
21231 exponents = validize_mem (force_const_mem (V4SImode, x));
21233 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21234 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21236 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21237 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21238 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21239 (0x1.0p84 + double(fp_value_hi_xmm)).
21240 Note these exponents differ by 32. */
21242 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21244 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21245 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21246 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21247 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21248 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21249 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21250 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21251 biases = validize_mem (force_const_mem (V2DFmode, biases));
21252 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21254 /* Add the upper and lower DFmode values together. */
21255 if (TARGET_SSE3)
21256 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21257 else
21259 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21260 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21261 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21264 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21267 /* Not used, but eases macroization of patterns. */
21268 void
21269 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21271 gcc_unreachable ();
21274 /* Convert an unsigned SImode value into a DFmode. Only currently used
21275 for SSE, but applicable anywhere. */
21277 void
21278 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21280 REAL_VALUE_TYPE TWO31r;
21281 rtx x, fp;
21283 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21284 NULL, 1, OPTAB_DIRECT);
21286 fp = gen_reg_rtx (DFmode);
21287 emit_insn (gen_floatsidf2 (fp, x));
21289 real_ldexp (&TWO31r, &dconst1, 31);
21290 x = const_double_from_real_value (TWO31r, DFmode);
21292 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21293 if (x != target)
21294 emit_move_insn (target, x);
21297 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21298 32-bit mode; otherwise we have a direct convert instruction. */
21300 void
21301 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21303 REAL_VALUE_TYPE TWO32r;
21304 rtx fp_lo, fp_hi, x;
21306 fp_lo = gen_reg_rtx (DFmode);
21307 fp_hi = gen_reg_rtx (DFmode);
21309 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21311 real_ldexp (&TWO32r, &dconst1, 32);
21312 x = const_double_from_real_value (TWO32r, DFmode);
21313 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21315 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21317 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21318 0, OPTAB_DIRECT);
21319 if (x != target)
21320 emit_move_insn (target, x);
21323 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21324 For x86_32, -mfpmath=sse, !optimize_size only. */
21325 void
21326 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21328 REAL_VALUE_TYPE ONE16r;
21329 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21331 real_ldexp (&ONE16r, &dconst1, 16);
21332 x = const_double_from_real_value (ONE16r, SFmode);
21333 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21334 NULL, 0, OPTAB_DIRECT);
21335 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21336 NULL, 0, OPTAB_DIRECT);
21337 fp_hi = gen_reg_rtx (SFmode);
21338 fp_lo = gen_reg_rtx (SFmode);
21339 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21340 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21341 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21342 0, OPTAB_DIRECT);
21343 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21344 0, OPTAB_DIRECT);
21345 if (!rtx_equal_p (target, fp_hi))
21346 emit_move_insn (target, fp_hi);
21349 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21350 a vector of unsigned ints VAL to vector of floats TARGET. */
21352 void
21353 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21355 rtx tmp[8];
21356 REAL_VALUE_TYPE TWO16r;
21357 machine_mode intmode = GET_MODE (val);
21358 machine_mode fltmode = GET_MODE (target);
21359 rtx (*cvt) (rtx, rtx);
21361 if (intmode == V4SImode)
21362 cvt = gen_floatv4siv4sf2;
21363 else
21364 cvt = gen_floatv8siv8sf2;
21365 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21366 tmp[0] = force_reg (intmode, tmp[0]);
21367 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21368 OPTAB_DIRECT);
21369 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21370 NULL_RTX, 1, OPTAB_DIRECT);
21371 tmp[3] = gen_reg_rtx (fltmode);
21372 emit_insn (cvt (tmp[3], tmp[1]));
21373 tmp[4] = gen_reg_rtx (fltmode);
21374 emit_insn (cvt (tmp[4], tmp[2]));
21375 real_ldexp (&TWO16r, &dconst1, 16);
21376 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21377 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21378 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21379 OPTAB_DIRECT);
21380 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21381 OPTAB_DIRECT);
21382 if (tmp[7] != target)
21383 emit_move_insn (target, tmp[7]);
21386 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21387 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21388 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21389 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21392 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21394 REAL_VALUE_TYPE TWO31r;
21395 rtx two31r, tmp[4];
21396 machine_mode mode = GET_MODE (val);
21397 machine_mode scalarmode = GET_MODE_INNER (mode);
21398 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21399 rtx (*cmp) (rtx, rtx, rtx, rtx);
21400 int i;
21402 for (i = 0; i < 3; i++)
21403 tmp[i] = gen_reg_rtx (mode);
21404 real_ldexp (&TWO31r, &dconst1, 31);
21405 two31r = const_double_from_real_value (TWO31r, scalarmode);
21406 two31r = ix86_build_const_vector (mode, 1, two31r);
21407 two31r = force_reg (mode, two31r);
21408 switch (mode)
21410 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21411 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21412 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21413 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21414 default: gcc_unreachable ();
21416 tmp[3] = gen_rtx_LE (mode, two31r, val);
21417 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21418 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21419 0, OPTAB_DIRECT);
21420 if (intmode == V4SImode || TARGET_AVX2)
21421 *xorp = expand_simple_binop (intmode, ASHIFT,
21422 gen_lowpart (intmode, tmp[0]),
21423 GEN_INT (31), NULL_RTX, 0,
21424 OPTAB_DIRECT);
21425 else
21427 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21428 two31 = ix86_build_const_vector (intmode, 1, two31);
21429 *xorp = expand_simple_binop (intmode, AND,
21430 gen_lowpart (intmode, tmp[0]),
21431 two31, NULL_RTX, 0,
21432 OPTAB_DIRECT);
21434 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21435 0, OPTAB_DIRECT);
21438 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21439 then replicate the value for all elements of the vector
21440 register. */
21443 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21445 int i, n_elt;
21446 rtvec v;
21447 machine_mode scalar_mode;
21449 switch (mode)
21451 case E_V64QImode:
21452 case E_V32QImode:
21453 case E_V16QImode:
21454 case E_V32HImode:
21455 case E_V16HImode:
21456 case E_V8HImode:
21457 case E_V16SImode:
21458 case E_V8SImode:
21459 case E_V4SImode:
21460 case E_V8DImode:
21461 case E_V4DImode:
21462 case E_V2DImode:
21463 gcc_assert (vect);
21464 /* FALLTHRU */
21465 case E_V16SFmode:
21466 case E_V8SFmode:
21467 case E_V4SFmode:
21468 case E_V8DFmode:
21469 case E_V4DFmode:
21470 case E_V2DFmode:
21471 n_elt = GET_MODE_NUNITS (mode);
21472 v = rtvec_alloc (n_elt);
21473 scalar_mode = GET_MODE_INNER (mode);
21475 RTVEC_ELT (v, 0) = value;
21477 for (i = 1; i < n_elt; ++i)
21478 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21480 return gen_rtx_CONST_VECTOR (mode, v);
21482 default:
21483 gcc_unreachable ();
21487 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21488 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21489 for an SSE register. If VECT is true, then replicate the mask for
21490 all elements of the vector register. If INVERT is true, then create
21491 a mask excluding the sign bit. */
21494 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21496 machine_mode vec_mode, imode;
21497 wide_int w;
21498 rtx mask, v;
21500 switch (mode)
21502 case E_V16SImode:
21503 case E_V16SFmode:
21504 case E_V8SImode:
21505 case E_V4SImode:
21506 case E_V8SFmode:
21507 case E_V4SFmode:
21508 vec_mode = mode;
21509 imode = SImode;
21510 break;
21512 case E_V8DImode:
21513 case E_V4DImode:
21514 case E_V2DImode:
21515 case E_V8DFmode:
21516 case E_V4DFmode:
21517 case E_V2DFmode:
21518 vec_mode = mode;
21519 imode = DImode;
21520 break;
21522 case E_TImode:
21523 case E_TFmode:
21524 vec_mode = VOIDmode;
21525 imode = TImode;
21526 break;
21528 default:
21529 gcc_unreachable ();
21532 machine_mode inner_mode = GET_MODE_INNER (mode);
21533 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21534 GET_MODE_BITSIZE (inner_mode));
21535 if (invert)
21536 w = wi::bit_not (w);
21538 /* Force this value into the low part of a fp vector constant. */
21539 mask = immed_wide_int_const (w, imode);
21540 mask = gen_lowpart (inner_mode, mask);
21542 if (vec_mode == VOIDmode)
21543 return force_reg (inner_mode, mask);
21545 v = ix86_build_const_vector (vec_mode, vect, mask);
21546 return force_reg (vec_mode, v);
21549 /* Generate code for floating point ABS or NEG. */
21551 void
21552 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21553 rtx operands[])
21555 rtx mask, set, dst, src;
21556 bool use_sse = false;
21557 bool vector_mode = VECTOR_MODE_P (mode);
21558 machine_mode vmode = mode;
21560 if (vector_mode)
21561 use_sse = true;
21562 else if (mode == TFmode)
21563 use_sse = true;
21564 else if (TARGET_SSE_MATH)
21566 use_sse = SSE_FLOAT_MODE_P (mode);
21567 if (mode == SFmode)
21568 vmode = V4SFmode;
21569 else if (mode == DFmode)
21570 vmode = V2DFmode;
21573 /* NEG and ABS performed with SSE use bitwise mask operations.
21574 Create the appropriate mask now. */
21575 if (use_sse)
21576 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21577 else
21578 mask = NULL_RTX;
21580 dst = operands[0];
21581 src = operands[1];
21583 set = gen_rtx_fmt_e (code, mode, src);
21584 set = gen_rtx_SET (dst, set);
21586 if (mask)
21588 rtx use, clob;
21589 rtvec par;
21591 use = gen_rtx_USE (VOIDmode, mask);
21592 if (vector_mode)
21593 par = gen_rtvec (2, set, use);
21594 else
21596 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21597 par = gen_rtvec (3, set, use, clob);
21599 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21601 else
21602 emit_insn (set);
21605 /* Expand a copysign operation. Special case operand 0 being a constant. */
21607 void
21608 ix86_expand_copysign (rtx operands[])
21610 machine_mode mode, vmode;
21611 rtx dest, op0, op1, mask, nmask;
21613 dest = operands[0];
21614 op0 = operands[1];
21615 op1 = operands[2];
21617 mode = GET_MODE (dest);
21619 if (mode == SFmode)
21620 vmode = V4SFmode;
21621 else if (mode == DFmode)
21622 vmode = V2DFmode;
21623 else
21624 vmode = mode;
21626 if (CONST_DOUBLE_P (op0))
21628 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21630 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21631 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21633 if (mode == SFmode || mode == DFmode)
21635 if (op0 == CONST0_RTX (mode))
21636 op0 = CONST0_RTX (vmode);
21637 else
21639 rtx v = ix86_build_const_vector (vmode, false, op0);
21641 op0 = force_reg (vmode, v);
21644 else if (op0 != CONST0_RTX (mode))
21645 op0 = force_reg (mode, op0);
21647 mask = ix86_build_signbit_mask (vmode, 0, 0);
21649 if (mode == SFmode)
21650 copysign_insn = gen_copysignsf3_const;
21651 else if (mode == DFmode)
21652 copysign_insn = gen_copysigndf3_const;
21653 else
21654 copysign_insn = gen_copysigntf3_const;
21656 emit_insn (copysign_insn (dest, op0, op1, mask));
21658 else
21660 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21662 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21663 mask = ix86_build_signbit_mask (vmode, 0, 0);
21665 if (mode == SFmode)
21666 copysign_insn = gen_copysignsf3_var;
21667 else if (mode == DFmode)
21668 copysign_insn = gen_copysigndf3_var;
21669 else
21670 copysign_insn = gen_copysigntf3_var;
21672 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21676 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21677 be a constant, and so has already been expanded into a vector constant. */
21679 void
21680 ix86_split_copysign_const (rtx operands[])
21682 machine_mode mode, vmode;
21683 rtx dest, op0, mask, x;
21685 dest = operands[0];
21686 op0 = operands[1];
21687 mask = operands[3];
21689 mode = GET_MODE (dest);
21690 vmode = GET_MODE (mask);
21692 dest = lowpart_subreg (vmode, dest, mode);
21693 x = gen_rtx_AND (vmode, dest, mask);
21694 emit_insn (gen_rtx_SET (dest, x));
21696 if (op0 != CONST0_RTX (vmode))
21698 x = gen_rtx_IOR (vmode, dest, op0);
21699 emit_insn (gen_rtx_SET (dest, x));
21703 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21704 so we have to do two masks. */
21706 void
21707 ix86_split_copysign_var (rtx operands[])
21709 machine_mode mode, vmode;
21710 rtx dest, scratch, op0, op1, mask, nmask, x;
21712 dest = operands[0];
21713 scratch = operands[1];
21714 op0 = operands[2];
21715 op1 = operands[3];
21716 nmask = operands[4];
21717 mask = operands[5];
21719 mode = GET_MODE (dest);
21720 vmode = GET_MODE (mask);
21722 if (rtx_equal_p (op0, op1))
21724 /* Shouldn't happen often (it's useless, obviously), but when it does
21725 we'd generate incorrect code if we continue below. */
21726 emit_move_insn (dest, op0);
21727 return;
21730 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21732 gcc_assert (REGNO (op1) == REGNO (scratch));
21734 x = gen_rtx_AND (vmode, scratch, mask);
21735 emit_insn (gen_rtx_SET (scratch, x));
21737 dest = mask;
21738 op0 = lowpart_subreg (vmode, op0, mode);
21739 x = gen_rtx_NOT (vmode, dest);
21740 x = gen_rtx_AND (vmode, x, op0);
21741 emit_insn (gen_rtx_SET (dest, x));
21743 else
21745 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21747 x = gen_rtx_AND (vmode, scratch, mask);
21749 else /* alternative 2,4 */
21751 gcc_assert (REGNO (mask) == REGNO (scratch));
21752 op1 = lowpart_subreg (vmode, op1, mode);
21753 x = gen_rtx_AND (vmode, scratch, op1);
21755 emit_insn (gen_rtx_SET (scratch, x));
21757 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21759 dest = lowpart_subreg (vmode, op0, mode);
21760 x = gen_rtx_AND (vmode, dest, nmask);
21762 else /* alternative 3,4 */
21764 gcc_assert (REGNO (nmask) == REGNO (dest));
21765 dest = nmask;
21766 op0 = lowpart_subreg (vmode, op0, mode);
21767 x = gen_rtx_AND (vmode, dest, op0);
21769 emit_insn (gen_rtx_SET (dest, x));
21772 x = gen_rtx_IOR (vmode, dest, scratch);
21773 emit_insn (gen_rtx_SET (dest, x));
21776 /* Return TRUE or FALSE depending on whether the first SET in INSN
21777 has source and destination with matching CC modes, and that the
21778 CC mode is at least as constrained as REQ_MODE. */
21780 bool
21781 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21783 rtx set;
21784 machine_mode set_mode;
21786 set = PATTERN (insn);
21787 if (GET_CODE (set) == PARALLEL)
21788 set = XVECEXP (set, 0, 0);
21789 gcc_assert (GET_CODE (set) == SET);
21790 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21792 set_mode = GET_MODE (SET_DEST (set));
21793 switch (set_mode)
21795 case E_CCNOmode:
21796 if (req_mode != CCNOmode
21797 && (req_mode != CCmode
21798 || XEXP (SET_SRC (set), 1) != const0_rtx))
21799 return false;
21800 break;
21801 case E_CCmode:
21802 if (req_mode == CCGCmode)
21803 return false;
21804 /* FALLTHRU */
21805 case E_CCGCmode:
21806 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21807 return false;
21808 /* FALLTHRU */
21809 case E_CCGOCmode:
21810 if (req_mode == CCZmode)
21811 return false;
21812 /* FALLTHRU */
21813 case E_CCZmode:
21814 break;
21816 case E_CCGZmode:
21818 case E_CCAmode:
21819 case E_CCCmode:
21820 case E_CCOmode:
21821 case E_CCPmode:
21822 case E_CCSmode:
21823 if (set_mode != req_mode)
21824 return false;
21825 break;
21827 default:
21828 gcc_unreachable ();
21831 return GET_MODE (SET_SRC (set)) == set_mode;
21834 /* Generate insn patterns to do an integer compare of OPERANDS. */
21836 static rtx
21837 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
21839 machine_mode cmpmode;
21840 rtx tmp, flags;
21842 cmpmode = SELECT_CC_MODE (code, op0, op1);
21843 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
21845 /* This is very simple, but making the interface the same as in the
21846 FP case makes the rest of the code easier. */
21847 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
21848 emit_insn (gen_rtx_SET (flags, tmp));
21850 /* Return the test that should be put into the flags user, i.e.
21851 the bcc, scc, or cmov instruction. */
21852 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
21855 /* Figure out whether to use unordered fp comparisons. */
21857 static bool
21858 ix86_unordered_fp_compare (enum rtx_code code)
21860 if (!TARGET_IEEE_FP)
21861 return false;
21863 switch (code)
21865 case GT:
21866 case GE:
21867 case LT:
21868 case LE:
21869 return false;
21871 case EQ:
21872 case NE:
21874 case LTGT:
21875 case UNORDERED:
21876 case ORDERED:
21877 case UNLT:
21878 case UNLE:
21879 case UNGT:
21880 case UNGE:
21881 case UNEQ:
21882 return true;
21884 default:
21885 gcc_unreachable ();
21889 machine_mode
21890 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
21892 machine_mode mode = GET_MODE (op0);
21894 if (SCALAR_FLOAT_MODE_P (mode))
21896 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
21897 return CCFPmode;
21900 switch (code)
21902 /* Only zero flag is needed. */
21903 case EQ: /* ZF=0 */
21904 case NE: /* ZF!=0 */
21905 return CCZmode;
21906 /* Codes needing carry flag. */
21907 case GEU: /* CF=0 */
21908 case LTU: /* CF=1 */
21909 /* Detect overflow checks. They need just the carry flag. */
21910 if (GET_CODE (op0) == PLUS
21911 && (rtx_equal_p (op1, XEXP (op0, 0))
21912 || rtx_equal_p (op1, XEXP (op0, 1))))
21913 return CCCmode;
21914 else
21915 return CCmode;
21916 case GTU: /* CF=0 & ZF=0 */
21917 case LEU: /* CF=1 | ZF=1 */
21918 return CCmode;
21919 /* Codes possibly doable only with sign flag when
21920 comparing against zero. */
21921 case GE: /* SF=OF or SF=0 */
21922 case LT: /* SF<>OF or SF=1 */
21923 if (op1 == const0_rtx)
21924 return CCGOCmode;
21925 else
21926 /* For other cases Carry flag is not required. */
21927 return CCGCmode;
21928 /* Codes doable only with sign flag when comparing
21929 against zero, but we miss jump instruction for it
21930 so we need to use relational tests against overflow
21931 that thus needs to be zero. */
21932 case GT: /* ZF=0 & SF=OF */
21933 case LE: /* ZF=1 | SF<>OF */
21934 if (op1 == const0_rtx)
21935 return CCNOmode;
21936 else
21937 return CCGCmode;
21938 /* strcmp pattern do (use flags) and combine may ask us for proper
21939 mode. */
21940 case USE:
21941 return CCmode;
21942 default:
21943 gcc_unreachable ();
21947 /* Return the fixed registers used for condition codes. */
21949 static bool
21950 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
21952 *p1 = FLAGS_REG;
21953 *p2 = FPSR_REG;
21954 return true;
21957 /* If two condition code modes are compatible, return a condition code
21958 mode which is compatible with both. Otherwise, return
21959 VOIDmode. */
21961 static machine_mode
21962 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
21964 if (m1 == m2)
21965 return m1;
21967 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
21968 return VOIDmode;
21970 if ((m1 == CCGCmode && m2 == CCGOCmode)
21971 || (m1 == CCGOCmode && m2 == CCGCmode))
21972 return CCGCmode;
21974 if ((m1 == CCNOmode && m2 == CCGOCmode)
21975 || (m1 == CCGOCmode && m2 == CCNOmode))
21976 return CCNOmode;
21978 if (m1 == CCZmode
21979 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
21980 return m2;
21981 else if (m2 == CCZmode
21982 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
21983 return m1;
21985 switch (m1)
21987 default:
21988 gcc_unreachable ();
21990 case E_CCmode:
21991 case E_CCGCmode:
21992 case E_CCGOCmode:
21993 case E_CCNOmode:
21994 case E_CCAmode:
21995 case E_CCCmode:
21996 case E_CCOmode:
21997 case E_CCPmode:
21998 case E_CCSmode:
21999 case E_CCZmode:
22000 switch (m2)
22002 default:
22003 return VOIDmode;
22005 case E_CCmode:
22006 case E_CCGCmode:
22007 case E_CCGOCmode:
22008 case E_CCNOmode:
22009 case E_CCAmode:
22010 case E_CCCmode:
22011 case E_CCOmode:
22012 case E_CCPmode:
22013 case E_CCSmode:
22014 case E_CCZmode:
22015 return CCmode;
22018 case E_CCFPmode:
22019 /* These are only compatible with themselves, which we already
22020 checked above. */
22021 return VOIDmode;
22026 /* Return a comparison we can do and that it is equivalent to
22027 swap_condition (code) apart possibly from orderedness.
22028 But, never change orderedness if TARGET_IEEE_FP, returning
22029 UNKNOWN in that case if necessary. */
22031 static enum rtx_code
22032 ix86_fp_swap_condition (enum rtx_code code)
22034 switch (code)
22036 case GT: /* GTU - CF=0 & ZF=0 */
22037 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22038 case GE: /* GEU - CF=0 */
22039 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22040 case UNLT: /* LTU - CF=1 */
22041 return TARGET_IEEE_FP ? UNKNOWN : GT;
22042 case UNLE: /* LEU - CF=1 | ZF=1 */
22043 return TARGET_IEEE_FP ? UNKNOWN : GE;
22044 default:
22045 return swap_condition (code);
22049 /* Return cost of comparison CODE using the best strategy for performance.
22050 All following functions do use number of instructions as a cost metrics.
22051 In future this should be tweaked to compute bytes for optimize_size and
22052 take into account performance of various instructions on various CPUs. */
22054 static int
22055 ix86_fp_comparison_cost (enum rtx_code code)
22057 int arith_cost;
22059 /* The cost of code using bit-twiddling on %ah. */
22060 switch (code)
22062 case UNLE:
22063 case UNLT:
22064 case LTGT:
22065 case GT:
22066 case GE:
22067 case UNORDERED:
22068 case ORDERED:
22069 case UNEQ:
22070 arith_cost = 4;
22071 break;
22072 case LT:
22073 case NE:
22074 case EQ:
22075 case UNGE:
22076 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22077 break;
22078 case LE:
22079 case UNGT:
22080 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22081 break;
22082 default:
22083 gcc_unreachable ();
22086 switch (ix86_fp_comparison_strategy (code))
22088 case IX86_FPCMP_COMI:
22089 return arith_cost > 4 ? 3 : 2;
22090 case IX86_FPCMP_SAHF:
22091 return arith_cost > 4 ? 4 : 3;
22092 default:
22093 return arith_cost;
22097 /* Return strategy to use for floating-point. We assume that fcomi is always
22098 preferrable where available, since that is also true when looking at size
22099 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22101 enum ix86_fpcmp_strategy
22102 ix86_fp_comparison_strategy (enum rtx_code)
22104 /* Do fcomi/sahf based test when profitable. */
22106 if (TARGET_CMOVE)
22107 return IX86_FPCMP_COMI;
22109 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22110 return IX86_FPCMP_SAHF;
22112 return IX86_FPCMP_ARITH;
22115 /* Swap, force into registers, or otherwise massage the two operands
22116 to a fp comparison. The operands are updated in place; the new
22117 comparison code is returned. */
22119 static enum rtx_code
22120 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22122 bool unordered_compare = ix86_unordered_fp_compare (code);
22123 rtx op0 = *pop0, op1 = *pop1;
22124 machine_mode op_mode = GET_MODE (op0);
22125 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22127 /* All of the unordered compare instructions only work on registers.
22128 The same is true of the fcomi compare instructions. The XFmode
22129 compare instructions require registers except when comparing
22130 against zero or when converting operand 1 from fixed point to
22131 floating point. */
22133 if (!is_sse
22134 && (unordered_compare
22135 || (op_mode == XFmode
22136 && ! (standard_80387_constant_p (op0) == 1
22137 || standard_80387_constant_p (op1) == 1)
22138 && GET_CODE (op1) != FLOAT)
22139 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22141 op0 = force_reg (op_mode, op0);
22142 op1 = force_reg (op_mode, op1);
22144 else
22146 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22147 things around if they appear profitable, otherwise force op0
22148 into a register. */
22150 if (standard_80387_constant_p (op0) == 0
22151 || (MEM_P (op0)
22152 && ! (standard_80387_constant_p (op1) == 0
22153 || MEM_P (op1))))
22155 enum rtx_code new_code = ix86_fp_swap_condition (code);
22156 if (new_code != UNKNOWN)
22158 std::swap (op0, op1);
22159 code = new_code;
22163 if (!REG_P (op0))
22164 op0 = force_reg (op_mode, op0);
22166 if (CONSTANT_P (op1))
22168 int tmp = standard_80387_constant_p (op1);
22169 if (tmp == 0)
22170 op1 = validize_mem (force_const_mem (op_mode, op1));
22171 else if (tmp == 1)
22173 if (TARGET_CMOVE)
22174 op1 = force_reg (op_mode, op1);
22176 else
22177 op1 = force_reg (op_mode, op1);
22181 /* Try to rearrange the comparison to make it cheaper. */
22182 if (ix86_fp_comparison_cost (code)
22183 > ix86_fp_comparison_cost (swap_condition (code))
22184 && (REG_P (op1) || can_create_pseudo_p ()))
22186 std::swap (op0, op1);
22187 code = swap_condition (code);
22188 if (!REG_P (op0))
22189 op0 = force_reg (op_mode, op0);
22192 *pop0 = op0;
22193 *pop1 = op1;
22194 return code;
22197 /* Convert comparison codes we use to represent FP comparison to integer
22198 code that will result in proper branch. Return UNKNOWN if no such code
22199 is available. */
22201 enum rtx_code
22202 ix86_fp_compare_code_to_integer (enum rtx_code code)
22204 switch (code)
22206 case GT:
22207 return GTU;
22208 case GE:
22209 return GEU;
22210 case ORDERED:
22211 case UNORDERED:
22212 return code;
22213 case UNEQ:
22214 return EQ;
22215 case UNLT:
22216 return LTU;
22217 case UNLE:
22218 return LEU;
22219 case LTGT:
22220 return NE;
22221 default:
22222 return UNKNOWN;
22226 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22228 static rtx
22229 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22231 bool unordered_compare = ix86_unordered_fp_compare (code);
22232 machine_mode intcmp_mode;
22233 rtx tmp, tmp2;
22235 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22237 /* Do fcomi/sahf based test when profitable. */
22238 switch (ix86_fp_comparison_strategy (code))
22240 case IX86_FPCMP_COMI:
22241 intcmp_mode = CCFPmode;
22242 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22243 if (unordered_compare)
22244 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22245 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22246 break;
22248 case IX86_FPCMP_SAHF:
22249 intcmp_mode = CCFPmode;
22250 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22251 if (unordered_compare)
22252 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22253 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22254 if (!scratch)
22255 scratch = gen_reg_rtx (HImode);
22256 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22257 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22258 break;
22260 case IX86_FPCMP_ARITH:
22261 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22262 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22263 if (unordered_compare)
22264 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22265 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22266 if (!scratch)
22267 scratch = gen_reg_rtx (HImode);
22268 emit_insn (gen_rtx_SET (scratch, tmp));
22270 /* In the unordered case, we have to check C2 for NaN's, which
22271 doesn't happen to work out to anything nice combination-wise.
22272 So do some bit twiddling on the value we've got in AH to come
22273 up with an appropriate set of condition codes. */
22275 intcmp_mode = CCNOmode;
22276 switch (code)
22278 case GT:
22279 case UNGT:
22280 if (code == GT || !TARGET_IEEE_FP)
22282 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22283 code = EQ;
22285 else
22287 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22288 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22289 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22290 intcmp_mode = CCmode;
22291 code = GEU;
22293 break;
22294 case LT:
22295 case UNLT:
22296 if (code == LT && TARGET_IEEE_FP)
22298 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22299 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22300 intcmp_mode = CCmode;
22301 code = EQ;
22303 else
22305 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22306 code = NE;
22308 break;
22309 case GE:
22310 case UNGE:
22311 if (code == GE || !TARGET_IEEE_FP)
22313 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22314 code = EQ;
22316 else
22318 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22319 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22320 code = NE;
22322 break;
22323 case LE:
22324 case UNLE:
22325 if (code == LE && TARGET_IEEE_FP)
22327 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22328 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22329 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22330 intcmp_mode = CCmode;
22331 code = LTU;
22333 else
22335 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22336 code = NE;
22338 break;
22339 case EQ:
22340 case UNEQ:
22341 if (code == EQ && TARGET_IEEE_FP)
22343 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22344 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22345 intcmp_mode = CCmode;
22346 code = EQ;
22348 else
22350 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22351 code = NE;
22353 break;
22354 case NE:
22355 case LTGT:
22356 if (code == NE && TARGET_IEEE_FP)
22358 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22359 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22360 GEN_INT (0x40)));
22361 code = NE;
22363 else
22365 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22366 code = EQ;
22368 break;
22370 case UNORDERED:
22371 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22372 code = NE;
22373 break;
22374 case ORDERED:
22375 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22376 code = EQ;
22377 break;
22379 default:
22380 gcc_unreachable ();
22382 break;
22384 default:
22385 gcc_unreachable();
22388 /* Return the test that should be put into the flags user, i.e.
22389 the bcc, scc, or cmov instruction. */
22390 return gen_rtx_fmt_ee (code, VOIDmode,
22391 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22392 const0_rtx);
22395 static rtx
22396 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22398 rtx ret;
22400 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22401 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22403 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22405 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22406 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22408 else
22409 ret = ix86_expand_int_compare (code, op0, op1);
22411 return ret;
22414 void
22415 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22417 machine_mode mode = GET_MODE (op0);
22418 rtx tmp;
22420 /* Handle special case - vector comparsion with boolean result, transform
22421 it using ptest instruction. */
22422 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22424 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22425 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22427 gcc_assert (code == EQ || code == NE);
22428 /* Generate XOR since we can't check that one operand is zero vector. */
22429 tmp = gen_reg_rtx (mode);
22430 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22431 tmp = gen_lowpart (p_mode, tmp);
22432 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22433 gen_rtx_UNSPEC (CCmode,
22434 gen_rtvec (2, tmp, tmp),
22435 UNSPEC_PTEST)));
22436 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22437 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22438 gen_rtx_LABEL_REF (VOIDmode, label),
22439 pc_rtx);
22440 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22441 return;
22444 switch (mode)
22446 case E_SFmode:
22447 case E_DFmode:
22448 case E_XFmode:
22449 case E_QImode:
22450 case E_HImode:
22451 case E_SImode:
22452 simple:
22453 tmp = ix86_expand_compare (code, op0, op1);
22454 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22455 gen_rtx_LABEL_REF (VOIDmode, label),
22456 pc_rtx);
22457 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22458 return;
22460 case E_DImode:
22461 if (TARGET_64BIT)
22462 goto simple;
22463 /* For 32-bit target DI comparison may be performed on
22464 SSE registers. To allow this we should avoid split
22465 to SI mode which is achieved by doing xor in DI mode
22466 and then comparing with zero (which is recognized by
22467 STV pass). We don't compare using xor when optimizing
22468 for size. */
22469 if (!optimize_insn_for_size_p ()
22470 && TARGET_STV
22471 && (code == EQ || code == NE))
22473 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22474 op1 = const0_rtx;
22476 /* FALLTHRU */
22477 case E_TImode:
22478 /* Expand DImode branch into multiple compare+branch. */
22480 rtx lo[2], hi[2];
22481 rtx_code_label *label2;
22482 enum rtx_code code1, code2, code3;
22483 machine_mode submode;
22485 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22487 std::swap (op0, op1);
22488 code = swap_condition (code);
22491 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22492 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22494 submode = mode == DImode ? SImode : DImode;
22496 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22497 avoid two branches. This costs one extra insn, so disable when
22498 optimizing for size. */
22500 if ((code == EQ || code == NE)
22501 && (!optimize_insn_for_size_p ()
22502 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22504 rtx xor0, xor1;
22506 xor1 = hi[0];
22507 if (hi[1] != const0_rtx)
22508 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22509 NULL_RTX, 0, OPTAB_WIDEN);
22511 xor0 = lo[0];
22512 if (lo[1] != const0_rtx)
22513 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22514 NULL_RTX, 0, OPTAB_WIDEN);
22516 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22517 NULL_RTX, 0, OPTAB_WIDEN);
22519 ix86_expand_branch (code, tmp, const0_rtx, label);
22520 return;
22523 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22524 op1 is a constant and the low word is zero, then we can just
22525 examine the high word. Similarly for low word -1 and
22526 less-or-equal-than or greater-than. */
22528 if (CONST_INT_P (hi[1]))
22529 switch (code)
22531 case LT: case LTU: case GE: case GEU:
22532 if (lo[1] == const0_rtx)
22534 ix86_expand_branch (code, hi[0], hi[1], label);
22535 return;
22537 break;
22538 case LE: case LEU: case GT: case GTU:
22539 if (lo[1] == constm1_rtx)
22541 ix86_expand_branch (code, hi[0], hi[1], label);
22542 return;
22544 break;
22545 default:
22546 break;
22549 /* Emulate comparisons that do not depend on Zero flag with
22550 double-word subtraction. Note that only Overflow, Sign
22551 and Carry flags are valid, so swap arguments and condition
22552 of comparisons that would otherwise test Zero flag. */
22554 switch (code)
22556 case LE: case LEU: case GT: case GTU:
22557 std::swap (lo[0], lo[1]);
22558 std::swap (hi[0], hi[1]);
22559 code = swap_condition (code);
22560 /* FALLTHRU */
22562 case LT: case LTU: case GE: case GEU:
22564 rtx (*cmp_insn) (rtx, rtx);
22565 rtx (*sbb_insn) (rtx, rtx, rtx);
22566 bool uns = (code == LTU || code == GEU);
22568 if (TARGET_64BIT)
22570 cmp_insn = gen_cmpdi_1;
22571 sbb_insn
22572 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
22574 else
22576 cmp_insn = gen_cmpsi_1;
22577 sbb_insn
22578 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
22581 if (!nonimmediate_operand (lo[0], submode))
22582 lo[0] = force_reg (submode, lo[0]);
22583 if (!x86_64_general_operand (lo[1], submode))
22584 lo[1] = force_reg (submode, lo[1]);
22586 if (!register_operand (hi[0], submode))
22587 hi[0] = force_reg (submode, hi[0]);
22588 if ((uns && !nonimmediate_operand (hi[1], submode))
22589 || (!uns && !x86_64_general_operand (hi[1], submode)))
22590 hi[1] = force_reg (submode, hi[1]);
22592 emit_insn (cmp_insn (lo[0], lo[1]));
22593 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
22595 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
22597 ix86_expand_branch (code, tmp, const0_rtx, label);
22598 return;
22601 default:
22602 break;
22605 /* Otherwise, we need two or three jumps. */
22607 label2 = gen_label_rtx ();
22609 code1 = code;
22610 code2 = swap_condition (code);
22611 code3 = unsigned_condition (code);
22613 switch (code)
22615 case LT: case GT: case LTU: case GTU:
22616 break;
22618 case LE: code1 = LT; code2 = GT; break;
22619 case GE: code1 = GT; code2 = LT; break;
22620 case LEU: code1 = LTU; code2 = GTU; break;
22621 case GEU: code1 = GTU; code2 = LTU; break;
22623 case EQ: code1 = UNKNOWN; code2 = NE; break;
22624 case NE: code2 = UNKNOWN; break;
22626 default:
22627 gcc_unreachable ();
22631 * a < b =>
22632 * if (hi(a) < hi(b)) goto true;
22633 * if (hi(a) > hi(b)) goto false;
22634 * if (lo(a) < lo(b)) goto true;
22635 * false:
22638 if (code1 != UNKNOWN)
22639 ix86_expand_branch (code1, hi[0], hi[1], label);
22640 if (code2 != UNKNOWN)
22641 ix86_expand_branch (code2, hi[0], hi[1], label2);
22643 ix86_expand_branch (code3, lo[0], lo[1], label);
22645 if (code2 != UNKNOWN)
22646 emit_label (label2);
22647 return;
22650 default:
22651 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22652 goto simple;
22656 void
22657 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22659 rtx ret;
22661 gcc_assert (GET_MODE (dest) == QImode);
22663 ret = ix86_expand_compare (code, op0, op1);
22664 PUT_MODE (ret, QImode);
22665 emit_insn (gen_rtx_SET (dest, ret));
22668 /* Expand comparison setting or clearing carry flag. Return true when
22669 successful and set pop for the operation. */
22670 static bool
22671 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22673 machine_mode mode =
22674 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22676 /* Do not handle double-mode compares that go through special path. */
22677 if (mode == (TARGET_64BIT ? TImode : DImode))
22678 return false;
22680 if (SCALAR_FLOAT_MODE_P (mode))
22682 rtx compare_op;
22683 rtx_insn *compare_seq;
22685 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22687 /* Shortcut: following common codes never translate
22688 into carry flag compares. */
22689 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22690 || code == ORDERED || code == UNORDERED)
22691 return false;
22693 /* These comparisons require zero flag; swap operands so they won't. */
22694 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22695 && !TARGET_IEEE_FP)
22697 std::swap (op0, op1);
22698 code = swap_condition (code);
22701 /* Try to expand the comparison and verify that we end up with
22702 carry flag based comparison. This fails to be true only when
22703 we decide to expand comparison using arithmetic that is not
22704 too common scenario. */
22705 start_sequence ();
22706 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22707 compare_seq = get_insns ();
22708 end_sequence ();
22710 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
22711 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22712 else
22713 code = GET_CODE (compare_op);
22715 if (code != LTU && code != GEU)
22716 return false;
22718 emit_insn (compare_seq);
22719 *pop = compare_op;
22720 return true;
22723 if (!INTEGRAL_MODE_P (mode))
22724 return false;
22726 switch (code)
22728 case LTU:
22729 case GEU:
22730 break;
22732 /* Convert a==0 into (unsigned)a<1. */
22733 case EQ:
22734 case NE:
22735 if (op1 != const0_rtx)
22736 return false;
22737 op1 = const1_rtx;
22738 code = (code == EQ ? LTU : GEU);
22739 break;
22741 /* Convert a>b into b<a or a>=b-1. */
22742 case GTU:
22743 case LEU:
22744 if (CONST_INT_P (op1))
22746 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22747 /* Bail out on overflow. We still can swap operands but that
22748 would force loading of the constant into register. */
22749 if (op1 == const0_rtx
22750 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22751 return false;
22752 code = (code == GTU ? GEU : LTU);
22754 else
22756 std::swap (op0, op1);
22757 code = (code == GTU ? LTU : GEU);
22759 break;
22761 /* Convert a>=0 into (unsigned)a<0x80000000. */
22762 case LT:
22763 case GE:
22764 if (mode == DImode || op1 != const0_rtx)
22765 return false;
22766 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22767 code = (code == LT ? GEU : LTU);
22768 break;
22769 case LE:
22770 case GT:
22771 if (mode == DImode || op1 != constm1_rtx)
22772 return false;
22773 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22774 code = (code == LE ? GEU : LTU);
22775 break;
22777 default:
22778 return false;
22780 /* Swapping operands may cause constant to appear as first operand. */
22781 if (!nonimmediate_operand (op0, VOIDmode))
22783 if (!can_create_pseudo_p ())
22784 return false;
22785 op0 = force_reg (mode, op0);
22787 *pop = ix86_expand_compare (code, op0, op1);
22788 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22789 return true;
22792 bool
22793 ix86_expand_int_movcc (rtx operands[])
22795 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22796 rtx_insn *compare_seq;
22797 rtx compare_op;
22798 machine_mode mode = GET_MODE (operands[0]);
22799 bool sign_bit_compare_p = false;
22800 rtx op0 = XEXP (operands[1], 0);
22801 rtx op1 = XEXP (operands[1], 1);
22803 if (GET_MODE (op0) == TImode
22804 || (GET_MODE (op0) == DImode
22805 && !TARGET_64BIT))
22806 return false;
22808 start_sequence ();
22809 compare_op = ix86_expand_compare (code, op0, op1);
22810 compare_seq = get_insns ();
22811 end_sequence ();
22813 compare_code = GET_CODE (compare_op);
22815 if ((op1 == const0_rtx && (code == GE || code == LT))
22816 || (op1 == constm1_rtx && (code == GT || code == LE)))
22817 sign_bit_compare_p = true;
22819 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22820 HImode insns, we'd be swallowed in word prefix ops. */
22822 if ((mode != HImode || TARGET_FAST_PREFIX)
22823 && (mode != (TARGET_64BIT ? TImode : DImode))
22824 && CONST_INT_P (operands[2])
22825 && CONST_INT_P (operands[3]))
22827 rtx out = operands[0];
22828 HOST_WIDE_INT ct = INTVAL (operands[2]);
22829 HOST_WIDE_INT cf = INTVAL (operands[3]);
22830 HOST_WIDE_INT diff;
22832 diff = ct - cf;
22833 /* Sign bit compares are better done using shifts than we do by using
22834 sbb. */
22835 if (sign_bit_compare_p
22836 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22838 /* Detect overlap between destination and compare sources. */
22839 rtx tmp = out;
22841 if (!sign_bit_compare_p)
22843 rtx flags;
22844 bool fpcmp = false;
22846 compare_code = GET_CODE (compare_op);
22848 flags = XEXP (compare_op, 0);
22850 if (GET_MODE (flags) == CCFPmode)
22852 fpcmp = true;
22853 compare_code
22854 = ix86_fp_compare_code_to_integer (compare_code);
22857 /* To simplify rest of code, restrict to the GEU case. */
22858 if (compare_code == LTU)
22860 std::swap (ct, cf);
22861 compare_code = reverse_condition (compare_code);
22862 code = reverse_condition (code);
22864 else
22866 if (fpcmp)
22867 PUT_CODE (compare_op,
22868 reverse_condition_maybe_unordered
22869 (GET_CODE (compare_op)));
22870 else
22871 PUT_CODE (compare_op,
22872 reverse_condition (GET_CODE (compare_op)));
22874 diff = ct - cf;
22876 if (reg_overlap_mentioned_p (out, op0)
22877 || reg_overlap_mentioned_p (out, op1))
22878 tmp = gen_reg_rtx (mode);
22880 if (mode == DImode)
22881 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22882 else
22883 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22884 flags, compare_op));
22886 else
22888 if (code == GT || code == GE)
22889 code = reverse_condition (code);
22890 else
22892 std::swap (ct, cf);
22893 diff = ct - cf;
22895 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
22898 if (diff == 1)
22901 * cmpl op0,op1
22902 * sbbl dest,dest
22903 * [addl dest, ct]
22905 * Size 5 - 8.
22907 if (ct)
22908 tmp = expand_simple_binop (mode, PLUS,
22909 tmp, GEN_INT (ct),
22910 copy_rtx (tmp), 1, OPTAB_DIRECT);
22912 else if (cf == -1)
22915 * cmpl op0,op1
22916 * sbbl dest,dest
22917 * orl $ct, dest
22919 * Size 8.
22921 tmp = expand_simple_binop (mode, IOR,
22922 tmp, GEN_INT (ct),
22923 copy_rtx (tmp), 1, OPTAB_DIRECT);
22925 else if (diff == -1 && ct)
22928 * cmpl op0,op1
22929 * sbbl dest,dest
22930 * notl dest
22931 * [addl dest, cf]
22933 * Size 8 - 11.
22935 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22936 if (cf)
22937 tmp = expand_simple_binop (mode, PLUS,
22938 copy_rtx (tmp), GEN_INT (cf),
22939 copy_rtx (tmp), 1, OPTAB_DIRECT);
22941 else
22944 * cmpl op0,op1
22945 * sbbl dest,dest
22946 * [notl dest]
22947 * andl cf - ct, dest
22948 * [addl dest, ct]
22950 * Size 8 - 11.
22953 if (cf == 0)
22955 cf = ct;
22956 ct = 0;
22957 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22960 tmp = expand_simple_binop (mode, AND,
22961 copy_rtx (tmp),
22962 gen_int_mode (cf - ct, mode),
22963 copy_rtx (tmp), 1, OPTAB_DIRECT);
22964 if (ct)
22965 tmp = expand_simple_binop (mode, PLUS,
22966 copy_rtx (tmp), GEN_INT (ct),
22967 copy_rtx (tmp), 1, OPTAB_DIRECT);
22970 if (!rtx_equal_p (tmp, out))
22971 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
22973 return true;
22976 if (diff < 0)
22978 machine_mode cmp_mode = GET_MODE (op0);
22979 enum rtx_code new_code;
22981 if (SCALAR_FLOAT_MODE_P (cmp_mode))
22983 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
22985 /* We may be reversing unordered compare to normal compare, that
22986 is not valid in general (we may convert non-trapping condition
22987 to trapping one), however on i386 we currently emit all
22988 comparisons unordered. */
22989 new_code = reverse_condition_maybe_unordered (code);
22991 else
22992 new_code = ix86_reverse_condition (code, cmp_mode);
22993 if (new_code != UNKNOWN)
22995 std::swap (ct, cf);
22996 diff = -diff;
22997 code = new_code;
23001 compare_code = UNKNOWN;
23002 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23003 && CONST_INT_P (op1))
23005 if (op1 == const0_rtx
23006 && (code == LT || code == GE))
23007 compare_code = code;
23008 else if (op1 == constm1_rtx)
23010 if (code == LE)
23011 compare_code = LT;
23012 else if (code == GT)
23013 compare_code = GE;
23017 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23018 if (compare_code != UNKNOWN
23019 && GET_MODE (op0) == GET_MODE (out)
23020 && (cf == -1 || ct == -1))
23022 /* If lea code below could be used, only optimize
23023 if it results in a 2 insn sequence. */
23025 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23026 || diff == 3 || diff == 5 || diff == 9)
23027 || (compare_code == LT && ct == -1)
23028 || (compare_code == GE && cf == -1))
23031 * notl op1 (if necessary)
23032 * sarl $31, op1
23033 * orl cf, op1
23035 if (ct != -1)
23037 cf = ct;
23038 ct = -1;
23039 code = reverse_condition (code);
23042 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23044 out = expand_simple_binop (mode, IOR,
23045 out, GEN_INT (cf),
23046 out, 1, OPTAB_DIRECT);
23047 if (out != operands[0])
23048 emit_move_insn (operands[0], out);
23050 return true;
23055 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23056 || diff == 3 || diff == 5 || diff == 9)
23057 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23058 && (mode != DImode
23059 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23062 * xorl dest,dest
23063 * cmpl op1,op2
23064 * setcc dest
23065 * lea cf(dest*(ct-cf)),dest
23067 * Size 14.
23069 * This also catches the degenerate setcc-only case.
23072 rtx tmp;
23073 int nops;
23075 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23077 nops = 0;
23078 /* On x86_64 the lea instruction operates on Pmode, so we need
23079 to get arithmetics done in proper mode to match. */
23080 if (diff == 1)
23081 tmp = copy_rtx (out);
23082 else
23084 rtx out1;
23085 out1 = copy_rtx (out);
23086 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23087 nops++;
23088 if (diff & 1)
23090 tmp = gen_rtx_PLUS (mode, tmp, out1);
23091 nops++;
23094 if (cf != 0)
23096 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23097 nops++;
23099 if (!rtx_equal_p (tmp, out))
23101 if (nops == 1)
23102 out = force_operand (tmp, copy_rtx (out));
23103 else
23104 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23106 if (!rtx_equal_p (out, operands[0]))
23107 emit_move_insn (operands[0], copy_rtx (out));
23109 return true;
23113 * General case: Jumpful:
23114 * xorl dest,dest cmpl op1, op2
23115 * cmpl op1, op2 movl ct, dest
23116 * setcc dest jcc 1f
23117 * decl dest movl cf, dest
23118 * andl (cf-ct),dest 1:
23119 * addl ct,dest
23121 * Size 20. Size 14.
23123 * This is reasonably steep, but branch mispredict costs are
23124 * high on modern cpus, so consider failing only if optimizing
23125 * for space.
23128 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23129 && BRANCH_COST (optimize_insn_for_speed_p (),
23130 false) >= 2)
23132 if (cf == 0)
23134 machine_mode cmp_mode = GET_MODE (op0);
23135 enum rtx_code new_code;
23137 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23139 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23141 /* We may be reversing unordered compare to normal compare,
23142 that is not valid in general (we may convert non-trapping
23143 condition to trapping one), however on i386 we currently
23144 emit all comparisons unordered. */
23145 new_code = reverse_condition_maybe_unordered (code);
23147 else
23149 new_code = ix86_reverse_condition (code, cmp_mode);
23150 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23151 compare_code = reverse_condition (compare_code);
23154 if (new_code != UNKNOWN)
23156 cf = ct;
23157 ct = 0;
23158 code = new_code;
23162 if (compare_code != UNKNOWN)
23164 /* notl op1 (if needed)
23165 sarl $31, op1
23166 andl (cf-ct), op1
23167 addl ct, op1
23169 For x < 0 (resp. x <= -1) there will be no notl,
23170 so if possible swap the constants to get rid of the
23171 complement.
23172 True/false will be -1/0 while code below (store flag
23173 followed by decrement) is 0/-1, so the constants need
23174 to be exchanged once more. */
23176 if (compare_code == GE || !cf)
23178 code = reverse_condition (code);
23179 compare_code = LT;
23181 else
23182 std::swap (ct, cf);
23184 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23186 else
23188 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23190 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23191 constm1_rtx,
23192 copy_rtx (out), 1, OPTAB_DIRECT);
23195 out = expand_simple_binop (mode, AND, copy_rtx (out),
23196 gen_int_mode (cf - ct, mode),
23197 copy_rtx (out), 1, OPTAB_DIRECT);
23198 if (ct)
23199 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23200 copy_rtx (out), 1, OPTAB_DIRECT);
23201 if (!rtx_equal_p (out, operands[0]))
23202 emit_move_insn (operands[0], copy_rtx (out));
23204 return true;
23208 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23210 /* Try a few things more with specific constants and a variable. */
23212 optab op;
23213 rtx var, orig_out, out, tmp;
23215 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23216 return false;
23218 /* If one of the two operands is an interesting constant, load a
23219 constant with the above and mask it in with a logical operation. */
23221 if (CONST_INT_P (operands[2]))
23223 var = operands[3];
23224 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23225 operands[3] = constm1_rtx, op = and_optab;
23226 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23227 operands[3] = const0_rtx, op = ior_optab;
23228 else
23229 return false;
23231 else if (CONST_INT_P (operands[3]))
23233 var = operands[2];
23234 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23235 operands[2] = constm1_rtx, op = and_optab;
23236 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23237 operands[2] = const0_rtx, op = ior_optab;
23238 else
23239 return false;
23241 else
23242 return false;
23244 orig_out = operands[0];
23245 tmp = gen_reg_rtx (mode);
23246 operands[0] = tmp;
23248 /* Recurse to get the constant loaded. */
23249 if (!ix86_expand_int_movcc (operands))
23250 return false;
23252 /* Mask in the interesting variable. */
23253 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23254 OPTAB_WIDEN);
23255 if (!rtx_equal_p (out, orig_out))
23256 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23258 return true;
23262 * For comparison with above,
23264 * movl cf,dest
23265 * movl ct,tmp
23266 * cmpl op1,op2
23267 * cmovcc tmp,dest
23269 * Size 15.
23272 if (! nonimmediate_operand (operands[2], mode))
23273 operands[2] = force_reg (mode, operands[2]);
23274 if (! nonimmediate_operand (operands[3], mode))
23275 operands[3] = force_reg (mode, operands[3]);
23277 if (! register_operand (operands[2], VOIDmode)
23278 && (mode == QImode
23279 || ! register_operand (operands[3], VOIDmode)))
23280 operands[2] = force_reg (mode, operands[2]);
23282 if (mode == QImode
23283 && ! register_operand (operands[3], VOIDmode))
23284 operands[3] = force_reg (mode, operands[3]);
23286 emit_insn (compare_seq);
23287 emit_insn (gen_rtx_SET (operands[0],
23288 gen_rtx_IF_THEN_ELSE (mode,
23289 compare_op, operands[2],
23290 operands[3])));
23291 return true;
23294 /* Swap, force into registers, or otherwise massage the two operands
23295 to an sse comparison with a mask result. Thus we differ a bit from
23296 ix86_prepare_fp_compare_args which expects to produce a flags result.
23298 The DEST operand exists to help determine whether to commute commutative
23299 operators. The POP0/POP1 operands are updated in place. The new
23300 comparison code is returned, or UNKNOWN if not implementable. */
23302 static enum rtx_code
23303 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23304 rtx *pop0, rtx *pop1)
23306 switch (code)
23308 case LTGT:
23309 case UNEQ:
23310 /* AVX supports all the needed comparisons. */
23311 if (TARGET_AVX)
23312 break;
23313 /* We have no LTGT as an operator. We could implement it with
23314 NE & ORDERED, but this requires an extra temporary. It's
23315 not clear that it's worth it. */
23316 return UNKNOWN;
23318 case LT:
23319 case LE:
23320 case UNGT:
23321 case UNGE:
23322 /* These are supported directly. */
23323 break;
23325 case EQ:
23326 case NE:
23327 case UNORDERED:
23328 case ORDERED:
23329 /* AVX has 3 operand comparisons, no need to swap anything. */
23330 if (TARGET_AVX)
23331 break;
23332 /* For commutative operators, try to canonicalize the destination
23333 operand to be first in the comparison - this helps reload to
23334 avoid extra moves. */
23335 if (!dest || !rtx_equal_p (dest, *pop1))
23336 break;
23337 /* FALLTHRU */
23339 case GE:
23340 case GT:
23341 case UNLE:
23342 case UNLT:
23343 /* These are not supported directly before AVX, and furthermore
23344 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23345 comparison operands to transform into something that is
23346 supported. */
23347 std::swap (*pop0, *pop1);
23348 code = swap_condition (code);
23349 break;
23351 default:
23352 gcc_unreachable ();
23355 return code;
23358 /* Detect conditional moves that exactly match min/max operational
23359 semantics. Note that this is IEEE safe, as long as we don't
23360 interchange the operands.
23362 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23363 and TRUE if the operation is successful and instructions are emitted. */
23365 static bool
23366 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23367 rtx cmp_op1, rtx if_true, rtx if_false)
23369 machine_mode mode;
23370 bool is_min;
23371 rtx tmp;
23373 if (code == LT)
23375 else if (code == UNGE)
23376 std::swap (if_true, if_false);
23377 else
23378 return false;
23380 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23381 is_min = true;
23382 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23383 is_min = false;
23384 else
23385 return false;
23387 mode = GET_MODE (dest);
23389 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23390 but MODE may be a vector mode and thus not appropriate. */
23391 if (!flag_finite_math_only || flag_signed_zeros)
23393 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23394 rtvec v;
23396 if_true = force_reg (mode, if_true);
23397 v = gen_rtvec (2, if_true, if_false);
23398 tmp = gen_rtx_UNSPEC (mode, v, u);
23400 else
23402 code = is_min ? SMIN : SMAX;
23403 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23406 emit_insn (gen_rtx_SET (dest, tmp));
23407 return true;
23410 /* Expand an sse vector comparison. Return the register with the result. */
23412 static rtx
23413 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23414 rtx op_true, rtx op_false)
23416 machine_mode mode = GET_MODE (dest);
23417 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23419 /* In general case result of comparison can differ from operands' type. */
23420 machine_mode cmp_mode;
23422 /* In AVX512F the result of comparison is an integer mask. */
23423 bool maskcmp = false;
23424 rtx x;
23426 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23428 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23429 cmp_mode = int_mode_for_size (nbits, 0).require ();
23430 maskcmp = true;
23432 else
23433 cmp_mode = cmp_ops_mode;
23436 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23437 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23438 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23440 if (optimize
23441 || (maskcmp && cmp_mode != mode)
23442 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23443 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23444 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23446 /* Compare patterns for int modes are unspec in AVX512F only. */
23447 if (maskcmp && (code == GT || code == EQ))
23449 rtx (*gen)(rtx, rtx, rtx);
23451 switch (cmp_ops_mode)
23453 case E_V64QImode:
23454 gcc_assert (TARGET_AVX512BW);
23455 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23456 break;
23457 case E_V32HImode:
23458 gcc_assert (TARGET_AVX512BW);
23459 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23460 break;
23461 case E_V16SImode:
23462 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23463 break;
23464 case E_V8DImode:
23465 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23466 break;
23467 default:
23468 gen = NULL;
23471 if (gen)
23473 emit_insn (gen (dest, cmp_op0, cmp_op1));
23474 return dest;
23477 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23479 if (cmp_mode != mode && !maskcmp)
23481 x = force_reg (cmp_ops_mode, x);
23482 convert_move (dest, x, false);
23484 else
23485 emit_insn (gen_rtx_SET (dest, x));
23487 return dest;
23490 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23491 operations. This is used for both scalar and vector conditional moves. */
23493 void
23494 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23496 machine_mode mode = GET_MODE (dest);
23497 machine_mode cmpmode = GET_MODE (cmp);
23499 /* In AVX512F the result of comparison is an integer mask. */
23500 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23502 rtx t2, t3, x;
23504 /* If we have an integer mask and FP value then we need
23505 to cast mask to FP mode. */
23506 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23508 cmp = force_reg (cmpmode, cmp);
23509 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23512 if (vector_all_ones_operand (op_true, mode)
23513 && rtx_equal_p (op_false, CONST0_RTX (mode))
23514 && !maskcmp)
23516 emit_insn (gen_rtx_SET (dest, cmp));
23518 else if (op_false == CONST0_RTX (mode)
23519 && !maskcmp)
23521 op_true = force_reg (mode, op_true);
23522 x = gen_rtx_AND (mode, cmp, op_true);
23523 emit_insn (gen_rtx_SET (dest, x));
23525 else if (op_true == CONST0_RTX (mode)
23526 && !maskcmp)
23528 op_false = force_reg (mode, op_false);
23529 x = gen_rtx_NOT (mode, cmp);
23530 x = gen_rtx_AND (mode, x, op_false);
23531 emit_insn (gen_rtx_SET (dest, x));
23533 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23534 && !maskcmp)
23536 op_false = force_reg (mode, op_false);
23537 x = gen_rtx_IOR (mode, cmp, op_false);
23538 emit_insn (gen_rtx_SET (dest, x));
23540 else if (TARGET_XOP
23541 && !maskcmp)
23543 op_true = force_reg (mode, op_true);
23545 if (!nonimmediate_operand (op_false, mode))
23546 op_false = force_reg (mode, op_false);
23548 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23549 op_true,
23550 op_false)));
23552 else
23554 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23555 rtx d = dest;
23557 if (!nonimmediate_operand (op_true, mode))
23558 op_true = force_reg (mode, op_true);
23560 op_false = force_reg (mode, op_false);
23562 switch (mode)
23564 case E_V4SFmode:
23565 if (TARGET_SSE4_1)
23566 gen = gen_sse4_1_blendvps;
23567 break;
23568 case E_V2DFmode:
23569 if (TARGET_SSE4_1)
23570 gen = gen_sse4_1_blendvpd;
23571 break;
23572 case E_V16QImode:
23573 case E_V8HImode:
23574 case E_V4SImode:
23575 case E_V2DImode:
23576 if (TARGET_SSE4_1)
23578 gen = gen_sse4_1_pblendvb;
23579 if (mode != V16QImode)
23580 d = gen_reg_rtx (V16QImode);
23581 op_false = gen_lowpart (V16QImode, op_false);
23582 op_true = gen_lowpart (V16QImode, op_true);
23583 cmp = gen_lowpart (V16QImode, cmp);
23585 break;
23586 case E_V8SFmode:
23587 if (TARGET_AVX)
23588 gen = gen_avx_blendvps256;
23589 break;
23590 case E_V4DFmode:
23591 if (TARGET_AVX)
23592 gen = gen_avx_blendvpd256;
23593 break;
23594 case E_V32QImode:
23595 case E_V16HImode:
23596 case E_V8SImode:
23597 case E_V4DImode:
23598 if (TARGET_AVX2)
23600 gen = gen_avx2_pblendvb;
23601 if (mode != V32QImode)
23602 d = gen_reg_rtx (V32QImode);
23603 op_false = gen_lowpart (V32QImode, op_false);
23604 op_true = gen_lowpart (V32QImode, op_true);
23605 cmp = gen_lowpart (V32QImode, cmp);
23607 break;
23609 case E_V64QImode:
23610 gen = gen_avx512bw_blendmv64qi;
23611 break;
23612 case E_V32HImode:
23613 gen = gen_avx512bw_blendmv32hi;
23614 break;
23615 case E_V16SImode:
23616 gen = gen_avx512f_blendmv16si;
23617 break;
23618 case E_V8DImode:
23619 gen = gen_avx512f_blendmv8di;
23620 break;
23621 case E_V8DFmode:
23622 gen = gen_avx512f_blendmv8df;
23623 break;
23624 case E_V16SFmode:
23625 gen = gen_avx512f_blendmv16sf;
23626 break;
23628 default:
23629 break;
23632 if (gen != NULL)
23634 emit_insn (gen (d, op_false, op_true, cmp));
23635 if (d != dest)
23636 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23638 else
23640 op_true = force_reg (mode, op_true);
23642 t2 = gen_reg_rtx (mode);
23643 if (optimize)
23644 t3 = gen_reg_rtx (mode);
23645 else
23646 t3 = dest;
23648 x = gen_rtx_AND (mode, op_true, cmp);
23649 emit_insn (gen_rtx_SET (t2, x));
23651 x = gen_rtx_NOT (mode, cmp);
23652 x = gen_rtx_AND (mode, x, op_false);
23653 emit_insn (gen_rtx_SET (t3, x));
23655 x = gen_rtx_IOR (mode, t3, t2);
23656 emit_insn (gen_rtx_SET (dest, x));
23661 /* Expand a floating-point conditional move. Return true if successful. */
23663 bool
23664 ix86_expand_fp_movcc (rtx operands[])
23666 machine_mode mode = GET_MODE (operands[0]);
23667 enum rtx_code code = GET_CODE (operands[1]);
23668 rtx tmp, compare_op;
23669 rtx op0 = XEXP (operands[1], 0);
23670 rtx op1 = XEXP (operands[1], 1);
23672 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23674 machine_mode cmode;
23676 /* Since we've no cmove for sse registers, don't force bad register
23677 allocation just to gain access to it. Deny movcc when the
23678 comparison mode doesn't match the move mode. */
23679 cmode = GET_MODE (op0);
23680 if (cmode == VOIDmode)
23681 cmode = GET_MODE (op1);
23682 if (cmode != mode)
23683 return false;
23685 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23686 if (code == UNKNOWN)
23687 return false;
23689 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23690 operands[2], operands[3]))
23691 return true;
23693 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23694 operands[2], operands[3]);
23695 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23696 return true;
23699 if (GET_MODE (op0) == TImode
23700 || (GET_MODE (op0) == DImode
23701 && !TARGET_64BIT))
23702 return false;
23704 /* The floating point conditional move instructions don't directly
23705 support conditions resulting from a signed integer comparison. */
23707 compare_op = ix86_expand_compare (code, op0, op1);
23708 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23710 tmp = gen_reg_rtx (QImode);
23711 ix86_expand_setcc (tmp, code, op0, op1);
23713 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23716 emit_insn (gen_rtx_SET (operands[0],
23717 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23718 operands[2], operands[3])));
23720 return true;
23723 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23725 static int
23726 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23728 switch (code)
23730 case EQ:
23731 return 0;
23732 case LT:
23733 case LTU:
23734 return 1;
23735 case LE:
23736 case LEU:
23737 return 2;
23738 case NE:
23739 return 4;
23740 case GE:
23741 case GEU:
23742 return 5;
23743 case GT:
23744 case GTU:
23745 return 6;
23746 default:
23747 gcc_unreachable ();
23751 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23753 static int
23754 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23756 switch (code)
23758 case EQ:
23759 return 0x00;
23760 case NE:
23761 return 0x04;
23762 case GT:
23763 return 0x0e;
23764 case LE:
23765 return 0x02;
23766 case GE:
23767 return 0x0d;
23768 case LT:
23769 return 0x01;
23770 case UNLE:
23771 return 0x0a;
23772 case UNLT:
23773 return 0x09;
23774 case UNGE:
23775 return 0x05;
23776 case UNGT:
23777 return 0x06;
23778 case UNEQ:
23779 return 0x18;
23780 case LTGT:
23781 return 0x0c;
23782 case ORDERED:
23783 return 0x07;
23784 case UNORDERED:
23785 return 0x03;
23786 default:
23787 gcc_unreachable ();
23791 /* Return immediate value to be used in UNSPEC_PCMP
23792 for comparison CODE in MODE. */
23794 static int
23795 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23797 if (FLOAT_MODE_P (mode))
23798 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23799 return ix86_int_cmp_code_to_pcmp_immediate (code);
23802 /* Expand AVX-512 vector comparison. */
23804 bool
23805 ix86_expand_mask_vec_cmp (rtx operands[])
23807 machine_mode mask_mode = GET_MODE (operands[0]);
23808 machine_mode cmp_mode = GET_MODE (operands[2]);
23809 enum rtx_code code = GET_CODE (operands[1]);
23810 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23811 int unspec_code;
23812 rtx unspec;
23814 switch (code)
23816 case LEU:
23817 case GTU:
23818 case GEU:
23819 case LTU:
23820 unspec_code = UNSPEC_UNSIGNED_PCMP;
23821 break;
23823 default:
23824 unspec_code = UNSPEC_PCMP;
23827 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23828 operands[3], imm),
23829 unspec_code);
23830 emit_insn (gen_rtx_SET (operands[0], unspec));
23832 return true;
23835 /* Expand fp vector comparison. */
23837 bool
23838 ix86_expand_fp_vec_cmp (rtx operands[])
23840 enum rtx_code code = GET_CODE (operands[1]);
23841 rtx cmp;
23843 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23844 &operands[2], &operands[3]);
23845 if (code == UNKNOWN)
23847 rtx temp;
23848 switch (GET_CODE (operands[1]))
23850 case LTGT:
23851 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23852 operands[3], NULL, NULL);
23853 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23854 operands[3], NULL, NULL);
23855 code = AND;
23856 break;
23857 case UNEQ:
23858 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23859 operands[3], NULL, NULL);
23860 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23861 operands[3], NULL, NULL);
23862 code = IOR;
23863 break;
23864 default:
23865 gcc_unreachable ();
23867 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23868 OPTAB_DIRECT);
23870 else
23871 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23872 operands[1], operands[2]);
23874 if (operands[0] != cmp)
23875 emit_move_insn (operands[0], cmp);
23877 return true;
23880 static rtx
23881 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23882 rtx op_true, rtx op_false, bool *negate)
23884 machine_mode data_mode = GET_MODE (dest);
23885 machine_mode mode = GET_MODE (cop0);
23886 rtx x;
23888 *negate = false;
23890 /* XOP supports all of the comparisons on all 128-bit vector int types. */
23891 if (TARGET_XOP
23892 && (mode == V16QImode || mode == V8HImode
23893 || mode == V4SImode || mode == V2DImode))
23895 else
23897 /* Canonicalize the comparison to EQ, GT, GTU. */
23898 switch (code)
23900 case EQ:
23901 case GT:
23902 case GTU:
23903 break;
23905 case NE:
23906 case LE:
23907 case LEU:
23908 code = reverse_condition (code);
23909 *negate = true;
23910 break;
23912 case GE:
23913 case GEU:
23914 code = reverse_condition (code);
23915 *negate = true;
23916 /* FALLTHRU */
23918 case LT:
23919 case LTU:
23920 std::swap (cop0, cop1);
23921 code = swap_condition (code);
23922 break;
23924 default:
23925 gcc_unreachable ();
23928 /* Only SSE4.1/SSE4.2 supports V2DImode. */
23929 if (mode == V2DImode)
23931 switch (code)
23933 case EQ:
23934 /* SSE4.1 supports EQ. */
23935 if (!TARGET_SSE4_1)
23936 return NULL;
23937 break;
23939 case GT:
23940 case GTU:
23941 /* SSE4.2 supports GT/GTU. */
23942 if (!TARGET_SSE4_2)
23943 return NULL;
23944 break;
23946 default:
23947 gcc_unreachable ();
23951 /* Unsigned parallel compare is not supported by the hardware.
23952 Play some tricks to turn this into a signed comparison
23953 against 0. */
23954 if (code == GTU)
23956 cop0 = force_reg (mode, cop0);
23958 switch (mode)
23960 case E_V16SImode:
23961 case E_V8DImode:
23962 case E_V8SImode:
23963 case E_V4DImode:
23964 case E_V4SImode:
23965 case E_V2DImode:
23967 rtx t1, t2, mask;
23968 rtx (*gen_sub3) (rtx, rtx, rtx);
23970 switch (mode)
23972 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
23973 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
23974 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
23975 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
23976 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
23977 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
23978 default:
23979 gcc_unreachable ();
23981 /* Subtract (-(INT MAX) - 1) from both operands to make
23982 them signed. */
23983 mask = ix86_build_signbit_mask (mode, true, false);
23984 t1 = gen_reg_rtx (mode);
23985 emit_insn (gen_sub3 (t1, cop0, mask));
23987 t2 = gen_reg_rtx (mode);
23988 emit_insn (gen_sub3 (t2, cop1, mask));
23990 cop0 = t1;
23991 cop1 = t2;
23992 code = GT;
23994 break;
23996 case E_V64QImode:
23997 case E_V32HImode:
23998 case E_V32QImode:
23999 case E_V16HImode:
24000 case E_V16QImode:
24001 case E_V8HImode:
24002 /* Perform a parallel unsigned saturating subtraction. */
24003 x = gen_reg_rtx (mode);
24004 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24005 cop1)));
24007 cop0 = x;
24008 cop1 = CONST0_RTX (mode);
24009 code = EQ;
24010 *negate = !*negate;
24011 break;
24013 default:
24014 gcc_unreachable ();
24019 if (*negate)
24020 std::swap (op_true, op_false);
24022 /* Allow the comparison to be done in one mode, but the movcc to
24023 happen in another mode. */
24024 if (data_mode == mode)
24026 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24027 op_true, op_false);
24029 else
24031 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24032 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24033 op_true, op_false);
24034 if (GET_MODE (x) == mode)
24035 x = gen_lowpart (data_mode, x);
24038 return x;
24041 /* Expand integer vector comparison. */
24043 bool
24044 ix86_expand_int_vec_cmp (rtx operands[])
24046 rtx_code code = GET_CODE (operands[1]);
24047 bool negate = false;
24048 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24049 operands[3], NULL, NULL, &negate);
24051 if (!cmp)
24052 return false;
24054 if (negate)
24055 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24056 CONST0_RTX (GET_MODE (cmp)),
24057 NULL, NULL, &negate);
24059 gcc_assert (!negate);
24061 if (operands[0] != cmp)
24062 emit_move_insn (operands[0], cmp);
24064 return true;
24067 /* Expand a floating-point vector conditional move; a vcond operation
24068 rather than a movcc operation. */
24070 bool
24071 ix86_expand_fp_vcond (rtx operands[])
24073 enum rtx_code code = GET_CODE (operands[3]);
24074 rtx cmp;
24076 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24077 &operands[4], &operands[5]);
24078 if (code == UNKNOWN)
24080 rtx temp;
24081 switch (GET_CODE (operands[3]))
24083 case LTGT:
24084 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24085 operands[5], operands[0], operands[0]);
24086 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24087 operands[5], operands[1], operands[2]);
24088 code = AND;
24089 break;
24090 case UNEQ:
24091 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24092 operands[5], operands[0], operands[0]);
24093 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24094 operands[5], operands[1], operands[2]);
24095 code = IOR;
24096 break;
24097 default:
24098 gcc_unreachable ();
24100 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24101 OPTAB_DIRECT);
24102 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24103 return true;
24106 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24107 operands[5], operands[1], operands[2]))
24108 return true;
24110 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24111 operands[1], operands[2]);
24112 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24113 return true;
24116 /* Expand a signed/unsigned integral vector conditional move. */
24118 bool
24119 ix86_expand_int_vcond (rtx operands[])
24121 machine_mode data_mode = GET_MODE (operands[0]);
24122 machine_mode mode = GET_MODE (operands[4]);
24123 enum rtx_code code = GET_CODE (operands[3]);
24124 bool negate = false;
24125 rtx x, cop0, cop1;
24127 cop0 = operands[4];
24128 cop1 = operands[5];
24130 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24131 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24132 if ((code == LT || code == GE)
24133 && data_mode == mode
24134 && cop1 == CONST0_RTX (mode)
24135 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24136 && GET_MODE_UNIT_SIZE (data_mode) > 1
24137 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24138 && (GET_MODE_SIZE (data_mode) == 16
24139 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24141 rtx negop = operands[2 - (code == LT)];
24142 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24143 if (negop == CONST1_RTX (data_mode))
24145 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24146 operands[0], 1, OPTAB_DIRECT);
24147 if (res != operands[0])
24148 emit_move_insn (operands[0], res);
24149 return true;
24151 else if (GET_MODE_INNER (data_mode) != DImode
24152 && vector_all_ones_operand (negop, data_mode))
24154 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24155 operands[0], 0, OPTAB_DIRECT);
24156 if (res != operands[0])
24157 emit_move_insn (operands[0], res);
24158 return true;
24162 if (!nonimmediate_operand (cop1, mode))
24163 cop1 = force_reg (mode, cop1);
24164 if (!general_operand (operands[1], data_mode))
24165 operands[1] = force_reg (data_mode, operands[1]);
24166 if (!general_operand (operands[2], data_mode))
24167 operands[2] = force_reg (data_mode, operands[2]);
24169 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24170 operands[1], operands[2], &negate);
24172 if (!x)
24173 return false;
24175 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24176 operands[2-negate]);
24177 return true;
24180 /* AVX512F does support 64-byte integer vector operations,
24181 thus the longest vector we are faced with is V64QImode. */
24182 #define MAX_VECT_LEN 64
24184 struct expand_vec_perm_d
24186 rtx target, op0, op1;
24187 unsigned char perm[MAX_VECT_LEN];
24188 machine_mode vmode;
24189 unsigned char nelt;
24190 bool one_operand_p;
24191 bool testing_p;
24194 static bool
24195 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24196 struct expand_vec_perm_d *d)
24198 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24199 expander, so args are either in d, or in op0, op1 etc. */
24200 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24201 machine_mode maskmode = mode;
24202 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24204 switch (mode)
24206 case E_V8HImode:
24207 if (TARGET_AVX512VL && TARGET_AVX512BW)
24208 gen = gen_avx512vl_vpermt2varv8hi3;
24209 break;
24210 case E_V16HImode:
24211 if (TARGET_AVX512VL && TARGET_AVX512BW)
24212 gen = gen_avx512vl_vpermt2varv16hi3;
24213 break;
24214 case E_V64QImode:
24215 if (TARGET_AVX512VBMI)
24216 gen = gen_avx512bw_vpermt2varv64qi3;
24217 break;
24218 case E_V32HImode:
24219 if (TARGET_AVX512BW)
24220 gen = gen_avx512bw_vpermt2varv32hi3;
24221 break;
24222 case E_V4SImode:
24223 if (TARGET_AVX512VL)
24224 gen = gen_avx512vl_vpermt2varv4si3;
24225 break;
24226 case E_V8SImode:
24227 if (TARGET_AVX512VL)
24228 gen = gen_avx512vl_vpermt2varv8si3;
24229 break;
24230 case E_V16SImode:
24231 if (TARGET_AVX512F)
24232 gen = gen_avx512f_vpermt2varv16si3;
24233 break;
24234 case E_V4SFmode:
24235 if (TARGET_AVX512VL)
24237 gen = gen_avx512vl_vpermt2varv4sf3;
24238 maskmode = V4SImode;
24240 break;
24241 case E_V8SFmode:
24242 if (TARGET_AVX512VL)
24244 gen = gen_avx512vl_vpermt2varv8sf3;
24245 maskmode = V8SImode;
24247 break;
24248 case E_V16SFmode:
24249 if (TARGET_AVX512F)
24251 gen = gen_avx512f_vpermt2varv16sf3;
24252 maskmode = V16SImode;
24254 break;
24255 case E_V2DImode:
24256 if (TARGET_AVX512VL)
24257 gen = gen_avx512vl_vpermt2varv2di3;
24258 break;
24259 case E_V4DImode:
24260 if (TARGET_AVX512VL)
24261 gen = gen_avx512vl_vpermt2varv4di3;
24262 break;
24263 case E_V8DImode:
24264 if (TARGET_AVX512F)
24265 gen = gen_avx512f_vpermt2varv8di3;
24266 break;
24267 case E_V2DFmode:
24268 if (TARGET_AVX512VL)
24270 gen = gen_avx512vl_vpermt2varv2df3;
24271 maskmode = V2DImode;
24273 break;
24274 case E_V4DFmode:
24275 if (TARGET_AVX512VL)
24277 gen = gen_avx512vl_vpermt2varv4df3;
24278 maskmode = V4DImode;
24280 break;
24281 case E_V8DFmode:
24282 if (TARGET_AVX512F)
24284 gen = gen_avx512f_vpermt2varv8df3;
24285 maskmode = V8DImode;
24287 break;
24288 default:
24289 break;
24292 if (gen == NULL)
24293 return false;
24295 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24296 expander, so args are either in d, or in op0, op1 etc. */
24297 if (d)
24299 rtx vec[64];
24300 target = d->target;
24301 op0 = d->op0;
24302 op1 = d->op1;
24303 for (int i = 0; i < d->nelt; ++i)
24304 vec[i] = GEN_INT (d->perm[i]);
24305 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24308 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24309 return true;
24312 /* Expand a variable vector permutation. */
24314 void
24315 ix86_expand_vec_perm (rtx operands[])
24317 rtx target = operands[0];
24318 rtx op0 = operands[1];
24319 rtx op1 = operands[2];
24320 rtx mask = operands[3];
24321 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24322 machine_mode mode = GET_MODE (op0);
24323 machine_mode maskmode = GET_MODE (mask);
24324 int w, e, i;
24325 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24327 /* Number of elements in the vector. */
24328 w = GET_MODE_NUNITS (mode);
24329 e = GET_MODE_UNIT_SIZE (mode);
24330 gcc_assert (w <= 64);
24332 if (TARGET_AVX512F && one_operand_shuffle)
24334 rtx (*gen) (rtx, rtx, rtx) = NULL;
24335 switch (mode)
24337 case E_V16SImode:
24338 gen =gen_avx512f_permvarv16si;
24339 break;
24340 case E_V16SFmode:
24341 gen = gen_avx512f_permvarv16sf;
24342 break;
24343 case E_V8DImode:
24344 gen = gen_avx512f_permvarv8di;
24345 break;
24346 case E_V8DFmode:
24347 gen = gen_avx512f_permvarv8df;
24348 break;
24349 default:
24350 break;
24352 if (gen != NULL)
24354 emit_insn (gen (target, op0, mask));
24355 return;
24359 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24360 return;
24362 if (TARGET_AVX2)
24364 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24366 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24367 an constant shuffle operand. With a tiny bit of effort we can
24368 use VPERMD instead. A re-interpretation stall for V4DFmode is
24369 unfortunate but there's no avoiding it.
24370 Similarly for V16HImode we don't have instructions for variable
24371 shuffling, while for V32QImode we can use after preparing suitable
24372 masks vpshufb; vpshufb; vpermq; vpor. */
24374 if (mode == V16HImode)
24376 maskmode = mode = V32QImode;
24377 w = 32;
24378 e = 1;
24380 else
24382 maskmode = mode = V8SImode;
24383 w = 8;
24384 e = 4;
24386 t1 = gen_reg_rtx (maskmode);
24388 /* Replicate the low bits of the V4DImode mask into V8SImode:
24389 mask = { A B C D }
24390 t1 = { A A B B C C D D }. */
24391 for (i = 0; i < w / 2; ++i)
24392 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24393 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24394 vt = force_reg (maskmode, vt);
24395 mask = gen_lowpart (maskmode, mask);
24396 if (maskmode == V8SImode)
24397 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24398 else
24399 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24401 /* Multiply the shuffle indicies by two. */
24402 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24403 OPTAB_DIRECT);
24405 /* Add one to the odd shuffle indicies:
24406 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24407 for (i = 0; i < w / 2; ++i)
24409 vec[i * 2] = const0_rtx;
24410 vec[i * 2 + 1] = const1_rtx;
24412 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24413 vt = validize_mem (force_const_mem (maskmode, vt));
24414 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24415 OPTAB_DIRECT);
24417 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24418 operands[3] = mask = t1;
24419 target = gen_reg_rtx (mode);
24420 op0 = gen_lowpart (mode, op0);
24421 op1 = gen_lowpart (mode, op1);
24424 switch (mode)
24426 case E_V8SImode:
24427 /* The VPERMD and VPERMPS instructions already properly ignore
24428 the high bits of the shuffle elements. No need for us to
24429 perform an AND ourselves. */
24430 if (one_operand_shuffle)
24432 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24433 if (target != operands[0])
24434 emit_move_insn (operands[0],
24435 gen_lowpart (GET_MODE (operands[0]), target));
24437 else
24439 t1 = gen_reg_rtx (V8SImode);
24440 t2 = gen_reg_rtx (V8SImode);
24441 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24442 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24443 goto merge_two;
24445 return;
24447 case E_V8SFmode:
24448 mask = gen_lowpart (V8SImode, mask);
24449 if (one_operand_shuffle)
24450 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24451 else
24453 t1 = gen_reg_rtx (V8SFmode);
24454 t2 = gen_reg_rtx (V8SFmode);
24455 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24456 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24457 goto merge_two;
24459 return;
24461 case E_V4SImode:
24462 /* By combining the two 128-bit input vectors into one 256-bit
24463 input vector, we can use VPERMD and VPERMPS for the full
24464 two-operand shuffle. */
24465 t1 = gen_reg_rtx (V8SImode);
24466 t2 = gen_reg_rtx (V8SImode);
24467 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24468 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24469 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24470 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24471 return;
24473 case E_V4SFmode:
24474 t1 = gen_reg_rtx (V8SFmode);
24475 t2 = gen_reg_rtx (V8SImode);
24476 mask = gen_lowpart (V4SImode, mask);
24477 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24478 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24479 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24480 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24481 return;
24483 case E_V32QImode:
24484 t1 = gen_reg_rtx (V32QImode);
24485 t2 = gen_reg_rtx (V32QImode);
24486 t3 = gen_reg_rtx (V32QImode);
24487 vt2 = GEN_INT (-128);
24488 vt = gen_const_vec_duplicate (V32QImode, vt2);
24489 vt = force_reg (V32QImode, vt);
24490 for (i = 0; i < 32; i++)
24491 vec[i] = i < 16 ? vt2 : const0_rtx;
24492 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24493 vt2 = force_reg (V32QImode, vt2);
24494 /* From mask create two adjusted masks, which contain the same
24495 bits as mask in the low 7 bits of each vector element.
24496 The first mask will have the most significant bit clear
24497 if it requests element from the same 128-bit lane
24498 and MSB set if it requests element from the other 128-bit lane.
24499 The second mask will have the opposite values of the MSB,
24500 and additionally will have its 128-bit lanes swapped.
24501 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24502 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24503 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24504 stands for other 12 bytes. */
24505 /* The bit whether element is from the same lane or the other
24506 lane is bit 4, so shift it up by 3 to the MSB position. */
24507 t5 = gen_reg_rtx (V4DImode);
24508 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24509 GEN_INT (3)));
24510 /* Clear MSB bits from the mask just in case it had them set. */
24511 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24512 /* After this t1 will have MSB set for elements from other lane. */
24513 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24514 /* Clear bits other than MSB. */
24515 emit_insn (gen_andv32qi3 (t1, t1, vt));
24516 /* Or in the lower bits from mask into t3. */
24517 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24518 /* And invert MSB bits in t1, so MSB is set for elements from the same
24519 lane. */
24520 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24521 /* Swap 128-bit lanes in t3. */
24522 t6 = gen_reg_rtx (V4DImode);
24523 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24524 const2_rtx, GEN_INT (3),
24525 const0_rtx, const1_rtx));
24526 /* And or in the lower bits from mask into t1. */
24527 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24528 if (one_operand_shuffle)
24530 /* Each of these shuffles will put 0s in places where
24531 element from the other 128-bit lane is needed, otherwise
24532 will shuffle in the requested value. */
24533 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24534 gen_lowpart (V32QImode, t6)));
24535 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24536 /* For t3 the 128-bit lanes are swapped again. */
24537 t7 = gen_reg_rtx (V4DImode);
24538 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24539 const2_rtx, GEN_INT (3),
24540 const0_rtx, const1_rtx));
24541 /* And oring both together leads to the result. */
24542 emit_insn (gen_iorv32qi3 (target, t1,
24543 gen_lowpart (V32QImode, t7)));
24544 if (target != operands[0])
24545 emit_move_insn (operands[0],
24546 gen_lowpart (GET_MODE (operands[0]), target));
24547 return;
24550 t4 = gen_reg_rtx (V32QImode);
24551 /* Similarly to the above one_operand_shuffle code,
24552 just for repeated twice for each operand. merge_two:
24553 code will merge the two results together. */
24554 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24555 gen_lowpart (V32QImode, t6)));
24556 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24557 gen_lowpart (V32QImode, t6)));
24558 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24559 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24560 t7 = gen_reg_rtx (V4DImode);
24561 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24562 const2_rtx, GEN_INT (3),
24563 const0_rtx, const1_rtx));
24564 t8 = gen_reg_rtx (V4DImode);
24565 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24566 const2_rtx, GEN_INT (3),
24567 const0_rtx, const1_rtx));
24568 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24569 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24570 t1 = t4;
24571 t2 = t3;
24572 goto merge_two;
24574 default:
24575 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24576 break;
24580 if (TARGET_XOP)
24582 /* The XOP VPPERM insn supports three inputs. By ignoring the
24583 one_operand_shuffle special case, we avoid creating another
24584 set of constant vectors in memory. */
24585 one_operand_shuffle = false;
24587 /* mask = mask & {2*w-1, ...} */
24588 vt = GEN_INT (2*w - 1);
24590 else
24592 /* mask = mask & {w-1, ...} */
24593 vt = GEN_INT (w - 1);
24596 vt = gen_const_vec_duplicate (maskmode, vt);
24597 mask = expand_simple_binop (maskmode, AND, mask, vt,
24598 NULL_RTX, 0, OPTAB_DIRECT);
24600 /* For non-QImode operations, convert the word permutation control
24601 into a byte permutation control. */
24602 if (mode != V16QImode)
24604 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24605 GEN_INT (exact_log2 (e)),
24606 NULL_RTX, 0, OPTAB_DIRECT);
24608 /* Convert mask to vector of chars. */
24609 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24611 /* Replicate each of the input bytes into byte positions:
24612 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24613 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24614 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24615 for (i = 0; i < 16; ++i)
24616 vec[i] = GEN_INT (i/e * e);
24617 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24618 vt = validize_mem (force_const_mem (V16QImode, vt));
24619 if (TARGET_XOP)
24620 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24621 else
24622 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24624 /* Convert it into the byte positions by doing
24625 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24626 for (i = 0; i < 16; ++i)
24627 vec[i] = GEN_INT (i % e);
24628 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24629 vt = validize_mem (force_const_mem (V16QImode, vt));
24630 emit_insn (gen_addv16qi3 (mask, mask, vt));
24633 /* The actual shuffle operations all operate on V16QImode. */
24634 op0 = gen_lowpart (V16QImode, op0);
24635 op1 = gen_lowpart (V16QImode, op1);
24637 if (TARGET_XOP)
24639 if (GET_MODE (target) != V16QImode)
24640 target = gen_reg_rtx (V16QImode);
24641 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24642 if (target != operands[0])
24643 emit_move_insn (operands[0],
24644 gen_lowpart (GET_MODE (operands[0]), target));
24646 else if (one_operand_shuffle)
24648 if (GET_MODE (target) != V16QImode)
24649 target = gen_reg_rtx (V16QImode);
24650 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24651 if (target != operands[0])
24652 emit_move_insn (operands[0],
24653 gen_lowpart (GET_MODE (operands[0]), target));
24655 else
24657 rtx xops[6];
24658 bool ok;
24660 /* Shuffle the two input vectors independently. */
24661 t1 = gen_reg_rtx (V16QImode);
24662 t2 = gen_reg_rtx (V16QImode);
24663 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24664 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24666 merge_two:
24667 /* Then merge them together. The key is whether any given control
24668 element contained a bit set that indicates the second word. */
24669 mask = operands[3];
24670 vt = GEN_INT (w);
24671 if (maskmode == V2DImode && !TARGET_SSE4_1)
24673 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24674 more shuffle to convert the V2DI input mask into a V4SI
24675 input mask. At which point the masking that expand_int_vcond
24676 will work as desired. */
24677 rtx t3 = gen_reg_rtx (V4SImode);
24678 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24679 const0_rtx, const0_rtx,
24680 const2_rtx, const2_rtx));
24681 mask = t3;
24682 maskmode = V4SImode;
24683 e = w = 4;
24686 vt = gen_const_vec_duplicate (maskmode, vt);
24687 vt = force_reg (maskmode, vt);
24688 mask = expand_simple_binop (maskmode, AND, mask, vt,
24689 NULL_RTX, 0, OPTAB_DIRECT);
24691 if (GET_MODE (target) != mode)
24692 target = gen_reg_rtx (mode);
24693 xops[0] = target;
24694 xops[1] = gen_lowpart (mode, t2);
24695 xops[2] = gen_lowpart (mode, t1);
24696 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24697 xops[4] = mask;
24698 xops[5] = vt;
24699 ok = ix86_expand_int_vcond (xops);
24700 gcc_assert (ok);
24701 if (target != operands[0])
24702 emit_move_insn (operands[0],
24703 gen_lowpart (GET_MODE (operands[0]), target));
24707 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24708 true if we should do zero extension, else sign extension. HIGH_P is
24709 true if we want the N/2 high elements, else the low elements. */
24711 void
24712 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24714 machine_mode imode = GET_MODE (src);
24715 rtx tmp;
24717 if (TARGET_SSE4_1)
24719 rtx (*unpack)(rtx, rtx);
24720 rtx (*extract)(rtx, rtx) = NULL;
24721 machine_mode halfmode = BLKmode;
24723 switch (imode)
24725 case E_V64QImode:
24726 if (unsigned_p)
24727 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24728 else
24729 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24730 halfmode = V32QImode;
24731 extract
24732 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24733 break;
24734 case E_V32QImode:
24735 if (unsigned_p)
24736 unpack = gen_avx2_zero_extendv16qiv16hi2;
24737 else
24738 unpack = gen_avx2_sign_extendv16qiv16hi2;
24739 halfmode = V16QImode;
24740 extract
24741 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24742 break;
24743 case E_V32HImode:
24744 if (unsigned_p)
24745 unpack = gen_avx512f_zero_extendv16hiv16si2;
24746 else
24747 unpack = gen_avx512f_sign_extendv16hiv16si2;
24748 halfmode = V16HImode;
24749 extract
24750 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24751 break;
24752 case E_V16HImode:
24753 if (unsigned_p)
24754 unpack = gen_avx2_zero_extendv8hiv8si2;
24755 else
24756 unpack = gen_avx2_sign_extendv8hiv8si2;
24757 halfmode = V8HImode;
24758 extract
24759 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24760 break;
24761 case E_V16SImode:
24762 if (unsigned_p)
24763 unpack = gen_avx512f_zero_extendv8siv8di2;
24764 else
24765 unpack = gen_avx512f_sign_extendv8siv8di2;
24766 halfmode = V8SImode;
24767 extract
24768 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24769 break;
24770 case E_V8SImode:
24771 if (unsigned_p)
24772 unpack = gen_avx2_zero_extendv4siv4di2;
24773 else
24774 unpack = gen_avx2_sign_extendv4siv4di2;
24775 halfmode = V4SImode;
24776 extract
24777 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24778 break;
24779 case E_V16QImode:
24780 if (unsigned_p)
24781 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24782 else
24783 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24784 break;
24785 case E_V8HImode:
24786 if (unsigned_p)
24787 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24788 else
24789 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24790 break;
24791 case E_V4SImode:
24792 if (unsigned_p)
24793 unpack = gen_sse4_1_zero_extendv2siv2di2;
24794 else
24795 unpack = gen_sse4_1_sign_extendv2siv2di2;
24796 break;
24797 default:
24798 gcc_unreachable ();
24801 if (GET_MODE_SIZE (imode) >= 32)
24803 tmp = gen_reg_rtx (halfmode);
24804 emit_insn (extract (tmp, src));
24806 else if (high_p)
24808 /* Shift higher 8 bytes to lower 8 bytes. */
24809 tmp = gen_reg_rtx (V1TImode);
24810 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24811 GEN_INT (64)));
24812 tmp = gen_lowpart (imode, tmp);
24814 else
24815 tmp = src;
24817 emit_insn (unpack (dest, tmp));
24819 else
24821 rtx (*unpack)(rtx, rtx, rtx);
24823 switch (imode)
24825 case E_V16QImode:
24826 if (high_p)
24827 unpack = gen_vec_interleave_highv16qi;
24828 else
24829 unpack = gen_vec_interleave_lowv16qi;
24830 break;
24831 case E_V8HImode:
24832 if (high_p)
24833 unpack = gen_vec_interleave_highv8hi;
24834 else
24835 unpack = gen_vec_interleave_lowv8hi;
24836 break;
24837 case E_V4SImode:
24838 if (high_p)
24839 unpack = gen_vec_interleave_highv4si;
24840 else
24841 unpack = gen_vec_interleave_lowv4si;
24842 break;
24843 default:
24844 gcc_unreachable ();
24847 if (unsigned_p)
24848 tmp = force_reg (imode, CONST0_RTX (imode));
24849 else
24850 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24851 src, pc_rtx, pc_rtx);
24853 rtx tmp2 = gen_reg_rtx (imode);
24854 emit_insn (unpack (tmp2, src, tmp));
24855 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24859 /* Expand conditional increment or decrement using adb/sbb instructions.
24860 The default case using setcc followed by the conditional move can be
24861 done by generic code. */
24862 bool
24863 ix86_expand_int_addcc (rtx operands[])
24865 enum rtx_code code = GET_CODE (operands[1]);
24866 rtx flags;
24867 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24868 rtx compare_op;
24869 rtx val = const0_rtx;
24870 bool fpcmp = false;
24871 machine_mode mode;
24872 rtx op0 = XEXP (operands[1], 0);
24873 rtx op1 = XEXP (operands[1], 1);
24875 if (operands[3] != const1_rtx
24876 && operands[3] != constm1_rtx)
24877 return false;
24878 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24879 return false;
24880 code = GET_CODE (compare_op);
24882 flags = XEXP (compare_op, 0);
24884 if (GET_MODE (flags) == CCFPmode)
24886 fpcmp = true;
24887 code = ix86_fp_compare_code_to_integer (code);
24890 if (code != LTU)
24892 val = constm1_rtx;
24893 if (fpcmp)
24894 PUT_CODE (compare_op,
24895 reverse_condition_maybe_unordered
24896 (GET_CODE (compare_op)));
24897 else
24898 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
24901 mode = GET_MODE (operands[0]);
24903 /* Construct either adc or sbb insn. */
24904 if ((code == LTU) == (operands[3] == constm1_rtx))
24906 switch (mode)
24908 case E_QImode:
24909 insn = gen_subqi3_carry;
24910 break;
24911 case E_HImode:
24912 insn = gen_subhi3_carry;
24913 break;
24914 case E_SImode:
24915 insn = gen_subsi3_carry;
24916 break;
24917 case E_DImode:
24918 insn = gen_subdi3_carry;
24919 break;
24920 default:
24921 gcc_unreachable ();
24924 else
24926 switch (mode)
24928 case E_QImode:
24929 insn = gen_addqi3_carry;
24930 break;
24931 case E_HImode:
24932 insn = gen_addhi3_carry;
24933 break;
24934 case E_SImode:
24935 insn = gen_addsi3_carry;
24936 break;
24937 case E_DImode:
24938 insn = gen_adddi3_carry;
24939 break;
24940 default:
24941 gcc_unreachable ();
24944 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
24946 return true;
24950 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
24951 but works for floating pointer parameters and nonoffsetable memories.
24952 For pushes, it returns just stack offsets; the values will be saved
24953 in the right order. Maximally three parts are generated. */
24955 static int
24956 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
24958 int size;
24960 if (!TARGET_64BIT)
24961 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
24962 else
24963 size = (GET_MODE_SIZE (mode) + 4) / 8;
24965 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
24966 gcc_assert (size >= 2 && size <= 4);
24968 /* Optimize constant pool reference to immediates. This is used by fp
24969 moves, that force all constants to memory to allow combining. */
24970 if (MEM_P (operand) && MEM_READONLY_P (operand))
24971 operand = avoid_constant_pool_reference (operand);
24973 if (MEM_P (operand) && !offsettable_memref_p (operand))
24975 /* The only non-offsetable memories we handle are pushes. */
24976 int ok = push_operand (operand, VOIDmode);
24978 gcc_assert (ok);
24980 operand = copy_rtx (operand);
24981 PUT_MODE (operand, word_mode);
24982 parts[0] = parts[1] = parts[2] = parts[3] = operand;
24983 return size;
24986 if (GET_CODE (operand) == CONST_VECTOR)
24988 scalar_int_mode imode = int_mode_for_mode (mode).require ();
24989 /* Caution: if we looked through a constant pool memory above,
24990 the operand may actually have a different mode now. That's
24991 ok, since we want to pun this all the way back to an integer. */
24992 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
24993 gcc_assert (operand != NULL);
24994 mode = imode;
24997 if (!TARGET_64BIT)
24999 if (mode == DImode)
25000 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25001 else
25003 int i;
25005 if (REG_P (operand))
25007 gcc_assert (reload_completed);
25008 for (i = 0; i < size; i++)
25009 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25011 else if (offsettable_memref_p (operand))
25013 operand = adjust_address (operand, SImode, 0);
25014 parts[0] = operand;
25015 for (i = 1; i < size; i++)
25016 parts[i] = adjust_address (operand, SImode, 4 * i);
25018 else if (CONST_DOUBLE_P (operand))
25020 const REAL_VALUE_TYPE *r;
25021 long l[4];
25023 r = CONST_DOUBLE_REAL_VALUE (operand);
25024 switch (mode)
25026 case E_TFmode:
25027 real_to_target (l, r, mode);
25028 parts[3] = gen_int_mode (l[3], SImode);
25029 parts[2] = gen_int_mode (l[2], SImode);
25030 break;
25031 case E_XFmode:
25032 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25033 long double may not be 80-bit. */
25034 real_to_target (l, r, mode);
25035 parts[2] = gen_int_mode (l[2], SImode);
25036 break;
25037 case E_DFmode:
25038 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25039 break;
25040 default:
25041 gcc_unreachable ();
25043 parts[1] = gen_int_mode (l[1], SImode);
25044 parts[0] = gen_int_mode (l[0], SImode);
25046 else
25047 gcc_unreachable ();
25050 else
25052 if (mode == TImode)
25053 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25054 if (mode == XFmode || mode == TFmode)
25056 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25057 if (REG_P (operand))
25059 gcc_assert (reload_completed);
25060 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25061 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25063 else if (offsettable_memref_p (operand))
25065 operand = adjust_address (operand, DImode, 0);
25066 parts[0] = operand;
25067 parts[1] = adjust_address (operand, upper_mode, 8);
25069 else if (CONST_DOUBLE_P (operand))
25071 long l[4];
25073 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25075 /* real_to_target puts 32-bit pieces in each long. */
25076 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25077 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25078 << 32), DImode);
25080 if (upper_mode == SImode)
25081 parts[1] = gen_int_mode (l[2], SImode);
25082 else
25083 parts[1]
25084 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25085 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25086 << 32), DImode);
25088 else
25089 gcc_unreachable ();
25093 return size;
25096 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25097 Return false when normal moves are needed; true when all required
25098 insns have been emitted. Operands 2-4 contain the input values
25099 int the correct order; operands 5-7 contain the output values. */
25101 void
25102 ix86_split_long_move (rtx operands[])
25104 rtx part[2][4];
25105 int nparts, i, j;
25106 int push = 0;
25107 int collisions = 0;
25108 machine_mode mode = GET_MODE (operands[0]);
25109 bool collisionparts[4];
25111 /* The DFmode expanders may ask us to move double.
25112 For 64bit target this is single move. By hiding the fact
25113 here we simplify i386.md splitters. */
25114 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25116 /* Optimize constant pool reference to immediates. This is used by
25117 fp moves, that force all constants to memory to allow combining. */
25119 if (MEM_P (operands[1])
25120 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25121 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25122 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25123 if (push_operand (operands[0], VOIDmode))
25125 operands[0] = copy_rtx (operands[0]);
25126 PUT_MODE (operands[0], word_mode);
25128 else
25129 operands[0] = gen_lowpart (DImode, operands[0]);
25130 operands[1] = gen_lowpart (DImode, operands[1]);
25131 emit_move_insn (operands[0], operands[1]);
25132 return;
25135 /* The only non-offsettable memory we handle is push. */
25136 if (push_operand (operands[0], VOIDmode))
25137 push = 1;
25138 else
25139 gcc_assert (!MEM_P (operands[0])
25140 || offsettable_memref_p (operands[0]));
25142 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25143 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25145 /* When emitting push, take care for source operands on the stack. */
25146 if (push && MEM_P (operands[1])
25147 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25149 rtx src_base = XEXP (part[1][nparts - 1], 0);
25151 /* Compensate for the stack decrement by 4. */
25152 if (!TARGET_64BIT && nparts == 3
25153 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25154 src_base = plus_constant (Pmode, src_base, 4);
25156 /* src_base refers to the stack pointer and is
25157 automatically decreased by emitted push. */
25158 for (i = 0; i < nparts; i++)
25159 part[1][i] = change_address (part[1][i],
25160 GET_MODE (part[1][i]), src_base);
25163 /* We need to do copy in the right order in case an address register
25164 of the source overlaps the destination. */
25165 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25167 rtx tmp;
25169 for (i = 0; i < nparts; i++)
25171 collisionparts[i]
25172 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25173 if (collisionparts[i])
25174 collisions++;
25177 /* Collision in the middle part can be handled by reordering. */
25178 if (collisions == 1 && nparts == 3 && collisionparts [1])
25180 std::swap (part[0][1], part[0][2]);
25181 std::swap (part[1][1], part[1][2]);
25183 else if (collisions == 1
25184 && nparts == 4
25185 && (collisionparts [1] || collisionparts [2]))
25187 if (collisionparts [1])
25189 std::swap (part[0][1], part[0][2]);
25190 std::swap (part[1][1], part[1][2]);
25192 else
25194 std::swap (part[0][2], part[0][3]);
25195 std::swap (part[1][2], part[1][3]);
25199 /* If there are more collisions, we can't handle it by reordering.
25200 Do an lea to the last part and use only one colliding move. */
25201 else if (collisions > 1)
25203 rtx base, addr;
25205 collisions = 1;
25207 base = part[0][nparts - 1];
25209 /* Handle the case when the last part isn't valid for lea.
25210 Happens in 64-bit mode storing the 12-byte XFmode. */
25211 if (GET_MODE (base) != Pmode)
25212 base = gen_rtx_REG (Pmode, REGNO (base));
25214 addr = XEXP (part[1][0], 0);
25215 if (TARGET_TLS_DIRECT_SEG_REFS)
25217 struct ix86_address parts;
25218 int ok = ix86_decompose_address (addr, &parts);
25219 gcc_assert (ok);
25220 /* It is not valid to use %gs: or %fs: in lea. */
25221 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25223 emit_insn (gen_rtx_SET (base, addr));
25224 part[1][0] = replace_equiv_address (part[1][0], base);
25225 for (i = 1; i < nparts; i++)
25227 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25228 part[1][i] = replace_equiv_address (part[1][i], tmp);
25233 if (push)
25235 if (!TARGET_64BIT)
25237 if (nparts == 3)
25239 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25240 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25241 stack_pointer_rtx, GEN_INT (-4)));
25242 emit_move_insn (part[0][2], part[1][2]);
25244 else if (nparts == 4)
25246 emit_move_insn (part[0][3], part[1][3]);
25247 emit_move_insn (part[0][2], part[1][2]);
25250 else
25252 /* In 64bit mode we don't have 32bit push available. In case this is
25253 register, it is OK - we will just use larger counterpart. We also
25254 retype memory - these comes from attempt to avoid REX prefix on
25255 moving of second half of TFmode value. */
25256 if (GET_MODE (part[1][1]) == SImode)
25258 switch (GET_CODE (part[1][1]))
25260 case MEM:
25261 part[1][1] = adjust_address (part[1][1], DImode, 0);
25262 break;
25264 case REG:
25265 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25266 break;
25268 default:
25269 gcc_unreachable ();
25272 if (GET_MODE (part[1][0]) == SImode)
25273 part[1][0] = part[1][1];
25276 emit_move_insn (part[0][1], part[1][1]);
25277 emit_move_insn (part[0][0], part[1][0]);
25278 return;
25281 /* Choose correct order to not overwrite the source before it is copied. */
25282 if ((REG_P (part[0][0])
25283 && REG_P (part[1][1])
25284 && (REGNO (part[0][0]) == REGNO (part[1][1])
25285 || (nparts == 3
25286 && REGNO (part[0][0]) == REGNO (part[1][2]))
25287 || (nparts == 4
25288 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25289 || (collisions > 0
25290 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25292 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25294 operands[2 + i] = part[0][j];
25295 operands[6 + i] = part[1][j];
25298 else
25300 for (i = 0; i < nparts; i++)
25302 operands[2 + i] = part[0][i];
25303 operands[6 + i] = part[1][i];
25307 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25308 if (optimize_insn_for_size_p ())
25310 for (j = 0; j < nparts - 1; j++)
25311 if (CONST_INT_P (operands[6 + j])
25312 && operands[6 + j] != const0_rtx
25313 && REG_P (operands[2 + j]))
25314 for (i = j; i < nparts - 1; i++)
25315 if (CONST_INT_P (operands[7 + i])
25316 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25317 operands[7 + i] = operands[2 + j];
25320 for (i = 0; i < nparts; i++)
25321 emit_move_insn (operands[2 + i], operands[6 + i]);
25323 return;
25326 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25327 left shift by a constant, either using a single shift or
25328 a sequence of add instructions. */
25330 static void
25331 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25333 rtx (*insn)(rtx, rtx, rtx);
25335 if (count == 1
25336 || (count * ix86_cost->add <= ix86_cost->shift_const
25337 && !optimize_insn_for_size_p ()))
25339 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25340 while (count-- > 0)
25341 emit_insn (insn (operand, operand, operand));
25343 else
25345 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25346 emit_insn (insn (operand, operand, GEN_INT (count)));
25350 void
25351 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25353 rtx (*gen_ashl3)(rtx, rtx, rtx);
25354 rtx (*gen_shld)(rtx, rtx, rtx);
25355 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25357 rtx low[2], high[2];
25358 int count;
25360 if (CONST_INT_P (operands[2]))
25362 split_double_mode (mode, operands, 2, low, high);
25363 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25365 if (count >= half_width)
25367 emit_move_insn (high[0], low[1]);
25368 emit_move_insn (low[0], const0_rtx);
25370 if (count > half_width)
25371 ix86_expand_ashl_const (high[0], count - half_width, mode);
25373 else
25375 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25377 if (!rtx_equal_p (operands[0], operands[1]))
25378 emit_move_insn (operands[0], operands[1]);
25380 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25381 ix86_expand_ashl_const (low[0], count, mode);
25383 return;
25386 split_double_mode (mode, operands, 1, low, high);
25388 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25390 if (operands[1] == const1_rtx)
25392 /* Assuming we've chosen a QImode capable registers, then 1 << N
25393 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25394 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25396 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25398 ix86_expand_clear (low[0]);
25399 ix86_expand_clear (high[0]);
25400 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25402 d = gen_lowpart (QImode, low[0]);
25403 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25404 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25405 emit_insn (gen_rtx_SET (d, s));
25407 d = gen_lowpart (QImode, high[0]);
25408 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25409 s = gen_rtx_NE (QImode, flags, const0_rtx);
25410 emit_insn (gen_rtx_SET (d, s));
25413 /* Otherwise, we can get the same results by manually performing
25414 a bit extract operation on bit 5/6, and then performing the two
25415 shifts. The two methods of getting 0/1 into low/high are exactly
25416 the same size. Avoiding the shift in the bit extract case helps
25417 pentium4 a bit; no one else seems to care much either way. */
25418 else
25420 machine_mode half_mode;
25421 rtx (*gen_lshr3)(rtx, rtx, rtx);
25422 rtx (*gen_and3)(rtx, rtx, rtx);
25423 rtx (*gen_xor3)(rtx, rtx, rtx);
25424 HOST_WIDE_INT bits;
25425 rtx x;
25427 if (mode == DImode)
25429 half_mode = SImode;
25430 gen_lshr3 = gen_lshrsi3;
25431 gen_and3 = gen_andsi3;
25432 gen_xor3 = gen_xorsi3;
25433 bits = 5;
25435 else
25437 half_mode = DImode;
25438 gen_lshr3 = gen_lshrdi3;
25439 gen_and3 = gen_anddi3;
25440 gen_xor3 = gen_xordi3;
25441 bits = 6;
25444 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25445 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25446 else
25447 x = gen_lowpart (half_mode, operands[2]);
25448 emit_insn (gen_rtx_SET (high[0], x));
25450 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25451 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25452 emit_move_insn (low[0], high[0]);
25453 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25456 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25457 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25458 return;
25461 if (operands[1] == constm1_rtx)
25463 /* For -1 << N, we can avoid the shld instruction, because we
25464 know that we're shifting 0...31/63 ones into a -1. */
25465 emit_move_insn (low[0], constm1_rtx);
25466 if (optimize_insn_for_size_p ())
25467 emit_move_insn (high[0], low[0]);
25468 else
25469 emit_move_insn (high[0], constm1_rtx);
25471 else
25473 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25475 if (!rtx_equal_p (operands[0], operands[1]))
25476 emit_move_insn (operands[0], operands[1]);
25478 split_double_mode (mode, operands, 1, low, high);
25479 emit_insn (gen_shld (high[0], low[0], operands[2]));
25482 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25484 if (TARGET_CMOVE && scratch)
25486 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25487 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25489 ix86_expand_clear (scratch);
25490 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25492 else
25494 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25495 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25497 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25501 void
25502 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25504 rtx (*gen_ashr3)(rtx, rtx, rtx)
25505 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25506 rtx (*gen_shrd)(rtx, rtx, rtx);
25507 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25509 rtx low[2], high[2];
25510 int count;
25512 if (CONST_INT_P (operands[2]))
25514 split_double_mode (mode, operands, 2, low, high);
25515 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25517 if (count == GET_MODE_BITSIZE (mode) - 1)
25519 emit_move_insn (high[0], high[1]);
25520 emit_insn (gen_ashr3 (high[0], high[0],
25521 GEN_INT (half_width - 1)));
25522 emit_move_insn (low[0], high[0]);
25525 else if (count >= half_width)
25527 emit_move_insn (low[0], high[1]);
25528 emit_move_insn (high[0], low[0]);
25529 emit_insn (gen_ashr3 (high[0], high[0],
25530 GEN_INT (half_width - 1)));
25532 if (count > half_width)
25533 emit_insn (gen_ashr3 (low[0], low[0],
25534 GEN_INT (count - half_width)));
25536 else
25538 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25540 if (!rtx_equal_p (operands[0], operands[1]))
25541 emit_move_insn (operands[0], operands[1]);
25543 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25544 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25547 else
25549 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25551 if (!rtx_equal_p (operands[0], operands[1]))
25552 emit_move_insn (operands[0], operands[1]);
25554 split_double_mode (mode, operands, 1, low, high);
25556 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25557 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25559 if (TARGET_CMOVE && scratch)
25561 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25562 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25564 emit_move_insn (scratch, high[0]);
25565 emit_insn (gen_ashr3 (scratch, scratch,
25566 GEN_INT (half_width - 1)));
25567 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25568 scratch));
25570 else
25572 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25573 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25575 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25580 void
25581 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25583 rtx (*gen_lshr3)(rtx, rtx, rtx)
25584 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25585 rtx (*gen_shrd)(rtx, rtx, rtx);
25586 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25588 rtx low[2], high[2];
25589 int count;
25591 if (CONST_INT_P (operands[2]))
25593 split_double_mode (mode, operands, 2, low, high);
25594 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25596 if (count >= half_width)
25598 emit_move_insn (low[0], high[1]);
25599 ix86_expand_clear (high[0]);
25601 if (count > half_width)
25602 emit_insn (gen_lshr3 (low[0], low[0],
25603 GEN_INT (count - half_width)));
25605 else
25607 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25609 if (!rtx_equal_p (operands[0], operands[1]))
25610 emit_move_insn (operands[0], operands[1]);
25612 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25613 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25616 else
25618 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25620 if (!rtx_equal_p (operands[0], operands[1]))
25621 emit_move_insn (operands[0], operands[1]);
25623 split_double_mode (mode, operands, 1, low, high);
25625 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25626 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25628 if (TARGET_CMOVE && scratch)
25630 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25631 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25633 ix86_expand_clear (scratch);
25634 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25635 scratch));
25637 else
25639 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25640 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25642 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25647 /* Predict just emitted jump instruction to be taken with probability PROB. */
25648 static void
25649 predict_jump (int prob)
25651 rtx_insn *insn = get_last_insn ();
25652 gcc_assert (JUMP_P (insn));
25653 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
25656 /* Helper function for the string operations below. Dest VARIABLE whether
25657 it is aligned to VALUE bytes. If true, jump to the label. */
25658 static rtx_code_label *
25659 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25661 rtx_code_label *label = gen_label_rtx ();
25662 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25663 if (GET_MODE (variable) == DImode)
25664 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25665 else
25666 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25667 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25668 1, label);
25669 if (epilogue)
25670 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25671 else
25672 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25673 return label;
25676 /* Adjust COUNTER by the VALUE. */
25677 static void
25678 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25680 rtx (*gen_add)(rtx, rtx, rtx)
25681 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25683 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25686 /* Zero extend possibly SImode EXP to Pmode register. */
25688 ix86_zero_extend_to_Pmode (rtx exp)
25690 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25693 /* Divide COUNTREG by SCALE. */
25694 static rtx
25695 scale_counter (rtx countreg, int scale)
25697 rtx sc;
25699 if (scale == 1)
25700 return countreg;
25701 if (CONST_INT_P (countreg))
25702 return GEN_INT (INTVAL (countreg) / scale);
25703 gcc_assert (REG_P (countreg));
25705 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25706 GEN_INT (exact_log2 (scale)),
25707 NULL, 1, OPTAB_DIRECT);
25708 return sc;
25711 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25712 DImode for constant loop counts. */
25714 static machine_mode
25715 counter_mode (rtx count_exp)
25717 if (GET_MODE (count_exp) != VOIDmode)
25718 return GET_MODE (count_exp);
25719 if (!CONST_INT_P (count_exp))
25720 return Pmode;
25721 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25722 return DImode;
25723 return SImode;
25726 /* Copy the address to a Pmode register. This is used for x32 to
25727 truncate DImode TLS address to a SImode register. */
25729 static rtx
25730 ix86_copy_addr_to_reg (rtx addr)
25732 rtx reg;
25733 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25735 reg = copy_addr_to_reg (addr);
25736 REG_POINTER (reg) = 1;
25737 return reg;
25739 else
25741 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25742 reg = copy_to_mode_reg (DImode, addr);
25743 REG_POINTER (reg) = 1;
25744 return gen_rtx_SUBREG (SImode, reg, 0);
25748 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25749 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25750 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25751 memory by VALUE (supposed to be in MODE).
25753 The size is rounded down to whole number of chunk size moved at once.
25754 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25757 static void
25758 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25759 rtx destptr, rtx srcptr, rtx value,
25760 rtx count, machine_mode mode, int unroll,
25761 int expected_size, bool issetmem)
25763 rtx_code_label *out_label, *top_label;
25764 rtx iter, tmp;
25765 machine_mode iter_mode = counter_mode (count);
25766 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25767 rtx piece_size = GEN_INT (piece_size_n);
25768 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25769 rtx size;
25770 int i;
25772 top_label = gen_label_rtx ();
25773 out_label = gen_label_rtx ();
25774 iter = gen_reg_rtx (iter_mode);
25776 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25777 NULL, 1, OPTAB_DIRECT);
25778 /* Those two should combine. */
25779 if (piece_size == const1_rtx)
25781 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25782 true, out_label);
25783 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25785 emit_move_insn (iter, const0_rtx);
25787 emit_label (top_label);
25789 tmp = convert_modes (Pmode, iter_mode, iter, true);
25791 /* This assert could be relaxed - in this case we'll need to compute
25792 smallest power of two, containing in PIECE_SIZE_N and pass it to
25793 offset_address. */
25794 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25795 destmem = offset_address (destmem, tmp, piece_size_n);
25796 destmem = adjust_address (destmem, mode, 0);
25798 if (!issetmem)
25800 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25801 srcmem = adjust_address (srcmem, mode, 0);
25803 /* When unrolling for chips that reorder memory reads and writes,
25804 we can save registers by using single temporary.
25805 Also using 4 temporaries is overkill in 32bit mode. */
25806 if (!TARGET_64BIT && 0)
25808 for (i = 0; i < unroll; i++)
25810 if (i)
25812 destmem =
25813 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25814 srcmem =
25815 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25817 emit_move_insn (destmem, srcmem);
25820 else
25822 rtx tmpreg[4];
25823 gcc_assert (unroll <= 4);
25824 for (i = 0; i < unroll; i++)
25826 tmpreg[i] = gen_reg_rtx (mode);
25827 if (i)
25829 srcmem =
25830 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25832 emit_move_insn (tmpreg[i], srcmem);
25834 for (i = 0; i < unroll; i++)
25836 if (i)
25838 destmem =
25839 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25841 emit_move_insn (destmem, tmpreg[i]);
25845 else
25846 for (i = 0; i < unroll; i++)
25848 if (i)
25849 destmem =
25850 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25851 emit_move_insn (destmem, value);
25854 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
25855 true, OPTAB_LIB_WIDEN);
25856 if (tmp != iter)
25857 emit_move_insn (iter, tmp);
25859 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
25860 true, top_label);
25861 if (expected_size != -1)
25863 expected_size /= GET_MODE_SIZE (mode) * unroll;
25864 if (expected_size == 0)
25865 predict_jump (0);
25866 else if (expected_size > REG_BR_PROB_BASE)
25867 predict_jump (REG_BR_PROB_BASE - 1);
25868 else
25869 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
25871 else
25872 predict_jump (REG_BR_PROB_BASE * 80 / 100);
25873 iter = ix86_zero_extend_to_Pmode (iter);
25874 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
25875 true, OPTAB_LIB_WIDEN);
25876 if (tmp != destptr)
25877 emit_move_insn (destptr, tmp);
25878 if (!issetmem)
25880 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
25881 true, OPTAB_LIB_WIDEN);
25882 if (tmp != srcptr)
25883 emit_move_insn (srcptr, tmp);
25885 emit_label (out_label);
25888 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
25889 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
25890 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
25891 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
25892 ORIG_VALUE is the original value passed to memset to fill the memory with.
25893 Other arguments have same meaning as for previous function. */
25895 static void
25896 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
25897 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
25898 rtx count,
25899 machine_mode mode, bool issetmem)
25901 rtx destexp;
25902 rtx srcexp;
25903 rtx countreg;
25904 HOST_WIDE_INT rounded_count;
25906 /* If possible, it is shorter to use rep movs.
25907 TODO: Maybe it is better to move this logic to decide_alg. */
25908 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
25909 && (!issetmem || orig_value == const0_rtx))
25910 mode = SImode;
25912 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
25913 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
25915 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
25916 GET_MODE_SIZE (mode)));
25917 if (mode != QImode)
25919 destexp = gen_rtx_ASHIFT (Pmode, countreg,
25920 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25921 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
25923 else
25924 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
25925 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
25927 rounded_count
25928 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25929 destmem = shallow_copy_rtx (destmem);
25930 set_mem_size (destmem, rounded_count);
25932 else if (MEM_SIZE_KNOWN_P (destmem))
25933 clear_mem_size (destmem);
25935 if (issetmem)
25937 value = force_reg (mode, gen_lowpart (mode, value));
25938 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
25940 else
25942 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
25943 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
25944 if (mode != QImode)
25946 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
25947 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25948 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
25950 else
25951 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
25952 if (CONST_INT_P (count))
25954 rounded_count
25955 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25956 srcmem = shallow_copy_rtx (srcmem);
25957 set_mem_size (srcmem, rounded_count);
25959 else
25961 if (MEM_SIZE_KNOWN_P (srcmem))
25962 clear_mem_size (srcmem);
25964 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
25965 destexp, srcexp));
25969 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
25970 DESTMEM.
25971 SRC is passed by pointer to be updated on return.
25972 Return value is updated DST. */
25973 static rtx
25974 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
25975 HOST_WIDE_INT size_to_move)
25977 rtx dst = destmem, src = *srcmem, adjust, tempreg;
25978 enum insn_code code;
25979 machine_mode move_mode;
25980 int piece_size, i;
25982 /* Find the widest mode in which we could perform moves.
25983 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
25984 it until move of such size is supported. */
25985 piece_size = 1 << floor_log2 (size_to_move);
25986 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
25987 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25989 gcc_assert (piece_size > 1);
25990 piece_size >>= 1;
25993 /* Find the corresponding vector mode with the same size as MOVE_MODE.
25994 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
25995 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
25997 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
25998 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
25999 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26001 move_mode = word_mode;
26002 piece_size = GET_MODE_SIZE (move_mode);
26003 code = optab_handler (mov_optab, move_mode);
26006 gcc_assert (code != CODE_FOR_nothing);
26008 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26009 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26011 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26012 gcc_assert (size_to_move % piece_size == 0);
26013 adjust = GEN_INT (piece_size);
26014 for (i = 0; i < size_to_move; i += piece_size)
26016 /* We move from memory to memory, so we'll need to do it via
26017 a temporary register. */
26018 tempreg = gen_reg_rtx (move_mode);
26019 emit_insn (GEN_FCN (code) (tempreg, src));
26020 emit_insn (GEN_FCN (code) (dst, tempreg));
26022 emit_move_insn (destptr,
26023 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26024 emit_move_insn (srcptr,
26025 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26027 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26028 piece_size);
26029 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26030 piece_size);
26033 /* Update DST and SRC rtx. */
26034 *srcmem = src;
26035 return dst;
26038 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26039 static void
26040 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26041 rtx destptr, rtx srcptr, rtx count, int max_size)
26043 rtx src, dest;
26044 if (CONST_INT_P (count))
26046 HOST_WIDE_INT countval = INTVAL (count);
26047 HOST_WIDE_INT epilogue_size = countval % max_size;
26048 int i;
26050 /* For now MAX_SIZE should be a power of 2. This assert could be
26051 relaxed, but it'll require a bit more complicated epilogue
26052 expanding. */
26053 gcc_assert ((max_size & (max_size - 1)) == 0);
26054 for (i = max_size; i >= 1; i >>= 1)
26056 if (epilogue_size & i)
26057 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26059 return;
26061 if (max_size > 8)
26063 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26064 count, 1, OPTAB_DIRECT);
26065 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26066 count, QImode, 1, 4, false);
26067 return;
26070 /* When there are stringops, we can cheaply increase dest and src pointers.
26071 Otherwise we save code size by maintaining offset (zero is readily
26072 available from preceding rep operation) and using x86 addressing modes.
26074 if (TARGET_SINGLE_STRINGOP)
26076 if (max_size > 4)
26078 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26079 src = change_address (srcmem, SImode, srcptr);
26080 dest = change_address (destmem, SImode, destptr);
26081 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26082 emit_label (label);
26083 LABEL_NUSES (label) = 1;
26085 if (max_size > 2)
26087 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26088 src = change_address (srcmem, HImode, srcptr);
26089 dest = change_address (destmem, HImode, destptr);
26090 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26091 emit_label (label);
26092 LABEL_NUSES (label) = 1;
26094 if (max_size > 1)
26096 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26097 src = change_address (srcmem, QImode, srcptr);
26098 dest = change_address (destmem, QImode, destptr);
26099 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26100 emit_label (label);
26101 LABEL_NUSES (label) = 1;
26104 else
26106 rtx offset = force_reg (Pmode, const0_rtx);
26107 rtx tmp;
26109 if (max_size > 4)
26111 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26112 src = change_address (srcmem, SImode, srcptr);
26113 dest = change_address (destmem, SImode, destptr);
26114 emit_move_insn (dest, src);
26115 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26116 true, OPTAB_LIB_WIDEN);
26117 if (tmp != offset)
26118 emit_move_insn (offset, tmp);
26119 emit_label (label);
26120 LABEL_NUSES (label) = 1;
26122 if (max_size > 2)
26124 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26125 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26126 src = change_address (srcmem, HImode, tmp);
26127 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26128 dest = change_address (destmem, HImode, tmp);
26129 emit_move_insn (dest, src);
26130 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26131 true, OPTAB_LIB_WIDEN);
26132 if (tmp != offset)
26133 emit_move_insn (offset, tmp);
26134 emit_label (label);
26135 LABEL_NUSES (label) = 1;
26137 if (max_size > 1)
26139 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26140 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26141 src = change_address (srcmem, QImode, tmp);
26142 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26143 dest = change_address (destmem, QImode, tmp);
26144 emit_move_insn (dest, src);
26145 emit_label (label);
26146 LABEL_NUSES (label) = 1;
26151 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26152 with value PROMOTED_VAL.
26153 SRC is passed by pointer to be updated on return.
26154 Return value is updated DST. */
26155 static rtx
26156 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26157 HOST_WIDE_INT size_to_move)
26159 rtx dst = destmem, adjust;
26160 enum insn_code code;
26161 machine_mode move_mode;
26162 int piece_size, i;
26164 /* Find the widest mode in which we could perform moves.
26165 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26166 it until move of such size is supported. */
26167 move_mode = GET_MODE (promoted_val);
26168 if (move_mode == VOIDmode)
26169 move_mode = QImode;
26170 if (size_to_move < GET_MODE_SIZE (move_mode))
26172 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26173 move_mode = int_mode_for_size (move_bits, 0).require ();
26174 promoted_val = gen_lowpart (move_mode, promoted_val);
26176 piece_size = GET_MODE_SIZE (move_mode);
26177 code = optab_handler (mov_optab, move_mode);
26178 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26180 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26182 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26183 gcc_assert (size_to_move % piece_size == 0);
26184 adjust = GEN_INT (piece_size);
26185 for (i = 0; i < size_to_move; i += piece_size)
26187 if (piece_size <= GET_MODE_SIZE (word_mode))
26189 emit_insn (gen_strset (destptr, dst, promoted_val));
26190 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26191 piece_size);
26192 continue;
26195 emit_insn (GEN_FCN (code) (dst, promoted_val));
26197 emit_move_insn (destptr,
26198 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26200 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26201 piece_size);
26204 /* Update DST rtx. */
26205 return dst;
26207 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26208 static void
26209 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26210 rtx count, int max_size)
26212 count =
26213 expand_simple_binop (counter_mode (count), AND, count,
26214 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26215 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26216 gen_lowpart (QImode, value), count, QImode,
26217 1, max_size / 2, true);
26220 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26221 static void
26222 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26223 rtx count, int max_size)
26225 rtx dest;
26227 if (CONST_INT_P (count))
26229 HOST_WIDE_INT countval = INTVAL (count);
26230 HOST_WIDE_INT epilogue_size = countval % max_size;
26231 int i;
26233 /* For now MAX_SIZE should be a power of 2. This assert could be
26234 relaxed, but it'll require a bit more complicated epilogue
26235 expanding. */
26236 gcc_assert ((max_size & (max_size - 1)) == 0);
26237 for (i = max_size; i >= 1; i >>= 1)
26239 if (epilogue_size & i)
26241 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26242 destmem = emit_memset (destmem, destptr, vec_value, i);
26243 else
26244 destmem = emit_memset (destmem, destptr, value, i);
26247 return;
26249 if (max_size > 32)
26251 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26252 return;
26254 if (max_size > 16)
26256 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26257 if (TARGET_64BIT)
26259 dest = change_address (destmem, DImode, destptr);
26260 emit_insn (gen_strset (destptr, dest, value));
26261 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26262 emit_insn (gen_strset (destptr, dest, value));
26264 else
26266 dest = change_address (destmem, SImode, destptr);
26267 emit_insn (gen_strset (destptr, dest, value));
26268 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26269 emit_insn (gen_strset (destptr, dest, value));
26270 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26271 emit_insn (gen_strset (destptr, dest, value));
26272 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26273 emit_insn (gen_strset (destptr, dest, value));
26275 emit_label (label);
26276 LABEL_NUSES (label) = 1;
26278 if (max_size > 8)
26280 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26281 if (TARGET_64BIT)
26283 dest = change_address (destmem, DImode, destptr);
26284 emit_insn (gen_strset (destptr, dest, value));
26286 else
26288 dest = change_address (destmem, SImode, destptr);
26289 emit_insn (gen_strset (destptr, dest, value));
26290 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26291 emit_insn (gen_strset (destptr, dest, value));
26293 emit_label (label);
26294 LABEL_NUSES (label) = 1;
26296 if (max_size > 4)
26298 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26299 dest = change_address (destmem, SImode, destptr);
26300 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26301 emit_label (label);
26302 LABEL_NUSES (label) = 1;
26304 if (max_size > 2)
26306 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26307 dest = change_address (destmem, HImode, destptr);
26308 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26309 emit_label (label);
26310 LABEL_NUSES (label) = 1;
26312 if (max_size > 1)
26314 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26315 dest = change_address (destmem, QImode, destptr);
26316 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26317 emit_label (label);
26318 LABEL_NUSES (label) = 1;
26322 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26323 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26324 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26325 ignored.
26326 Return value is updated DESTMEM. */
26327 static rtx
26328 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26329 rtx destptr, rtx srcptr, rtx value,
26330 rtx vec_value, rtx count, int align,
26331 int desired_alignment, bool issetmem)
26333 int i;
26334 for (i = 1; i < desired_alignment; i <<= 1)
26336 if (align <= i)
26338 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26339 if (issetmem)
26341 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26342 destmem = emit_memset (destmem, destptr, vec_value, i);
26343 else
26344 destmem = emit_memset (destmem, destptr, value, i);
26346 else
26347 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26348 ix86_adjust_counter (count, i);
26349 emit_label (label);
26350 LABEL_NUSES (label) = 1;
26351 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26354 return destmem;
26357 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26358 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26359 and jump to DONE_LABEL. */
26360 static void
26361 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26362 rtx destptr, rtx srcptr,
26363 rtx value, rtx vec_value,
26364 rtx count, int size,
26365 rtx done_label, bool issetmem)
26367 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26368 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26369 rtx modesize;
26370 int n;
26372 /* If we do not have vector value to copy, we must reduce size. */
26373 if (issetmem)
26375 if (!vec_value)
26377 if (GET_MODE (value) == VOIDmode && size > 8)
26378 mode = Pmode;
26379 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26380 mode = GET_MODE (value);
26382 else
26383 mode = GET_MODE (vec_value), value = vec_value;
26385 else
26387 /* Choose appropriate vector mode. */
26388 if (size >= 32)
26389 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26390 else if (size >= 16)
26391 mode = TARGET_SSE ? V16QImode : DImode;
26392 srcmem = change_address (srcmem, mode, srcptr);
26394 destmem = change_address (destmem, mode, destptr);
26395 modesize = GEN_INT (GET_MODE_SIZE (mode));
26396 gcc_assert (GET_MODE_SIZE (mode) <= size);
26397 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26399 if (issetmem)
26400 emit_move_insn (destmem, gen_lowpart (mode, value));
26401 else
26403 emit_move_insn (destmem, srcmem);
26404 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26406 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26409 destmem = offset_address (destmem, count, 1);
26410 destmem = offset_address (destmem, GEN_INT (-2 * size),
26411 GET_MODE_SIZE (mode));
26412 if (!issetmem)
26414 srcmem = offset_address (srcmem, count, 1);
26415 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26416 GET_MODE_SIZE (mode));
26418 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26420 if (issetmem)
26421 emit_move_insn (destmem, gen_lowpart (mode, value));
26422 else
26424 emit_move_insn (destmem, srcmem);
26425 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26427 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26429 emit_jump_insn (gen_jump (done_label));
26430 emit_barrier ();
26432 emit_label (label);
26433 LABEL_NUSES (label) = 1;
26436 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26437 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26438 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26439 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26440 DONE_LABEL is a label after the whole copying sequence. The label is created
26441 on demand if *DONE_LABEL is NULL.
26442 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26443 bounds after the initial copies.
26445 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26446 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26447 we will dispatch to a library call for large blocks.
26449 In pseudocode we do:
26451 if (COUNT < SIZE)
26453 Assume that SIZE is 4. Bigger sizes are handled analogously
26454 if (COUNT & 4)
26456 copy 4 bytes from SRCPTR to DESTPTR
26457 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26458 goto done_label
26460 if (!COUNT)
26461 goto done_label;
26462 copy 1 byte from SRCPTR to DESTPTR
26463 if (COUNT & 2)
26465 copy 2 bytes from SRCPTR to DESTPTR
26466 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26469 else
26471 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26472 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26474 OLD_DESPTR = DESTPTR;
26475 Align DESTPTR up to DESIRED_ALIGN
26476 SRCPTR += DESTPTR - OLD_DESTPTR
26477 COUNT -= DEST_PTR - OLD_DESTPTR
26478 if (DYNAMIC_CHECK)
26479 Round COUNT down to multiple of SIZE
26480 << optional caller supplied zero size guard is here >>
26481 << optional caller supplied dynamic check is here >>
26482 << caller supplied main copy loop is here >>
26484 done_label:
26486 static void
26487 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26488 rtx *destptr, rtx *srcptr,
26489 machine_mode mode,
26490 rtx value, rtx vec_value,
26491 rtx *count,
26492 rtx_code_label **done_label,
26493 int size,
26494 int desired_align,
26495 int align,
26496 unsigned HOST_WIDE_INT *min_size,
26497 bool dynamic_check,
26498 bool issetmem)
26500 rtx_code_label *loop_label = NULL, *label;
26501 int n;
26502 rtx modesize;
26503 int prolog_size = 0;
26504 rtx mode_value;
26506 /* Chose proper value to copy. */
26507 if (issetmem && VECTOR_MODE_P (mode))
26508 mode_value = vec_value;
26509 else
26510 mode_value = value;
26511 gcc_assert (GET_MODE_SIZE (mode) <= size);
26513 /* See if block is big or small, handle small blocks. */
26514 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26516 int size2 = size;
26517 loop_label = gen_label_rtx ();
26519 if (!*done_label)
26520 *done_label = gen_label_rtx ();
26522 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26523 1, loop_label);
26524 size2 >>= 1;
26526 /* Handle sizes > 3. */
26527 for (;size2 > 2; size2 >>= 1)
26528 expand_small_movmem_or_setmem (destmem, srcmem,
26529 *destptr, *srcptr,
26530 value, vec_value,
26531 *count,
26532 size2, *done_label, issetmem);
26533 /* Nothing to copy? Jump to DONE_LABEL if so */
26534 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26535 1, *done_label);
26537 /* Do a byte copy. */
26538 destmem = change_address (destmem, QImode, *destptr);
26539 if (issetmem)
26540 emit_move_insn (destmem, gen_lowpart (QImode, value));
26541 else
26543 srcmem = change_address (srcmem, QImode, *srcptr);
26544 emit_move_insn (destmem, srcmem);
26547 /* Handle sizes 2 and 3. */
26548 label = ix86_expand_aligntest (*count, 2, false);
26549 destmem = change_address (destmem, HImode, *destptr);
26550 destmem = offset_address (destmem, *count, 1);
26551 destmem = offset_address (destmem, GEN_INT (-2), 2);
26552 if (issetmem)
26553 emit_move_insn (destmem, gen_lowpart (HImode, value));
26554 else
26556 srcmem = change_address (srcmem, HImode, *srcptr);
26557 srcmem = offset_address (srcmem, *count, 1);
26558 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26559 emit_move_insn (destmem, srcmem);
26562 emit_label (label);
26563 LABEL_NUSES (label) = 1;
26564 emit_jump_insn (gen_jump (*done_label));
26565 emit_barrier ();
26567 else
26568 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26569 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26571 /* Start memcpy for COUNT >= SIZE. */
26572 if (loop_label)
26574 emit_label (loop_label);
26575 LABEL_NUSES (loop_label) = 1;
26578 /* Copy first desired_align bytes. */
26579 if (!issetmem)
26580 srcmem = change_address (srcmem, mode, *srcptr);
26581 destmem = change_address (destmem, mode, *destptr);
26582 modesize = GEN_INT (GET_MODE_SIZE (mode));
26583 for (n = 0; prolog_size < desired_align - align; n++)
26585 if (issetmem)
26586 emit_move_insn (destmem, mode_value);
26587 else
26589 emit_move_insn (destmem, srcmem);
26590 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26592 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26593 prolog_size += GET_MODE_SIZE (mode);
26597 /* Copy last SIZE bytes. */
26598 destmem = offset_address (destmem, *count, 1);
26599 destmem = offset_address (destmem,
26600 GEN_INT (-size - prolog_size),
26602 if (issetmem)
26603 emit_move_insn (destmem, mode_value);
26604 else
26606 srcmem = offset_address (srcmem, *count, 1);
26607 srcmem = offset_address (srcmem,
26608 GEN_INT (-size - prolog_size),
26610 emit_move_insn (destmem, srcmem);
26612 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26614 destmem = offset_address (destmem, modesize, 1);
26615 if (issetmem)
26616 emit_move_insn (destmem, mode_value);
26617 else
26619 srcmem = offset_address (srcmem, modesize, 1);
26620 emit_move_insn (destmem, srcmem);
26624 /* Align destination. */
26625 if (desired_align > 1 && desired_align > align)
26627 rtx saveddest = *destptr;
26629 gcc_assert (desired_align <= size);
26630 /* Align destptr up, place it to new register. */
26631 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26632 GEN_INT (prolog_size),
26633 NULL_RTX, 1, OPTAB_DIRECT);
26634 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26635 REG_POINTER (*destptr) = 1;
26636 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26637 GEN_INT (-desired_align),
26638 *destptr, 1, OPTAB_DIRECT);
26639 /* See how many bytes we skipped. */
26640 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26641 *destptr,
26642 saveddest, 1, OPTAB_DIRECT);
26643 /* Adjust srcptr and count. */
26644 if (!issetmem)
26645 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26646 saveddest, *srcptr, 1, OPTAB_DIRECT);
26647 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26648 saveddest, *count, 1, OPTAB_DIRECT);
26649 /* We copied at most size + prolog_size. */
26650 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26651 *min_size
26652 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26653 else
26654 *min_size = 0;
26656 /* Our loops always round down the block size, but for dispatch to
26657 library we need precise value. */
26658 if (dynamic_check)
26659 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26660 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26662 else
26664 gcc_assert (prolog_size == 0);
26665 /* Decrease count, so we won't end up copying last word twice. */
26666 if (!CONST_INT_P (*count))
26667 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26668 constm1_rtx, *count, 1, OPTAB_DIRECT);
26669 else
26670 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26671 (unsigned HOST_WIDE_INT)size));
26672 if (*min_size)
26673 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26678 /* This function is like the previous one, except here we know how many bytes
26679 need to be copied. That allows us to update alignment not only of DST, which
26680 is returned, but also of SRC, which is passed as a pointer for that
26681 reason. */
26682 static rtx
26683 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26684 rtx srcreg, rtx value, rtx vec_value,
26685 int desired_align, int align_bytes,
26686 bool issetmem)
26688 rtx src = NULL;
26689 rtx orig_dst = dst;
26690 rtx orig_src = NULL;
26691 int piece_size = 1;
26692 int copied_bytes = 0;
26694 if (!issetmem)
26696 gcc_assert (srcp != NULL);
26697 src = *srcp;
26698 orig_src = src;
26701 for (piece_size = 1;
26702 piece_size <= desired_align && copied_bytes < align_bytes;
26703 piece_size <<= 1)
26705 if (align_bytes & piece_size)
26707 if (issetmem)
26709 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26710 dst = emit_memset (dst, destreg, vec_value, piece_size);
26711 else
26712 dst = emit_memset (dst, destreg, value, piece_size);
26714 else
26715 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26716 copied_bytes += piece_size;
26719 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26720 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26721 if (MEM_SIZE_KNOWN_P (orig_dst))
26722 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26724 if (!issetmem)
26726 int src_align_bytes = get_mem_align_offset (src, desired_align
26727 * BITS_PER_UNIT);
26728 if (src_align_bytes >= 0)
26729 src_align_bytes = desired_align - src_align_bytes;
26730 if (src_align_bytes >= 0)
26732 unsigned int src_align;
26733 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26735 if ((src_align_bytes & (src_align - 1))
26736 == (align_bytes & (src_align - 1)))
26737 break;
26739 if (src_align > (unsigned int) desired_align)
26740 src_align = desired_align;
26741 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26742 set_mem_align (src, src_align * BITS_PER_UNIT);
26744 if (MEM_SIZE_KNOWN_P (orig_src))
26745 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26746 *srcp = src;
26749 return dst;
26752 /* Return true if ALG can be used in current context.
26753 Assume we expand memset if MEMSET is true. */
26754 static bool
26755 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26757 if (alg == no_stringop)
26758 return false;
26759 if (alg == vector_loop)
26760 return TARGET_SSE || TARGET_AVX;
26761 /* Algorithms using the rep prefix want at least edi and ecx;
26762 additionally, memset wants eax and memcpy wants esi. Don't
26763 consider such algorithms if the user has appropriated those
26764 registers for their own purposes, or if we have a non-default
26765 address space, since some string insns cannot override the segment. */
26766 if (alg == rep_prefix_1_byte
26767 || alg == rep_prefix_4_byte
26768 || alg == rep_prefix_8_byte)
26770 if (have_as)
26771 return false;
26772 if (fixed_regs[CX_REG]
26773 || fixed_regs[DI_REG]
26774 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26775 return false;
26777 return true;
26780 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26781 static enum stringop_alg
26782 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26783 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26784 bool memset, bool zero_memset, bool have_as,
26785 int *dynamic_check, bool *noalign, bool recur)
26787 const struct stringop_algs *algs;
26788 bool optimize_for_speed;
26789 int max = 0;
26790 const struct processor_costs *cost;
26791 int i;
26792 bool any_alg_usable_p = false;
26794 *noalign = false;
26795 *dynamic_check = -1;
26797 /* Even if the string operation call is cold, we still might spend a lot
26798 of time processing large blocks. */
26799 if (optimize_function_for_size_p (cfun)
26800 || (optimize_insn_for_size_p ()
26801 && (max_size < 256
26802 || (expected_size != -1 && expected_size < 256))))
26803 optimize_for_speed = false;
26804 else
26805 optimize_for_speed = true;
26807 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26808 if (memset)
26809 algs = &cost->memset[TARGET_64BIT != 0];
26810 else
26811 algs = &cost->memcpy[TARGET_64BIT != 0];
26813 /* See maximal size for user defined algorithm. */
26814 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26816 enum stringop_alg candidate = algs->size[i].alg;
26817 bool usable = alg_usable_p (candidate, memset, have_as);
26818 any_alg_usable_p |= usable;
26820 if (candidate != libcall && candidate && usable)
26821 max = algs->size[i].max;
26824 /* If expected size is not known but max size is small enough
26825 so inline version is a win, set expected size into
26826 the range. */
26827 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26828 && expected_size == -1)
26829 expected_size = min_size / 2 + max_size / 2;
26831 /* If user specified the algorithm, honor it if possible. */
26832 if (ix86_stringop_alg != no_stringop
26833 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26834 return ix86_stringop_alg;
26835 /* rep; movq or rep; movl is the smallest variant. */
26836 else if (!optimize_for_speed)
26838 *noalign = true;
26839 if (!count || (count & 3) || (memset && !zero_memset))
26840 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26841 ? rep_prefix_1_byte : loop_1_byte;
26842 else
26843 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
26844 ? rep_prefix_4_byte : loop;
26846 /* Very tiny blocks are best handled via the loop, REP is expensive to
26847 setup. */
26848 else if (expected_size != -1 && expected_size < 4)
26849 return loop_1_byte;
26850 else if (expected_size != -1)
26852 enum stringop_alg alg = libcall;
26853 bool alg_noalign = false;
26854 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26856 /* We get here if the algorithms that were not libcall-based
26857 were rep-prefix based and we are unable to use rep prefixes
26858 based on global register usage. Break out of the loop and
26859 use the heuristic below. */
26860 if (algs->size[i].max == 0)
26861 break;
26862 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
26864 enum stringop_alg candidate = algs->size[i].alg;
26866 if (candidate != libcall
26867 && alg_usable_p (candidate, memset, have_as))
26869 alg = candidate;
26870 alg_noalign = algs->size[i].noalign;
26872 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
26873 last non-libcall inline algorithm. */
26874 if (TARGET_INLINE_ALL_STRINGOPS)
26876 /* When the current size is best to be copied by a libcall,
26877 but we are still forced to inline, run the heuristic below
26878 that will pick code for medium sized blocks. */
26879 if (alg != libcall)
26881 *noalign = alg_noalign;
26882 return alg;
26884 else if (!any_alg_usable_p)
26885 break;
26887 else if (alg_usable_p (candidate, memset, have_as))
26889 *noalign = algs->size[i].noalign;
26890 return candidate;
26895 /* When asked to inline the call anyway, try to pick meaningful choice.
26896 We look for maximal size of block that is faster to copy by hand and
26897 take blocks of at most of that size guessing that average size will
26898 be roughly half of the block.
26900 If this turns out to be bad, we might simply specify the preferred
26901 choice in ix86_costs. */
26902 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26903 && (algs->unknown_size == libcall
26904 || !alg_usable_p (algs->unknown_size, memset, have_as)))
26906 enum stringop_alg alg;
26907 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
26909 /* If there aren't any usable algorithms or if recursing already,
26910 then recursing on smaller sizes or same size isn't going to
26911 find anything. Just return the simple byte-at-a-time copy loop. */
26912 if (!any_alg_usable_p || recur)
26914 /* Pick something reasonable. */
26915 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
26916 *dynamic_check = 128;
26917 return loop_1_byte;
26919 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
26920 zero_memset, have_as, dynamic_check, noalign, true);
26921 gcc_assert (*dynamic_check == -1);
26922 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26923 *dynamic_check = max;
26924 else
26925 gcc_assert (alg != libcall);
26926 return alg;
26928 return (alg_usable_p (algs->unknown_size, memset, have_as)
26929 ? algs->unknown_size : libcall);
26932 /* Decide on alignment. We know that the operand is already aligned to ALIGN
26933 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
26934 static int
26935 decide_alignment (int align,
26936 enum stringop_alg alg,
26937 int expected_size,
26938 machine_mode move_mode)
26940 int desired_align = 0;
26942 gcc_assert (alg != no_stringop);
26944 if (alg == libcall)
26945 return 0;
26946 if (move_mode == VOIDmode)
26947 return 0;
26949 desired_align = GET_MODE_SIZE (move_mode);
26950 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
26951 copying whole cacheline at once. */
26952 if (TARGET_PENTIUMPRO
26953 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
26954 desired_align = 8;
26956 if (optimize_size)
26957 desired_align = 1;
26958 if (desired_align < align)
26959 desired_align = align;
26960 if (expected_size != -1 && expected_size < 4)
26961 desired_align = align;
26963 return desired_align;
26967 /* Helper function for memcpy. For QImode value 0xXY produce
26968 0xXYXYXYXY of wide specified by MODE. This is essentially
26969 a * 0x10101010, but we can do slightly better than
26970 synth_mult by unwinding the sequence by hand on CPUs with
26971 slow multiply. */
26972 static rtx
26973 promote_duplicated_reg (machine_mode mode, rtx val)
26975 machine_mode valmode = GET_MODE (val);
26976 rtx tmp;
26977 int nops = mode == DImode ? 3 : 2;
26979 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
26980 if (val == const0_rtx)
26981 return copy_to_mode_reg (mode, CONST0_RTX (mode));
26982 if (CONST_INT_P (val))
26984 HOST_WIDE_INT v = INTVAL (val) & 255;
26986 v |= v << 8;
26987 v |= v << 16;
26988 if (mode == DImode)
26989 v |= (v << 16) << 16;
26990 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
26993 if (valmode == VOIDmode)
26994 valmode = QImode;
26995 if (valmode != QImode)
26996 val = gen_lowpart (QImode, val);
26997 if (mode == QImode)
26998 return val;
26999 if (!TARGET_PARTIAL_REG_STALL)
27000 nops--;
27001 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27002 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27003 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27004 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27006 rtx reg = convert_modes (mode, QImode, val, true);
27007 tmp = promote_duplicated_reg (mode, const1_rtx);
27008 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27009 OPTAB_DIRECT);
27011 else
27013 rtx reg = convert_modes (mode, QImode, val, true);
27015 if (!TARGET_PARTIAL_REG_STALL)
27016 if (mode == SImode)
27017 emit_insn (gen_insvsi_1 (reg, reg));
27018 else
27019 emit_insn (gen_insvdi_1 (reg, reg));
27020 else
27022 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27023 NULL, 1, OPTAB_DIRECT);
27024 reg =
27025 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27027 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27028 NULL, 1, OPTAB_DIRECT);
27029 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27030 if (mode == SImode)
27031 return reg;
27032 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27033 NULL, 1, OPTAB_DIRECT);
27034 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27035 return reg;
27039 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27040 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27041 alignment from ALIGN to DESIRED_ALIGN. */
27042 static rtx
27043 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27044 int align)
27046 rtx promoted_val;
27048 if (TARGET_64BIT
27049 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27050 promoted_val = promote_duplicated_reg (DImode, val);
27051 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27052 promoted_val = promote_duplicated_reg (SImode, val);
27053 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27054 promoted_val = promote_duplicated_reg (HImode, val);
27055 else
27056 promoted_val = val;
27058 return promoted_val;
27061 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27062 operations when profitable. The code depends upon architecture, block size
27063 and alignment, but always has one of the following overall structures:
27065 Aligned move sequence:
27067 1) Prologue guard: Conditional that jumps up to epilogues for small
27068 blocks that can be handled by epilogue alone. This is faster
27069 but also needed for correctness, since prologue assume the block
27070 is larger than the desired alignment.
27072 Optional dynamic check for size and libcall for large
27073 blocks is emitted here too, with -minline-stringops-dynamically.
27075 2) Prologue: copy first few bytes in order to get destination
27076 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27077 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27078 copied. We emit either a jump tree on power of two sized
27079 blocks, or a byte loop.
27081 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27082 with specified algorithm.
27084 4) Epilogue: code copying tail of the block that is too small to be
27085 handled by main body (or up to size guarded by prologue guard).
27087 Misaligned move sequence
27089 1) missaligned move prologue/epilogue containing:
27090 a) Prologue handling small memory blocks and jumping to done_label
27091 (skipped if blocks are known to be large enough)
27092 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27093 needed by single possibly misaligned move
27094 (skipped if alignment is not needed)
27095 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27097 2) Zero size guard dispatching to done_label, if needed
27099 3) dispatch to library call, if needed,
27101 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27102 with specified algorithm. */
27103 bool
27104 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27105 rtx align_exp, rtx expected_align_exp,
27106 rtx expected_size_exp, rtx min_size_exp,
27107 rtx max_size_exp, rtx probable_max_size_exp,
27108 bool issetmem)
27110 rtx destreg;
27111 rtx srcreg = NULL;
27112 rtx_code_label *label = NULL;
27113 rtx tmp;
27114 rtx_code_label *jump_around_label = NULL;
27115 HOST_WIDE_INT align = 1;
27116 unsigned HOST_WIDE_INT count = 0;
27117 HOST_WIDE_INT expected_size = -1;
27118 int size_needed = 0, epilogue_size_needed;
27119 int desired_align = 0, align_bytes = 0;
27120 enum stringop_alg alg;
27121 rtx promoted_val = NULL;
27122 rtx vec_promoted_val = NULL;
27123 bool force_loopy_epilogue = false;
27124 int dynamic_check;
27125 bool need_zero_guard = false;
27126 bool noalign;
27127 machine_mode move_mode = VOIDmode;
27128 machine_mode wider_mode;
27129 int unroll_factor = 1;
27130 /* TODO: Once value ranges are available, fill in proper data. */
27131 unsigned HOST_WIDE_INT min_size = 0;
27132 unsigned HOST_WIDE_INT max_size = -1;
27133 unsigned HOST_WIDE_INT probable_max_size = -1;
27134 bool misaligned_prologue_used = false;
27135 bool have_as;
27137 if (CONST_INT_P (align_exp))
27138 align = INTVAL (align_exp);
27139 /* i386 can do misaligned access on reasonably increased cost. */
27140 if (CONST_INT_P (expected_align_exp)
27141 && INTVAL (expected_align_exp) > align)
27142 align = INTVAL (expected_align_exp);
27143 /* ALIGN is the minimum of destination and source alignment, but we care here
27144 just about destination alignment. */
27145 else if (!issetmem
27146 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27147 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27149 if (CONST_INT_P (count_exp))
27151 min_size = max_size = probable_max_size = count = expected_size
27152 = INTVAL (count_exp);
27153 /* When COUNT is 0, there is nothing to do. */
27154 if (!count)
27155 return true;
27157 else
27159 if (min_size_exp)
27160 min_size = INTVAL (min_size_exp);
27161 if (max_size_exp)
27162 max_size = INTVAL (max_size_exp);
27163 if (probable_max_size_exp)
27164 probable_max_size = INTVAL (probable_max_size_exp);
27165 if (CONST_INT_P (expected_size_exp))
27166 expected_size = INTVAL (expected_size_exp);
27169 /* Make sure we don't need to care about overflow later on. */
27170 if (count > (HOST_WIDE_INT_1U << 30))
27171 return false;
27173 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27174 if (!issetmem)
27175 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27177 /* Step 0: Decide on preferred algorithm, desired alignment and
27178 size of chunks to be copied by main loop. */
27179 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27180 issetmem,
27181 issetmem && val_exp == const0_rtx, have_as,
27182 &dynamic_check, &noalign, false);
27183 if (alg == libcall)
27184 return false;
27185 gcc_assert (alg != no_stringop);
27187 /* For now vector-version of memset is generated only for memory zeroing, as
27188 creating of promoted vector value is very cheap in this case. */
27189 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27190 alg = unrolled_loop;
27192 if (!count)
27193 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27194 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27195 if (!issetmem)
27196 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27198 unroll_factor = 1;
27199 move_mode = word_mode;
27200 switch (alg)
27202 case libcall:
27203 case no_stringop:
27204 case last_alg:
27205 gcc_unreachable ();
27206 case loop_1_byte:
27207 need_zero_guard = true;
27208 move_mode = QImode;
27209 break;
27210 case loop:
27211 need_zero_guard = true;
27212 break;
27213 case unrolled_loop:
27214 need_zero_guard = true;
27215 unroll_factor = (TARGET_64BIT ? 4 : 2);
27216 break;
27217 case vector_loop:
27218 need_zero_guard = true;
27219 unroll_factor = 4;
27220 /* Find the widest supported mode. */
27221 move_mode = word_mode;
27222 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27223 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27224 move_mode = wider_mode;
27226 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27227 move_mode = TImode;
27229 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27230 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27231 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27233 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27234 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27235 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27236 move_mode = word_mode;
27238 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27239 break;
27240 case rep_prefix_8_byte:
27241 move_mode = DImode;
27242 break;
27243 case rep_prefix_4_byte:
27244 move_mode = SImode;
27245 break;
27246 case rep_prefix_1_byte:
27247 move_mode = QImode;
27248 break;
27250 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27251 epilogue_size_needed = size_needed;
27253 /* If we are going to call any library calls conditionally, make sure any
27254 pending stack adjustment happen before the first conditional branch,
27255 otherwise they will be emitted before the library call only and won't
27256 happen from the other branches. */
27257 if (dynamic_check != -1)
27258 do_pending_stack_adjust ();
27260 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27261 if (!TARGET_ALIGN_STRINGOPS || noalign)
27262 align = desired_align;
27264 /* Step 1: Prologue guard. */
27266 /* Alignment code needs count to be in register. */
27267 if (CONST_INT_P (count_exp) && desired_align > align)
27269 if (INTVAL (count_exp) > desired_align
27270 && INTVAL (count_exp) > size_needed)
27272 align_bytes
27273 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27274 if (align_bytes <= 0)
27275 align_bytes = 0;
27276 else
27277 align_bytes = desired_align - align_bytes;
27279 if (align_bytes == 0)
27280 count_exp = force_reg (counter_mode (count_exp), count_exp);
27282 gcc_assert (desired_align >= 1 && align >= 1);
27284 /* Misaligned move sequences handle both prologue and epilogue at once.
27285 Default code generation results in a smaller code for large alignments
27286 and also avoids redundant job when sizes are known precisely. */
27287 misaligned_prologue_used
27288 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27289 && MAX (desired_align, epilogue_size_needed) <= 32
27290 && desired_align <= epilogue_size_needed
27291 && ((desired_align > align && !align_bytes)
27292 || (!count && epilogue_size_needed > 1)));
27294 /* Do the cheap promotion to allow better CSE across the
27295 main loop and epilogue (ie one load of the big constant in the
27296 front of all code.
27297 For now the misaligned move sequences do not have fast path
27298 without broadcasting. */
27299 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27301 if (alg == vector_loop)
27303 gcc_assert (val_exp == const0_rtx);
27304 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27305 promoted_val = promote_duplicated_reg_to_size (val_exp,
27306 GET_MODE_SIZE (word_mode),
27307 desired_align, align);
27309 else
27311 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27312 desired_align, align);
27315 /* Misaligned move sequences handles both prologues and epilogues at once.
27316 Default code generation results in smaller code for large alignments and
27317 also avoids redundant job when sizes are known precisely. */
27318 if (misaligned_prologue_used)
27320 /* Misaligned move prologue handled small blocks by itself. */
27321 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27322 (dst, src, &destreg, &srcreg,
27323 move_mode, promoted_val, vec_promoted_val,
27324 &count_exp,
27325 &jump_around_label,
27326 desired_align < align
27327 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27328 desired_align, align, &min_size, dynamic_check, issetmem);
27329 if (!issetmem)
27330 src = change_address (src, BLKmode, srcreg);
27331 dst = change_address (dst, BLKmode, destreg);
27332 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27333 epilogue_size_needed = 0;
27334 if (need_zero_guard
27335 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27337 /* It is possible that we copied enough so the main loop will not
27338 execute. */
27339 gcc_assert (size_needed > 1);
27340 if (jump_around_label == NULL_RTX)
27341 jump_around_label = gen_label_rtx ();
27342 emit_cmp_and_jump_insns (count_exp,
27343 GEN_INT (size_needed),
27344 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27345 if (expected_size == -1
27346 || expected_size < (desired_align - align) / 2 + size_needed)
27347 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27348 else
27349 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27352 /* Ensure that alignment prologue won't copy past end of block. */
27353 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27355 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27356 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27357 Make sure it is power of 2. */
27358 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27360 /* To improve performance of small blocks, we jump around the VAL
27361 promoting mode. This mean that if the promoted VAL is not constant,
27362 we might not use it in the epilogue and have to use byte
27363 loop variant. */
27364 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27365 force_loopy_epilogue = true;
27366 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27367 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27369 /* If main algorithm works on QImode, no epilogue is needed.
27370 For small sizes just don't align anything. */
27371 if (size_needed == 1)
27372 desired_align = align;
27373 else
27374 goto epilogue;
27376 else if (!count
27377 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27379 label = gen_label_rtx ();
27380 emit_cmp_and_jump_insns (count_exp,
27381 GEN_INT (epilogue_size_needed),
27382 LTU, 0, counter_mode (count_exp), 1, label);
27383 if (expected_size == -1 || expected_size < epilogue_size_needed)
27384 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27385 else
27386 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27390 /* Emit code to decide on runtime whether library call or inline should be
27391 used. */
27392 if (dynamic_check != -1)
27394 if (!issetmem && CONST_INT_P (count_exp))
27396 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27398 emit_block_copy_via_libcall (dst, src, count_exp);
27399 count_exp = const0_rtx;
27400 goto epilogue;
27403 else
27405 rtx_code_label *hot_label = gen_label_rtx ();
27406 if (jump_around_label == NULL_RTX)
27407 jump_around_label = gen_label_rtx ();
27408 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27409 LEU, 0, counter_mode (count_exp),
27410 1, hot_label);
27411 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27412 if (issetmem)
27413 set_storage_via_libcall (dst, count_exp, val_exp);
27414 else
27415 emit_block_copy_via_libcall (dst, src, count_exp);
27416 emit_jump (jump_around_label);
27417 emit_label (hot_label);
27421 /* Step 2: Alignment prologue. */
27422 /* Do the expensive promotion once we branched off the small blocks. */
27423 if (issetmem && !promoted_val)
27424 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27425 desired_align, align);
27427 if (desired_align > align && !misaligned_prologue_used)
27429 if (align_bytes == 0)
27431 /* Except for the first move in prologue, we no longer know
27432 constant offset in aliasing info. It don't seems to worth
27433 the pain to maintain it for the first move, so throw away
27434 the info early. */
27435 dst = change_address (dst, BLKmode, destreg);
27436 if (!issetmem)
27437 src = change_address (src, BLKmode, srcreg);
27438 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27439 promoted_val, vec_promoted_val,
27440 count_exp, align, desired_align,
27441 issetmem);
27442 /* At most desired_align - align bytes are copied. */
27443 if (min_size < (unsigned)(desired_align - align))
27444 min_size = 0;
27445 else
27446 min_size -= desired_align - align;
27448 else
27450 /* If we know how many bytes need to be stored before dst is
27451 sufficiently aligned, maintain aliasing info accurately. */
27452 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27453 srcreg,
27454 promoted_val,
27455 vec_promoted_val,
27456 desired_align,
27457 align_bytes,
27458 issetmem);
27460 count_exp = plus_constant (counter_mode (count_exp),
27461 count_exp, -align_bytes);
27462 count -= align_bytes;
27463 min_size -= align_bytes;
27464 max_size -= align_bytes;
27466 if (need_zero_guard
27467 && min_size < (unsigned HOST_WIDE_INT) size_needed
27468 && (count < (unsigned HOST_WIDE_INT) size_needed
27469 || (align_bytes == 0
27470 && count < ((unsigned HOST_WIDE_INT) size_needed
27471 + desired_align - align))))
27473 /* It is possible that we copied enough so the main loop will not
27474 execute. */
27475 gcc_assert (size_needed > 1);
27476 if (label == NULL_RTX)
27477 label = gen_label_rtx ();
27478 emit_cmp_and_jump_insns (count_exp,
27479 GEN_INT (size_needed),
27480 LTU, 0, counter_mode (count_exp), 1, label);
27481 if (expected_size == -1
27482 || expected_size < (desired_align - align) / 2 + size_needed)
27483 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27484 else
27485 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27488 if (label && size_needed == 1)
27490 emit_label (label);
27491 LABEL_NUSES (label) = 1;
27492 label = NULL;
27493 epilogue_size_needed = 1;
27494 if (issetmem)
27495 promoted_val = val_exp;
27497 else if (label == NULL_RTX && !misaligned_prologue_used)
27498 epilogue_size_needed = size_needed;
27500 /* Step 3: Main loop. */
27502 switch (alg)
27504 case libcall:
27505 case no_stringop:
27506 case last_alg:
27507 gcc_unreachable ();
27508 case loop_1_byte:
27509 case loop:
27510 case unrolled_loop:
27511 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27512 count_exp, move_mode, unroll_factor,
27513 expected_size, issetmem);
27514 break;
27515 case vector_loop:
27516 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27517 vec_promoted_val, count_exp, move_mode,
27518 unroll_factor, expected_size, issetmem);
27519 break;
27520 case rep_prefix_8_byte:
27521 case rep_prefix_4_byte:
27522 case rep_prefix_1_byte:
27523 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27524 val_exp, count_exp, move_mode, issetmem);
27525 break;
27527 /* Adjust properly the offset of src and dest memory for aliasing. */
27528 if (CONST_INT_P (count_exp))
27530 if (!issetmem)
27531 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27532 (count / size_needed) * size_needed);
27533 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27534 (count / size_needed) * size_needed);
27536 else
27538 if (!issetmem)
27539 src = change_address (src, BLKmode, srcreg);
27540 dst = change_address (dst, BLKmode, destreg);
27543 /* Step 4: Epilogue to copy the remaining bytes. */
27544 epilogue:
27545 if (label)
27547 /* When the main loop is done, COUNT_EXP might hold original count,
27548 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27549 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27550 bytes. Compensate if needed. */
27552 if (size_needed < epilogue_size_needed)
27554 tmp =
27555 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27556 GEN_INT (size_needed - 1), count_exp, 1,
27557 OPTAB_DIRECT);
27558 if (tmp != count_exp)
27559 emit_move_insn (count_exp, tmp);
27561 emit_label (label);
27562 LABEL_NUSES (label) = 1;
27565 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27567 if (force_loopy_epilogue)
27568 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27569 epilogue_size_needed);
27570 else
27572 if (issetmem)
27573 expand_setmem_epilogue (dst, destreg, promoted_val,
27574 vec_promoted_val, count_exp,
27575 epilogue_size_needed);
27576 else
27577 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27578 epilogue_size_needed);
27581 if (jump_around_label)
27582 emit_label (jump_around_label);
27583 return true;
27587 /* Expand the appropriate insns for doing strlen if not just doing
27588 repnz; scasb
27590 out = result, initialized with the start address
27591 align_rtx = alignment of the address.
27592 scratch = scratch register, initialized with the startaddress when
27593 not aligned, otherwise undefined
27595 This is just the body. It needs the initializations mentioned above and
27596 some address computing at the end. These things are done in i386.md. */
27598 static void
27599 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27601 int align;
27602 rtx tmp;
27603 rtx_code_label *align_2_label = NULL;
27604 rtx_code_label *align_3_label = NULL;
27605 rtx_code_label *align_4_label = gen_label_rtx ();
27606 rtx_code_label *end_0_label = gen_label_rtx ();
27607 rtx mem;
27608 rtx tmpreg = gen_reg_rtx (SImode);
27609 rtx scratch = gen_reg_rtx (SImode);
27610 rtx cmp;
27612 align = 0;
27613 if (CONST_INT_P (align_rtx))
27614 align = INTVAL (align_rtx);
27616 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27618 /* Is there a known alignment and is it less than 4? */
27619 if (align < 4)
27621 rtx scratch1 = gen_reg_rtx (Pmode);
27622 emit_move_insn (scratch1, out);
27623 /* Is there a known alignment and is it not 2? */
27624 if (align != 2)
27626 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27627 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27629 /* Leave just the 3 lower bits. */
27630 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27631 NULL_RTX, 0, OPTAB_WIDEN);
27633 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27634 Pmode, 1, align_4_label);
27635 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27636 Pmode, 1, align_2_label);
27637 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27638 Pmode, 1, align_3_label);
27640 else
27642 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27643 check if is aligned to 4 - byte. */
27645 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27646 NULL_RTX, 0, OPTAB_WIDEN);
27648 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27649 Pmode, 1, align_4_label);
27652 mem = change_address (src, QImode, out);
27654 /* Now compare the bytes. */
27656 /* Compare the first n unaligned byte on a byte per byte basis. */
27657 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27658 QImode, 1, end_0_label);
27660 /* Increment the address. */
27661 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27663 /* Not needed with an alignment of 2 */
27664 if (align != 2)
27666 emit_label (align_2_label);
27668 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27669 end_0_label);
27671 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27673 emit_label (align_3_label);
27676 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27677 end_0_label);
27679 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27682 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27683 align this loop. It gives only huge programs, but does not help to
27684 speed up. */
27685 emit_label (align_4_label);
27687 mem = change_address (src, SImode, out);
27688 emit_move_insn (scratch, mem);
27689 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27691 /* This formula yields a nonzero result iff one of the bytes is zero.
27692 This saves three branches inside loop and many cycles. */
27694 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27695 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27696 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27697 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27698 gen_int_mode (0x80808080, SImode)));
27699 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27700 align_4_label);
27702 if (TARGET_CMOVE)
27704 rtx reg = gen_reg_rtx (SImode);
27705 rtx reg2 = gen_reg_rtx (Pmode);
27706 emit_move_insn (reg, tmpreg);
27707 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27709 /* If zero is not in the first two bytes, move two bytes forward. */
27710 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27711 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27712 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27713 emit_insn (gen_rtx_SET (tmpreg,
27714 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27715 reg,
27716 tmpreg)));
27717 /* Emit lea manually to avoid clobbering of flags. */
27718 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27720 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27721 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27722 emit_insn (gen_rtx_SET (out,
27723 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27724 reg2,
27725 out)));
27727 else
27729 rtx_code_label *end_2_label = gen_label_rtx ();
27730 /* Is zero in the first two bytes? */
27732 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27733 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27734 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27735 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27736 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27737 pc_rtx);
27738 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27739 JUMP_LABEL (tmp) = end_2_label;
27741 /* Not in the first two. Move two bytes forward. */
27742 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27743 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27745 emit_label (end_2_label);
27749 /* Avoid branch in fixing the byte. */
27750 tmpreg = gen_lowpart (QImode, tmpreg);
27751 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27752 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27753 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27754 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27756 emit_label (end_0_label);
27759 /* Expand strlen. */
27761 bool
27762 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27764 rtx addr, scratch1, scratch2, scratch3, scratch4;
27766 /* The generic case of strlen expander is long. Avoid it's
27767 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27769 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27770 && !TARGET_INLINE_ALL_STRINGOPS
27771 && !optimize_insn_for_size_p ()
27772 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27773 return false;
27775 addr = force_reg (Pmode, XEXP (src, 0));
27776 scratch1 = gen_reg_rtx (Pmode);
27778 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27779 && !optimize_insn_for_size_p ())
27781 /* Well it seems that some optimizer does not combine a call like
27782 foo(strlen(bar), strlen(bar));
27783 when the move and the subtraction is done here. It does calculate
27784 the length just once when these instructions are done inside of
27785 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27786 often used and I use one fewer register for the lifetime of
27787 output_strlen_unroll() this is better. */
27789 emit_move_insn (out, addr);
27791 ix86_expand_strlensi_unroll_1 (out, src, align);
27793 /* strlensi_unroll_1 returns the address of the zero at the end of
27794 the string, like memchr(), so compute the length by subtracting
27795 the start address. */
27796 emit_insn (ix86_gen_sub3 (out, out, addr));
27798 else
27800 rtx unspec;
27802 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27803 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27804 return false;
27805 /* Can't use this for non-default address spaces. */
27806 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27807 return false;
27809 scratch2 = gen_reg_rtx (Pmode);
27810 scratch3 = gen_reg_rtx (Pmode);
27811 scratch4 = force_reg (Pmode, constm1_rtx);
27813 emit_move_insn (scratch3, addr);
27814 eoschar = force_reg (QImode, eoschar);
27816 src = replace_equiv_address_nv (src, scratch3);
27818 /* If .md starts supporting :P, this can be done in .md. */
27819 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27820 scratch4), UNSPEC_SCAS);
27821 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27822 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27823 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27825 return true;
27828 /* For given symbol (function) construct code to compute address of it's PLT
27829 entry in large x86-64 PIC model. */
27830 static rtx
27831 construct_plt_address (rtx symbol)
27833 rtx tmp, unspec;
27835 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27836 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27837 gcc_assert (Pmode == DImode);
27839 tmp = gen_reg_rtx (Pmode);
27840 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27842 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27843 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27844 return tmp;
27848 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
27849 rtx callarg2,
27850 rtx pop, bool sibcall)
27852 rtx vec[3];
27853 rtx use = NULL, call;
27854 unsigned int vec_len = 0;
27855 tree fndecl;
27857 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27859 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
27860 if (fndecl
27861 && (lookup_attribute ("interrupt",
27862 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
27863 error ("interrupt service routine can't be called directly");
27865 else
27866 fndecl = NULL_TREE;
27868 if (pop == const0_rtx)
27869 pop = NULL;
27870 gcc_assert (!TARGET_64BIT || !pop);
27872 if (TARGET_MACHO && !TARGET_64BIT)
27874 #if TARGET_MACHO
27875 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27876 fnaddr = machopic_indirect_call_target (fnaddr);
27877 #endif
27879 else
27881 /* Static functions and indirect calls don't need the pic register. Also,
27882 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
27883 it an indirect call. */
27884 rtx addr = XEXP (fnaddr, 0);
27885 if (flag_pic
27886 && GET_CODE (addr) == SYMBOL_REF
27887 && !SYMBOL_REF_LOCAL_P (addr))
27889 if (flag_plt
27890 && (SYMBOL_REF_DECL (addr) == NULL_TREE
27891 || !lookup_attribute ("noplt",
27892 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
27894 if (!TARGET_64BIT
27895 || (ix86_cmodel == CM_LARGE_PIC
27896 && DEFAULT_ABI != MS_ABI))
27898 use_reg (&use, gen_rtx_REG (Pmode,
27899 REAL_PIC_OFFSET_TABLE_REGNUM));
27900 if (ix86_use_pseudo_pic_reg ())
27901 emit_move_insn (gen_rtx_REG (Pmode,
27902 REAL_PIC_OFFSET_TABLE_REGNUM),
27903 pic_offset_table_rtx);
27906 else if (!TARGET_PECOFF && !TARGET_MACHO)
27908 if (TARGET_64BIT)
27910 fnaddr = gen_rtx_UNSPEC (Pmode,
27911 gen_rtvec (1, addr),
27912 UNSPEC_GOTPCREL);
27913 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27915 else
27917 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
27918 UNSPEC_GOT);
27919 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27920 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
27921 fnaddr);
27923 fnaddr = gen_const_mem (Pmode, fnaddr);
27924 /* Pmode may not be the same as word_mode for x32, which
27925 doesn't support indirect branch via 32-bit memory slot.
27926 Since x32 GOT slot is 64 bit with zero upper 32 bits,
27927 indirect branch via x32 GOT slot is OK. */
27928 if (GET_MODE (fnaddr) != word_mode)
27929 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
27930 fnaddr = gen_rtx_MEM (QImode, fnaddr);
27935 /* Skip setting up RAX register for -mskip-rax-setup when there are no
27936 parameters passed in vector registers. */
27937 if (TARGET_64BIT
27938 && (INTVAL (callarg2) > 0
27939 || (INTVAL (callarg2) == 0
27940 && (TARGET_SSE || !flag_skip_rax_setup))))
27942 rtx al = gen_rtx_REG (QImode, AX_REG);
27943 emit_move_insn (al, callarg2);
27944 use_reg (&use, al);
27947 if (ix86_cmodel == CM_LARGE_PIC
27948 && !TARGET_PECOFF
27949 && MEM_P (fnaddr)
27950 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
27951 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
27952 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
27953 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
27954 branch via x32 GOT slot is OK. */
27955 else if (!(TARGET_X32
27956 && MEM_P (fnaddr)
27957 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
27958 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
27959 && (sibcall
27960 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
27961 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
27963 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
27964 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
27967 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
27969 if (retval)
27971 /* We should add bounds as destination register in case
27972 pointer with bounds may be returned. */
27973 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
27975 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
27976 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
27977 if (GET_CODE (retval) == PARALLEL)
27979 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
27980 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
27981 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
27982 retval = chkp_join_splitted_slot (retval, par);
27984 else
27986 retval = gen_rtx_PARALLEL (VOIDmode,
27987 gen_rtvec (3, retval, b0, b1));
27988 chkp_put_regs_to_expr_list (retval);
27992 call = gen_rtx_SET (retval, call);
27994 vec[vec_len++] = call;
27996 if (pop)
27998 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
27999 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28000 vec[vec_len++] = pop;
28003 if (cfun->machine->no_caller_saved_registers
28004 && (!fndecl
28005 || (!TREE_THIS_VOLATILE (fndecl)
28006 && !lookup_attribute ("no_caller_saved_registers",
28007 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28009 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28010 bool is_64bit_ms_abi = (TARGET_64BIT
28011 && ix86_function_abi (fndecl) == MS_ABI);
28012 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28014 /* If there are no caller-saved registers, add all registers
28015 that are clobbered by the call which returns. */
28016 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28017 if (!fixed_regs[i]
28018 && (ix86_call_used_regs[i] == 1
28019 || (ix86_call_used_regs[i] & c_mask))
28020 && !STACK_REGNO_P (i)
28021 && !MMX_REGNO_P (i))
28022 clobber_reg (&use,
28023 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28025 else if (TARGET_64BIT_MS_ABI
28026 && (!callarg2 || INTVAL (callarg2) != -2))
28028 unsigned i;
28030 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28032 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28033 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28035 clobber_reg (&use, gen_rtx_REG (mode, regno));
28038 /* Set here, but it may get cleared later. */
28039 if (TARGET_CALL_MS2SYSV_XLOGUES)
28041 if (!TARGET_SSE)
28044 /* Don't break hot-patched functions. */
28045 else if (ix86_function_ms_hook_prologue (current_function_decl))
28048 /* TODO: Cases not yet examined. */
28049 else if (flag_split_stack)
28050 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28052 else
28054 gcc_assert (!reload_completed);
28055 cfun->machine->call_ms2sysv = true;
28060 if (vec_len > 1)
28061 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28062 call = emit_call_insn (call);
28063 if (use)
28064 CALL_INSN_FUNCTION_USAGE (call) = use;
28066 return call;
28069 /* Return true if the function being called was marked with attribute
28070 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28071 to handle the non-PIC case in the backend because there is no easy
28072 interface for the front-end to force non-PLT calls to use the GOT.
28073 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28074 to call the function marked "noplt" indirectly. */
28076 static bool
28077 ix86_nopic_noplt_attribute_p (rtx call_op)
28079 if (flag_pic || ix86_cmodel == CM_LARGE
28080 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28081 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28082 || SYMBOL_REF_LOCAL_P (call_op))
28083 return false;
28085 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28087 if (!flag_plt
28088 || (symbol_decl != NULL_TREE
28089 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28090 return true;
28092 return false;
28095 /* Output the assembly for a call instruction. */
28097 const char *
28098 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28100 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28101 bool seh_nop_p = false;
28102 const char *xasm;
28104 if (SIBLING_CALL_P (insn))
28106 if (direct_p)
28108 if (ix86_nopic_noplt_attribute_p (call_op))
28110 if (TARGET_64BIT)
28111 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28112 else
28113 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28115 else
28116 xasm = "%!jmp\t%P0";
28118 /* SEH epilogue detection requires the indirect branch case
28119 to include REX.W. */
28120 else if (TARGET_SEH)
28121 xasm = "%!rex.W jmp\t%A0";
28122 else
28123 xasm = "%!jmp\t%A0";
28125 output_asm_insn (xasm, &call_op);
28126 return "";
28129 /* SEH unwinding can require an extra nop to be emitted in several
28130 circumstances. Determine if we have one of those. */
28131 if (TARGET_SEH)
28133 rtx_insn *i;
28135 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28137 /* If we get to another real insn, we don't need the nop. */
28138 if (INSN_P (i))
28139 break;
28141 /* If we get to the epilogue note, prevent a catch region from
28142 being adjacent to the standard epilogue sequence. If non-
28143 call-exceptions, we'll have done this during epilogue emission. */
28144 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28145 && !flag_non_call_exceptions
28146 && !can_throw_internal (insn))
28148 seh_nop_p = true;
28149 break;
28153 /* If we didn't find a real insn following the call, prevent the
28154 unwinder from looking into the next function. */
28155 if (i == NULL)
28156 seh_nop_p = true;
28159 if (direct_p)
28161 if (ix86_nopic_noplt_attribute_p (call_op))
28163 if (TARGET_64BIT)
28164 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28165 else
28166 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28168 else
28169 xasm = "%!call\t%P0";
28171 else
28172 xasm = "%!call\t%A0";
28174 output_asm_insn (xasm, &call_op);
28176 if (seh_nop_p)
28177 return "nop";
28179 return "";
28182 /* Clear stack slot assignments remembered from previous functions.
28183 This is called from INIT_EXPANDERS once before RTL is emitted for each
28184 function. */
28186 static struct machine_function *
28187 ix86_init_machine_status (void)
28189 struct machine_function *f;
28191 f = ggc_cleared_alloc<machine_function> ();
28192 f->call_abi = ix86_abi;
28194 return f;
28197 /* Return a MEM corresponding to a stack slot with mode MODE.
28198 Allocate a new slot if necessary.
28200 The RTL for a function can have several slots available: N is
28201 which slot to use. */
28204 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28206 struct stack_local_entry *s;
28208 gcc_assert (n < MAX_386_STACK_LOCALS);
28210 for (s = ix86_stack_locals; s; s = s->next)
28211 if (s->mode == mode && s->n == n)
28212 return validize_mem (copy_rtx (s->rtl));
28214 s = ggc_alloc<stack_local_entry> ();
28215 s->n = n;
28216 s->mode = mode;
28217 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28219 s->next = ix86_stack_locals;
28220 ix86_stack_locals = s;
28221 return validize_mem (copy_rtx (s->rtl));
28224 static void
28225 ix86_instantiate_decls (void)
28227 struct stack_local_entry *s;
28229 for (s = ix86_stack_locals; s; s = s->next)
28230 if (s->rtl != NULL_RTX)
28231 instantiate_decl_rtl (s->rtl);
28234 /* Return the number used for encoding REG, in the range 0..7. */
28236 static int
28237 reg_encoded_number (rtx reg)
28239 unsigned regno = REGNO (reg);
28240 switch (regno)
28242 case AX_REG:
28243 return 0;
28244 case CX_REG:
28245 return 1;
28246 case DX_REG:
28247 return 2;
28248 case BX_REG:
28249 return 3;
28250 case SP_REG:
28251 return 4;
28252 case BP_REG:
28253 return 5;
28254 case SI_REG:
28255 return 6;
28256 case DI_REG:
28257 return 7;
28258 default:
28259 break;
28261 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28262 return regno - FIRST_STACK_REG;
28263 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28264 return regno - FIRST_SSE_REG;
28265 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28266 return regno - FIRST_MMX_REG;
28267 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28268 return regno - FIRST_REX_SSE_REG;
28269 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28270 return regno - FIRST_REX_INT_REG;
28271 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28272 return regno - FIRST_MASK_REG;
28273 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28274 return regno - FIRST_BND_REG;
28275 return -1;
28278 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28279 in its encoding if it could be relevant for ROP mitigation, otherwise
28280 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28281 used for calculating it into them. */
28283 static int
28284 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28285 int *popno0 = 0, int *popno1 = 0)
28287 if (asm_noperands (PATTERN (insn)) >= 0)
28288 return -1;
28289 int has_modrm = get_attr_modrm (insn);
28290 if (!has_modrm)
28291 return -1;
28292 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28293 rtx op0, op1;
28294 switch (cls)
28296 case MODRM_CLASS_OP02:
28297 gcc_assert (noperands >= 3);
28298 if (popno0)
28300 *popno0 = 0;
28301 *popno1 = 2;
28303 op0 = operands[0];
28304 op1 = operands[2];
28305 break;
28306 case MODRM_CLASS_OP01:
28307 gcc_assert (noperands >= 2);
28308 if (popno0)
28310 *popno0 = 0;
28311 *popno1 = 1;
28313 op0 = operands[0];
28314 op1 = operands[1];
28315 break;
28316 default:
28317 return -1;
28319 if (REG_P (op0) && REG_P (op1))
28321 int enc0 = reg_encoded_number (op0);
28322 int enc1 = reg_encoded_number (op1);
28323 return 0xc0 + (enc1 << 3) + enc0;
28325 return -1;
28328 /* Check whether x86 address PARTS is a pc-relative address. */
28330 bool
28331 ix86_rip_relative_addr_p (struct ix86_address *parts)
28333 rtx base, index, disp;
28335 base = parts->base;
28336 index = parts->index;
28337 disp = parts->disp;
28339 if (disp && !base && !index)
28341 if (TARGET_64BIT)
28343 rtx symbol = disp;
28345 if (GET_CODE (disp) == CONST)
28346 symbol = XEXP (disp, 0);
28347 if (GET_CODE (symbol) == PLUS
28348 && CONST_INT_P (XEXP (symbol, 1)))
28349 symbol = XEXP (symbol, 0);
28351 if (GET_CODE (symbol) == LABEL_REF
28352 || (GET_CODE (symbol) == SYMBOL_REF
28353 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28354 || (GET_CODE (symbol) == UNSPEC
28355 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28356 || XINT (symbol, 1) == UNSPEC_PCREL
28357 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28358 return true;
28361 return false;
28364 /* Calculate the length of the memory address in the instruction encoding.
28365 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28366 or other prefixes. We never generate addr32 prefix for LEA insn. */
28369 memory_address_length (rtx addr, bool lea)
28371 struct ix86_address parts;
28372 rtx base, index, disp;
28373 int len;
28374 int ok;
28376 if (GET_CODE (addr) == PRE_DEC
28377 || GET_CODE (addr) == POST_INC
28378 || GET_CODE (addr) == PRE_MODIFY
28379 || GET_CODE (addr) == POST_MODIFY)
28380 return 0;
28382 ok = ix86_decompose_address (addr, &parts);
28383 gcc_assert (ok);
28385 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28387 /* If this is not LEA instruction, add the length of addr32 prefix. */
28388 if (TARGET_64BIT && !lea
28389 && (SImode_address_operand (addr, VOIDmode)
28390 || (parts.base && GET_MODE (parts.base) == SImode)
28391 || (parts.index && GET_MODE (parts.index) == SImode)))
28392 len++;
28394 base = parts.base;
28395 index = parts.index;
28396 disp = parts.disp;
28398 if (base && SUBREG_P (base))
28399 base = SUBREG_REG (base);
28400 if (index && SUBREG_P (index))
28401 index = SUBREG_REG (index);
28403 gcc_assert (base == NULL_RTX || REG_P (base));
28404 gcc_assert (index == NULL_RTX || REG_P (index));
28406 /* Rule of thumb:
28407 - esp as the base always wants an index,
28408 - ebp as the base always wants a displacement,
28409 - r12 as the base always wants an index,
28410 - r13 as the base always wants a displacement. */
28412 /* Register Indirect. */
28413 if (base && !index && !disp)
28415 /* esp (for its index) and ebp (for its displacement) need
28416 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28417 code. */
28418 if (base == arg_pointer_rtx
28419 || base == frame_pointer_rtx
28420 || REGNO (base) == SP_REG
28421 || REGNO (base) == BP_REG
28422 || REGNO (base) == R12_REG
28423 || REGNO (base) == R13_REG)
28424 len++;
28427 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28428 is not disp32, but disp32(%rip), so for disp32
28429 SIB byte is needed, unless print_operand_address
28430 optimizes it into disp32(%rip) or (%rip) is implied
28431 by UNSPEC. */
28432 else if (disp && !base && !index)
28434 len += 4;
28435 if (!ix86_rip_relative_addr_p (&parts))
28436 len++;
28438 else
28440 /* Find the length of the displacement constant. */
28441 if (disp)
28443 if (base && satisfies_constraint_K (disp))
28444 len += 1;
28445 else
28446 len += 4;
28448 /* ebp always wants a displacement. Similarly r13. */
28449 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28450 len++;
28452 /* An index requires the two-byte modrm form.... */
28453 if (index
28454 /* ...like esp (or r12), which always wants an index. */
28455 || base == arg_pointer_rtx
28456 || base == frame_pointer_rtx
28457 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28458 len++;
28461 return len;
28464 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28465 is set, expect that insn have 8bit immediate alternative. */
28467 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28469 int len = 0;
28470 int i;
28471 extract_insn_cached (insn);
28472 for (i = recog_data.n_operands - 1; i >= 0; --i)
28473 if (CONSTANT_P (recog_data.operand[i]))
28475 enum attr_mode mode = get_attr_mode (insn);
28477 gcc_assert (!len);
28478 if (shortform && CONST_INT_P (recog_data.operand[i]))
28480 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28481 switch (mode)
28483 case MODE_QI:
28484 len = 1;
28485 continue;
28486 case MODE_HI:
28487 ival = trunc_int_for_mode (ival, HImode);
28488 break;
28489 case MODE_SI:
28490 ival = trunc_int_for_mode (ival, SImode);
28491 break;
28492 default:
28493 break;
28495 if (IN_RANGE (ival, -128, 127))
28497 len = 1;
28498 continue;
28501 switch (mode)
28503 case MODE_QI:
28504 len = 1;
28505 break;
28506 case MODE_HI:
28507 len = 2;
28508 break;
28509 case MODE_SI:
28510 len = 4;
28511 break;
28512 /* Immediates for DImode instructions are encoded
28513 as 32bit sign extended values. */
28514 case MODE_DI:
28515 len = 4;
28516 break;
28517 default:
28518 fatal_insn ("unknown insn mode", insn);
28521 return len;
28524 /* Compute default value for "length_address" attribute. */
28526 ix86_attr_length_address_default (rtx_insn *insn)
28528 int i;
28530 if (get_attr_type (insn) == TYPE_LEA)
28532 rtx set = PATTERN (insn), addr;
28534 if (GET_CODE (set) == PARALLEL)
28535 set = XVECEXP (set, 0, 0);
28537 gcc_assert (GET_CODE (set) == SET);
28539 addr = SET_SRC (set);
28541 return memory_address_length (addr, true);
28544 extract_insn_cached (insn);
28545 for (i = recog_data.n_operands - 1; i >= 0; --i)
28547 rtx op = recog_data.operand[i];
28548 if (MEM_P (op))
28550 constrain_operands_cached (insn, reload_completed);
28551 if (which_alternative != -1)
28553 const char *constraints = recog_data.constraints[i];
28554 int alt = which_alternative;
28556 while (*constraints == '=' || *constraints == '+')
28557 constraints++;
28558 while (alt-- > 0)
28559 while (*constraints++ != ',')
28561 /* Skip ignored operands. */
28562 if (*constraints == 'X')
28563 continue;
28566 int len = memory_address_length (XEXP (op, 0), false);
28568 /* Account for segment prefix for non-default addr spaces. */
28569 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28570 len++;
28572 return len;
28575 return 0;
28578 /* Compute default value for "length_vex" attribute. It includes
28579 2 or 3 byte VEX prefix and 1 opcode byte. */
28582 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28583 bool has_vex_w)
28585 int i;
28587 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28588 byte VEX prefix. */
28589 if (!has_0f_opcode || has_vex_w)
28590 return 3 + 1;
28592 /* We can always use 2 byte VEX prefix in 32bit. */
28593 if (!TARGET_64BIT)
28594 return 2 + 1;
28596 extract_insn_cached (insn);
28598 for (i = recog_data.n_operands - 1; i >= 0; --i)
28599 if (REG_P (recog_data.operand[i]))
28601 /* REX.W bit uses 3 byte VEX prefix. */
28602 if (GET_MODE (recog_data.operand[i]) == DImode
28603 && GENERAL_REG_P (recog_data.operand[i]))
28604 return 3 + 1;
28606 else
28608 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28609 if (MEM_P (recog_data.operand[i])
28610 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28611 return 3 + 1;
28614 return 2 + 1;
28618 static bool
28619 ix86_class_likely_spilled_p (reg_class_t);
28621 /* Returns true if lhs of insn is HW function argument register and set up
28622 is_spilled to true if it is likely spilled HW register. */
28623 static bool
28624 insn_is_function_arg (rtx insn, bool* is_spilled)
28626 rtx dst;
28628 if (!NONDEBUG_INSN_P (insn))
28629 return false;
28630 /* Call instructions are not movable, ignore it. */
28631 if (CALL_P (insn))
28632 return false;
28633 insn = PATTERN (insn);
28634 if (GET_CODE (insn) == PARALLEL)
28635 insn = XVECEXP (insn, 0, 0);
28636 if (GET_CODE (insn) != SET)
28637 return false;
28638 dst = SET_DEST (insn);
28639 if (REG_P (dst) && HARD_REGISTER_P (dst)
28640 && ix86_function_arg_regno_p (REGNO (dst)))
28642 /* Is it likely spilled HW register? */
28643 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
28644 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
28645 *is_spilled = true;
28646 return true;
28648 return false;
28651 /* Add output dependencies for chain of function adjacent arguments if only
28652 there is a move to likely spilled HW register. Return first argument
28653 if at least one dependence was added or NULL otherwise. */
28654 static rtx_insn *
28655 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
28657 rtx_insn *insn;
28658 rtx_insn *last = call;
28659 rtx_insn *first_arg = NULL;
28660 bool is_spilled = false;
28662 head = PREV_INSN (head);
28664 /* Find nearest to call argument passing instruction. */
28665 while (true)
28667 last = PREV_INSN (last);
28668 if (last == head)
28669 return NULL;
28670 if (!NONDEBUG_INSN_P (last))
28671 continue;
28672 if (insn_is_function_arg (last, &is_spilled))
28673 break;
28674 return NULL;
28677 first_arg = last;
28678 while (true)
28680 insn = PREV_INSN (last);
28681 if (!INSN_P (insn))
28682 break;
28683 if (insn == head)
28684 break;
28685 if (!NONDEBUG_INSN_P (insn))
28687 last = insn;
28688 continue;
28690 if (insn_is_function_arg (insn, &is_spilled))
28692 /* Add output depdendence between two function arguments if chain
28693 of output arguments contains likely spilled HW registers. */
28694 if (is_spilled)
28695 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28696 first_arg = last = insn;
28698 else
28699 break;
28701 if (!is_spilled)
28702 return NULL;
28703 return first_arg;
28706 /* Add output or anti dependency from insn to first_arg to restrict its code
28707 motion. */
28708 static void
28709 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
28711 rtx set;
28712 rtx tmp;
28714 /* Add anti dependencies for bounds stores. */
28715 if (INSN_P (insn)
28716 && GET_CODE (PATTERN (insn)) == PARALLEL
28717 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
28718 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
28720 add_dependence (first_arg, insn, REG_DEP_ANTI);
28721 return;
28724 set = single_set (insn);
28725 if (!set)
28726 return;
28727 tmp = SET_DEST (set);
28728 if (REG_P (tmp))
28730 /* Add output dependency to the first function argument. */
28731 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28732 return;
28734 /* Add anti dependency. */
28735 add_dependence (first_arg, insn, REG_DEP_ANTI);
28738 /* Avoid cross block motion of function argument through adding dependency
28739 from the first non-jump instruction in bb. */
28740 static void
28741 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
28743 rtx_insn *insn = BB_END (bb);
28745 while (insn)
28747 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
28749 rtx set = single_set (insn);
28750 if (set)
28752 avoid_func_arg_motion (arg, insn);
28753 return;
28756 if (insn == BB_HEAD (bb))
28757 return;
28758 insn = PREV_INSN (insn);
28762 /* Hook for pre-reload schedule - avoid motion of function arguments
28763 passed in likely spilled HW registers. */
28764 static void
28765 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
28767 rtx_insn *insn;
28768 rtx_insn *first_arg = NULL;
28769 if (reload_completed)
28770 return;
28771 while (head != tail && DEBUG_INSN_P (head))
28772 head = NEXT_INSN (head);
28773 for (insn = tail; insn != head; insn = PREV_INSN (insn))
28774 if (INSN_P (insn) && CALL_P (insn))
28776 first_arg = add_parameter_dependencies (insn, head);
28777 if (first_arg)
28779 /* Add dependee for first argument to predecessors if only
28780 region contains more than one block. */
28781 basic_block bb = BLOCK_FOR_INSN (insn);
28782 int rgn = CONTAINING_RGN (bb->index);
28783 int nr_blks = RGN_NR_BLOCKS (rgn);
28784 /* Skip trivial regions and region head blocks that can have
28785 predecessors outside of region. */
28786 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
28788 edge e;
28789 edge_iterator ei;
28791 /* Regions are SCCs with the exception of selective
28792 scheduling with pipelining of outer blocks enabled.
28793 So also check that immediate predecessors of a non-head
28794 block are in the same region. */
28795 FOR_EACH_EDGE (e, ei, bb->preds)
28797 /* Avoid creating of loop-carried dependencies through
28798 using topological ordering in the region. */
28799 if (rgn == CONTAINING_RGN (e->src->index)
28800 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
28801 add_dependee_for_func_arg (first_arg, e->src);
28804 insn = first_arg;
28805 if (insn == head)
28806 break;
28809 else if (first_arg)
28810 avoid_func_arg_motion (first_arg, insn);
28813 /* Hook for pre-reload schedule - set priority of moves from likely spilled
28814 HW registers to maximum, to schedule them at soon as possible. These are
28815 moves from function argument registers at the top of the function entry
28816 and moves from function return value registers after call. */
28817 static int
28818 ix86_adjust_priority (rtx_insn *insn, int priority)
28820 rtx set;
28822 if (reload_completed)
28823 return priority;
28825 if (!NONDEBUG_INSN_P (insn))
28826 return priority;
28828 set = single_set (insn);
28829 if (set)
28831 rtx tmp = SET_SRC (set);
28832 if (REG_P (tmp)
28833 && HARD_REGISTER_P (tmp)
28834 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
28835 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
28836 return current_sched_info->sched_max_insns_priority;
28839 return priority;
28842 /* Prepare for scheduling pass. */
28843 static void
28844 ix86_sched_init_global (FILE *, int, int)
28846 /* Install scheduling hooks for current CPU. Some of these hooks are used
28847 in time-critical parts of the scheduler, so we only set them up when
28848 they are actually used. */
28849 switch (ix86_tune)
28851 case PROCESSOR_CORE2:
28852 case PROCESSOR_NEHALEM:
28853 case PROCESSOR_SANDYBRIDGE:
28854 case PROCESSOR_HASWELL:
28855 case PROCESSOR_GENERIC:
28856 /* Do not perform multipass scheduling for pre-reload schedule
28857 to save compile time. */
28858 if (reload_completed)
28860 ix86_core2i7_init_hooks ();
28861 break;
28863 /* Fall through. */
28864 default:
28865 targetm.sched.dfa_post_advance_cycle = NULL;
28866 targetm.sched.first_cycle_multipass_init = NULL;
28867 targetm.sched.first_cycle_multipass_begin = NULL;
28868 targetm.sched.first_cycle_multipass_issue = NULL;
28869 targetm.sched.first_cycle_multipass_backtrack = NULL;
28870 targetm.sched.first_cycle_multipass_end = NULL;
28871 targetm.sched.first_cycle_multipass_fini = NULL;
28872 break;
28877 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
28879 static HOST_WIDE_INT
28880 ix86_static_rtx_alignment (machine_mode mode)
28882 if (mode == DFmode)
28883 return 64;
28884 if (ALIGN_MODE_128 (mode))
28885 return MAX (128, GET_MODE_ALIGNMENT (mode));
28886 return GET_MODE_ALIGNMENT (mode);
28889 /* Implement TARGET_CONSTANT_ALIGNMENT. */
28891 static HOST_WIDE_INT
28892 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
28894 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
28895 || TREE_CODE (exp) == INTEGER_CST)
28897 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
28898 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
28899 return MAX (mode_align, align);
28901 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
28902 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
28903 return BITS_PER_WORD;
28905 return align;
28908 /* Implement TARGET_EMPTY_RECORD_P. */
28910 static bool
28911 ix86_is_empty_record (const_tree type)
28913 if (!TARGET_64BIT)
28914 return false;
28915 return default_is_empty_record (type);
28918 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
28920 static void
28921 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
28923 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
28925 if (!cum->warn_empty)
28926 return;
28928 if (!TYPE_EMPTY_P (type))
28929 return;
28931 const_tree ctx = get_ultimate_context (cum->decl);
28932 if (ctx != NULL_TREE
28933 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
28934 return;
28936 /* If the actual size of the type is zero, then there is no change
28937 in how objects of this size are passed. */
28938 if (int_size_in_bytes (type) == 0)
28939 return;
28941 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
28942 "changes in -fabi-version=12 (GCC 8)", type);
28944 /* Only warn once. */
28945 cum->warn_empty = false;
28948 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
28949 the data type, and ALIGN is the alignment that the object would
28950 ordinarily have. */
28952 static int
28953 iamcu_alignment (tree type, int align)
28955 machine_mode mode;
28957 if (align < 32 || TYPE_USER_ALIGN (type))
28958 return align;
28960 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
28961 bytes. */
28962 mode = TYPE_MODE (strip_array_types (type));
28963 switch (GET_MODE_CLASS (mode))
28965 case MODE_INT:
28966 case MODE_COMPLEX_INT:
28967 case MODE_COMPLEX_FLOAT:
28968 case MODE_FLOAT:
28969 case MODE_DECIMAL_FLOAT:
28970 return 32;
28971 default:
28972 return align;
28976 /* Compute the alignment for a static variable.
28977 TYPE is the data type, and ALIGN is the alignment that
28978 the object would ordinarily have. The value of this function is used
28979 instead of that alignment to align the object. */
28982 ix86_data_alignment (tree type, int align, bool opt)
28984 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
28985 for symbols from other compilation units or symbols that don't need
28986 to bind locally. In order to preserve some ABI compatibility with
28987 those compilers, ensure we don't decrease alignment from what we
28988 used to assume. */
28990 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
28992 /* A data structure, equal or greater than the size of a cache line
28993 (64 bytes in the Pentium 4 and other recent Intel processors, including
28994 processors based on Intel Core microarchitecture) should be aligned
28995 so that its base address is a multiple of a cache line size. */
28997 int max_align
28998 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
29000 if (max_align < BITS_PER_WORD)
29001 max_align = BITS_PER_WORD;
29003 switch (ix86_align_data_type)
29005 case ix86_align_data_type_abi: opt = false; break;
29006 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29007 case ix86_align_data_type_cacheline: break;
29010 if (TARGET_IAMCU)
29011 align = iamcu_alignment (type, align);
29013 if (opt
29014 && AGGREGATE_TYPE_P (type)
29015 && TYPE_SIZE (type)
29016 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29018 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29019 && align < max_align_compat)
29020 align = max_align_compat;
29021 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29022 && align < max_align)
29023 align = max_align;
29026 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29027 to 16byte boundary. */
29028 if (TARGET_64BIT)
29030 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29031 && TYPE_SIZE (type)
29032 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29033 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29034 && align < 128)
29035 return 128;
29038 if (!opt)
29039 return align;
29041 if (TREE_CODE (type) == ARRAY_TYPE)
29043 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29044 return 64;
29045 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29046 return 128;
29048 else if (TREE_CODE (type) == COMPLEX_TYPE)
29051 if (TYPE_MODE (type) == DCmode && align < 64)
29052 return 64;
29053 if ((TYPE_MODE (type) == XCmode
29054 || TYPE_MODE (type) == TCmode) && align < 128)
29055 return 128;
29057 else if ((TREE_CODE (type) == RECORD_TYPE
29058 || TREE_CODE (type) == UNION_TYPE
29059 || TREE_CODE (type) == QUAL_UNION_TYPE)
29060 && TYPE_FIELDS (type))
29062 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29063 return 64;
29064 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29065 return 128;
29067 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29068 || TREE_CODE (type) == INTEGER_TYPE)
29070 if (TYPE_MODE (type) == DFmode && align < 64)
29071 return 64;
29072 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29073 return 128;
29076 return align;
29079 /* Compute the alignment for a local variable or a stack slot. EXP is
29080 the data type or decl itself, MODE is the widest mode available and
29081 ALIGN is the alignment that the object would ordinarily have. The
29082 value of this macro is used instead of that alignment to align the
29083 object. */
29085 unsigned int
29086 ix86_local_alignment (tree exp, machine_mode mode,
29087 unsigned int align)
29089 tree type, decl;
29091 if (exp && DECL_P (exp))
29093 type = TREE_TYPE (exp);
29094 decl = exp;
29096 else
29098 type = exp;
29099 decl = NULL;
29102 /* Don't do dynamic stack realignment for long long objects with
29103 -mpreferred-stack-boundary=2. */
29104 if (!TARGET_64BIT
29105 && align == 64
29106 && ix86_preferred_stack_boundary < 64
29107 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29108 && (!type || !TYPE_USER_ALIGN (type))
29109 && (!decl || !DECL_USER_ALIGN (decl)))
29110 align = 32;
29112 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29113 register in MODE. We will return the largest alignment of XF
29114 and DF. */
29115 if (!type)
29117 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29118 align = GET_MODE_ALIGNMENT (DFmode);
29119 return align;
29122 /* Don't increase alignment for Intel MCU psABI. */
29123 if (TARGET_IAMCU)
29124 return align;
29126 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29127 to 16byte boundary. Exact wording is:
29129 An array uses the same alignment as its elements, except that a local or
29130 global array variable of length at least 16 bytes or
29131 a C99 variable-length array variable always has alignment of at least 16 bytes.
29133 This was added to allow use of aligned SSE instructions at arrays. This
29134 rule is meant for static storage (where compiler can not do the analysis
29135 by itself). We follow it for automatic variables only when convenient.
29136 We fully control everything in the function compiled and functions from
29137 other unit can not rely on the alignment.
29139 Exclude va_list type. It is the common case of local array where
29140 we can not benefit from the alignment.
29142 TODO: Probably one should optimize for size only when var is not escaping. */
29143 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
29144 && TARGET_SSE)
29146 if (AGGREGATE_TYPE_P (type)
29147 && (va_list_type_node == NULL_TREE
29148 || (TYPE_MAIN_VARIANT (type)
29149 != TYPE_MAIN_VARIANT (va_list_type_node)))
29150 && TYPE_SIZE (type)
29151 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29152 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29153 && align < 128)
29154 return 128;
29156 if (TREE_CODE (type) == ARRAY_TYPE)
29158 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29159 return 64;
29160 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29161 return 128;
29163 else if (TREE_CODE (type) == COMPLEX_TYPE)
29165 if (TYPE_MODE (type) == DCmode && align < 64)
29166 return 64;
29167 if ((TYPE_MODE (type) == XCmode
29168 || TYPE_MODE (type) == TCmode) && align < 128)
29169 return 128;
29171 else if ((TREE_CODE (type) == RECORD_TYPE
29172 || TREE_CODE (type) == UNION_TYPE
29173 || TREE_CODE (type) == QUAL_UNION_TYPE)
29174 && TYPE_FIELDS (type))
29176 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29177 return 64;
29178 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29179 return 128;
29181 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29182 || TREE_CODE (type) == INTEGER_TYPE)
29185 if (TYPE_MODE (type) == DFmode && align < 64)
29186 return 64;
29187 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29188 return 128;
29190 return align;
29193 /* Compute the minimum required alignment for dynamic stack realignment
29194 purposes for a local variable, parameter or a stack slot. EXP is
29195 the data type or decl itself, MODE is its mode and ALIGN is the
29196 alignment that the object would ordinarily have. */
29198 unsigned int
29199 ix86_minimum_alignment (tree exp, machine_mode mode,
29200 unsigned int align)
29202 tree type, decl;
29204 if (exp && DECL_P (exp))
29206 type = TREE_TYPE (exp);
29207 decl = exp;
29209 else
29211 type = exp;
29212 decl = NULL;
29215 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
29216 return align;
29218 /* Don't do dynamic stack realignment for long long objects with
29219 -mpreferred-stack-boundary=2. */
29220 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
29221 && (!type || !TYPE_USER_ALIGN (type))
29222 && (!decl || !DECL_USER_ALIGN (decl)))
29224 gcc_checking_assert (!TARGET_STV);
29225 return 32;
29228 return align;
29231 /* Find a location for the static chain incoming to a nested function.
29232 This is a register, unless all free registers are used by arguments. */
29234 static rtx
29235 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
29237 unsigned regno;
29239 /* While this function won't be called by the middle-end when a static
29240 chain isn't needed, it's also used throughout the backend so it's
29241 easiest to keep this check centralized. */
29242 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
29243 return NULL;
29245 if (TARGET_64BIT)
29247 /* We always use R10 in 64-bit mode. */
29248 regno = R10_REG;
29250 else
29252 const_tree fntype, fndecl;
29253 unsigned int ccvt;
29255 /* By default in 32-bit mode we use ECX to pass the static chain. */
29256 regno = CX_REG;
29258 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
29260 fntype = TREE_TYPE (fndecl_or_type);
29261 fndecl = fndecl_or_type;
29263 else
29265 fntype = fndecl_or_type;
29266 fndecl = NULL;
29269 ccvt = ix86_get_callcvt (fntype);
29270 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29272 /* Fastcall functions use ecx/edx for arguments, which leaves
29273 us with EAX for the static chain.
29274 Thiscall functions use ecx for arguments, which also
29275 leaves us with EAX for the static chain. */
29276 regno = AX_REG;
29278 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29280 /* Thiscall functions use ecx for arguments, which leaves
29281 us with EAX and EDX for the static chain.
29282 We are using for abi-compatibility EAX. */
29283 regno = AX_REG;
29285 else if (ix86_function_regparm (fntype, fndecl) == 3)
29287 /* For regparm 3, we have no free call-clobbered registers in
29288 which to store the static chain. In order to implement this,
29289 we have the trampoline push the static chain to the stack.
29290 However, we can't push a value below the return address when
29291 we call the nested function directly, so we have to use an
29292 alternate entry point. For this we use ESI, and have the
29293 alternate entry point push ESI, so that things appear the
29294 same once we're executing the nested function. */
29295 if (incoming_p)
29297 if (fndecl == current_function_decl
29298 && !ix86_static_chain_on_stack)
29300 gcc_assert (!reload_completed);
29301 ix86_static_chain_on_stack = true;
29303 return gen_frame_mem (SImode,
29304 plus_constant (Pmode,
29305 arg_pointer_rtx, -8));
29307 regno = SI_REG;
29311 return gen_rtx_REG (Pmode, regno);
29314 /* Emit RTL insns to initialize the variable parts of a trampoline.
29315 FNDECL is the decl of the target address; M_TRAMP is a MEM for
29316 the trampoline, and CHAIN_VALUE is an RTX for the static chain
29317 to be passed to the target function. */
29319 static void
29320 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
29322 rtx mem, fnaddr;
29323 int opcode;
29324 int offset = 0;
29326 fnaddr = XEXP (DECL_RTL (fndecl), 0);
29328 if (TARGET_64BIT)
29330 int size;
29332 /* Load the function address to r11. Try to load address using
29333 the shorter movl instead of movabs. We may want to support
29334 movq for kernel mode, but kernel does not use trampolines at
29335 the moment. FNADDR is a 32bit address and may not be in
29336 DImode when ptr_mode == SImode. Always use movl in this
29337 case. */
29338 if (ptr_mode == SImode
29339 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
29341 fnaddr = copy_addr_to_reg (fnaddr);
29343 mem = adjust_address (m_tramp, HImode, offset);
29344 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
29346 mem = adjust_address (m_tramp, SImode, offset + 2);
29347 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
29348 offset += 6;
29350 else
29352 mem = adjust_address (m_tramp, HImode, offset);
29353 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
29355 mem = adjust_address (m_tramp, DImode, offset + 2);
29356 emit_move_insn (mem, fnaddr);
29357 offset += 10;
29360 /* Load static chain using movabs to r10. Use the shorter movl
29361 instead of movabs when ptr_mode == SImode. */
29362 if (ptr_mode == SImode)
29364 opcode = 0xba41;
29365 size = 6;
29367 else
29369 opcode = 0xba49;
29370 size = 10;
29373 mem = adjust_address (m_tramp, HImode, offset);
29374 emit_move_insn (mem, gen_int_mode (opcode, HImode));
29376 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
29377 emit_move_insn (mem, chain_value);
29378 offset += size;
29380 /* Jump to r11; the last (unused) byte is a nop, only there to
29381 pad the write out to a single 32-bit store. */
29382 mem = adjust_address (m_tramp, SImode, offset);
29383 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
29384 offset += 4;
29386 else
29388 rtx disp, chain;
29390 /* Depending on the static chain location, either load a register
29391 with a constant, or push the constant to the stack. All of the
29392 instructions are the same size. */
29393 chain = ix86_static_chain (fndecl, true);
29394 if (REG_P (chain))
29396 switch (REGNO (chain))
29398 case AX_REG:
29399 opcode = 0xb8; break;
29400 case CX_REG:
29401 opcode = 0xb9; break;
29402 default:
29403 gcc_unreachable ();
29406 else
29407 opcode = 0x68;
29409 mem = adjust_address (m_tramp, QImode, offset);
29410 emit_move_insn (mem, gen_int_mode (opcode, QImode));
29412 mem = adjust_address (m_tramp, SImode, offset + 1);
29413 emit_move_insn (mem, chain_value);
29414 offset += 5;
29416 mem = adjust_address (m_tramp, QImode, offset);
29417 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
29419 mem = adjust_address (m_tramp, SImode, offset + 1);
29421 /* Compute offset from the end of the jmp to the target function.
29422 In the case in which the trampoline stores the static chain on
29423 the stack, we need to skip the first insn which pushes the
29424 (call-saved) register static chain; this push is 1 byte. */
29425 offset += 5;
29426 disp = expand_binop (SImode, sub_optab, fnaddr,
29427 plus_constant (Pmode, XEXP (m_tramp, 0),
29428 offset - (MEM_P (chain) ? 1 : 0)),
29429 NULL_RTX, 1, OPTAB_DIRECT);
29430 emit_move_insn (mem, disp);
29433 gcc_assert (offset <= TRAMPOLINE_SIZE);
29435 #ifdef HAVE_ENABLE_EXECUTE_STACK
29436 #ifdef CHECK_EXECUTE_STACK_ENABLED
29437 if (CHECK_EXECUTE_STACK_ENABLED)
29438 #endif
29439 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
29440 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
29441 #endif
29444 static bool
29445 ix86_allocate_stack_slots_for_args (void)
29447 /* Naked functions should not allocate stack slots for arguments. */
29448 return !ix86_function_naked (current_function_decl);
29451 static bool
29452 ix86_warn_func_return (tree decl)
29454 /* Naked functions are implemented entirely in assembly, including the
29455 return sequence, so suppress warnings about this. */
29456 return !ix86_function_naked (decl);
29459 /* The following file contains several enumerations and data structures
29460 built from the definitions in i386-builtin-types.def. */
29462 #include "i386-builtin-types.inc"
29464 /* Table for the ix86 builtin non-function types. */
29465 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
29467 /* Retrieve an element from the above table, building some of
29468 the types lazily. */
29470 static tree
29471 ix86_get_builtin_type (enum ix86_builtin_type tcode)
29473 unsigned int index;
29474 tree type, itype;
29476 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
29478 type = ix86_builtin_type_tab[(int) tcode];
29479 if (type != NULL)
29480 return type;
29482 gcc_assert (tcode > IX86_BT_LAST_PRIM);
29483 if (tcode <= IX86_BT_LAST_VECT)
29485 machine_mode mode;
29487 index = tcode - IX86_BT_LAST_PRIM - 1;
29488 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
29489 mode = ix86_builtin_type_vect_mode[index];
29491 type = build_vector_type_for_mode (itype, mode);
29493 else
29495 int quals;
29497 index = tcode - IX86_BT_LAST_VECT - 1;
29498 if (tcode <= IX86_BT_LAST_PTR)
29499 quals = TYPE_UNQUALIFIED;
29500 else
29501 quals = TYPE_QUAL_CONST;
29503 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
29504 if (quals != TYPE_UNQUALIFIED)
29505 itype = build_qualified_type (itype, quals);
29507 type = build_pointer_type (itype);
29510 ix86_builtin_type_tab[(int) tcode] = type;
29511 return type;
29514 /* Table for the ix86 builtin function types. */
29515 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
29517 /* Retrieve an element from the above table, building some of
29518 the types lazily. */
29520 static tree
29521 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
29523 tree type;
29525 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
29527 type = ix86_builtin_func_type_tab[(int) tcode];
29528 if (type != NULL)
29529 return type;
29531 if (tcode <= IX86_BT_LAST_FUNC)
29533 unsigned start = ix86_builtin_func_start[(int) tcode];
29534 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
29535 tree rtype, atype, args = void_list_node;
29536 unsigned i;
29538 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
29539 for (i = after - 1; i > start; --i)
29541 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
29542 args = tree_cons (NULL, atype, args);
29545 type = build_function_type (rtype, args);
29547 else
29549 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
29550 enum ix86_builtin_func_type icode;
29552 icode = ix86_builtin_func_alias_base[index];
29553 type = ix86_get_builtin_func_type (icode);
29556 ix86_builtin_func_type_tab[(int) tcode] = type;
29557 return type;
29561 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
29562 bdesc_* arrays below should come first, then builtins for each bdesc_*
29563 array in ascending order, so that we can use direct array accesses. */
29564 enum ix86_builtins
29566 IX86_BUILTIN_MASKMOVQ,
29567 IX86_BUILTIN_LDMXCSR,
29568 IX86_BUILTIN_STMXCSR,
29569 IX86_BUILTIN_MASKMOVDQU,
29570 IX86_BUILTIN_PSLLDQ128,
29571 IX86_BUILTIN_CLFLUSH,
29572 IX86_BUILTIN_MONITOR,
29573 IX86_BUILTIN_MWAIT,
29574 IX86_BUILTIN_CLZERO,
29575 IX86_BUILTIN_VEC_INIT_V2SI,
29576 IX86_BUILTIN_VEC_INIT_V4HI,
29577 IX86_BUILTIN_VEC_INIT_V8QI,
29578 IX86_BUILTIN_VEC_EXT_V2DF,
29579 IX86_BUILTIN_VEC_EXT_V2DI,
29580 IX86_BUILTIN_VEC_EXT_V4SF,
29581 IX86_BUILTIN_VEC_EXT_V4SI,
29582 IX86_BUILTIN_VEC_EXT_V8HI,
29583 IX86_BUILTIN_VEC_EXT_V2SI,
29584 IX86_BUILTIN_VEC_EXT_V4HI,
29585 IX86_BUILTIN_VEC_EXT_V16QI,
29586 IX86_BUILTIN_VEC_SET_V2DI,
29587 IX86_BUILTIN_VEC_SET_V4SF,
29588 IX86_BUILTIN_VEC_SET_V4SI,
29589 IX86_BUILTIN_VEC_SET_V8HI,
29590 IX86_BUILTIN_VEC_SET_V4HI,
29591 IX86_BUILTIN_VEC_SET_V16QI,
29592 IX86_BUILTIN_GATHERSIV2DF,
29593 IX86_BUILTIN_GATHERSIV4DF,
29594 IX86_BUILTIN_GATHERDIV2DF,
29595 IX86_BUILTIN_GATHERDIV4DF,
29596 IX86_BUILTIN_GATHERSIV4SF,
29597 IX86_BUILTIN_GATHERSIV8SF,
29598 IX86_BUILTIN_GATHERDIV4SF,
29599 IX86_BUILTIN_GATHERDIV8SF,
29600 IX86_BUILTIN_GATHERSIV2DI,
29601 IX86_BUILTIN_GATHERSIV4DI,
29602 IX86_BUILTIN_GATHERDIV2DI,
29603 IX86_BUILTIN_GATHERDIV4DI,
29604 IX86_BUILTIN_GATHERSIV4SI,
29605 IX86_BUILTIN_GATHERSIV8SI,
29606 IX86_BUILTIN_GATHERDIV4SI,
29607 IX86_BUILTIN_GATHERDIV8SI,
29608 IX86_BUILTIN_VFMSUBSD3_MASK3,
29609 IX86_BUILTIN_VFMSUBSS3_MASK3,
29610 IX86_BUILTIN_GATHER3SIV8SF,
29611 IX86_BUILTIN_GATHER3SIV4SF,
29612 IX86_BUILTIN_GATHER3SIV4DF,
29613 IX86_BUILTIN_GATHER3SIV2DF,
29614 IX86_BUILTIN_GATHER3DIV8SF,
29615 IX86_BUILTIN_GATHER3DIV4SF,
29616 IX86_BUILTIN_GATHER3DIV4DF,
29617 IX86_BUILTIN_GATHER3DIV2DF,
29618 IX86_BUILTIN_GATHER3SIV8SI,
29619 IX86_BUILTIN_GATHER3SIV4SI,
29620 IX86_BUILTIN_GATHER3SIV4DI,
29621 IX86_BUILTIN_GATHER3SIV2DI,
29622 IX86_BUILTIN_GATHER3DIV8SI,
29623 IX86_BUILTIN_GATHER3DIV4SI,
29624 IX86_BUILTIN_GATHER3DIV4DI,
29625 IX86_BUILTIN_GATHER3DIV2DI,
29626 IX86_BUILTIN_SCATTERSIV8SF,
29627 IX86_BUILTIN_SCATTERSIV4SF,
29628 IX86_BUILTIN_SCATTERSIV4DF,
29629 IX86_BUILTIN_SCATTERSIV2DF,
29630 IX86_BUILTIN_SCATTERDIV8SF,
29631 IX86_BUILTIN_SCATTERDIV4SF,
29632 IX86_BUILTIN_SCATTERDIV4DF,
29633 IX86_BUILTIN_SCATTERDIV2DF,
29634 IX86_BUILTIN_SCATTERSIV8SI,
29635 IX86_BUILTIN_SCATTERSIV4SI,
29636 IX86_BUILTIN_SCATTERSIV4DI,
29637 IX86_BUILTIN_SCATTERSIV2DI,
29638 IX86_BUILTIN_SCATTERDIV8SI,
29639 IX86_BUILTIN_SCATTERDIV4SI,
29640 IX86_BUILTIN_SCATTERDIV4DI,
29641 IX86_BUILTIN_SCATTERDIV2DI,
29642 /* Alternate 4 and 8 element gather/scatter for the vectorizer
29643 where all operands are 32-byte or 64-byte wide respectively. */
29644 IX86_BUILTIN_GATHERALTSIV4DF,
29645 IX86_BUILTIN_GATHERALTDIV8SF,
29646 IX86_BUILTIN_GATHERALTSIV4DI,
29647 IX86_BUILTIN_GATHERALTDIV8SI,
29648 IX86_BUILTIN_GATHER3ALTDIV16SF,
29649 IX86_BUILTIN_GATHER3ALTDIV16SI,
29650 IX86_BUILTIN_GATHER3ALTSIV4DF,
29651 IX86_BUILTIN_GATHER3ALTDIV8SF,
29652 IX86_BUILTIN_GATHER3ALTSIV4DI,
29653 IX86_BUILTIN_GATHER3ALTDIV8SI,
29654 IX86_BUILTIN_GATHER3ALTSIV8DF,
29655 IX86_BUILTIN_GATHER3ALTSIV8DI,
29656 IX86_BUILTIN_GATHER3DIV16SF,
29657 IX86_BUILTIN_GATHER3DIV16SI,
29658 IX86_BUILTIN_GATHER3DIV8DF,
29659 IX86_BUILTIN_GATHER3DIV8DI,
29660 IX86_BUILTIN_GATHER3SIV16SF,
29661 IX86_BUILTIN_GATHER3SIV16SI,
29662 IX86_BUILTIN_GATHER3SIV8DF,
29663 IX86_BUILTIN_GATHER3SIV8DI,
29664 IX86_BUILTIN_SCATTERALTSIV8DF,
29665 IX86_BUILTIN_SCATTERALTDIV16SF,
29666 IX86_BUILTIN_SCATTERALTSIV8DI,
29667 IX86_BUILTIN_SCATTERALTDIV16SI,
29668 IX86_BUILTIN_SCATTERDIV16SF,
29669 IX86_BUILTIN_SCATTERDIV16SI,
29670 IX86_BUILTIN_SCATTERDIV8DF,
29671 IX86_BUILTIN_SCATTERDIV8DI,
29672 IX86_BUILTIN_SCATTERSIV16SF,
29673 IX86_BUILTIN_SCATTERSIV16SI,
29674 IX86_BUILTIN_SCATTERSIV8DF,
29675 IX86_BUILTIN_SCATTERSIV8DI,
29676 IX86_BUILTIN_GATHERPFQPD,
29677 IX86_BUILTIN_GATHERPFDPS,
29678 IX86_BUILTIN_GATHERPFDPD,
29679 IX86_BUILTIN_GATHERPFQPS,
29680 IX86_BUILTIN_SCATTERPFDPD,
29681 IX86_BUILTIN_SCATTERPFDPS,
29682 IX86_BUILTIN_SCATTERPFQPD,
29683 IX86_BUILTIN_SCATTERPFQPS,
29684 IX86_BUILTIN_CLWB,
29685 IX86_BUILTIN_CLFLUSHOPT,
29686 IX86_BUILTIN_INFQ,
29687 IX86_BUILTIN_HUGE_VALQ,
29688 IX86_BUILTIN_NANQ,
29689 IX86_BUILTIN_NANSQ,
29690 IX86_BUILTIN_XABORT,
29691 IX86_BUILTIN_ADDCARRYX32,
29692 IX86_BUILTIN_ADDCARRYX64,
29693 IX86_BUILTIN_SBB32,
29694 IX86_BUILTIN_SBB64,
29695 IX86_BUILTIN_RDRAND16_STEP,
29696 IX86_BUILTIN_RDRAND32_STEP,
29697 IX86_BUILTIN_RDRAND64_STEP,
29698 IX86_BUILTIN_RDSEED16_STEP,
29699 IX86_BUILTIN_RDSEED32_STEP,
29700 IX86_BUILTIN_RDSEED64_STEP,
29701 IX86_BUILTIN_MONITORX,
29702 IX86_BUILTIN_MWAITX,
29703 IX86_BUILTIN_CFSTRING,
29704 IX86_BUILTIN_CPU_INIT,
29705 IX86_BUILTIN_CPU_IS,
29706 IX86_BUILTIN_CPU_SUPPORTS,
29707 IX86_BUILTIN_READ_FLAGS,
29708 IX86_BUILTIN_WRITE_FLAGS,
29710 /* All the remaining builtins are tracked in bdesc_* arrays in
29711 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
29712 this point. */
29713 #define BDESC(mask, icode, name, code, comparison, flag) \
29714 code,
29715 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29716 code, \
29717 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
29718 #define BDESC_END(kind, next_kind)
29720 #include "i386-builtin.def"
29722 #undef BDESC
29723 #undef BDESC_FIRST
29724 #undef BDESC_END
29726 IX86_BUILTIN_MAX,
29728 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
29730 /* Now just the aliases for bdesc_* start/end. */
29731 #define BDESC(mask, icode, name, code, comparison, flag)
29732 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
29733 #define BDESC_END(kind, next_kind) \
29734 IX86_BUILTIN__BDESC_##kind##_LAST \
29735 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
29737 #include "i386-builtin.def"
29739 #undef BDESC
29740 #undef BDESC_FIRST
29741 #undef BDESC_END
29743 /* Just to make sure there is no comma after the last enumerator. */
29744 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
29747 /* Table for the ix86 builtin decls. */
29748 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
29750 /* Table of all of the builtin functions that are possible with different ISA's
29751 but are waiting to be built until a function is declared to use that
29752 ISA. */
29753 struct builtin_isa {
29754 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
29755 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
29756 const char *name; /* function name */
29757 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
29758 unsigned char const_p:1; /* true if the declaration is constant */
29759 unsigned char pure_p:1; /* true if the declaration has pure attribute */
29760 bool leaf_p; /* true if the declaration has leaf attribute */
29761 bool nothrow_p; /* true if the declaration has nothrow attribute */
29762 bool set_and_not_built_p;
29765 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
29767 /* Bits that can still enable any inclusion of a builtin. */
29768 static HOST_WIDE_INT deferred_isa_values = 0;
29769 static HOST_WIDE_INT deferred_isa_values2 = 0;
29771 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
29772 of which isa_flags to use in the ix86_builtins_isa array. Stores the
29773 function decl in the ix86_builtins array. Returns the function decl or
29774 NULL_TREE, if the builtin was not added.
29776 If the front end has a special hook for builtin functions, delay adding
29777 builtin functions that aren't in the current ISA until the ISA is changed
29778 with function specific optimization. Doing so, can save about 300K for the
29779 default compiler. When the builtin is expanded, check at that time whether
29780 it is valid.
29782 If the front end doesn't have a special hook, record all builtins, even if
29783 it isn't an instruction set in the current ISA in case the user uses
29784 function specific options for a different ISA, so that we don't get scope
29785 errors if a builtin is added in the middle of a function scope. */
29787 static inline tree
29788 def_builtin (HOST_WIDE_INT mask, const char *name,
29789 enum ix86_builtin_func_type tcode,
29790 enum ix86_builtins code)
29792 tree decl = NULL_TREE;
29794 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
29796 ix86_builtins_isa[(int) code].isa = mask;
29798 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
29799 where any bit set means that built-in is enable, this bit must be *and-ed*
29800 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
29801 means that *both* cpuid bits must be set for the built-in to be available.
29802 Handle this here. */
29803 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
29804 mask &= ~OPTION_MASK_ISA_AVX512VL;
29806 mask &= ~OPTION_MASK_ISA_64BIT;
29807 if (mask == 0
29808 || (mask & ix86_isa_flags) != 0
29809 || (lang_hooks.builtin_function
29810 == lang_hooks.builtin_function_ext_scope))
29813 tree type = ix86_get_builtin_func_type (tcode);
29814 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29815 NULL, NULL_TREE);
29816 ix86_builtins[(int) code] = decl;
29817 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29819 else
29821 /* Just a MASK where set_and_not_built_p == true can potentially
29822 include a builtin. */
29823 deferred_isa_values |= mask;
29824 ix86_builtins[(int) code] = NULL_TREE;
29825 ix86_builtins_isa[(int) code].tcode = tcode;
29826 ix86_builtins_isa[(int) code].name = name;
29827 ix86_builtins_isa[(int) code].leaf_p = false;
29828 ix86_builtins_isa[(int) code].nothrow_p = false;
29829 ix86_builtins_isa[(int) code].const_p = false;
29830 ix86_builtins_isa[(int) code].pure_p = false;
29831 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29835 return decl;
29838 /* Like def_builtin, but also marks the function decl "const". */
29840 static inline tree
29841 def_builtin_const (HOST_WIDE_INT mask, const char *name,
29842 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29844 tree decl = def_builtin (mask, name, tcode, code);
29845 if (decl)
29846 TREE_READONLY (decl) = 1;
29847 else
29848 ix86_builtins_isa[(int) code].const_p = true;
29850 return decl;
29853 /* Like def_builtin, but also marks the function decl "pure". */
29855 static inline tree
29856 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
29857 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29859 tree decl = def_builtin (mask, name, tcode, code);
29860 if (decl)
29861 DECL_PURE_P (decl) = 1;
29862 else
29863 ix86_builtins_isa[(int) code].pure_p = true;
29865 return decl;
29868 /* Like def_builtin, but for additional isa2 flags. */
29870 static inline tree
29871 def_builtin2 (HOST_WIDE_INT mask, const char *name,
29872 enum ix86_builtin_func_type tcode,
29873 enum ix86_builtins code)
29875 tree decl = NULL_TREE;
29877 ix86_builtins_isa[(int) code].isa2 = mask;
29879 if (mask == 0
29880 || (mask & ix86_isa_flags2) != 0
29881 || (lang_hooks.builtin_function
29882 == lang_hooks.builtin_function_ext_scope))
29885 tree type = ix86_get_builtin_func_type (tcode);
29886 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29887 NULL, NULL_TREE);
29888 ix86_builtins[(int) code] = decl;
29889 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29891 else
29893 /* Just a MASK where set_and_not_built_p == true can potentially
29894 include a builtin. */
29895 deferred_isa_values2 |= mask;
29896 ix86_builtins[(int) code] = NULL_TREE;
29897 ix86_builtins_isa[(int) code].tcode = tcode;
29898 ix86_builtins_isa[(int) code].name = name;
29899 ix86_builtins_isa[(int) code].leaf_p = false;
29900 ix86_builtins_isa[(int) code].nothrow_p = false;
29901 ix86_builtins_isa[(int) code].const_p = false;
29902 ix86_builtins_isa[(int) code].pure_p = false;
29903 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29906 return decl;
29909 /* Like def_builtin, but also marks the function decl "const". */
29911 static inline tree
29912 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
29913 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29915 tree decl = def_builtin2 (mask, name, tcode, code);
29916 if (decl)
29917 TREE_READONLY (decl) = 1;
29918 else
29919 ix86_builtins_isa[(int) code].const_p = true;
29921 return decl;
29924 /* Like def_builtin, but also marks the function decl "pure". */
29926 static inline tree
29927 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
29928 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29930 tree decl = def_builtin2 (mask, name, tcode, code);
29931 if (decl)
29932 DECL_PURE_P (decl) = 1;
29933 else
29934 ix86_builtins_isa[(int) code].pure_p = true;
29936 return decl;
29939 /* Add any new builtin functions for a given ISA that may not have been
29940 declared. This saves a bit of space compared to adding all of the
29941 declarations to the tree, even if we didn't use them. */
29943 static void
29944 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
29946 if ((isa & deferred_isa_values) == 0
29947 && (isa2 & deferred_isa_values2) == 0)
29948 return;
29950 /* Bits in ISA value can be removed from potential isa values. */
29951 deferred_isa_values &= ~isa;
29952 deferred_isa_values2 &= ~isa2;
29954 int i;
29955 tree saved_current_target_pragma = current_target_pragma;
29956 current_target_pragma = NULL_TREE;
29958 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
29960 if (((ix86_builtins_isa[i].isa & isa) != 0
29961 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
29962 && ix86_builtins_isa[i].set_and_not_built_p)
29964 tree decl, type;
29966 /* Don't define the builtin again. */
29967 ix86_builtins_isa[i].set_and_not_built_p = false;
29969 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
29970 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
29971 type, i, BUILT_IN_MD, NULL,
29972 NULL_TREE);
29974 ix86_builtins[i] = decl;
29975 if (ix86_builtins_isa[i].const_p)
29976 TREE_READONLY (decl) = 1;
29977 if (ix86_builtins_isa[i].pure_p)
29978 DECL_PURE_P (decl) = 1;
29979 if (ix86_builtins_isa[i].leaf_p)
29980 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
29981 NULL_TREE);
29982 if (ix86_builtins_isa[i].nothrow_p)
29983 TREE_NOTHROW (decl) = 1;
29987 current_target_pragma = saved_current_target_pragma;
29990 /* Bits for builtin_description.flag. */
29992 /* Set when we don't support the comparison natively, and should
29993 swap_comparison in order to support it. */
29994 #define BUILTIN_DESC_SWAP_OPERANDS 1
29996 struct builtin_description
29998 const HOST_WIDE_INT mask;
29999 const enum insn_code icode;
30000 const char *const name;
30001 const enum ix86_builtins code;
30002 const enum rtx_code comparison;
30003 const int flag;
30006 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30007 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30008 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30009 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30010 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30011 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30012 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30013 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30014 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30015 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30016 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30017 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30018 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30019 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30020 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30021 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30022 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30023 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30024 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30025 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30026 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30027 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30028 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30029 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30030 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30031 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30032 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30033 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30034 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30035 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30036 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30037 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30038 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30039 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30040 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30041 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30042 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30043 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30044 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30045 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30046 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30047 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30048 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30049 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30050 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30051 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30052 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30053 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30054 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30055 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30056 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30057 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30059 #define BDESC(mask, icode, name, code, comparison, flag) \
30060 { mask, icode, name, code, comparison, flag },
30061 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30062 static const struct builtin_description bdesc_##kind[] = \
30064 BDESC (mask, icode, name, code, comparison, flag)
30065 #define BDESC_END(kind, next_kind) \
30068 #include "i386-builtin.def"
30070 #undef BDESC
30071 #undef BDESC_FIRST
30072 #undef BDESC_END
30074 /* TM vector builtins. */
30076 /* Reuse the existing x86-specific `struct builtin_description' cause
30077 we're lazy. Add casts to make them fit. */
30078 static const struct builtin_description bdesc_tm[] =
30080 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30081 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30082 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30083 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30084 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30085 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30086 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30088 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30089 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30090 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30091 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30092 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30093 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30094 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30096 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30097 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30098 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30099 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30100 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30101 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30102 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30104 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30105 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
30106 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
30109 /* Initialize the transactional memory vector load/store builtins. */
30111 static void
30112 ix86_init_tm_builtins (void)
30114 enum ix86_builtin_func_type ftype;
30115 const struct builtin_description *d;
30116 size_t i;
30117 tree decl;
30118 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
30119 tree attrs_log, attrs_type_log;
30121 if (!flag_tm)
30122 return;
30124 /* If there are no builtins defined, we must be compiling in a
30125 language without trans-mem support. */
30126 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
30127 return;
30129 /* Use whatever attributes a normal TM load has. */
30130 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
30131 attrs_load = DECL_ATTRIBUTES (decl);
30132 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30133 /* Use whatever attributes a normal TM store has. */
30134 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
30135 attrs_store = DECL_ATTRIBUTES (decl);
30136 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30137 /* Use whatever attributes a normal TM log has. */
30138 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
30139 attrs_log = DECL_ATTRIBUTES (decl);
30140 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30142 for (i = 0, d = bdesc_tm;
30143 i < ARRAY_SIZE (bdesc_tm);
30144 i++, d++)
30146 if ((d->mask & ix86_isa_flags) != 0
30147 || (lang_hooks.builtin_function
30148 == lang_hooks.builtin_function_ext_scope))
30150 tree type, attrs, attrs_type;
30151 enum built_in_function code = (enum built_in_function) d->code;
30153 ftype = (enum ix86_builtin_func_type) d->flag;
30154 type = ix86_get_builtin_func_type (ftype);
30156 if (BUILTIN_TM_LOAD_P (code))
30158 attrs = attrs_load;
30159 attrs_type = attrs_type_load;
30161 else if (BUILTIN_TM_STORE_P (code))
30163 attrs = attrs_store;
30164 attrs_type = attrs_type_store;
30166 else
30168 attrs = attrs_log;
30169 attrs_type = attrs_type_log;
30171 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
30172 /* The builtin without the prefix for
30173 calling it directly. */
30174 d->name + strlen ("__builtin_"),
30175 attrs);
30176 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
30177 set the TYPE_ATTRIBUTES. */
30178 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
30180 set_builtin_decl (code, decl, false);
30185 /* Macros for verification of enum ix86_builtins order. */
30186 #define BDESC_VERIFY(x, y, z) \
30187 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
30188 #define BDESC_VERIFYS(x, y, z) \
30189 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
30191 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30192 IX86_BUILTIN__BDESC_COMI_LAST, 1);
30193 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30194 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
30195 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30196 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
30197 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
30198 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
30199 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30200 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
30201 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
30202 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
30203 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30204 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
30205 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
30206 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
30207 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30208 IX86_BUILTIN__BDESC_MPX_LAST, 1);
30209 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30210 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
30211 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
30212 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
30213 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30214 IX86_BUILTIN__BDESC_CET_LAST, 1);
30215 BDESC_VERIFYS (IX86_BUILTIN_MAX,
30216 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
30218 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
30219 in the current target ISA to allow the user to compile particular modules
30220 with different target specific options that differ from the command line
30221 options. */
30222 static void
30223 ix86_init_mmx_sse_builtins (void)
30225 const struct builtin_description * d;
30226 enum ix86_builtin_func_type ftype;
30227 size_t i;
30229 /* Add all special builtins with variable number of operands. */
30230 for (i = 0, d = bdesc_special_args;
30231 i < ARRAY_SIZE (bdesc_special_args);
30232 i++, d++)
30234 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
30235 if (d->name == 0)
30236 continue;
30238 ftype = (enum ix86_builtin_func_type) d->flag;
30239 def_builtin (d->mask, d->name, ftype, d->code);
30241 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
30242 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30243 ARRAY_SIZE (bdesc_special_args) - 1);
30245 /* Add all builtins with variable number of operands. */
30246 for (i = 0, d = bdesc_args;
30247 i < ARRAY_SIZE (bdesc_args);
30248 i++, d++)
30250 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
30251 if (d->name == 0)
30252 continue;
30254 ftype = (enum ix86_builtin_func_type) d->flag;
30255 def_builtin_const (d->mask, d->name, ftype, d->code);
30257 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
30258 IX86_BUILTIN__BDESC_ARGS_FIRST,
30259 ARRAY_SIZE (bdesc_args) - 1);
30261 /* Add all builtins with variable number of operands. */
30262 for (i = 0, d = bdesc_args2;
30263 i < ARRAY_SIZE (bdesc_args2);
30264 i++, d++)
30266 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
30267 if (d->name == 0)
30268 continue;
30270 ftype = (enum ix86_builtin_func_type) d->flag;
30271 def_builtin_const2 (d->mask, d->name, ftype, d->code);
30273 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
30274 IX86_BUILTIN__BDESC_ARGS2_FIRST,
30275 ARRAY_SIZE (bdesc_args2) - 1);
30277 for (i = 0, d = bdesc_special_args2;
30278 i < ARRAY_SIZE (bdesc_special_args2);
30279 i++, d++)
30281 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
30282 if (d->name == 0)
30283 continue;
30285 ftype = (enum ix86_builtin_func_type) d->flag;
30286 def_builtin2 (d->mask, d->name, ftype, d->code);
30288 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
30289 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30290 ARRAY_SIZE (bdesc_special_args2) - 1);
30292 /* Add all builtins with rounding. */
30293 for (i = 0, d = bdesc_round_args;
30294 i < ARRAY_SIZE (bdesc_round_args);
30295 i++, d++)
30297 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
30298 if (d->name == 0)
30299 continue;
30301 ftype = (enum ix86_builtin_func_type) d->flag;
30302 def_builtin_const (d->mask, d->name, ftype, d->code);
30304 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
30305 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30306 ARRAY_SIZE (bdesc_round_args) - 1);
30308 /* pcmpestr[im] insns. */
30309 for (i = 0, d = bdesc_pcmpestr;
30310 i < ARRAY_SIZE (bdesc_pcmpestr);
30311 i++, d++)
30313 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
30314 if (d->code == IX86_BUILTIN_PCMPESTRM128)
30315 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
30316 else
30317 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
30318 def_builtin_const (d->mask, d->name, ftype, d->code);
30320 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
30321 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30322 ARRAY_SIZE (bdesc_pcmpestr) - 1);
30324 /* pcmpistr[im] insns. */
30325 for (i = 0, d = bdesc_pcmpistr;
30326 i < ARRAY_SIZE (bdesc_pcmpistr);
30327 i++, d++)
30329 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
30330 if (d->code == IX86_BUILTIN_PCMPISTRM128)
30331 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
30332 else
30333 ftype = INT_FTYPE_V16QI_V16QI_INT;
30334 def_builtin_const (d->mask, d->name, ftype, d->code);
30336 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
30337 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30338 ARRAY_SIZE (bdesc_pcmpistr) - 1);
30340 /* comi/ucomi insns. */
30341 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30343 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
30344 if (d->mask == OPTION_MASK_ISA_SSE2)
30345 ftype = INT_FTYPE_V2DF_V2DF;
30346 else
30347 ftype = INT_FTYPE_V4SF_V4SF;
30348 def_builtin_const (d->mask, d->name, ftype, d->code);
30350 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
30351 IX86_BUILTIN__BDESC_COMI_FIRST,
30352 ARRAY_SIZE (bdesc_comi) - 1);
30354 /* SSE */
30355 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
30356 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
30357 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
30358 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
30360 /* SSE or 3DNow!A */
30361 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30362 /* As it uses V4HImode, we have to require -mmmx too. */
30363 | OPTION_MASK_ISA_MMX,
30364 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
30365 IX86_BUILTIN_MASKMOVQ);
30367 /* SSE2 */
30368 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
30369 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
30371 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
30372 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
30373 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
30374 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
30376 /* SSE3. */
30377 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
30378 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
30379 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
30380 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
30382 /* AES */
30383 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
30384 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
30385 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
30386 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
30387 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
30388 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
30389 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
30390 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
30391 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
30392 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
30393 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
30394 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
30396 /* PCLMUL */
30397 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
30398 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
30400 /* RDRND */
30401 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
30402 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
30403 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
30404 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
30405 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
30406 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
30407 IX86_BUILTIN_RDRAND64_STEP);
30409 /* AVX2 */
30410 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
30411 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
30412 IX86_BUILTIN_GATHERSIV2DF);
30414 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
30415 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
30416 IX86_BUILTIN_GATHERSIV4DF);
30418 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
30419 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
30420 IX86_BUILTIN_GATHERDIV2DF);
30422 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
30423 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
30424 IX86_BUILTIN_GATHERDIV4DF);
30426 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
30427 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
30428 IX86_BUILTIN_GATHERSIV4SF);
30430 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
30431 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
30432 IX86_BUILTIN_GATHERSIV8SF);
30434 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
30435 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
30436 IX86_BUILTIN_GATHERDIV4SF);
30438 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
30439 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
30440 IX86_BUILTIN_GATHERDIV8SF);
30442 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
30443 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
30444 IX86_BUILTIN_GATHERSIV2DI);
30446 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
30447 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
30448 IX86_BUILTIN_GATHERSIV4DI);
30450 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
30451 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
30452 IX86_BUILTIN_GATHERDIV2DI);
30454 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
30455 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
30456 IX86_BUILTIN_GATHERDIV4DI);
30458 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
30459 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
30460 IX86_BUILTIN_GATHERSIV4SI);
30462 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
30463 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
30464 IX86_BUILTIN_GATHERSIV8SI);
30466 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
30467 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
30468 IX86_BUILTIN_GATHERDIV4SI);
30470 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
30471 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
30472 IX86_BUILTIN_GATHERDIV8SI);
30474 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
30475 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
30476 IX86_BUILTIN_GATHERALTSIV4DF);
30478 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
30479 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
30480 IX86_BUILTIN_GATHERALTDIV8SF);
30482 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
30483 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
30484 IX86_BUILTIN_GATHERALTSIV4DI);
30486 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
30487 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
30488 IX86_BUILTIN_GATHERALTDIV8SI);
30490 /* AVX512F */
30491 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
30492 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
30493 IX86_BUILTIN_GATHER3SIV16SF);
30495 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
30496 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
30497 IX86_BUILTIN_GATHER3SIV8DF);
30499 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
30500 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
30501 IX86_BUILTIN_GATHER3DIV16SF);
30503 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
30504 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
30505 IX86_BUILTIN_GATHER3DIV8DF);
30507 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
30508 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
30509 IX86_BUILTIN_GATHER3SIV16SI);
30511 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
30512 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
30513 IX86_BUILTIN_GATHER3SIV8DI);
30515 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
30516 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
30517 IX86_BUILTIN_GATHER3DIV16SI);
30519 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
30520 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
30521 IX86_BUILTIN_GATHER3DIV8DI);
30523 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
30524 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
30525 IX86_BUILTIN_GATHER3ALTSIV8DF);
30527 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
30528 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
30529 IX86_BUILTIN_GATHER3ALTDIV16SF);
30531 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
30532 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
30533 IX86_BUILTIN_GATHER3ALTSIV8DI);
30535 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
30536 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
30537 IX86_BUILTIN_GATHER3ALTDIV16SI);
30539 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
30540 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
30541 IX86_BUILTIN_SCATTERSIV16SF);
30543 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
30544 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
30545 IX86_BUILTIN_SCATTERSIV8DF);
30547 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
30548 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
30549 IX86_BUILTIN_SCATTERDIV16SF);
30551 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
30552 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
30553 IX86_BUILTIN_SCATTERDIV8DF);
30555 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
30556 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
30557 IX86_BUILTIN_SCATTERSIV16SI);
30559 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
30560 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
30561 IX86_BUILTIN_SCATTERSIV8DI);
30563 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
30564 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
30565 IX86_BUILTIN_SCATTERDIV16SI);
30567 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
30568 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
30569 IX86_BUILTIN_SCATTERDIV8DI);
30571 /* AVX512VL */
30572 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
30573 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
30574 IX86_BUILTIN_GATHER3SIV2DF);
30576 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
30577 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
30578 IX86_BUILTIN_GATHER3SIV4DF);
30580 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
30581 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
30582 IX86_BUILTIN_GATHER3DIV2DF);
30584 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
30585 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
30586 IX86_BUILTIN_GATHER3DIV4DF);
30588 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
30589 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
30590 IX86_BUILTIN_GATHER3SIV4SF);
30592 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
30593 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
30594 IX86_BUILTIN_GATHER3SIV8SF);
30596 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
30597 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
30598 IX86_BUILTIN_GATHER3DIV4SF);
30600 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
30601 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
30602 IX86_BUILTIN_GATHER3DIV8SF);
30604 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
30605 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
30606 IX86_BUILTIN_GATHER3SIV2DI);
30608 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
30609 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
30610 IX86_BUILTIN_GATHER3SIV4DI);
30612 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
30613 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
30614 IX86_BUILTIN_GATHER3DIV2DI);
30616 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
30617 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
30618 IX86_BUILTIN_GATHER3DIV4DI);
30620 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
30621 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
30622 IX86_BUILTIN_GATHER3SIV4SI);
30624 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
30625 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
30626 IX86_BUILTIN_GATHER3SIV8SI);
30628 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
30629 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
30630 IX86_BUILTIN_GATHER3DIV4SI);
30632 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
30633 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
30634 IX86_BUILTIN_GATHER3DIV8SI);
30636 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
30637 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
30638 IX86_BUILTIN_GATHER3ALTSIV4DF);
30640 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
30641 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
30642 IX86_BUILTIN_GATHER3ALTDIV8SF);
30644 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
30645 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
30646 IX86_BUILTIN_GATHER3ALTSIV4DI);
30648 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
30649 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
30650 IX86_BUILTIN_GATHER3ALTDIV8SI);
30652 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
30653 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
30654 IX86_BUILTIN_SCATTERSIV8SF);
30656 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
30657 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
30658 IX86_BUILTIN_SCATTERSIV4SF);
30660 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
30661 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
30662 IX86_BUILTIN_SCATTERSIV4DF);
30664 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
30665 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
30666 IX86_BUILTIN_SCATTERSIV2DF);
30668 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
30669 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
30670 IX86_BUILTIN_SCATTERDIV8SF);
30672 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
30673 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
30674 IX86_BUILTIN_SCATTERDIV4SF);
30676 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
30677 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
30678 IX86_BUILTIN_SCATTERDIV4DF);
30680 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
30681 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
30682 IX86_BUILTIN_SCATTERDIV2DF);
30684 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
30685 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
30686 IX86_BUILTIN_SCATTERSIV8SI);
30688 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
30689 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
30690 IX86_BUILTIN_SCATTERSIV4SI);
30692 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
30693 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
30694 IX86_BUILTIN_SCATTERSIV4DI);
30696 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
30697 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
30698 IX86_BUILTIN_SCATTERSIV2DI);
30700 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
30701 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
30702 IX86_BUILTIN_SCATTERDIV8SI);
30704 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
30705 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
30706 IX86_BUILTIN_SCATTERDIV4SI);
30708 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
30709 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
30710 IX86_BUILTIN_SCATTERDIV4DI);
30712 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
30713 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
30714 IX86_BUILTIN_SCATTERDIV2DI);
30715 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
30716 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
30717 IX86_BUILTIN_SCATTERALTSIV8DF);
30719 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
30720 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
30721 IX86_BUILTIN_SCATTERALTDIV16SF);
30723 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
30724 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
30725 IX86_BUILTIN_SCATTERALTSIV8DI);
30727 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
30728 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
30729 IX86_BUILTIN_SCATTERALTDIV16SI);
30731 /* AVX512PF */
30732 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
30733 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30734 IX86_BUILTIN_GATHERPFDPD);
30735 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
30736 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30737 IX86_BUILTIN_GATHERPFDPS);
30738 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
30739 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30740 IX86_BUILTIN_GATHERPFQPD);
30741 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
30742 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30743 IX86_BUILTIN_GATHERPFQPS);
30744 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
30745 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30746 IX86_BUILTIN_SCATTERPFDPD);
30747 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
30748 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30749 IX86_BUILTIN_SCATTERPFDPS);
30750 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
30751 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30752 IX86_BUILTIN_SCATTERPFQPD);
30753 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
30754 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30755 IX86_BUILTIN_SCATTERPFQPS);
30757 /* SHA */
30758 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
30759 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
30760 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
30761 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
30762 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
30763 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
30764 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
30765 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
30766 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
30767 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
30768 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
30769 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
30770 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
30771 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
30773 /* RTM. */
30774 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
30775 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
30777 /* MMX access to the vec_init patterns. */
30778 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
30779 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
30781 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
30782 V4HI_FTYPE_HI_HI_HI_HI,
30783 IX86_BUILTIN_VEC_INIT_V4HI);
30785 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
30786 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
30787 IX86_BUILTIN_VEC_INIT_V8QI);
30789 /* Access to the vec_extract patterns. */
30790 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
30791 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
30792 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
30793 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
30794 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
30795 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
30796 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
30797 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
30798 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
30799 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
30801 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30802 /* As it uses V4HImode, we have to require -mmmx too. */
30803 | OPTION_MASK_ISA_MMX,
30804 "__builtin_ia32_vec_ext_v4hi",
30805 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
30807 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
30808 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
30810 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
30811 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
30813 /* Access to the vec_set patterns. */
30814 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
30815 "__builtin_ia32_vec_set_v2di",
30816 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
30818 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
30819 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
30821 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
30822 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
30824 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
30825 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
30827 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30828 /* As it uses V4HImode, we have to require -mmmx too. */
30829 | OPTION_MASK_ISA_MMX,
30830 "__builtin_ia32_vec_set_v4hi",
30831 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
30833 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
30834 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
30836 /* RDSEED */
30837 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
30838 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
30839 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
30840 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
30841 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
30842 "__builtin_ia32_rdseed_di_step",
30843 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
30845 /* ADCX */
30846 def_builtin (0, "__builtin_ia32_addcarryx_u32",
30847 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
30848 def_builtin (OPTION_MASK_ISA_64BIT,
30849 "__builtin_ia32_addcarryx_u64",
30850 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30851 IX86_BUILTIN_ADDCARRYX64);
30853 /* SBB */
30854 def_builtin (0, "__builtin_ia32_sbb_u32",
30855 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
30856 def_builtin (OPTION_MASK_ISA_64BIT,
30857 "__builtin_ia32_sbb_u64",
30858 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30859 IX86_BUILTIN_SBB64);
30861 /* Read/write FLAGS. */
30862 def_builtin (0, "__builtin_ia32_readeflags_u32",
30863 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30864 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
30865 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30866 def_builtin (0, "__builtin_ia32_writeeflags_u32",
30867 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
30868 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
30869 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
30871 /* CLFLUSHOPT. */
30872 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
30873 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
30875 /* CLWB. */
30876 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
30877 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
30879 /* MONITORX and MWAITX. */
30880 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
30881 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
30882 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
30883 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
30885 /* CLZERO. */
30886 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
30887 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
30889 /* Add FMA4 multi-arg argument instructions */
30890 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30892 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
30893 if (d->name == 0)
30894 continue;
30896 ftype = (enum ix86_builtin_func_type) d->flag;
30897 def_builtin_const (d->mask, d->name, ftype, d->code);
30899 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
30900 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30901 ARRAY_SIZE (bdesc_multi_arg) - 1);
30903 /* Add CET inrinsics. */
30904 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
30906 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
30907 if (d->name == 0)
30908 continue;
30910 ftype = (enum ix86_builtin_func_type) d->flag;
30911 def_builtin2 (d->mask, d->name, ftype, d->code);
30913 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
30914 IX86_BUILTIN__BDESC_CET_FIRST,
30915 ARRAY_SIZE (bdesc_cet) - 1);
30917 for (i = 0, d = bdesc_cet_rdssp;
30918 i < ARRAY_SIZE (bdesc_cet_rdssp);
30919 i++, d++)
30921 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
30922 if (d->name == 0)
30923 continue;
30925 ftype = (enum ix86_builtin_func_type) d->flag;
30926 def_builtin2 (d->mask, d->name, ftype, d->code);
30928 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
30929 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30930 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
30933 static void
30934 ix86_init_mpx_builtins ()
30936 const struct builtin_description * d;
30937 enum ix86_builtin_func_type ftype;
30938 tree decl;
30939 size_t i;
30941 for (i = 0, d = bdesc_mpx;
30942 i < ARRAY_SIZE (bdesc_mpx);
30943 i++, d++)
30945 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
30946 if (d->name == 0)
30947 continue;
30949 ftype = (enum ix86_builtin_func_type) d->flag;
30950 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
30952 /* With no leaf and nothrow flags for MPX builtins
30953 abnormal edges may follow its call when setjmp
30954 presents in the function. Since we may have a lot
30955 of MPX builtins calls it causes lots of useless
30956 edges and enormous PHI nodes. To avoid this we mark
30957 MPX builtins as leaf and nothrow. */
30958 if (decl)
30960 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30961 NULL_TREE);
30962 TREE_NOTHROW (decl) = 1;
30964 else
30966 ix86_builtins_isa[(int)d->code].leaf_p = true;
30967 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30970 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
30971 IX86_BUILTIN__BDESC_MPX_FIRST,
30972 ARRAY_SIZE (bdesc_mpx) - 1);
30974 for (i = 0, d = bdesc_mpx_const;
30975 i < ARRAY_SIZE (bdesc_mpx_const);
30976 i++, d++)
30978 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
30979 if (d->name == 0)
30980 continue;
30982 ftype = (enum ix86_builtin_func_type) d->flag;
30983 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
30985 if (decl)
30987 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30988 NULL_TREE);
30989 TREE_NOTHROW (decl) = 1;
30991 else
30993 ix86_builtins_isa[(int)d->code].leaf_p = true;
30994 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30997 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
30998 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30999 ARRAY_SIZE (bdesc_mpx_const) - 1);
31001 #undef BDESC_VERIFY
31002 #undef BDESC_VERIFYS
31004 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31005 to return a pointer to VERSION_DECL if the outcome of the expression
31006 formed by PREDICATE_CHAIN is true. This function will be called during
31007 version dispatch to decide which function version to execute. It returns
31008 the basic block at the end, to which more conditions can be added. */
31010 static basic_block
31011 add_condition_to_bb (tree function_decl, tree version_decl,
31012 tree predicate_chain, basic_block new_bb)
31014 gimple *return_stmt;
31015 tree convert_expr, result_var;
31016 gimple *convert_stmt;
31017 gimple *call_cond_stmt;
31018 gimple *if_else_stmt;
31020 basic_block bb1, bb2, bb3;
31021 edge e12, e23;
31023 tree cond_var, and_expr_var = NULL_TREE;
31024 gimple_seq gseq;
31026 tree predicate_decl, predicate_arg;
31028 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31030 gcc_assert (new_bb != NULL);
31031 gseq = bb_seq (new_bb);
31034 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31035 build_fold_addr_expr (version_decl));
31036 result_var = create_tmp_var (ptr_type_node);
31037 convert_stmt = gimple_build_assign (result_var, convert_expr);
31038 return_stmt = gimple_build_return (result_var);
31040 if (predicate_chain == NULL_TREE)
31042 gimple_seq_add_stmt (&gseq, convert_stmt);
31043 gimple_seq_add_stmt (&gseq, return_stmt);
31044 set_bb_seq (new_bb, gseq);
31045 gimple_set_bb (convert_stmt, new_bb);
31046 gimple_set_bb (return_stmt, new_bb);
31047 pop_cfun ();
31048 return new_bb;
31051 while (predicate_chain != NULL)
31053 cond_var = create_tmp_var (integer_type_node);
31054 predicate_decl = TREE_PURPOSE (predicate_chain);
31055 predicate_arg = TREE_VALUE (predicate_chain);
31056 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31057 gimple_call_set_lhs (call_cond_stmt, cond_var);
31059 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31060 gimple_set_bb (call_cond_stmt, new_bb);
31061 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31063 predicate_chain = TREE_CHAIN (predicate_chain);
31065 if (and_expr_var == NULL)
31066 and_expr_var = cond_var;
31067 else
31069 gimple *assign_stmt;
31070 /* Use MIN_EXPR to check if any integer is zero?.
31071 and_expr_var = min_expr <cond_var, and_expr_var> */
31072 assign_stmt = gimple_build_assign (and_expr_var,
31073 build2 (MIN_EXPR, integer_type_node,
31074 cond_var, and_expr_var));
31076 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31077 gimple_set_bb (assign_stmt, new_bb);
31078 gimple_seq_add_stmt (&gseq, assign_stmt);
31082 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31083 integer_zero_node,
31084 NULL_TREE, NULL_TREE);
31085 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31086 gimple_set_bb (if_else_stmt, new_bb);
31087 gimple_seq_add_stmt (&gseq, if_else_stmt);
31089 gimple_seq_add_stmt (&gseq, convert_stmt);
31090 gimple_seq_add_stmt (&gseq, return_stmt);
31091 set_bb_seq (new_bb, gseq);
31093 bb1 = new_bb;
31094 e12 = split_block (bb1, if_else_stmt);
31095 bb2 = e12->dest;
31096 e12->flags &= ~EDGE_FALLTHRU;
31097 e12->flags |= EDGE_TRUE_VALUE;
31099 e23 = split_block (bb2, return_stmt);
31101 gimple_set_bb (convert_stmt, bb2);
31102 gimple_set_bb (return_stmt, bb2);
31104 bb3 = e23->dest;
31105 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31107 remove_edge (e23);
31108 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31110 pop_cfun ();
31112 return bb3;
31115 /* This parses the attribute arguments to target in DECL and determines
31116 the right builtin to use to match the platform specification.
31117 It returns the priority value for this version decl. If PREDICATE_LIST
31118 is not NULL, it stores the list of cpu features that need to be checked
31119 before dispatching this function. */
31121 static unsigned int
31122 get_builtin_code_for_version (tree decl, tree *predicate_list)
31124 tree attrs;
31125 struct cl_target_option cur_target;
31126 tree target_node;
31127 struct cl_target_option *new_target;
31128 const char *arg_str = NULL;
31129 const char *attrs_str = NULL;
31130 char *tok_str = NULL;
31131 char *token;
31133 /* Priority of i386 features, greater value is higher priority. This is
31134 used to decide the order in which function dispatch must happen. For
31135 instance, a version specialized for SSE4.2 should be checked for dispatch
31136 before a version for SSE3, as SSE4.2 implies SSE3. */
31137 enum feature_priority
31139 P_ZERO = 0,
31140 P_MMX,
31141 P_SSE,
31142 P_SSE2,
31143 P_SSE3,
31144 P_SSSE3,
31145 P_PROC_SSSE3,
31146 P_SSE4_A,
31147 P_PROC_SSE4_A,
31148 P_SSE4_1,
31149 P_SSE4_2,
31150 P_PROC_SSE4_2,
31151 P_POPCNT,
31152 P_AES,
31153 P_PCLMUL,
31154 P_AVX,
31155 P_PROC_AVX,
31156 P_BMI,
31157 P_PROC_BMI,
31158 P_FMA4,
31159 P_XOP,
31160 P_PROC_XOP,
31161 P_FMA,
31162 P_PROC_FMA,
31163 P_BMI2,
31164 P_AVX2,
31165 P_PROC_AVX2,
31166 P_AVX512F,
31167 P_PROC_AVX512F
31170 enum feature_priority priority = P_ZERO;
31172 /* These are the target attribute strings for which a dispatcher is
31173 available, from fold_builtin_cpu. */
31175 static struct _feature_list
31177 const char *const name;
31178 const enum feature_priority priority;
31180 const feature_list[] =
31182 {"mmx", P_MMX},
31183 {"sse", P_SSE},
31184 {"sse2", P_SSE2},
31185 {"sse3", P_SSE3},
31186 {"sse4a", P_SSE4_A},
31187 {"ssse3", P_SSSE3},
31188 {"sse4.1", P_SSE4_1},
31189 {"sse4.2", P_SSE4_2},
31190 {"popcnt", P_POPCNT},
31191 {"aes", P_AES},
31192 {"pclmul", P_PCLMUL},
31193 {"avx", P_AVX},
31194 {"bmi", P_BMI},
31195 {"fma4", P_FMA4},
31196 {"xop", P_XOP},
31197 {"fma", P_FMA},
31198 {"bmi2", P_BMI2},
31199 {"avx2", P_AVX2},
31200 {"avx512f", P_AVX512F}
31204 static unsigned int NUM_FEATURES
31205 = sizeof (feature_list) / sizeof (struct _feature_list);
31207 unsigned int i;
31209 tree predicate_chain = NULL_TREE;
31210 tree predicate_decl, predicate_arg;
31212 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31213 gcc_assert (attrs != NULL);
31215 attrs = TREE_VALUE (TREE_VALUE (attrs));
31217 gcc_assert (TREE_CODE (attrs) == STRING_CST);
31218 attrs_str = TREE_STRING_POINTER (attrs);
31220 /* Return priority zero for default function. */
31221 if (strcmp (attrs_str, "default") == 0)
31222 return 0;
31224 /* Handle arch= if specified. For priority, set it to be 1 more than
31225 the best instruction set the processor can handle. For instance, if
31226 there is a version for atom and a version for ssse3 (the highest ISA
31227 priority for atom), the atom version must be checked for dispatch
31228 before the ssse3 version. */
31229 if (strstr (attrs_str, "arch=") != NULL)
31231 cl_target_option_save (&cur_target, &global_options);
31232 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
31233 &global_options_set);
31235 gcc_assert (target_node);
31236 new_target = TREE_TARGET_OPTION (target_node);
31237 gcc_assert (new_target);
31239 if (new_target->arch_specified && new_target->arch > 0)
31241 switch (new_target->arch)
31243 case PROCESSOR_CORE2:
31244 arg_str = "core2";
31245 priority = P_PROC_SSSE3;
31246 break;
31247 case PROCESSOR_NEHALEM:
31248 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
31250 arg_str = "westmere";
31251 priority = P_AES;
31253 else
31255 /* We translate "arch=corei7" and "arch=nehalem" to
31256 "corei7" so that it will be mapped to M_INTEL_COREI7
31257 as cpu type to cover all M_INTEL_COREI7_XXXs. */
31258 arg_str = "corei7";
31259 priority = P_PROC_SSE4_2;
31261 break;
31262 case PROCESSOR_SANDYBRIDGE:
31263 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
31264 arg_str = "ivybridge";
31265 else
31266 arg_str = "sandybridge";
31267 priority = P_PROC_AVX;
31268 break;
31269 case PROCESSOR_HASWELL:
31270 case PROCESSOR_SKYLAKE_AVX512:
31271 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VBMI)
31272 arg_str = "cannonlake";
31273 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31274 arg_str = "skylake-avx512";
31275 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
31276 arg_str = "skylake";
31277 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
31278 arg_str = "broadwell";
31279 else
31280 arg_str = "haswell";
31281 priority = P_PROC_AVX2;
31282 break;
31283 case PROCESSOR_BONNELL:
31284 arg_str = "bonnell";
31285 priority = P_PROC_SSSE3;
31286 break;
31287 case PROCESSOR_KNL:
31288 arg_str = "knl";
31289 priority = P_PROC_AVX512F;
31290 break;
31291 case PROCESSOR_KNM:
31292 arg_str = "knm";
31293 priority = P_PROC_AVX512F;
31294 break;
31295 case PROCESSOR_SILVERMONT:
31296 arg_str = "silvermont";
31297 priority = P_PROC_SSE4_2;
31298 break;
31299 case PROCESSOR_AMDFAM10:
31300 arg_str = "amdfam10h";
31301 priority = P_PROC_SSE4_A;
31302 break;
31303 case PROCESSOR_BTVER1:
31304 arg_str = "btver1";
31305 priority = P_PROC_SSE4_A;
31306 break;
31307 case PROCESSOR_BTVER2:
31308 arg_str = "btver2";
31309 priority = P_PROC_BMI;
31310 break;
31311 case PROCESSOR_BDVER1:
31312 arg_str = "bdver1";
31313 priority = P_PROC_XOP;
31314 break;
31315 case PROCESSOR_BDVER2:
31316 arg_str = "bdver2";
31317 priority = P_PROC_FMA;
31318 break;
31319 case PROCESSOR_BDVER3:
31320 arg_str = "bdver3";
31321 priority = P_PROC_FMA;
31322 break;
31323 case PROCESSOR_BDVER4:
31324 arg_str = "bdver4";
31325 priority = P_PROC_AVX2;
31326 break;
31327 case PROCESSOR_ZNVER1:
31328 arg_str = "znver1";
31329 priority = P_PROC_AVX2;
31330 break;
31334 cl_target_option_restore (&global_options, &cur_target);
31336 if (predicate_list && arg_str == NULL)
31338 error_at (DECL_SOURCE_LOCATION (decl),
31339 "No dispatcher found for the versioning attributes");
31340 return 0;
31343 if (predicate_list)
31345 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
31346 /* For a C string literal the length includes the trailing NULL. */
31347 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
31348 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31349 predicate_chain);
31353 /* Process feature name. */
31354 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
31355 strcpy (tok_str, attrs_str);
31356 token = strtok (tok_str, ",");
31357 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
31359 while (token != NULL)
31361 /* Do not process "arch=" */
31362 if (strncmp (token, "arch=", 5) == 0)
31364 token = strtok (NULL, ",");
31365 continue;
31367 for (i = 0; i < NUM_FEATURES; ++i)
31369 if (strcmp (token, feature_list[i].name) == 0)
31371 if (predicate_list)
31373 predicate_arg = build_string_literal (
31374 strlen (feature_list[i].name) + 1,
31375 feature_list[i].name);
31376 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31377 predicate_chain);
31379 /* Find the maximum priority feature. */
31380 if (feature_list[i].priority > priority)
31381 priority = feature_list[i].priority;
31383 break;
31386 if (predicate_list && i == NUM_FEATURES)
31388 error_at (DECL_SOURCE_LOCATION (decl),
31389 "No dispatcher found for %s", token);
31390 return 0;
31392 token = strtok (NULL, ",");
31394 free (tok_str);
31396 if (predicate_list && predicate_chain == NULL_TREE)
31398 error_at (DECL_SOURCE_LOCATION (decl),
31399 "No dispatcher found for the versioning attributes : %s",
31400 attrs_str);
31401 return 0;
31403 else if (predicate_list)
31405 predicate_chain = nreverse (predicate_chain);
31406 *predicate_list = predicate_chain;
31409 return priority;
31412 /* This compares the priority of target features in function DECL1
31413 and DECL2. It returns positive value if DECL1 is higher priority,
31414 negative value if DECL2 is higher priority and 0 if they are the
31415 same. */
31417 static int
31418 ix86_compare_version_priority (tree decl1, tree decl2)
31420 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
31421 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
31423 return (int)priority1 - (int)priority2;
31426 /* V1 and V2 point to function versions with different priorities
31427 based on the target ISA. This function compares their priorities. */
31429 static int
31430 feature_compare (const void *v1, const void *v2)
31432 typedef struct _function_version_info
31434 tree version_decl;
31435 tree predicate_chain;
31436 unsigned int dispatch_priority;
31437 } function_version_info;
31439 const function_version_info c1 = *(const function_version_info *)v1;
31440 const function_version_info c2 = *(const function_version_info *)v2;
31441 return (c2.dispatch_priority - c1.dispatch_priority);
31444 /* This function generates the dispatch function for
31445 multi-versioned functions. DISPATCH_DECL is the function which will
31446 contain the dispatch logic. FNDECLS are the function choices for
31447 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
31448 in DISPATCH_DECL in which the dispatch code is generated. */
31450 static int
31451 dispatch_function_versions (tree dispatch_decl,
31452 void *fndecls_p,
31453 basic_block *empty_bb)
31455 tree default_decl;
31456 gimple *ifunc_cpu_init_stmt;
31457 gimple_seq gseq;
31458 int ix;
31459 tree ele;
31460 vec<tree> *fndecls;
31461 unsigned int num_versions = 0;
31462 unsigned int actual_versions = 0;
31463 unsigned int i;
31465 struct _function_version_info
31467 tree version_decl;
31468 tree predicate_chain;
31469 unsigned int dispatch_priority;
31470 }*function_version_info;
31472 gcc_assert (dispatch_decl != NULL
31473 && fndecls_p != NULL
31474 && empty_bb != NULL);
31476 /*fndecls_p is actually a vector. */
31477 fndecls = static_cast<vec<tree> *> (fndecls_p);
31479 /* At least one more version other than the default. */
31480 num_versions = fndecls->length ();
31481 gcc_assert (num_versions >= 2);
31483 function_version_info = (struct _function_version_info *)
31484 XNEWVEC (struct _function_version_info, (num_versions - 1));
31486 /* The first version in the vector is the default decl. */
31487 default_decl = (*fndecls)[0];
31489 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
31491 gseq = bb_seq (*empty_bb);
31492 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
31493 constructors, so explicity call __builtin_cpu_init here. */
31494 ifunc_cpu_init_stmt = gimple_build_call_vec (
31495 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
31496 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
31497 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
31498 set_bb_seq (*empty_bb, gseq);
31500 pop_cfun ();
31503 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
31505 tree version_decl = ele;
31506 tree predicate_chain = NULL_TREE;
31507 unsigned int priority;
31508 /* Get attribute string, parse it and find the right predicate decl.
31509 The predicate function could be a lengthy combination of many
31510 features, like arch-type and various isa-variants. */
31511 priority = get_builtin_code_for_version (version_decl,
31512 &predicate_chain);
31514 if (predicate_chain == NULL_TREE)
31515 continue;
31517 function_version_info [actual_versions].version_decl = version_decl;
31518 function_version_info [actual_versions].predicate_chain
31519 = predicate_chain;
31520 function_version_info [actual_versions].dispatch_priority = priority;
31521 actual_versions++;
31524 /* Sort the versions according to descending order of dispatch priority. The
31525 priority is based on the ISA. This is not a perfect solution. There
31526 could still be ambiguity. If more than one function version is suitable
31527 to execute, which one should be dispatched? In future, allow the user
31528 to specify a dispatch priority next to the version. */
31529 qsort (function_version_info, actual_versions,
31530 sizeof (struct _function_version_info), feature_compare);
31532 for (i = 0; i < actual_versions; ++i)
31533 *empty_bb = add_condition_to_bb (dispatch_decl,
31534 function_version_info[i].version_decl,
31535 function_version_info[i].predicate_chain,
31536 *empty_bb);
31538 /* dispatch default version at the end. */
31539 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
31540 NULL, *empty_bb);
31542 free (function_version_info);
31543 return 0;
31546 /* This function changes the assembler name for functions that are
31547 versions. If DECL is a function version and has a "target"
31548 attribute, it appends the attribute string to its assembler name. */
31550 static tree
31551 ix86_mangle_function_version_assembler_name (tree decl, tree id)
31553 tree version_attr;
31554 const char *orig_name, *version_string;
31555 char *attr_str, *assembler_name;
31557 if (DECL_DECLARED_INLINE_P (decl)
31558 && lookup_attribute ("gnu_inline",
31559 DECL_ATTRIBUTES (decl)))
31560 error_at (DECL_SOURCE_LOCATION (decl),
31561 "Function versions cannot be marked as gnu_inline,"
31562 " bodies have to be generated");
31564 if (DECL_VIRTUAL_P (decl)
31565 || DECL_VINDEX (decl))
31566 sorry ("Virtual function multiversioning not supported");
31568 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31570 /* target attribute string cannot be NULL. */
31571 gcc_assert (version_attr != NULL_TREE);
31573 orig_name = IDENTIFIER_POINTER (id);
31574 version_string
31575 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
31577 if (strcmp (version_string, "default") == 0)
31578 return id;
31580 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
31581 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
31583 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
31585 /* Allow assembler name to be modified if already set. */
31586 if (DECL_ASSEMBLER_NAME_SET_P (decl))
31587 SET_DECL_RTL (decl, NULL);
31589 tree ret = get_identifier (assembler_name);
31590 XDELETEVEC (attr_str);
31591 XDELETEVEC (assembler_name);
31592 return ret;
31596 static tree
31597 ix86_mangle_decl_assembler_name (tree decl, tree id)
31599 /* For function version, add the target suffix to the assembler name. */
31600 if (TREE_CODE (decl) == FUNCTION_DECL
31601 && DECL_FUNCTION_VERSIONED (decl))
31602 id = ix86_mangle_function_version_assembler_name (decl, id);
31603 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
31604 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
31605 #endif
31607 return id;
31610 /* Make a dispatcher declaration for the multi-versioned function DECL.
31611 Calls to DECL function will be replaced with calls to the dispatcher
31612 by the front-end. Returns the decl of the dispatcher function. */
31614 static tree
31615 ix86_get_function_versions_dispatcher (void *decl)
31617 tree fn = (tree) decl;
31618 struct cgraph_node *node = NULL;
31619 struct cgraph_node *default_node = NULL;
31620 struct cgraph_function_version_info *node_v = NULL;
31621 struct cgraph_function_version_info *first_v = NULL;
31623 tree dispatch_decl = NULL;
31625 struct cgraph_function_version_info *default_version_info = NULL;
31627 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
31629 node = cgraph_node::get (fn);
31630 gcc_assert (node != NULL);
31632 node_v = node->function_version ();
31633 gcc_assert (node_v != NULL);
31635 if (node_v->dispatcher_resolver != NULL)
31636 return node_v->dispatcher_resolver;
31638 /* Find the default version and make it the first node. */
31639 first_v = node_v;
31640 /* Go to the beginning of the chain. */
31641 while (first_v->prev != NULL)
31642 first_v = first_v->prev;
31643 default_version_info = first_v;
31644 while (default_version_info != NULL)
31646 if (is_function_default_version
31647 (default_version_info->this_node->decl))
31648 break;
31649 default_version_info = default_version_info->next;
31652 /* If there is no default node, just return NULL. */
31653 if (default_version_info == NULL)
31654 return NULL;
31656 /* Make default info the first node. */
31657 if (first_v != default_version_info)
31659 default_version_info->prev->next = default_version_info->next;
31660 if (default_version_info->next)
31661 default_version_info->next->prev = default_version_info->prev;
31662 first_v->prev = default_version_info;
31663 default_version_info->next = first_v;
31664 default_version_info->prev = NULL;
31667 default_node = default_version_info->this_node;
31669 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
31670 if (targetm.has_ifunc_p ())
31672 struct cgraph_function_version_info *it_v = NULL;
31673 struct cgraph_node *dispatcher_node = NULL;
31674 struct cgraph_function_version_info *dispatcher_version_info = NULL;
31676 /* Right now, the dispatching is done via ifunc. */
31677 dispatch_decl = make_dispatcher_decl (default_node->decl);
31679 dispatcher_node = cgraph_node::get_create (dispatch_decl);
31680 gcc_assert (dispatcher_node != NULL);
31681 dispatcher_node->dispatcher_function = 1;
31682 dispatcher_version_info
31683 = dispatcher_node->insert_new_function_version ();
31684 dispatcher_version_info->next = default_version_info;
31685 dispatcher_node->definition = 1;
31687 /* Set the dispatcher for all the versions. */
31688 it_v = default_version_info;
31689 while (it_v != NULL)
31691 it_v->dispatcher_resolver = dispatch_decl;
31692 it_v = it_v->next;
31695 else
31696 #endif
31698 error_at (DECL_SOURCE_LOCATION (default_node->decl),
31699 "multiversioning needs ifunc which is not supported "
31700 "on this target");
31703 return dispatch_decl;
31706 /* Make the resolver function decl to dispatch the versions of
31707 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
31708 ifunc alias that will point to the created resolver. Create an
31709 empty basic block in the resolver and store the pointer in
31710 EMPTY_BB. Return the decl of the resolver function. */
31712 static tree
31713 make_resolver_func (const tree default_decl,
31714 const tree ifunc_alias_decl,
31715 basic_block *empty_bb)
31717 char *resolver_name;
31718 tree decl, type, decl_name, t;
31720 /* IFUNC's have to be globally visible. So, if the default_decl is
31721 not, then the name of the IFUNC should be made unique. */
31722 if (TREE_PUBLIC (default_decl) == 0)
31724 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
31725 symtab->change_decl_assembler_name (ifunc_alias_decl,
31726 get_identifier (ifunc_name));
31727 XDELETEVEC (ifunc_name);
31730 resolver_name = make_unique_name (default_decl, "resolver", false);
31732 /* The resolver function should return a (void *). */
31733 type = build_function_type_list (ptr_type_node, NULL_TREE);
31735 decl = build_fn_decl (resolver_name, type);
31736 decl_name = get_identifier (resolver_name);
31737 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
31739 DECL_NAME (decl) = decl_name;
31740 TREE_USED (decl) = 1;
31741 DECL_ARTIFICIAL (decl) = 1;
31742 DECL_IGNORED_P (decl) = 1;
31743 TREE_PUBLIC (decl) = 0;
31744 DECL_UNINLINABLE (decl) = 1;
31746 /* Resolver is not external, body is generated. */
31747 DECL_EXTERNAL (decl) = 0;
31748 DECL_EXTERNAL (ifunc_alias_decl) = 0;
31750 DECL_CONTEXT (decl) = NULL_TREE;
31751 DECL_INITIAL (decl) = make_node (BLOCK);
31752 DECL_STATIC_CONSTRUCTOR (decl) = 0;
31754 if (DECL_COMDAT_GROUP (default_decl)
31755 || TREE_PUBLIC (default_decl))
31757 /* In this case, each translation unit with a call to this
31758 versioned function will put out a resolver. Ensure it
31759 is comdat to keep just one copy. */
31760 DECL_COMDAT (decl) = 1;
31761 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
31763 /* Build result decl and add to function_decl. */
31764 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
31765 DECL_ARTIFICIAL (t) = 1;
31766 DECL_IGNORED_P (t) = 1;
31767 DECL_RESULT (decl) = t;
31769 gimplify_function_tree (decl);
31770 push_cfun (DECL_STRUCT_FUNCTION (decl));
31771 *empty_bb = init_lowered_empty_function (decl, false,
31772 profile_count::uninitialized ());
31774 cgraph_node::add_new_function (decl, true);
31775 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
31777 pop_cfun ();
31779 gcc_assert (ifunc_alias_decl != NULL);
31780 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
31781 DECL_ATTRIBUTES (ifunc_alias_decl)
31782 = make_attribute ("ifunc", resolver_name,
31783 DECL_ATTRIBUTES (ifunc_alias_decl));
31785 /* Create the alias for dispatch to resolver here. */
31786 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
31787 XDELETEVEC (resolver_name);
31788 return decl;
31791 /* Generate the dispatching code body to dispatch multi-versioned function
31792 DECL. The target hook is called to process the "target" attributes and
31793 provide the code to dispatch the right function at run-time. NODE points
31794 to the dispatcher decl whose body will be created. */
31796 static tree
31797 ix86_generate_version_dispatcher_body (void *node_p)
31799 tree resolver_decl;
31800 basic_block empty_bb;
31801 tree default_ver_decl;
31802 struct cgraph_node *versn;
31803 struct cgraph_node *node;
31805 struct cgraph_function_version_info *node_version_info = NULL;
31806 struct cgraph_function_version_info *versn_info = NULL;
31808 node = (cgraph_node *)node_p;
31810 node_version_info = node->function_version ();
31811 gcc_assert (node->dispatcher_function
31812 && node_version_info != NULL);
31814 if (node_version_info->dispatcher_resolver)
31815 return node_version_info->dispatcher_resolver;
31817 /* The first version in the chain corresponds to the default version. */
31818 default_ver_decl = node_version_info->next->this_node->decl;
31820 /* node is going to be an alias, so remove the finalized bit. */
31821 node->definition = false;
31823 resolver_decl = make_resolver_func (default_ver_decl,
31824 node->decl, &empty_bb);
31826 node_version_info->dispatcher_resolver = resolver_decl;
31828 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
31830 auto_vec<tree, 2> fn_ver_vec;
31832 for (versn_info = node_version_info->next; versn_info;
31833 versn_info = versn_info->next)
31835 versn = versn_info->this_node;
31836 /* Check for virtual functions here again, as by this time it should
31837 have been determined if this function needs a vtable index or
31838 not. This happens for methods in derived classes that override
31839 virtual methods in base classes but are not explicitly marked as
31840 virtual. */
31841 if (DECL_VINDEX (versn->decl))
31842 sorry ("Virtual function multiversioning not supported");
31844 fn_ver_vec.safe_push (versn->decl);
31847 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
31848 cgraph_edge::rebuild_edges ();
31849 pop_cfun ();
31850 return resolver_decl;
31852 /* This builds the processor_model struct type defined in
31853 libgcc/config/i386/cpuinfo.c */
31855 static tree
31856 build_processor_model_struct (void)
31858 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
31859 "__cpu_features"};
31860 tree field = NULL_TREE, field_chain = NULL_TREE;
31861 int i;
31862 tree type = make_node (RECORD_TYPE);
31864 /* The first 3 fields are unsigned int. */
31865 for (i = 0; i < 3; ++i)
31867 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31868 get_identifier (field_name[i]), unsigned_type_node);
31869 if (field_chain != NULL_TREE)
31870 DECL_CHAIN (field) = field_chain;
31871 field_chain = field;
31874 /* The last field is an array of unsigned integers of size one. */
31875 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31876 get_identifier (field_name[3]),
31877 build_array_type (unsigned_type_node,
31878 build_index_type (size_one_node)));
31879 if (field_chain != NULL_TREE)
31880 DECL_CHAIN (field) = field_chain;
31881 field_chain = field;
31883 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
31884 return type;
31887 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
31889 static tree
31890 make_var_decl (tree type, const char *name)
31892 tree new_decl;
31894 new_decl = build_decl (UNKNOWN_LOCATION,
31895 VAR_DECL,
31896 get_identifier(name),
31897 type);
31899 DECL_EXTERNAL (new_decl) = 1;
31900 TREE_STATIC (new_decl) = 1;
31901 TREE_PUBLIC (new_decl) = 1;
31902 DECL_INITIAL (new_decl) = 0;
31903 DECL_ARTIFICIAL (new_decl) = 0;
31904 DECL_PRESERVE_P (new_decl) = 1;
31906 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
31907 assemble_variable (new_decl, 0, 0, 0);
31909 return new_decl;
31912 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
31913 into an integer defined in libgcc/config/i386/cpuinfo.c */
31915 static tree
31916 fold_builtin_cpu (tree fndecl, tree *args)
31918 unsigned int i;
31919 enum ix86_builtins fn_code = (enum ix86_builtins)
31920 DECL_FUNCTION_CODE (fndecl);
31921 tree param_string_cst = NULL;
31923 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
31924 enum processor_features
31926 F_CMOV = 0,
31927 F_MMX,
31928 F_POPCNT,
31929 F_SSE,
31930 F_SSE2,
31931 F_SSE3,
31932 F_SSSE3,
31933 F_SSE4_1,
31934 F_SSE4_2,
31935 F_AVX,
31936 F_AVX2,
31937 F_SSE4_A,
31938 F_FMA4,
31939 F_XOP,
31940 F_FMA,
31941 F_AVX512F,
31942 F_BMI,
31943 F_BMI2,
31944 F_AES,
31945 F_PCLMUL,
31946 F_AVX512VL,
31947 F_AVX512BW,
31948 F_AVX512DQ,
31949 F_AVX512CD,
31950 F_AVX512ER,
31951 F_AVX512PF,
31952 F_AVX512VBMI,
31953 F_AVX512IFMA,
31954 F_AVX5124VNNIW,
31955 F_AVX5124FMAPS,
31956 F_AVX512VPOPCNTDQ,
31957 F_MAX
31960 /* These are the values for vendor types and cpu types and subtypes
31961 in cpuinfo.c. Cpu types and subtypes should be subtracted by
31962 the corresponding start value. */
31963 enum processor_model
31965 M_INTEL = 1,
31966 M_AMD,
31967 M_CPU_TYPE_START,
31968 M_INTEL_BONNELL,
31969 M_INTEL_CORE2,
31970 M_INTEL_COREI7,
31971 M_AMDFAM10H,
31972 M_AMDFAM15H,
31973 M_INTEL_SILVERMONT,
31974 M_INTEL_KNL,
31975 M_AMD_BTVER1,
31976 M_AMD_BTVER2,
31977 M_AMDFAM17H,
31978 M_INTEL_KNM,
31979 M_CPU_SUBTYPE_START,
31980 M_INTEL_COREI7_NEHALEM,
31981 M_INTEL_COREI7_WESTMERE,
31982 M_INTEL_COREI7_SANDYBRIDGE,
31983 M_AMDFAM10H_BARCELONA,
31984 M_AMDFAM10H_SHANGHAI,
31985 M_AMDFAM10H_ISTANBUL,
31986 M_AMDFAM15H_BDVER1,
31987 M_AMDFAM15H_BDVER2,
31988 M_AMDFAM15H_BDVER3,
31989 M_AMDFAM15H_BDVER4,
31990 M_AMDFAM17H_ZNVER1,
31991 M_INTEL_COREI7_IVYBRIDGE,
31992 M_INTEL_COREI7_HASWELL,
31993 M_INTEL_COREI7_BROADWELL,
31994 M_INTEL_COREI7_SKYLAKE,
31995 M_INTEL_COREI7_SKYLAKE_AVX512,
31996 M_INTEL_COREI7_CANNONLAKE
31999 static struct _arch_names_table
32001 const char *const name;
32002 const enum processor_model model;
32004 const arch_names_table[] =
32006 {"amd", M_AMD},
32007 {"intel", M_INTEL},
32008 {"atom", M_INTEL_BONNELL},
32009 {"slm", M_INTEL_SILVERMONT},
32010 {"core2", M_INTEL_CORE2},
32011 {"corei7", M_INTEL_COREI7},
32012 {"nehalem", M_INTEL_COREI7_NEHALEM},
32013 {"westmere", M_INTEL_COREI7_WESTMERE},
32014 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32015 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32016 {"haswell", M_INTEL_COREI7_HASWELL},
32017 {"broadwell", M_INTEL_COREI7_BROADWELL},
32018 {"skylake", M_INTEL_COREI7_SKYLAKE},
32019 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32020 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32021 {"bonnell", M_INTEL_BONNELL},
32022 {"silvermont", M_INTEL_SILVERMONT},
32023 {"knl", M_INTEL_KNL},
32024 {"knm", M_INTEL_KNM},
32025 {"amdfam10h", M_AMDFAM10H},
32026 {"barcelona", M_AMDFAM10H_BARCELONA},
32027 {"shanghai", M_AMDFAM10H_SHANGHAI},
32028 {"istanbul", M_AMDFAM10H_ISTANBUL},
32029 {"btver1", M_AMD_BTVER1},
32030 {"amdfam15h", M_AMDFAM15H},
32031 {"bdver1", M_AMDFAM15H_BDVER1},
32032 {"bdver2", M_AMDFAM15H_BDVER2},
32033 {"bdver3", M_AMDFAM15H_BDVER3},
32034 {"bdver4", M_AMDFAM15H_BDVER4},
32035 {"btver2", M_AMD_BTVER2},
32036 {"amdfam17h", M_AMDFAM17H},
32037 {"znver1", M_AMDFAM17H_ZNVER1},
32040 static struct _isa_names_table
32042 const char *const name;
32043 const enum processor_features feature;
32045 const isa_names_table[] =
32047 {"cmov", F_CMOV},
32048 {"mmx", F_MMX},
32049 {"popcnt", F_POPCNT},
32050 {"sse", F_SSE},
32051 {"sse2", F_SSE2},
32052 {"sse3", F_SSE3},
32053 {"ssse3", F_SSSE3},
32054 {"sse4a", F_SSE4_A},
32055 {"sse4.1", F_SSE4_1},
32056 {"sse4.2", F_SSE4_2},
32057 {"avx", F_AVX},
32058 {"fma4", F_FMA4},
32059 {"xop", F_XOP},
32060 {"fma", F_FMA},
32061 {"avx2", F_AVX2},
32062 {"avx512f", F_AVX512F},
32063 {"bmi", F_BMI},
32064 {"bmi2", F_BMI2},
32065 {"aes", F_AES},
32066 {"pclmul", F_PCLMUL},
32067 {"avx512vl",F_AVX512VL},
32068 {"avx512bw",F_AVX512BW},
32069 {"avx512dq",F_AVX512DQ},
32070 {"avx512cd",F_AVX512CD},
32071 {"avx512er",F_AVX512ER},
32072 {"avx512pf",F_AVX512PF},
32073 {"avx512vbmi",F_AVX512VBMI},
32074 {"avx512ifma",F_AVX512IFMA},
32075 {"avx5124vnniw",F_AVX5124VNNIW},
32076 {"avx5124fmaps",F_AVX5124FMAPS},
32077 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32080 tree __processor_model_type = build_processor_model_struct ();
32081 tree __cpu_model_var = make_var_decl (__processor_model_type,
32082 "__cpu_model");
32085 varpool_node::add (__cpu_model_var);
32087 gcc_assert ((args != NULL) && (*args != NULL));
32089 param_string_cst = *args;
32090 while (param_string_cst
32091 && TREE_CODE (param_string_cst) != STRING_CST)
32093 /* *args must be a expr that can contain other EXPRS leading to a
32094 STRING_CST. */
32095 if (!EXPR_P (param_string_cst))
32097 error ("Parameter to builtin must be a string constant or literal");
32098 return integer_zero_node;
32100 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32103 gcc_assert (param_string_cst);
32105 if (fn_code == IX86_BUILTIN_CPU_IS)
32107 tree ref;
32108 tree field;
32109 tree final;
32111 unsigned int field_val = 0;
32112 unsigned int NUM_ARCH_NAMES
32113 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
32115 for (i = 0; i < NUM_ARCH_NAMES; i++)
32116 if (strcmp (arch_names_table[i].name,
32117 TREE_STRING_POINTER (param_string_cst)) == 0)
32118 break;
32120 if (i == NUM_ARCH_NAMES)
32122 error ("Parameter to builtin not valid: %s",
32123 TREE_STRING_POINTER (param_string_cst));
32124 return integer_zero_node;
32127 field = TYPE_FIELDS (__processor_model_type);
32128 field_val = arch_names_table[i].model;
32130 /* CPU types are stored in the next field. */
32131 if (field_val > M_CPU_TYPE_START
32132 && field_val < M_CPU_SUBTYPE_START)
32134 field = DECL_CHAIN (field);
32135 field_val -= M_CPU_TYPE_START;
32138 /* CPU subtypes are stored in the next field. */
32139 if (field_val > M_CPU_SUBTYPE_START)
32141 field = DECL_CHAIN ( DECL_CHAIN (field));
32142 field_val -= M_CPU_SUBTYPE_START;
32145 /* Get the appropriate field in __cpu_model. */
32146 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32147 field, NULL_TREE);
32149 /* Check the value. */
32150 final = build2 (EQ_EXPR, unsigned_type_node, ref,
32151 build_int_cstu (unsigned_type_node, field_val));
32152 return build1 (CONVERT_EXPR, integer_type_node, final);
32154 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
32156 tree ref;
32157 tree array_elt;
32158 tree field;
32159 tree final;
32161 unsigned int field_val = 0;
32162 unsigned int NUM_ISA_NAMES
32163 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
32165 for (i = 0; i < NUM_ISA_NAMES; i++)
32166 if (strcmp (isa_names_table[i].name,
32167 TREE_STRING_POINTER (param_string_cst)) == 0)
32168 break;
32170 if (i == NUM_ISA_NAMES)
32172 error ("Parameter to builtin not valid: %s",
32173 TREE_STRING_POINTER (param_string_cst));
32174 return integer_zero_node;
32177 field = TYPE_FIELDS (__processor_model_type);
32178 /* Get the last field, which is __cpu_features. */
32179 while (DECL_CHAIN (field))
32180 field = DECL_CHAIN (field);
32182 /* Get the appropriate field: __cpu_model.__cpu_features */
32183 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32184 field, NULL_TREE);
32186 /* Access the 0th element of __cpu_features array. */
32187 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
32188 integer_zero_node, NULL_TREE, NULL_TREE);
32190 field_val = (1 << isa_names_table[i].feature);
32191 /* Return __cpu_model.__cpu_features[0] & field_val */
32192 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
32193 build_int_cstu (unsigned_type_node, field_val));
32194 return build1 (CONVERT_EXPR, integer_type_node, final);
32196 gcc_unreachable ();
32199 static tree
32200 ix86_fold_builtin (tree fndecl, int n_args,
32201 tree *args, bool ignore ATTRIBUTE_UNUSED)
32203 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32205 enum ix86_builtins fn_code = (enum ix86_builtins)
32206 DECL_FUNCTION_CODE (fndecl);
32207 switch (fn_code)
32209 case IX86_BUILTIN_CPU_IS:
32210 case IX86_BUILTIN_CPU_SUPPORTS:
32211 gcc_assert (n_args == 1);
32212 return fold_builtin_cpu (fndecl, args);
32214 case IX86_BUILTIN_NANQ:
32215 case IX86_BUILTIN_NANSQ:
32217 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32218 const char *str = c_getstr (*args);
32219 int quiet = fn_code == IX86_BUILTIN_NANQ;
32220 REAL_VALUE_TYPE real;
32222 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
32223 return build_real (type, real);
32224 return NULL_TREE;
32227 case IX86_BUILTIN_INFQ:
32228 case IX86_BUILTIN_HUGE_VALQ:
32230 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32231 REAL_VALUE_TYPE inf;
32232 real_inf (&inf);
32233 return build_real (type, inf);
32236 case IX86_BUILTIN_TZCNT16:
32237 case IX86_BUILTIN_CTZS:
32238 case IX86_BUILTIN_TZCNT32:
32239 case IX86_BUILTIN_TZCNT64:
32240 gcc_assert (n_args == 1);
32241 if (TREE_CODE (args[0]) == INTEGER_CST)
32243 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32244 tree arg = args[0];
32245 if (fn_code == IX86_BUILTIN_TZCNT16
32246 || fn_code == IX86_BUILTIN_CTZS)
32247 arg = fold_convert (short_unsigned_type_node, arg);
32248 if (integer_zerop (arg))
32249 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32250 else
32251 return fold_const_call (CFN_CTZ, type, arg);
32253 break;
32255 case IX86_BUILTIN_LZCNT16:
32256 case IX86_BUILTIN_CLZS:
32257 case IX86_BUILTIN_LZCNT32:
32258 case IX86_BUILTIN_LZCNT64:
32259 gcc_assert (n_args == 1);
32260 if (TREE_CODE (args[0]) == INTEGER_CST)
32262 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32263 tree arg = args[0];
32264 if (fn_code == IX86_BUILTIN_LZCNT16
32265 || fn_code == IX86_BUILTIN_CLZS)
32266 arg = fold_convert (short_unsigned_type_node, arg);
32267 if (integer_zerop (arg))
32268 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32269 else
32270 return fold_const_call (CFN_CLZ, type, arg);
32272 break;
32274 case IX86_BUILTIN_BEXTR32:
32275 case IX86_BUILTIN_BEXTR64:
32276 case IX86_BUILTIN_BEXTRI32:
32277 case IX86_BUILTIN_BEXTRI64:
32278 gcc_assert (n_args == 2);
32279 if (tree_fits_uhwi_p (args[1]))
32281 unsigned HOST_WIDE_INT res = 0;
32282 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
32283 unsigned int start = tree_to_uhwi (args[1]);
32284 unsigned int len = (start & 0xff00) >> 8;
32285 start &= 0xff;
32286 if (start >= prec || len == 0)
32287 res = 0;
32288 else if (!tree_fits_uhwi_p (args[0]))
32289 break;
32290 else
32291 res = tree_to_uhwi (args[0]) >> start;
32292 if (len > prec)
32293 len = prec;
32294 if (len < HOST_BITS_PER_WIDE_INT)
32295 res &= (HOST_WIDE_INT_1U << len) - 1;
32296 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32298 break;
32300 case IX86_BUILTIN_BZHI32:
32301 case IX86_BUILTIN_BZHI64:
32302 gcc_assert (n_args == 2);
32303 if (tree_fits_uhwi_p (args[1]))
32305 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
32306 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
32307 return args[0];
32308 if (!tree_fits_uhwi_p (args[0]))
32309 break;
32310 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
32311 res &= ~(HOST_WIDE_INT_M1U << idx);
32312 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32314 break;
32316 case IX86_BUILTIN_PDEP32:
32317 case IX86_BUILTIN_PDEP64:
32318 gcc_assert (n_args == 2);
32319 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32321 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32322 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32323 unsigned HOST_WIDE_INT res = 0;
32324 unsigned HOST_WIDE_INT m, k = 1;
32325 for (m = 1; m; m <<= 1)
32326 if ((mask & m) != 0)
32328 if ((src & k) != 0)
32329 res |= m;
32330 k <<= 1;
32332 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32334 break;
32336 case IX86_BUILTIN_PEXT32:
32337 case IX86_BUILTIN_PEXT64:
32338 gcc_assert (n_args == 2);
32339 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32341 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32342 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32343 unsigned HOST_WIDE_INT res = 0;
32344 unsigned HOST_WIDE_INT m, k = 1;
32345 for (m = 1; m; m <<= 1)
32346 if ((mask & m) != 0)
32348 if ((src & m) != 0)
32349 res |= k;
32350 k <<= 1;
32352 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32354 break;
32356 default:
32357 break;
32361 #ifdef SUBTARGET_FOLD_BUILTIN
32362 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
32363 #endif
32365 return NULL_TREE;
32368 /* Fold a MD builtin (use ix86_fold_builtin for folding into
32369 constant) in GIMPLE. */
32371 bool
32372 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
32374 gimple *stmt = gsi_stmt (*gsi);
32375 tree fndecl = gimple_call_fndecl (stmt);
32376 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
32377 int n_args = gimple_call_num_args (stmt);
32378 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
32379 tree decl = NULL_TREE;
32380 tree arg0, arg1;
32382 switch (fn_code)
32384 case IX86_BUILTIN_TZCNT32:
32385 decl = builtin_decl_implicit (BUILT_IN_CTZ);
32386 goto fold_tzcnt_lzcnt;
32388 case IX86_BUILTIN_TZCNT64:
32389 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
32390 goto fold_tzcnt_lzcnt;
32392 case IX86_BUILTIN_LZCNT32:
32393 decl = builtin_decl_implicit (BUILT_IN_CLZ);
32394 goto fold_tzcnt_lzcnt;
32396 case IX86_BUILTIN_LZCNT64:
32397 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
32398 goto fold_tzcnt_lzcnt;
32400 fold_tzcnt_lzcnt:
32401 gcc_assert (n_args == 1);
32402 arg0 = gimple_call_arg (stmt, 0);
32403 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
32405 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
32406 /* If arg0 is provably non-zero, optimize into generic
32407 __builtin_c[tl]z{,ll} function the middle-end handles
32408 better. */
32409 if (!expr_not_equal_to (arg0, wi::zero (prec)))
32410 return false;
32412 location_t loc = gimple_location (stmt);
32413 gimple *g = gimple_build_call (decl, 1, arg0);
32414 gimple_set_location (g, loc);
32415 tree lhs = make_ssa_name (integer_type_node);
32416 gimple_call_set_lhs (g, lhs);
32417 gsi_insert_before (gsi, g, GSI_SAME_STMT);
32418 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
32419 gimple_set_location (g, loc);
32420 gsi_replace (gsi, g, false);
32421 return true;
32423 break;
32425 case IX86_BUILTIN_BZHI32:
32426 case IX86_BUILTIN_BZHI64:
32427 gcc_assert (n_args == 2);
32428 arg1 = gimple_call_arg (stmt, 1);
32429 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
32431 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
32432 arg0 = gimple_call_arg (stmt, 0);
32433 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
32434 break;
32435 location_t loc = gimple_location (stmt);
32436 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32437 gimple_set_location (g, loc);
32438 gsi_replace (gsi, g, false);
32439 return true;
32441 break;
32443 case IX86_BUILTIN_PDEP32:
32444 case IX86_BUILTIN_PDEP64:
32445 case IX86_BUILTIN_PEXT32:
32446 case IX86_BUILTIN_PEXT64:
32447 gcc_assert (n_args == 2);
32448 arg1 = gimple_call_arg (stmt, 1);
32449 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
32451 location_t loc = gimple_location (stmt);
32452 arg0 = gimple_call_arg (stmt, 0);
32453 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32454 gimple_set_location (g, loc);
32455 gsi_replace (gsi, g, false);
32456 return true;
32458 break;
32460 default:
32461 break;
32464 return false;
32467 /* Make builtins to detect cpu type and features supported. NAME is
32468 the builtin name, CODE is the builtin code, and FTYPE is the function
32469 type of the builtin. */
32471 static void
32472 make_cpu_type_builtin (const char* name, int code,
32473 enum ix86_builtin_func_type ftype, bool is_const)
32475 tree decl;
32476 tree type;
32478 type = ix86_get_builtin_func_type (ftype);
32479 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32480 NULL, NULL_TREE);
32481 gcc_assert (decl != NULL_TREE);
32482 ix86_builtins[(int) code] = decl;
32483 TREE_READONLY (decl) = is_const;
32486 /* Make builtins to get CPU type and features supported. The created
32487 builtins are :
32489 __builtin_cpu_init (), to detect cpu type and features,
32490 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
32491 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
32494 static void
32495 ix86_init_platform_type_builtins (void)
32497 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
32498 INT_FTYPE_VOID, false);
32499 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
32500 INT_FTYPE_PCCHAR, true);
32501 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
32502 INT_FTYPE_PCCHAR, true);
32505 /* Internal method for ix86_init_builtins. */
32507 static void
32508 ix86_init_builtins_va_builtins_abi (void)
32510 tree ms_va_ref, sysv_va_ref;
32511 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
32512 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
32513 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
32514 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
32516 if (!TARGET_64BIT)
32517 return;
32518 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
32519 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
32520 ms_va_ref = build_reference_type (ms_va_list_type_node);
32521 sysv_va_ref =
32522 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
32524 fnvoid_va_end_ms =
32525 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32526 fnvoid_va_start_ms =
32527 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32528 fnvoid_va_end_sysv =
32529 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
32530 fnvoid_va_start_sysv =
32531 build_varargs_function_type_list (void_type_node, sysv_va_ref,
32532 NULL_TREE);
32533 fnvoid_va_copy_ms =
32534 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
32535 NULL_TREE);
32536 fnvoid_va_copy_sysv =
32537 build_function_type_list (void_type_node, sysv_va_ref,
32538 sysv_va_ref, NULL_TREE);
32540 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
32541 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
32542 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
32543 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
32544 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
32545 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
32546 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
32547 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32548 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
32549 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32550 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
32551 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32554 static void
32555 ix86_init_builtin_types (void)
32557 tree float80_type_node, const_string_type_node;
32559 /* The __float80 type. */
32560 float80_type_node = long_double_type_node;
32561 if (TYPE_MODE (float80_type_node) != XFmode)
32563 if (float64x_type_node != NULL_TREE
32564 && TYPE_MODE (float64x_type_node) == XFmode)
32565 float80_type_node = float64x_type_node;
32566 else
32568 /* The __float80 type. */
32569 float80_type_node = make_node (REAL_TYPE);
32571 TYPE_PRECISION (float80_type_node) = 80;
32572 layout_type (float80_type_node);
32575 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
32577 /* The __float128 type. The node has already been created as
32578 _Float128, so we only need to register the __float128 name for
32579 it. */
32580 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
32582 const_string_type_node
32583 = build_pointer_type (build_qualified_type
32584 (char_type_node, TYPE_QUAL_CONST));
32586 /* This macro is built by i386-builtin-types.awk. */
32587 DEFINE_BUILTIN_PRIMITIVE_TYPES;
32590 static void
32591 ix86_init_builtins (void)
32593 tree ftype, decl;
32595 ix86_init_builtin_types ();
32597 /* Builtins to get CPU type and features. */
32598 ix86_init_platform_type_builtins ();
32600 /* TFmode support builtins. */
32601 def_builtin_const (0, "__builtin_infq",
32602 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
32603 def_builtin_const (0, "__builtin_huge_valq",
32604 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
32606 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
32607 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
32608 BUILT_IN_MD, "nanq", NULL_TREE);
32609 TREE_READONLY (decl) = 1;
32610 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
32612 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
32613 BUILT_IN_MD, "nansq", NULL_TREE);
32614 TREE_READONLY (decl) = 1;
32615 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
32617 /* We will expand them to normal call if SSE isn't available since
32618 they are used by libgcc. */
32619 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
32620 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
32621 BUILT_IN_MD, "__fabstf2", NULL_TREE);
32622 TREE_READONLY (decl) = 1;
32623 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
32625 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
32626 decl = add_builtin_function ("__builtin_copysignq", ftype,
32627 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
32628 "__copysigntf3", NULL_TREE);
32629 TREE_READONLY (decl) = 1;
32630 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
32632 ix86_init_tm_builtins ();
32633 ix86_init_mmx_sse_builtins ();
32634 ix86_init_mpx_builtins ();
32636 if (TARGET_LP64)
32637 ix86_init_builtins_va_builtins_abi ();
32639 #ifdef SUBTARGET_INIT_BUILTINS
32640 SUBTARGET_INIT_BUILTINS;
32641 #endif
32644 /* Return the ix86 builtin for CODE. */
32646 static tree
32647 ix86_builtin_decl (unsigned code, bool)
32649 if (code >= IX86_BUILTIN_MAX)
32650 return error_mark_node;
32652 return ix86_builtins[code];
32655 /* Errors in the source file can cause expand_expr to return const0_rtx
32656 where we expect a vector. To avoid crashing, use one of the vector
32657 clear instructions. */
32658 static rtx
32659 safe_vector_operand (rtx x, machine_mode mode)
32661 if (x == const0_rtx)
32662 x = CONST0_RTX (mode);
32663 return x;
32666 /* Fixup modeless constants to fit required mode. */
32667 static rtx
32668 fixup_modeless_constant (rtx x, machine_mode mode)
32670 if (GET_MODE (x) == VOIDmode)
32671 x = convert_to_mode (mode, x, 1);
32672 return x;
32675 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
32677 static rtx
32678 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
32680 rtx pat;
32681 tree arg0 = CALL_EXPR_ARG (exp, 0);
32682 tree arg1 = CALL_EXPR_ARG (exp, 1);
32683 rtx op0 = expand_normal (arg0);
32684 rtx op1 = expand_normal (arg1);
32685 machine_mode tmode = insn_data[icode].operand[0].mode;
32686 machine_mode mode0 = insn_data[icode].operand[1].mode;
32687 machine_mode mode1 = insn_data[icode].operand[2].mode;
32689 if (VECTOR_MODE_P (mode0))
32690 op0 = safe_vector_operand (op0, mode0);
32691 if (VECTOR_MODE_P (mode1))
32692 op1 = safe_vector_operand (op1, mode1);
32694 if (optimize || !target
32695 || GET_MODE (target) != tmode
32696 || !insn_data[icode].operand[0].predicate (target, tmode))
32697 target = gen_reg_rtx (tmode);
32699 if (GET_MODE (op1) == SImode && mode1 == TImode)
32701 rtx x = gen_reg_rtx (V4SImode);
32702 emit_insn (gen_sse2_loadd (x, op1));
32703 op1 = gen_lowpart (TImode, x);
32706 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32707 op0 = copy_to_mode_reg (mode0, op0);
32708 if (!insn_data[icode].operand[2].predicate (op1, mode1))
32709 op1 = copy_to_mode_reg (mode1, op1);
32711 pat = GEN_FCN (icode) (target, op0, op1);
32712 if (! pat)
32713 return 0;
32715 emit_insn (pat);
32717 return target;
32720 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
32722 static rtx
32723 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
32724 enum ix86_builtin_func_type m_type,
32725 enum rtx_code sub_code)
32727 rtx pat;
32728 int i;
32729 int nargs;
32730 bool comparison_p = false;
32731 bool tf_p = false;
32732 bool last_arg_constant = false;
32733 int num_memory = 0;
32734 struct {
32735 rtx op;
32736 machine_mode mode;
32737 } args[4];
32739 machine_mode tmode = insn_data[icode].operand[0].mode;
32741 switch (m_type)
32743 case MULTI_ARG_4_DF2_DI_I:
32744 case MULTI_ARG_4_DF2_DI_I1:
32745 case MULTI_ARG_4_SF2_SI_I:
32746 case MULTI_ARG_4_SF2_SI_I1:
32747 nargs = 4;
32748 last_arg_constant = true;
32749 break;
32751 case MULTI_ARG_3_SF:
32752 case MULTI_ARG_3_DF:
32753 case MULTI_ARG_3_SF2:
32754 case MULTI_ARG_3_DF2:
32755 case MULTI_ARG_3_DI:
32756 case MULTI_ARG_3_SI:
32757 case MULTI_ARG_3_SI_DI:
32758 case MULTI_ARG_3_HI:
32759 case MULTI_ARG_3_HI_SI:
32760 case MULTI_ARG_3_QI:
32761 case MULTI_ARG_3_DI2:
32762 case MULTI_ARG_3_SI2:
32763 case MULTI_ARG_3_HI2:
32764 case MULTI_ARG_3_QI2:
32765 nargs = 3;
32766 break;
32768 case MULTI_ARG_2_SF:
32769 case MULTI_ARG_2_DF:
32770 case MULTI_ARG_2_DI:
32771 case MULTI_ARG_2_SI:
32772 case MULTI_ARG_2_HI:
32773 case MULTI_ARG_2_QI:
32774 nargs = 2;
32775 break;
32777 case MULTI_ARG_2_DI_IMM:
32778 case MULTI_ARG_2_SI_IMM:
32779 case MULTI_ARG_2_HI_IMM:
32780 case MULTI_ARG_2_QI_IMM:
32781 nargs = 2;
32782 last_arg_constant = true;
32783 break;
32785 case MULTI_ARG_1_SF:
32786 case MULTI_ARG_1_DF:
32787 case MULTI_ARG_1_SF2:
32788 case MULTI_ARG_1_DF2:
32789 case MULTI_ARG_1_DI:
32790 case MULTI_ARG_1_SI:
32791 case MULTI_ARG_1_HI:
32792 case MULTI_ARG_1_QI:
32793 case MULTI_ARG_1_SI_DI:
32794 case MULTI_ARG_1_HI_DI:
32795 case MULTI_ARG_1_HI_SI:
32796 case MULTI_ARG_1_QI_DI:
32797 case MULTI_ARG_1_QI_SI:
32798 case MULTI_ARG_1_QI_HI:
32799 nargs = 1;
32800 break;
32802 case MULTI_ARG_2_DI_CMP:
32803 case MULTI_ARG_2_SI_CMP:
32804 case MULTI_ARG_2_HI_CMP:
32805 case MULTI_ARG_2_QI_CMP:
32806 nargs = 2;
32807 comparison_p = true;
32808 break;
32810 case MULTI_ARG_2_SF_TF:
32811 case MULTI_ARG_2_DF_TF:
32812 case MULTI_ARG_2_DI_TF:
32813 case MULTI_ARG_2_SI_TF:
32814 case MULTI_ARG_2_HI_TF:
32815 case MULTI_ARG_2_QI_TF:
32816 nargs = 2;
32817 tf_p = true;
32818 break;
32820 default:
32821 gcc_unreachable ();
32824 if (optimize || !target
32825 || GET_MODE (target) != tmode
32826 || !insn_data[icode].operand[0].predicate (target, tmode))
32827 target = gen_reg_rtx (tmode);
32828 else if (memory_operand (target, tmode))
32829 num_memory++;
32831 gcc_assert (nargs <= 4);
32833 for (i = 0; i < nargs; i++)
32835 tree arg = CALL_EXPR_ARG (exp, i);
32836 rtx op = expand_normal (arg);
32837 int adjust = (comparison_p) ? 1 : 0;
32838 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
32840 if (last_arg_constant && i == nargs - 1)
32842 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
32844 enum insn_code new_icode = icode;
32845 switch (icode)
32847 case CODE_FOR_xop_vpermil2v2df3:
32848 case CODE_FOR_xop_vpermil2v4sf3:
32849 case CODE_FOR_xop_vpermil2v4df3:
32850 case CODE_FOR_xop_vpermil2v8sf3:
32851 error ("the last argument must be a 2-bit immediate");
32852 return gen_reg_rtx (tmode);
32853 case CODE_FOR_xop_rotlv2di3:
32854 new_icode = CODE_FOR_rotlv2di3;
32855 goto xop_rotl;
32856 case CODE_FOR_xop_rotlv4si3:
32857 new_icode = CODE_FOR_rotlv4si3;
32858 goto xop_rotl;
32859 case CODE_FOR_xop_rotlv8hi3:
32860 new_icode = CODE_FOR_rotlv8hi3;
32861 goto xop_rotl;
32862 case CODE_FOR_xop_rotlv16qi3:
32863 new_icode = CODE_FOR_rotlv16qi3;
32864 xop_rotl:
32865 if (CONST_INT_P (op))
32867 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
32868 op = GEN_INT (INTVAL (op) & mask);
32869 gcc_checking_assert
32870 (insn_data[icode].operand[i + 1].predicate (op, mode));
32872 else
32874 gcc_checking_assert
32875 (nargs == 2
32876 && insn_data[new_icode].operand[0].mode == tmode
32877 && insn_data[new_icode].operand[1].mode == tmode
32878 && insn_data[new_icode].operand[2].mode == mode
32879 && insn_data[new_icode].operand[0].predicate
32880 == insn_data[icode].operand[0].predicate
32881 && insn_data[new_icode].operand[1].predicate
32882 == insn_data[icode].operand[1].predicate);
32883 icode = new_icode;
32884 goto non_constant;
32886 break;
32887 default:
32888 gcc_unreachable ();
32892 else
32894 non_constant:
32895 if (VECTOR_MODE_P (mode))
32896 op = safe_vector_operand (op, mode);
32898 /* If we aren't optimizing, only allow one memory operand to be
32899 generated. */
32900 if (memory_operand (op, mode))
32901 num_memory++;
32903 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
32905 if (optimize
32906 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
32907 || num_memory > 1)
32908 op = force_reg (mode, op);
32911 args[i].op = op;
32912 args[i].mode = mode;
32915 switch (nargs)
32917 case 1:
32918 pat = GEN_FCN (icode) (target, args[0].op);
32919 break;
32921 case 2:
32922 if (tf_p)
32923 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
32924 GEN_INT ((int)sub_code));
32925 else if (! comparison_p)
32926 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32927 else
32929 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
32930 args[0].op,
32931 args[1].op);
32933 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
32935 break;
32937 case 3:
32938 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32939 break;
32941 case 4:
32942 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
32943 break;
32945 default:
32946 gcc_unreachable ();
32949 if (! pat)
32950 return 0;
32952 emit_insn (pat);
32953 return target;
32956 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
32957 insns with vec_merge. */
32959 static rtx
32960 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
32961 rtx target)
32963 rtx pat;
32964 tree arg0 = CALL_EXPR_ARG (exp, 0);
32965 rtx op1, op0 = expand_normal (arg0);
32966 machine_mode tmode = insn_data[icode].operand[0].mode;
32967 machine_mode mode0 = insn_data[icode].operand[1].mode;
32969 if (optimize || !target
32970 || GET_MODE (target) != tmode
32971 || !insn_data[icode].operand[0].predicate (target, tmode))
32972 target = gen_reg_rtx (tmode);
32974 if (VECTOR_MODE_P (mode0))
32975 op0 = safe_vector_operand (op0, mode0);
32977 if ((optimize && !register_operand (op0, mode0))
32978 || !insn_data[icode].operand[1].predicate (op0, mode0))
32979 op0 = copy_to_mode_reg (mode0, op0);
32981 op1 = op0;
32982 if (!insn_data[icode].operand[2].predicate (op1, mode0))
32983 op1 = copy_to_mode_reg (mode0, op1);
32985 pat = GEN_FCN (icode) (target, op0, op1);
32986 if (! pat)
32987 return 0;
32988 emit_insn (pat);
32989 return target;
32992 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
32994 static rtx
32995 ix86_expand_sse_compare (const struct builtin_description *d,
32996 tree exp, rtx target, bool swap)
32998 rtx pat;
32999 tree arg0 = CALL_EXPR_ARG (exp, 0);
33000 tree arg1 = CALL_EXPR_ARG (exp, 1);
33001 rtx op0 = expand_normal (arg0);
33002 rtx op1 = expand_normal (arg1);
33003 rtx op2;
33004 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33005 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33006 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33007 enum rtx_code comparison = d->comparison;
33009 if (VECTOR_MODE_P (mode0))
33010 op0 = safe_vector_operand (op0, mode0);
33011 if (VECTOR_MODE_P (mode1))
33012 op1 = safe_vector_operand (op1, mode1);
33014 /* Swap operands if we have a comparison that isn't available in
33015 hardware. */
33016 if (swap)
33017 std::swap (op0, op1);
33019 if (optimize || !target
33020 || GET_MODE (target) != tmode
33021 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33022 target = gen_reg_rtx (tmode);
33024 if ((optimize && !register_operand (op0, mode0))
33025 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33026 op0 = copy_to_mode_reg (mode0, op0);
33027 if ((optimize && !register_operand (op1, mode1))
33028 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33029 op1 = copy_to_mode_reg (mode1, op1);
33031 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33032 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33033 if (! pat)
33034 return 0;
33035 emit_insn (pat);
33036 return target;
33039 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33041 static rtx
33042 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33043 rtx target)
33045 rtx pat;
33046 tree arg0 = CALL_EXPR_ARG (exp, 0);
33047 tree arg1 = CALL_EXPR_ARG (exp, 1);
33048 rtx op0 = expand_normal (arg0);
33049 rtx op1 = expand_normal (arg1);
33050 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33051 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33052 enum rtx_code comparison = d->comparison;
33054 if (VECTOR_MODE_P (mode0))
33055 op0 = safe_vector_operand (op0, mode0);
33056 if (VECTOR_MODE_P (mode1))
33057 op1 = safe_vector_operand (op1, mode1);
33059 /* Swap operands if we have a comparison that isn't available in
33060 hardware. */
33061 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33062 std::swap (op0, op1);
33064 target = gen_reg_rtx (SImode);
33065 emit_move_insn (target, const0_rtx);
33066 target = gen_rtx_SUBREG (QImode, target, 0);
33068 if ((optimize && !register_operand (op0, mode0))
33069 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33070 op0 = copy_to_mode_reg (mode0, op0);
33071 if ((optimize && !register_operand (op1, mode1))
33072 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33073 op1 = copy_to_mode_reg (mode1, op1);
33075 pat = GEN_FCN (d->icode) (op0, op1);
33076 if (! pat)
33077 return 0;
33078 emit_insn (pat);
33079 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33080 gen_rtx_fmt_ee (comparison, QImode,
33081 SET_DEST (pat),
33082 const0_rtx)));
33084 return SUBREG_REG (target);
33087 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33089 static rtx
33090 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33091 rtx target)
33093 rtx pat;
33094 tree arg0 = CALL_EXPR_ARG (exp, 0);
33095 rtx op1, op0 = expand_normal (arg0);
33096 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33097 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33099 if (optimize || target == 0
33100 || GET_MODE (target) != tmode
33101 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33102 target = gen_reg_rtx (tmode);
33104 if (VECTOR_MODE_P (mode0))
33105 op0 = safe_vector_operand (op0, mode0);
33107 if ((optimize && !register_operand (op0, mode0))
33108 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33109 op0 = copy_to_mode_reg (mode0, op0);
33111 op1 = GEN_INT (d->comparison);
33113 pat = GEN_FCN (d->icode) (target, op0, op1);
33114 if (! pat)
33115 return 0;
33116 emit_insn (pat);
33117 return target;
33120 static rtx
33121 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33122 tree exp, rtx target)
33124 rtx pat;
33125 tree arg0 = CALL_EXPR_ARG (exp, 0);
33126 tree arg1 = CALL_EXPR_ARG (exp, 1);
33127 rtx op0 = expand_normal (arg0);
33128 rtx op1 = expand_normal (arg1);
33129 rtx op2;
33130 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33131 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33132 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33134 if (optimize || target == 0
33135 || GET_MODE (target) != tmode
33136 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33137 target = gen_reg_rtx (tmode);
33139 op0 = safe_vector_operand (op0, mode0);
33140 op1 = safe_vector_operand (op1, mode1);
33142 if ((optimize && !register_operand (op0, mode0))
33143 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33144 op0 = copy_to_mode_reg (mode0, op0);
33145 if ((optimize && !register_operand (op1, mode1))
33146 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33147 op1 = copy_to_mode_reg (mode1, op1);
33149 op2 = GEN_INT (d->comparison);
33151 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33152 if (! pat)
33153 return 0;
33154 emit_insn (pat);
33155 return target;
33158 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
33160 static rtx
33161 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
33162 rtx target)
33164 rtx pat;
33165 tree arg0 = CALL_EXPR_ARG (exp, 0);
33166 tree arg1 = CALL_EXPR_ARG (exp, 1);
33167 rtx op0 = expand_normal (arg0);
33168 rtx op1 = expand_normal (arg1);
33169 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33170 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33171 enum rtx_code comparison = d->comparison;
33173 if (VECTOR_MODE_P (mode0))
33174 op0 = safe_vector_operand (op0, mode0);
33175 if (VECTOR_MODE_P (mode1))
33176 op1 = safe_vector_operand (op1, mode1);
33178 target = gen_reg_rtx (SImode);
33179 emit_move_insn (target, const0_rtx);
33180 target = gen_rtx_SUBREG (QImode, target, 0);
33182 if ((optimize && !register_operand (op0, mode0))
33183 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33184 op0 = copy_to_mode_reg (mode0, op0);
33185 if ((optimize && !register_operand (op1, mode1))
33186 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33187 op1 = copy_to_mode_reg (mode1, op1);
33189 pat = GEN_FCN (d->icode) (op0, op1);
33190 if (! pat)
33191 return 0;
33192 emit_insn (pat);
33193 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33194 gen_rtx_fmt_ee (comparison, QImode,
33195 SET_DEST (pat),
33196 const0_rtx)));
33198 return SUBREG_REG (target);
33201 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
33203 static rtx
33204 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
33205 tree exp, rtx target)
33207 rtx pat;
33208 tree arg0 = CALL_EXPR_ARG (exp, 0);
33209 tree arg1 = CALL_EXPR_ARG (exp, 1);
33210 tree arg2 = CALL_EXPR_ARG (exp, 2);
33211 tree arg3 = CALL_EXPR_ARG (exp, 3);
33212 tree arg4 = CALL_EXPR_ARG (exp, 4);
33213 rtx scratch0, scratch1;
33214 rtx op0 = expand_normal (arg0);
33215 rtx op1 = expand_normal (arg1);
33216 rtx op2 = expand_normal (arg2);
33217 rtx op3 = expand_normal (arg3);
33218 rtx op4 = expand_normal (arg4);
33219 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
33221 tmode0 = insn_data[d->icode].operand[0].mode;
33222 tmode1 = insn_data[d->icode].operand[1].mode;
33223 modev2 = insn_data[d->icode].operand[2].mode;
33224 modei3 = insn_data[d->icode].operand[3].mode;
33225 modev4 = insn_data[d->icode].operand[4].mode;
33226 modei5 = insn_data[d->icode].operand[5].mode;
33227 modeimm = insn_data[d->icode].operand[6].mode;
33229 if (VECTOR_MODE_P (modev2))
33230 op0 = safe_vector_operand (op0, modev2);
33231 if (VECTOR_MODE_P (modev4))
33232 op2 = safe_vector_operand (op2, modev4);
33234 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33235 op0 = copy_to_mode_reg (modev2, op0);
33236 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
33237 op1 = copy_to_mode_reg (modei3, op1);
33238 if ((optimize && !register_operand (op2, modev4))
33239 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
33240 op2 = copy_to_mode_reg (modev4, op2);
33241 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
33242 op3 = copy_to_mode_reg (modei5, op3);
33244 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
33246 error ("the fifth argument must be an 8-bit immediate");
33247 return const0_rtx;
33250 if (d->code == IX86_BUILTIN_PCMPESTRI128)
33252 if (optimize || !target
33253 || GET_MODE (target) != tmode0
33254 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33255 target = gen_reg_rtx (tmode0);
33257 scratch1 = gen_reg_rtx (tmode1);
33259 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
33261 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
33263 if (optimize || !target
33264 || GET_MODE (target) != tmode1
33265 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33266 target = gen_reg_rtx (tmode1);
33268 scratch0 = gen_reg_rtx (tmode0);
33270 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
33272 else
33274 gcc_assert (d->flag);
33276 scratch0 = gen_reg_rtx (tmode0);
33277 scratch1 = gen_reg_rtx (tmode1);
33279 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
33282 if (! pat)
33283 return 0;
33285 emit_insn (pat);
33287 if (d->flag)
33289 target = gen_reg_rtx (SImode);
33290 emit_move_insn (target, const0_rtx);
33291 target = gen_rtx_SUBREG (QImode, target, 0);
33293 emit_insn
33294 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33295 gen_rtx_fmt_ee (EQ, QImode,
33296 gen_rtx_REG ((machine_mode) d->flag,
33297 FLAGS_REG),
33298 const0_rtx)));
33299 return SUBREG_REG (target);
33301 else
33302 return target;
33306 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
33308 static rtx
33309 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
33310 tree exp, rtx target)
33312 rtx pat;
33313 tree arg0 = CALL_EXPR_ARG (exp, 0);
33314 tree arg1 = CALL_EXPR_ARG (exp, 1);
33315 tree arg2 = CALL_EXPR_ARG (exp, 2);
33316 rtx scratch0, scratch1;
33317 rtx op0 = expand_normal (arg0);
33318 rtx op1 = expand_normal (arg1);
33319 rtx op2 = expand_normal (arg2);
33320 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
33322 tmode0 = insn_data[d->icode].operand[0].mode;
33323 tmode1 = insn_data[d->icode].operand[1].mode;
33324 modev2 = insn_data[d->icode].operand[2].mode;
33325 modev3 = insn_data[d->icode].operand[3].mode;
33326 modeimm = insn_data[d->icode].operand[4].mode;
33328 if (VECTOR_MODE_P (modev2))
33329 op0 = safe_vector_operand (op0, modev2);
33330 if (VECTOR_MODE_P (modev3))
33331 op1 = safe_vector_operand (op1, modev3);
33333 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33334 op0 = copy_to_mode_reg (modev2, op0);
33335 if ((optimize && !register_operand (op1, modev3))
33336 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
33337 op1 = copy_to_mode_reg (modev3, op1);
33339 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
33341 error ("the third argument must be an 8-bit immediate");
33342 return const0_rtx;
33345 if (d->code == IX86_BUILTIN_PCMPISTRI128)
33347 if (optimize || !target
33348 || GET_MODE (target) != tmode0
33349 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33350 target = gen_reg_rtx (tmode0);
33352 scratch1 = gen_reg_rtx (tmode1);
33354 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
33356 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
33358 if (optimize || !target
33359 || GET_MODE (target) != tmode1
33360 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33361 target = gen_reg_rtx (tmode1);
33363 scratch0 = gen_reg_rtx (tmode0);
33365 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
33367 else
33369 gcc_assert (d->flag);
33371 scratch0 = gen_reg_rtx (tmode0);
33372 scratch1 = gen_reg_rtx (tmode1);
33374 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
33377 if (! pat)
33378 return 0;
33380 emit_insn (pat);
33382 if (d->flag)
33384 target = gen_reg_rtx (SImode);
33385 emit_move_insn (target, const0_rtx);
33386 target = gen_rtx_SUBREG (QImode, target, 0);
33388 emit_insn
33389 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33390 gen_rtx_fmt_ee (EQ, QImode,
33391 gen_rtx_REG ((machine_mode) d->flag,
33392 FLAGS_REG),
33393 const0_rtx)));
33394 return SUBREG_REG (target);
33396 else
33397 return target;
33400 /* Subroutine of ix86_expand_builtin to take care of insns with
33401 variable number of operands. */
33403 static rtx
33404 ix86_expand_args_builtin (const struct builtin_description *d,
33405 tree exp, rtx target)
33407 rtx pat, real_target;
33408 unsigned int i, nargs;
33409 unsigned int nargs_constant = 0;
33410 unsigned int mask_pos = 0;
33411 int num_memory = 0;
33412 struct
33414 rtx op;
33415 machine_mode mode;
33416 } args[6];
33417 bool second_arg_count = false;
33418 enum insn_code icode = d->icode;
33419 const struct insn_data_d *insn_p = &insn_data[icode];
33420 machine_mode tmode = insn_p->operand[0].mode;
33421 machine_mode rmode = VOIDmode;
33422 bool swap = false;
33423 enum rtx_code comparison = d->comparison;
33425 switch ((enum ix86_builtin_func_type) d->flag)
33427 case V2DF_FTYPE_V2DF_ROUND:
33428 case V4DF_FTYPE_V4DF_ROUND:
33429 case V8DF_FTYPE_V8DF_ROUND:
33430 case V4SF_FTYPE_V4SF_ROUND:
33431 case V8SF_FTYPE_V8SF_ROUND:
33432 case V16SF_FTYPE_V16SF_ROUND:
33433 case V4SI_FTYPE_V4SF_ROUND:
33434 case V8SI_FTYPE_V8SF_ROUND:
33435 case V16SI_FTYPE_V16SF_ROUND:
33436 return ix86_expand_sse_round (d, exp, target);
33437 case V4SI_FTYPE_V2DF_V2DF_ROUND:
33438 case V8SI_FTYPE_V4DF_V4DF_ROUND:
33439 case V16SI_FTYPE_V8DF_V8DF_ROUND:
33440 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
33441 case INT_FTYPE_V8SF_V8SF_PTEST:
33442 case INT_FTYPE_V4DI_V4DI_PTEST:
33443 case INT_FTYPE_V4DF_V4DF_PTEST:
33444 case INT_FTYPE_V4SF_V4SF_PTEST:
33445 case INT_FTYPE_V2DI_V2DI_PTEST:
33446 case INT_FTYPE_V2DF_V2DF_PTEST:
33447 return ix86_expand_sse_ptest (d, exp, target);
33448 case FLOAT128_FTYPE_FLOAT128:
33449 case FLOAT_FTYPE_FLOAT:
33450 case INT_FTYPE_INT:
33451 case UINT_FTYPE_UINT:
33452 case UINT16_FTYPE_UINT16:
33453 case UINT64_FTYPE_INT:
33454 case UINT64_FTYPE_UINT64:
33455 case INT64_FTYPE_INT64:
33456 case INT64_FTYPE_V4SF:
33457 case INT64_FTYPE_V2DF:
33458 case INT_FTYPE_V16QI:
33459 case INT_FTYPE_V8QI:
33460 case INT_FTYPE_V8SF:
33461 case INT_FTYPE_V4DF:
33462 case INT_FTYPE_V4SF:
33463 case INT_FTYPE_V2DF:
33464 case INT_FTYPE_V32QI:
33465 case V16QI_FTYPE_V16QI:
33466 case V8SI_FTYPE_V8SF:
33467 case V8SI_FTYPE_V4SI:
33468 case V8HI_FTYPE_V8HI:
33469 case V8HI_FTYPE_V16QI:
33470 case V8QI_FTYPE_V8QI:
33471 case V8SF_FTYPE_V8SF:
33472 case V8SF_FTYPE_V8SI:
33473 case V8SF_FTYPE_V4SF:
33474 case V8SF_FTYPE_V8HI:
33475 case V4SI_FTYPE_V4SI:
33476 case V4SI_FTYPE_V16QI:
33477 case V4SI_FTYPE_V4SF:
33478 case V4SI_FTYPE_V8SI:
33479 case V4SI_FTYPE_V8HI:
33480 case V4SI_FTYPE_V4DF:
33481 case V4SI_FTYPE_V2DF:
33482 case V4HI_FTYPE_V4HI:
33483 case V4DF_FTYPE_V4DF:
33484 case V4DF_FTYPE_V4SI:
33485 case V4DF_FTYPE_V4SF:
33486 case V4DF_FTYPE_V2DF:
33487 case V4SF_FTYPE_V4SF:
33488 case V4SF_FTYPE_V4SI:
33489 case V4SF_FTYPE_V8SF:
33490 case V4SF_FTYPE_V4DF:
33491 case V4SF_FTYPE_V8HI:
33492 case V4SF_FTYPE_V2DF:
33493 case V2DI_FTYPE_V2DI:
33494 case V2DI_FTYPE_V16QI:
33495 case V2DI_FTYPE_V8HI:
33496 case V2DI_FTYPE_V4SI:
33497 case V2DF_FTYPE_V2DF:
33498 case V2DF_FTYPE_V4SI:
33499 case V2DF_FTYPE_V4DF:
33500 case V2DF_FTYPE_V4SF:
33501 case V2DF_FTYPE_V2SI:
33502 case V2SI_FTYPE_V2SI:
33503 case V2SI_FTYPE_V4SF:
33504 case V2SI_FTYPE_V2SF:
33505 case V2SI_FTYPE_V2DF:
33506 case V2SF_FTYPE_V2SF:
33507 case V2SF_FTYPE_V2SI:
33508 case V32QI_FTYPE_V32QI:
33509 case V32QI_FTYPE_V16QI:
33510 case V16HI_FTYPE_V16HI:
33511 case V16HI_FTYPE_V8HI:
33512 case V8SI_FTYPE_V8SI:
33513 case V16HI_FTYPE_V16QI:
33514 case V8SI_FTYPE_V16QI:
33515 case V4DI_FTYPE_V16QI:
33516 case V8SI_FTYPE_V8HI:
33517 case V4DI_FTYPE_V8HI:
33518 case V4DI_FTYPE_V4SI:
33519 case V4DI_FTYPE_V2DI:
33520 case UQI_FTYPE_UQI:
33521 case UHI_FTYPE_UHI:
33522 case USI_FTYPE_USI:
33523 case USI_FTYPE_UQI:
33524 case USI_FTYPE_UHI:
33525 case UDI_FTYPE_UDI:
33526 case UHI_FTYPE_V16QI:
33527 case USI_FTYPE_V32QI:
33528 case UDI_FTYPE_V64QI:
33529 case V16QI_FTYPE_UHI:
33530 case V32QI_FTYPE_USI:
33531 case V64QI_FTYPE_UDI:
33532 case V8HI_FTYPE_UQI:
33533 case V16HI_FTYPE_UHI:
33534 case V32HI_FTYPE_USI:
33535 case V4SI_FTYPE_UQI:
33536 case V8SI_FTYPE_UQI:
33537 case V4SI_FTYPE_UHI:
33538 case V8SI_FTYPE_UHI:
33539 case UQI_FTYPE_V8HI:
33540 case UHI_FTYPE_V16HI:
33541 case USI_FTYPE_V32HI:
33542 case UQI_FTYPE_V4SI:
33543 case UQI_FTYPE_V8SI:
33544 case UHI_FTYPE_V16SI:
33545 case UQI_FTYPE_V2DI:
33546 case UQI_FTYPE_V4DI:
33547 case UQI_FTYPE_V8DI:
33548 case V16SI_FTYPE_UHI:
33549 case V2DI_FTYPE_UQI:
33550 case V4DI_FTYPE_UQI:
33551 case V16SI_FTYPE_INT:
33552 case V16SF_FTYPE_V8SF:
33553 case V16SI_FTYPE_V8SI:
33554 case V16SF_FTYPE_V4SF:
33555 case V16SI_FTYPE_V4SI:
33556 case V16SI_FTYPE_V16SF:
33557 case V16SI_FTYPE_V16SI:
33558 case V16SF_FTYPE_V16SF:
33559 case V8DI_FTYPE_UQI:
33560 case V8DI_FTYPE_V8DI:
33561 case V8DF_FTYPE_V4DF:
33562 case V8DF_FTYPE_V2DF:
33563 case V8DF_FTYPE_V8DF:
33564 nargs = 1;
33565 break;
33566 case V4SF_FTYPE_V4SF_VEC_MERGE:
33567 case V2DF_FTYPE_V2DF_VEC_MERGE:
33568 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
33569 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
33570 case V16QI_FTYPE_V16QI_V16QI:
33571 case V16QI_FTYPE_V8HI_V8HI:
33572 case V16SF_FTYPE_V16SF_V16SF:
33573 case V8QI_FTYPE_V8QI_V8QI:
33574 case V8QI_FTYPE_V4HI_V4HI:
33575 case V8HI_FTYPE_V8HI_V8HI:
33576 case V8HI_FTYPE_V16QI_V16QI:
33577 case V8HI_FTYPE_V4SI_V4SI:
33578 case V8SF_FTYPE_V8SF_V8SF:
33579 case V8SF_FTYPE_V8SF_V8SI:
33580 case V8DF_FTYPE_V8DF_V8DF:
33581 case V4SI_FTYPE_V4SI_V4SI:
33582 case V4SI_FTYPE_V8HI_V8HI:
33583 case V4SI_FTYPE_V2DF_V2DF:
33584 case V4HI_FTYPE_V4HI_V4HI:
33585 case V4HI_FTYPE_V8QI_V8QI:
33586 case V4HI_FTYPE_V2SI_V2SI:
33587 case V4DF_FTYPE_V4DF_V4DF:
33588 case V4DF_FTYPE_V4DF_V4DI:
33589 case V4SF_FTYPE_V4SF_V4SF:
33590 case V4SF_FTYPE_V4SF_V4SI:
33591 case V4SF_FTYPE_V4SF_V2SI:
33592 case V4SF_FTYPE_V4SF_V2DF:
33593 case V4SF_FTYPE_V4SF_UINT:
33594 case V4SF_FTYPE_V4SF_DI:
33595 case V4SF_FTYPE_V4SF_SI:
33596 case V2DI_FTYPE_V2DI_V2DI:
33597 case V2DI_FTYPE_V16QI_V16QI:
33598 case V2DI_FTYPE_V4SI_V4SI:
33599 case V2DI_FTYPE_V2DI_V16QI:
33600 case V2SI_FTYPE_V2SI_V2SI:
33601 case V2SI_FTYPE_V4HI_V4HI:
33602 case V2SI_FTYPE_V2SF_V2SF:
33603 case V2DF_FTYPE_V2DF_V2DF:
33604 case V2DF_FTYPE_V2DF_V4SF:
33605 case V2DF_FTYPE_V2DF_V2DI:
33606 case V2DF_FTYPE_V2DF_DI:
33607 case V2DF_FTYPE_V2DF_SI:
33608 case V2DF_FTYPE_V2DF_UINT:
33609 case V2SF_FTYPE_V2SF_V2SF:
33610 case V1DI_FTYPE_V1DI_V1DI:
33611 case V1DI_FTYPE_V8QI_V8QI:
33612 case V1DI_FTYPE_V2SI_V2SI:
33613 case V32QI_FTYPE_V16HI_V16HI:
33614 case V16HI_FTYPE_V8SI_V8SI:
33615 case V64QI_FTYPE_V64QI_V64QI:
33616 case V32QI_FTYPE_V32QI_V32QI:
33617 case V16HI_FTYPE_V32QI_V32QI:
33618 case V16HI_FTYPE_V16HI_V16HI:
33619 case V8SI_FTYPE_V4DF_V4DF:
33620 case V8SI_FTYPE_V8SI_V8SI:
33621 case V8SI_FTYPE_V16HI_V16HI:
33622 case V4DI_FTYPE_V4DI_V4DI:
33623 case V4DI_FTYPE_V8SI_V8SI:
33624 case V8DI_FTYPE_V64QI_V64QI:
33625 if (comparison == UNKNOWN)
33626 return ix86_expand_binop_builtin (icode, exp, target);
33627 nargs = 2;
33628 break;
33629 case V4SF_FTYPE_V4SF_V4SF_SWAP:
33630 case V2DF_FTYPE_V2DF_V2DF_SWAP:
33631 gcc_assert (comparison != UNKNOWN);
33632 nargs = 2;
33633 swap = true;
33634 break;
33635 case V16HI_FTYPE_V16HI_V8HI_COUNT:
33636 case V16HI_FTYPE_V16HI_SI_COUNT:
33637 case V8SI_FTYPE_V8SI_V4SI_COUNT:
33638 case V8SI_FTYPE_V8SI_SI_COUNT:
33639 case V4DI_FTYPE_V4DI_V2DI_COUNT:
33640 case V4DI_FTYPE_V4DI_INT_COUNT:
33641 case V8HI_FTYPE_V8HI_V8HI_COUNT:
33642 case V8HI_FTYPE_V8HI_SI_COUNT:
33643 case V4SI_FTYPE_V4SI_V4SI_COUNT:
33644 case V4SI_FTYPE_V4SI_SI_COUNT:
33645 case V4HI_FTYPE_V4HI_V4HI_COUNT:
33646 case V4HI_FTYPE_V4HI_SI_COUNT:
33647 case V2DI_FTYPE_V2DI_V2DI_COUNT:
33648 case V2DI_FTYPE_V2DI_SI_COUNT:
33649 case V2SI_FTYPE_V2SI_V2SI_COUNT:
33650 case V2SI_FTYPE_V2SI_SI_COUNT:
33651 case V1DI_FTYPE_V1DI_V1DI_COUNT:
33652 case V1DI_FTYPE_V1DI_SI_COUNT:
33653 nargs = 2;
33654 second_arg_count = true;
33655 break;
33656 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
33657 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
33658 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
33659 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
33660 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
33661 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
33662 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
33663 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
33664 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
33665 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
33666 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
33667 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
33668 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
33669 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
33670 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
33671 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
33672 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
33673 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
33674 nargs = 4;
33675 second_arg_count = true;
33676 break;
33677 case UINT64_FTYPE_UINT64_UINT64:
33678 case UINT_FTYPE_UINT_UINT:
33679 case UINT_FTYPE_UINT_USHORT:
33680 case UINT_FTYPE_UINT_UCHAR:
33681 case UINT16_FTYPE_UINT16_INT:
33682 case UINT8_FTYPE_UINT8_INT:
33683 case UQI_FTYPE_UQI_UQI:
33684 case UHI_FTYPE_UHI_UHI:
33685 case USI_FTYPE_USI_USI:
33686 case UDI_FTYPE_UDI_UDI:
33687 case V16SI_FTYPE_V8DF_V8DF:
33688 nargs = 2;
33689 break;
33690 case V2DI_FTYPE_V2DI_INT_CONVERT:
33691 nargs = 2;
33692 rmode = V1TImode;
33693 nargs_constant = 1;
33694 break;
33695 case V4DI_FTYPE_V4DI_INT_CONVERT:
33696 nargs = 2;
33697 rmode = V2TImode;
33698 nargs_constant = 1;
33699 break;
33700 case V8DI_FTYPE_V8DI_INT_CONVERT:
33701 nargs = 2;
33702 rmode = V4TImode;
33703 nargs_constant = 1;
33704 break;
33705 case V8HI_FTYPE_V8HI_INT:
33706 case V8HI_FTYPE_V8SF_INT:
33707 case V16HI_FTYPE_V16SF_INT:
33708 case V8HI_FTYPE_V4SF_INT:
33709 case V8SF_FTYPE_V8SF_INT:
33710 case V4SF_FTYPE_V16SF_INT:
33711 case V16SF_FTYPE_V16SF_INT:
33712 case V4SI_FTYPE_V4SI_INT:
33713 case V4SI_FTYPE_V8SI_INT:
33714 case V4HI_FTYPE_V4HI_INT:
33715 case V4DF_FTYPE_V4DF_INT:
33716 case V4DF_FTYPE_V8DF_INT:
33717 case V4SF_FTYPE_V4SF_INT:
33718 case V4SF_FTYPE_V8SF_INT:
33719 case V2DI_FTYPE_V2DI_INT:
33720 case V2DF_FTYPE_V2DF_INT:
33721 case V2DF_FTYPE_V4DF_INT:
33722 case V16HI_FTYPE_V16HI_INT:
33723 case V8SI_FTYPE_V8SI_INT:
33724 case V16SI_FTYPE_V16SI_INT:
33725 case V4SI_FTYPE_V16SI_INT:
33726 case V4DI_FTYPE_V4DI_INT:
33727 case V2DI_FTYPE_V4DI_INT:
33728 case V4DI_FTYPE_V8DI_INT:
33729 case QI_FTYPE_V4SF_INT:
33730 case QI_FTYPE_V2DF_INT:
33731 case UQI_FTYPE_UQI_UQI_CONST:
33732 case UHI_FTYPE_UHI_UQI:
33733 case USI_FTYPE_USI_UQI:
33734 case UDI_FTYPE_UDI_UQI:
33735 nargs = 2;
33736 nargs_constant = 1;
33737 break;
33738 case V16QI_FTYPE_V16QI_V16QI_V16QI:
33739 case V8SF_FTYPE_V8SF_V8SF_V8SF:
33740 case V4DF_FTYPE_V4DF_V4DF_V4DF:
33741 case V4SF_FTYPE_V4SF_V4SF_V4SF:
33742 case V2DF_FTYPE_V2DF_V2DF_V2DF:
33743 case V32QI_FTYPE_V32QI_V32QI_V32QI:
33744 case UHI_FTYPE_V16SI_V16SI_UHI:
33745 case UQI_FTYPE_V8DI_V8DI_UQI:
33746 case V16HI_FTYPE_V16SI_V16HI_UHI:
33747 case V16QI_FTYPE_V16SI_V16QI_UHI:
33748 case V16QI_FTYPE_V8DI_V16QI_UQI:
33749 case V16SF_FTYPE_V16SF_V16SF_UHI:
33750 case V16SF_FTYPE_V4SF_V16SF_UHI:
33751 case V16SI_FTYPE_SI_V16SI_UHI:
33752 case V16SI_FTYPE_V16HI_V16SI_UHI:
33753 case V16SI_FTYPE_V16QI_V16SI_UHI:
33754 case V8SF_FTYPE_V4SF_V8SF_UQI:
33755 case V4DF_FTYPE_V2DF_V4DF_UQI:
33756 case V8SI_FTYPE_V4SI_V8SI_UQI:
33757 case V8SI_FTYPE_SI_V8SI_UQI:
33758 case V4SI_FTYPE_V4SI_V4SI_UQI:
33759 case V4SI_FTYPE_SI_V4SI_UQI:
33760 case V4DI_FTYPE_V2DI_V4DI_UQI:
33761 case V4DI_FTYPE_DI_V4DI_UQI:
33762 case V2DI_FTYPE_V2DI_V2DI_UQI:
33763 case V2DI_FTYPE_DI_V2DI_UQI:
33764 case V64QI_FTYPE_V64QI_V64QI_UDI:
33765 case V64QI_FTYPE_V16QI_V64QI_UDI:
33766 case V64QI_FTYPE_QI_V64QI_UDI:
33767 case V32QI_FTYPE_V32QI_V32QI_USI:
33768 case V32QI_FTYPE_V16QI_V32QI_USI:
33769 case V32QI_FTYPE_QI_V32QI_USI:
33770 case V16QI_FTYPE_V16QI_V16QI_UHI:
33771 case V16QI_FTYPE_QI_V16QI_UHI:
33772 case V32HI_FTYPE_V8HI_V32HI_USI:
33773 case V32HI_FTYPE_HI_V32HI_USI:
33774 case V16HI_FTYPE_V8HI_V16HI_UHI:
33775 case V16HI_FTYPE_HI_V16HI_UHI:
33776 case V8HI_FTYPE_V8HI_V8HI_UQI:
33777 case V8HI_FTYPE_HI_V8HI_UQI:
33778 case V8SF_FTYPE_V8HI_V8SF_UQI:
33779 case V4SF_FTYPE_V8HI_V4SF_UQI:
33780 case V8SI_FTYPE_V8SF_V8SI_UQI:
33781 case V4SI_FTYPE_V4SF_V4SI_UQI:
33782 case V4DI_FTYPE_V4SF_V4DI_UQI:
33783 case V2DI_FTYPE_V4SF_V2DI_UQI:
33784 case V4SF_FTYPE_V4DI_V4SF_UQI:
33785 case V4SF_FTYPE_V2DI_V4SF_UQI:
33786 case V4DF_FTYPE_V4DI_V4DF_UQI:
33787 case V2DF_FTYPE_V2DI_V2DF_UQI:
33788 case V16QI_FTYPE_V8HI_V16QI_UQI:
33789 case V16QI_FTYPE_V16HI_V16QI_UHI:
33790 case V16QI_FTYPE_V4SI_V16QI_UQI:
33791 case V16QI_FTYPE_V8SI_V16QI_UQI:
33792 case V8HI_FTYPE_V4SI_V8HI_UQI:
33793 case V8HI_FTYPE_V8SI_V8HI_UQI:
33794 case V16QI_FTYPE_V2DI_V16QI_UQI:
33795 case V16QI_FTYPE_V4DI_V16QI_UQI:
33796 case V8HI_FTYPE_V2DI_V8HI_UQI:
33797 case V8HI_FTYPE_V4DI_V8HI_UQI:
33798 case V4SI_FTYPE_V2DI_V4SI_UQI:
33799 case V4SI_FTYPE_V4DI_V4SI_UQI:
33800 case V32QI_FTYPE_V32HI_V32QI_USI:
33801 case UHI_FTYPE_V16QI_V16QI_UHI:
33802 case USI_FTYPE_V32QI_V32QI_USI:
33803 case UDI_FTYPE_V64QI_V64QI_UDI:
33804 case UQI_FTYPE_V8HI_V8HI_UQI:
33805 case UHI_FTYPE_V16HI_V16HI_UHI:
33806 case USI_FTYPE_V32HI_V32HI_USI:
33807 case UQI_FTYPE_V4SI_V4SI_UQI:
33808 case UQI_FTYPE_V8SI_V8SI_UQI:
33809 case UQI_FTYPE_V2DI_V2DI_UQI:
33810 case UQI_FTYPE_V4DI_V4DI_UQI:
33811 case V4SF_FTYPE_V2DF_V4SF_UQI:
33812 case V4SF_FTYPE_V4DF_V4SF_UQI:
33813 case V16SI_FTYPE_V16SI_V16SI_UHI:
33814 case V16SI_FTYPE_V4SI_V16SI_UHI:
33815 case V2DI_FTYPE_V4SI_V2DI_UQI:
33816 case V2DI_FTYPE_V8HI_V2DI_UQI:
33817 case V2DI_FTYPE_V16QI_V2DI_UQI:
33818 case V4DI_FTYPE_V4DI_V4DI_UQI:
33819 case V4DI_FTYPE_V4SI_V4DI_UQI:
33820 case V4DI_FTYPE_V8HI_V4DI_UQI:
33821 case V4DI_FTYPE_V16QI_V4DI_UQI:
33822 case V4DI_FTYPE_V4DF_V4DI_UQI:
33823 case V2DI_FTYPE_V2DF_V2DI_UQI:
33824 case V4SI_FTYPE_V4DF_V4SI_UQI:
33825 case V4SI_FTYPE_V2DF_V4SI_UQI:
33826 case V4SI_FTYPE_V8HI_V4SI_UQI:
33827 case V4SI_FTYPE_V16QI_V4SI_UQI:
33828 case V4DI_FTYPE_V4DI_V4DI_V4DI:
33829 case V8DF_FTYPE_V2DF_V8DF_UQI:
33830 case V8DF_FTYPE_V4DF_V8DF_UQI:
33831 case V8DF_FTYPE_V8DF_V8DF_UQI:
33832 case V8SF_FTYPE_V8SF_V8SF_UQI:
33833 case V8SF_FTYPE_V8SI_V8SF_UQI:
33834 case V4DF_FTYPE_V4DF_V4DF_UQI:
33835 case V4SF_FTYPE_V4SF_V4SF_UQI:
33836 case V2DF_FTYPE_V2DF_V2DF_UQI:
33837 case V2DF_FTYPE_V4SF_V2DF_UQI:
33838 case V2DF_FTYPE_V4SI_V2DF_UQI:
33839 case V4SF_FTYPE_V4SI_V4SF_UQI:
33840 case V4DF_FTYPE_V4SF_V4DF_UQI:
33841 case V4DF_FTYPE_V4SI_V4DF_UQI:
33842 case V8SI_FTYPE_V8SI_V8SI_UQI:
33843 case V8SI_FTYPE_V8HI_V8SI_UQI:
33844 case V8SI_FTYPE_V16QI_V8SI_UQI:
33845 case V8DF_FTYPE_V8SI_V8DF_UQI:
33846 case V8DI_FTYPE_DI_V8DI_UQI:
33847 case V16SF_FTYPE_V8SF_V16SF_UHI:
33848 case V16SI_FTYPE_V8SI_V16SI_UHI:
33849 case V16HI_FTYPE_V16HI_V16HI_UHI:
33850 case V8HI_FTYPE_V16QI_V8HI_UQI:
33851 case V16HI_FTYPE_V16QI_V16HI_UHI:
33852 case V32HI_FTYPE_V32HI_V32HI_USI:
33853 case V32HI_FTYPE_V32QI_V32HI_USI:
33854 case V8DI_FTYPE_V16QI_V8DI_UQI:
33855 case V8DI_FTYPE_V2DI_V8DI_UQI:
33856 case V8DI_FTYPE_V4DI_V8DI_UQI:
33857 case V8DI_FTYPE_V8DI_V8DI_UQI:
33858 case V8DI_FTYPE_V8HI_V8DI_UQI:
33859 case V8DI_FTYPE_V8SI_V8DI_UQI:
33860 case V8HI_FTYPE_V8DI_V8HI_UQI:
33861 case V8SI_FTYPE_V8DI_V8SI_UQI:
33862 case V4SI_FTYPE_V4SI_V4SI_V4SI:
33863 case V16SI_FTYPE_V16SI_V16SI_V16SI:
33864 case V8DI_FTYPE_V8DI_V8DI_V8DI:
33865 case V32HI_FTYPE_V32HI_V32HI_V32HI:
33866 case V2DI_FTYPE_V2DI_V2DI_V2DI:
33867 case V16HI_FTYPE_V16HI_V16HI_V16HI:
33868 case V8SI_FTYPE_V8SI_V8SI_V8SI:
33869 case V8HI_FTYPE_V8HI_V8HI_V8HI:
33870 nargs = 3;
33871 break;
33872 case V32QI_FTYPE_V32QI_V32QI_INT:
33873 case V16HI_FTYPE_V16HI_V16HI_INT:
33874 case V16QI_FTYPE_V16QI_V16QI_INT:
33875 case V4DI_FTYPE_V4DI_V4DI_INT:
33876 case V8HI_FTYPE_V8HI_V8HI_INT:
33877 case V8SI_FTYPE_V8SI_V8SI_INT:
33878 case V8SI_FTYPE_V8SI_V4SI_INT:
33879 case V8SF_FTYPE_V8SF_V8SF_INT:
33880 case V8SF_FTYPE_V8SF_V4SF_INT:
33881 case V4SI_FTYPE_V4SI_V4SI_INT:
33882 case V4DF_FTYPE_V4DF_V4DF_INT:
33883 case V16SF_FTYPE_V16SF_V16SF_INT:
33884 case V16SF_FTYPE_V16SF_V4SF_INT:
33885 case V16SI_FTYPE_V16SI_V4SI_INT:
33886 case V4DF_FTYPE_V4DF_V2DF_INT:
33887 case V4SF_FTYPE_V4SF_V4SF_INT:
33888 case V2DI_FTYPE_V2DI_V2DI_INT:
33889 case V4DI_FTYPE_V4DI_V2DI_INT:
33890 case V2DF_FTYPE_V2DF_V2DF_INT:
33891 case UQI_FTYPE_V8DI_V8UDI_INT:
33892 case UQI_FTYPE_V8DF_V8DF_INT:
33893 case UQI_FTYPE_V2DF_V2DF_INT:
33894 case UQI_FTYPE_V4SF_V4SF_INT:
33895 case UHI_FTYPE_V16SI_V16SI_INT:
33896 case UHI_FTYPE_V16SF_V16SF_INT:
33897 case V64QI_FTYPE_V64QI_V64QI_INT:
33898 case V32HI_FTYPE_V32HI_V32HI_INT:
33899 case V16SI_FTYPE_V16SI_V16SI_INT:
33900 case V8DI_FTYPE_V8DI_V8DI_INT:
33901 nargs = 3;
33902 nargs_constant = 1;
33903 break;
33904 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
33905 nargs = 3;
33906 rmode = V4DImode;
33907 nargs_constant = 1;
33908 break;
33909 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
33910 nargs = 3;
33911 rmode = V2DImode;
33912 nargs_constant = 1;
33913 break;
33914 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
33915 nargs = 3;
33916 rmode = DImode;
33917 nargs_constant = 1;
33918 break;
33919 case V2DI_FTYPE_V2DI_UINT_UINT:
33920 nargs = 3;
33921 nargs_constant = 2;
33922 break;
33923 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
33924 nargs = 3;
33925 rmode = V8DImode;
33926 nargs_constant = 1;
33927 break;
33928 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
33929 nargs = 5;
33930 rmode = V8DImode;
33931 mask_pos = 2;
33932 nargs_constant = 1;
33933 break;
33934 case QI_FTYPE_V8DF_INT_UQI:
33935 case QI_FTYPE_V4DF_INT_UQI:
33936 case QI_FTYPE_V2DF_INT_UQI:
33937 case HI_FTYPE_V16SF_INT_UHI:
33938 case QI_FTYPE_V8SF_INT_UQI:
33939 case QI_FTYPE_V4SF_INT_UQI:
33940 nargs = 3;
33941 mask_pos = 1;
33942 nargs_constant = 1;
33943 break;
33944 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
33945 nargs = 5;
33946 rmode = V4DImode;
33947 mask_pos = 2;
33948 nargs_constant = 1;
33949 break;
33950 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
33951 nargs = 5;
33952 rmode = V2DImode;
33953 mask_pos = 2;
33954 nargs_constant = 1;
33955 break;
33956 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
33957 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
33958 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
33959 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
33960 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
33961 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
33962 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
33963 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
33964 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
33965 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
33966 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
33967 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
33968 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
33969 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
33970 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
33971 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
33972 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
33973 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
33974 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
33975 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
33976 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
33977 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
33978 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
33979 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
33980 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
33981 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
33982 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
33983 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
33984 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
33985 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
33986 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
33987 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
33988 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
33989 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
33990 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
33991 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
33992 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
33993 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
33994 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
33995 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
33996 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
33997 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
33998 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
33999 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34000 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34001 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34002 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34003 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34004 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34005 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34006 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34007 nargs = 4;
34008 break;
34009 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34010 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34011 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34012 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34013 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34014 nargs = 4;
34015 nargs_constant = 1;
34016 break;
34017 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34018 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34019 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34020 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34021 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34022 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34023 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34024 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34025 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34026 case USI_FTYPE_V32QI_V32QI_INT_USI:
34027 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34028 case USI_FTYPE_V32HI_V32HI_INT_USI:
34029 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34030 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34031 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34032 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34033 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34034 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34035 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34036 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34037 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34038 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34039 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34040 nargs = 4;
34041 mask_pos = 1;
34042 nargs_constant = 1;
34043 break;
34044 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34045 nargs = 4;
34046 nargs_constant = 2;
34047 break;
34048 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34049 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34050 nargs = 4;
34051 break;
34052 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34053 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34054 mask_pos = 1;
34055 nargs = 4;
34056 nargs_constant = 1;
34057 break;
34058 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34059 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34060 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34061 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34062 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34063 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34064 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34065 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34066 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34067 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34068 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34069 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34070 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34071 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34072 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34073 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34074 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34075 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34076 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34077 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34078 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34079 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34080 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34081 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34082 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34083 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34084 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34085 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34086 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34087 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34088 nargs = 4;
34089 mask_pos = 2;
34090 nargs_constant = 1;
34091 break;
34092 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34093 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34094 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34095 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34096 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34097 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34098 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34099 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34100 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34101 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34102 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34103 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34104 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34105 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34106 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34107 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34108 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34109 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34110 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34111 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34112 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34113 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34114 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34115 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34116 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34117 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34118 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34119 nargs = 5;
34120 mask_pos = 2;
34121 nargs_constant = 1;
34122 break;
34123 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34124 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34125 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34126 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34127 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34128 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34129 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34130 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34131 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34132 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34133 nargs = 5;
34134 mask_pos = 1;
34135 nargs_constant = 1;
34136 break;
34137 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
34138 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
34139 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
34140 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
34141 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
34142 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
34143 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
34144 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
34145 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
34146 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
34147 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
34148 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
34149 nargs = 5;
34150 mask_pos = 1;
34151 nargs_constant = 2;
34152 break;
34154 default:
34155 gcc_unreachable ();
34158 gcc_assert (nargs <= ARRAY_SIZE (args));
34160 if (comparison != UNKNOWN)
34162 gcc_assert (nargs == 2);
34163 return ix86_expand_sse_compare (d, exp, target, swap);
34166 if (rmode == VOIDmode || rmode == tmode)
34168 if (optimize
34169 || target == 0
34170 || GET_MODE (target) != tmode
34171 || !insn_p->operand[0].predicate (target, tmode))
34172 target = gen_reg_rtx (tmode);
34173 else if (memory_operand (target, tmode))
34174 num_memory++;
34175 real_target = target;
34177 else
34179 real_target = gen_reg_rtx (tmode);
34180 target = lowpart_subreg (rmode, real_target, tmode);
34183 for (i = 0; i < nargs; i++)
34185 tree arg = CALL_EXPR_ARG (exp, i);
34186 rtx op = expand_normal (arg);
34187 machine_mode mode = insn_p->operand[i + 1].mode;
34188 bool match = insn_p->operand[i + 1].predicate (op, mode);
34190 if (second_arg_count && i == 1)
34192 /* SIMD shift insns take either an 8-bit immediate or
34193 register as count. But builtin functions take int as
34194 count. If count doesn't match, we put it in register.
34195 The instructions are using 64-bit count, if op is just
34196 32-bit, zero-extend it, as negative shift counts
34197 are undefined behavior and zero-extension is more
34198 efficient. */
34199 if (!match)
34201 if (SCALAR_INT_MODE_P (GET_MODE (op)))
34202 op = convert_modes (mode, GET_MODE (op), op, 1);
34203 else
34204 op = lowpart_subreg (mode, op, GET_MODE (op));
34205 if (!insn_p->operand[i + 1].predicate (op, mode))
34206 op = copy_to_reg (op);
34209 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34210 (!mask_pos && (nargs - i) <= nargs_constant))
34212 if (!match)
34213 switch (icode)
34215 case CODE_FOR_avx_vinsertf128v4di:
34216 case CODE_FOR_avx_vextractf128v4di:
34217 error ("the last argument must be an 1-bit immediate");
34218 return const0_rtx;
34220 case CODE_FOR_avx512f_cmpv8di3_mask:
34221 case CODE_FOR_avx512f_cmpv16si3_mask:
34222 case CODE_FOR_avx512f_ucmpv8di3_mask:
34223 case CODE_FOR_avx512f_ucmpv16si3_mask:
34224 case CODE_FOR_avx512vl_cmpv4di3_mask:
34225 case CODE_FOR_avx512vl_cmpv8si3_mask:
34226 case CODE_FOR_avx512vl_ucmpv4di3_mask:
34227 case CODE_FOR_avx512vl_ucmpv8si3_mask:
34228 case CODE_FOR_avx512vl_cmpv2di3_mask:
34229 case CODE_FOR_avx512vl_cmpv4si3_mask:
34230 case CODE_FOR_avx512vl_ucmpv2di3_mask:
34231 case CODE_FOR_avx512vl_ucmpv4si3_mask:
34232 error ("the last argument must be a 3-bit immediate");
34233 return const0_rtx;
34235 case CODE_FOR_sse4_1_roundsd:
34236 case CODE_FOR_sse4_1_roundss:
34238 case CODE_FOR_sse4_1_roundpd:
34239 case CODE_FOR_sse4_1_roundps:
34240 case CODE_FOR_avx_roundpd256:
34241 case CODE_FOR_avx_roundps256:
34243 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
34244 case CODE_FOR_sse4_1_roundps_sfix:
34245 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
34246 case CODE_FOR_avx_roundps_sfix256:
34248 case CODE_FOR_sse4_1_blendps:
34249 case CODE_FOR_avx_blendpd256:
34250 case CODE_FOR_avx_vpermilv4df:
34251 case CODE_FOR_avx_vpermilv4df_mask:
34252 case CODE_FOR_avx512f_getmantv8df_mask:
34253 case CODE_FOR_avx512f_getmantv16sf_mask:
34254 case CODE_FOR_avx512vl_getmantv8sf_mask:
34255 case CODE_FOR_avx512vl_getmantv4df_mask:
34256 case CODE_FOR_avx512vl_getmantv4sf_mask:
34257 case CODE_FOR_avx512vl_getmantv2df_mask:
34258 case CODE_FOR_avx512dq_rangepv8df_mask_round:
34259 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
34260 case CODE_FOR_avx512dq_rangepv4df_mask:
34261 case CODE_FOR_avx512dq_rangepv8sf_mask:
34262 case CODE_FOR_avx512dq_rangepv2df_mask:
34263 case CODE_FOR_avx512dq_rangepv4sf_mask:
34264 case CODE_FOR_avx_shufpd256_mask:
34265 error ("the last argument must be a 4-bit immediate");
34266 return const0_rtx;
34268 case CODE_FOR_sha1rnds4:
34269 case CODE_FOR_sse4_1_blendpd:
34270 case CODE_FOR_avx_vpermilv2df:
34271 case CODE_FOR_avx_vpermilv2df_mask:
34272 case CODE_FOR_xop_vpermil2v2df3:
34273 case CODE_FOR_xop_vpermil2v4sf3:
34274 case CODE_FOR_xop_vpermil2v4df3:
34275 case CODE_FOR_xop_vpermil2v8sf3:
34276 case CODE_FOR_avx512f_vinsertf32x4_mask:
34277 case CODE_FOR_avx512f_vinserti32x4_mask:
34278 case CODE_FOR_avx512f_vextractf32x4_mask:
34279 case CODE_FOR_avx512f_vextracti32x4_mask:
34280 case CODE_FOR_sse2_shufpd:
34281 case CODE_FOR_sse2_shufpd_mask:
34282 case CODE_FOR_avx512dq_shuf_f64x2_mask:
34283 case CODE_FOR_avx512dq_shuf_i64x2_mask:
34284 case CODE_FOR_avx512vl_shuf_i32x4_mask:
34285 case CODE_FOR_avx512vl_shuf_f32x4_mask:
34286 error ("the last argument must be a 2-bit immediate");
34287 return const0_rtx;
34289 case CODE_FOR_avx_vextractf128v4df:
34290 case CODE_FOR_avx_vextractf128v8sf:
34291 case CODE_FOR_avx_vextractf128v8si:
34292 case CODE_FOR_avx_vinsertf128v4df:
34293 case CODE_FOR_avx_vinsertf128v8sf:
34294 case CODE_FOR_avx_vinsertf128v8si:
34295 case CODE_FOR_avx512f_vinsertf64x4_mask:
34296 case CODE_FOR_avx512f_vinserti64x4_mask:
34297 case CODE_FOR_avx512f_vextractf64x4_mask:
34298 case CODE_FOR_avx512f_vextracti64x4_mask:
34299 case CODE_FOR_avx512dq_vinsertf32x8_mask:
34300 case CODE_FOR_avx512dq_vinserti32x8_mask:
34301 case CODE_FOR_avx512vl_vinsertv4df:
34302 case CODE_FOR_avx512vl_vinsertv4di:
34303 case CODE_FOR_avx512vl_vinsertv8sf:
34304 case CODE_FOR_avx512vl_vinsertv8si:
34305 error ("the last argument must be a 1-bit immediate");
34306 return const0_rtx;
34308 case CODE_FOR_avx_vmcmpv2df3:
34309 case CODE_FOR_avx_vmcmpv4sf3:
34310 case CODE_FOR_avx_cmpv2df3:
34311 case CODE_FOR_avx_cmpv4sf3:
34312 case CODE_FOR_avx_cmpv4df3:
34313 case CODE_FOR_avx_cmpv8sf3:
34314 case CODE_FOR_avx512f_cmpv8df3_mask:
34315 case CODE_FOR_avx512f_cmpv16sf3_mask:
34316 case CODE_FOR_avx512f_vmcmpv2df3_mask:
34317 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
34318 error ("the last argument must be a 5-bit immediate");
34319 return const0_rtx;
34321 default:
34322 switch (nargs_constant)
34324 case 2:
34325 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34326 (!mask_pos && (nargs - i) == nargs_constant))
34328 error ("the next to last argument must be an 8-bit immediate");
34329 break;
34331 /* FALLTHRU */
34332 case 1:
34333 error ("the last argument must be an 8-bit immediate");
34334 break;
34335 default:
34336 gcc_unreachable ();
34338 return const0_rtx;
34341 else
34343 if (VECTOR_MODE_P (mode))
34344 op = safe_vector_operand (op, mode);
34346 /* If we aren't optimizing, only allow one memory operand to
34347 be generated. */
34348 if (memory_operand (op, mode))
34349 num_memory++;
34351 op = fixup_modeless_constant (op, mode);
34353 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34355 if (optimize || !match || num_memory > 1)
34356 op = copy_to_mode_reg (mode, op);
34358 else
34360 op = copy_to_reg (op);
34361 op = lowpart_subreg (mode, op, GET_MODE (op));
34365 args[i].op = op;
34366 args[i].mode = mode;
34369 switch (nargs)
34371 case 1:
34372 pat = GEN_FCN (icode) (real_target, args[0].op);
34373 break;
34374 case 2:
34375 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
34376 break;
34377 case 3:
34378 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34379 args[2].op);
34380 break;
34381 case 4:
34382 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34383 args[2].op, args[3].op);
34384 break;
34385 case 5:
34386 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34387 args[2].op, args[3].op, args[4].op);
34388 break;
34389 case 6:
34390 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34391 args[2].op, args[3].op, args[4].op,
34392 args[5].op);
34393 break;
34394 default:
34395 gcc_unreachable ();
34398 if (! pat)
34399 return 0;
34401 emit_insn (pat);
34402 return target;
34405 /* Transform pattern of following layout:
34406 (set A
34407 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
34409 into:
34410 (set (A B)) */
34412 static rtx
34413 ix86_erase_embedded_rounding (rtx pat)
34415 if (GET_CODE (pat) == INSN)
34416 pat = PATTERN (pat);
34418 gcc_assert (GET_CODE (pat) == SET);
34419 rtx src = SET_SRC (pat);
34420 gcc_assert (XVECLEN (src, 0) == 2);
34421 rtx p0 = XVECEXP (src, 0, 0);
34422 gcc_assert (GET_CODE (src) == UNSPEC
34423 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
34424 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
34425 return res;
34428 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
34429 with rounding. */
34430 static rtx
34431 ix86_expand_sse_comi_round (const struct builtin_description *d,
34432 tree exp, rtx target)
34434 rtx pat, set_dst;
34435 tree arg0 = CALL_EXPR_ARG (exp, 0);
34436 tree arg1 = CALL_EXPR_ARG (exp, 1);
34437 tree arg2 = CALL_EXPR_ARG (exp, 2);
34438 tree arg3 = CALL_EXPR_ARG (exp, 3);
34439 rtx op0 = expand_normal (arg0);
34440 rtx op1 = expand_normal (arg1);
34441 rtx op2 = expand_normal (arg2);
34442 rtx op3 = expand_normal (arg3);
34443 enum insn_code icode = d->icode;
34444 const struct insn_data_d *insn_p = &insn_data[icode];
34445 machine_mode mode0 = insn_p->operand[0].mode;
34446 machine_mode mode1 = insn_p->operand[1].mode;
34447 enum rtx_code comparison = UNEQ;
34448 bool need_ucomi = false;
34450 /* See avxintrin.h for values. */
34451 enum rtx_code comi_comparisons[32] =
34453 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
34454 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
34455 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
34457 bool need_ucomi_values[32] =
34459 true, false, false, true, true, false, false, true,
34460 true, false, false, true, true, false, false, true,
34461 false, true, true, false, false, true, true, false,
34462 false, true, true, false, false, true, true, false
34465 if (!CONST_INT_P (op2))
34467 error ("the third argument must be comparison constant");
34468 return const0_rtx;
34470 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
34472 error ("incorrect comparison mode");
34473 return const0_rtx;
34476 if (!insn_p->operand[2].predicate (op3, SImode))
34478 error ("incorrect rounding operand");
34479 return const0_rtx;
34482 comparison = comi_comparisons[INTVAL (op2)];
34483 need_ucomi = need_ucomi_values[INTVAL (op2)];
34485 if (VECTOR_MODE_P (mode0))
34486 op0 = safe_vector_operand (op0, mode0);
34487 if (VECTOR_MODE_P (mode1))
34488 op1 = safe_vector_operand (op1, mode1);
34490 target = gen_reg_rtx (SImode);
34491 emit_move_insn (target, const0_rtx);
34492 target = gen_rtx_SUBREG (QImode, target, 0);
34494 if ((optimize && !register_operand (op0, mode0))
34495 || !insn_p->operand[0].predicate (op0, mode0))
34496 op0 = copy_to_mode_reg (mode0, op0);
34497 if ((optimize && !register_operand (op1, mode1))
34498 || !insn_p->operand[1].predicate (op1, mode1))
34499 op1 = copy_to_mode_reg (mode1, op1);
34501 if (need_ucomi)
34502 icode = icode == CODE_FOR_sse_comi_round
34503 ? CODE_FOR_sse_ucomi_round
34504 : CODE_FOR_sse2_ucomi_round;
34506 pat = GEN_FCN (icode) (op0, op1, op3);
34507 if (! pat)
34508 return 0;
34510 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
34511 if (INTVAL (op3) == NO_ROUND)
34513 pat = ix86_erase_embedded_rounding (pat);
34514 if (! pat)
34515 return 0;
34517 set_dst = SET_DEST (pat);
34519 else
34521 gcc_assert (GET_CODE (pat) == SET);
34522 set_dst = SET_DEST (pat);
34525 emit_insn (pat);
34526 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34527 gen_rtx_fmt_ee (comparison, QImode,
34528 set_dst,
34529 const0_rtx)));
34531 return SUBREG_REG (target);
34534 static rtx
34535 ix86_expand_round_builtin (const struct builtin_description *d,
34536 tree exp, rtx target)
34538 rtx pat;
34539 unsigned int i, nargs;
34540 struct
34542 rtx op;
34543 machine_mode mode;
34544 } args[6];
34545 enum insn_code icode = d->icode;
34546 const struct insn_data_d *insn_p = &insn_data[icode];
34547 machine_mode tmode = insn_p->operand[0].mode;
34548 unsigned int nargs_constant = 0;
34549 unsigned int redundant_embed_rnd = 0;
34551 switch ((enum ix86_builtin_func_type) d->flag)
34553 case UINT64_FTYPE_V2DF_INT:
34554 case UINT64_FTYPE_V4SF_INT:
34555 case UINT_FTYPE_V2DF_INT:
34556 case UINT_FTYPE_V4SF_INT:
34557 case INT64_FTYPE_V2DF_INT:
34558 case INT64_FTYPE_V4SF_INT:
34559 case INT_FTYPE_V2DF_INT:
34560 case INT_FTYPE_V4SF_INT:
34561 nargs = 2;
34562 break;
34563 case V4SF_FTYPE_V4SF_UINT_INT:
34564 case V4SF_FTYPE_V4SF_UINT64_INT:
34565 case V2DF_FTYPE_V2DF_UINT64_INT:
34566 case V4SF_FTYPE_V4SF_INT_INT:
34567 case V4SF_FTYPE_V4SF_INT64_INT:
34568 case V2DF_FTYPE_V2DF_INT64_INT:
34569 case V4SF_FTYPE_V4SF_V4SF_INT:
34570 case V2DF_FTYPE_V2DF_V2DF_INT:
34571 case V4SF_FTYPE_V4SF_V2DF_INT:
34572 case V2DF_FTYPE_V2DF_V4SF_INT:
34573 nargs = 3;
34574 break;
34575 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
34576 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
34577 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
34578 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
34579 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
34580 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
34581 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
34582 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
34583 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
34584 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
34585 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
34586 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
34587 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
34588 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
34589 nargs = 4;
34590 break;
34591 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
34592 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
34593 nargs_constant = 2;
34594 nargs = 4;
34595 break;
34596 case INT_FTYPE_V4SF_V4SF_INT_INT:
34597 case INT_FTYPE_V2DF_V2DF_INT_INT:
34598 return ix86_expand_sse_comi_round (d, exp, target);
34599 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
34600 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
34601 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
34602 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
34603 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
34604 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
34605 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
34606 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
34607 nargs = 5;
34608 break;
34609 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
34610 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
34611 nargs_constant = 4;
34612 nargs = 5;
34613 break;
34614 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
34615 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
34616 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
34617 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
34618 nargs_constant = 3;
34619 nargs = 5;
34620 break;
34621 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
34622 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
34623 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
34624 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
34625 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
34626 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
34627 nargs = 6;
34628 nargs_constant = 4;
34629 break;
34630 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
34631 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
34632 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
34633 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
34634 nargs = 6;
34635 nargs_constant = 3;
34636 break;
34637 default:
34638 gcc_unreachable ();
34640 gcc_assert (nargs <= ARRAY_SIZE (args));
34642 if (optimize
34643 || target == 0
34644 || GET_MODE (target) != tmode
34645 || !insn_p->operand[0].predicate (target, tmode))
34646 target = gen_reg_rtx (tmode);
34648 for (i = 0; i < nargs; i++)
34650 tree arg = CALL_EXPR_ARG (exp, i);
34651 rtx op = expand_normal (arg);
34652 machine_mode mode = insn_p->operand[i + 1].mode;
34653 bool match = insn_p->operand[i + 1].predicate (op, mode);
34655 if (i == nargs - nargs_constant)
34657 if (!match)
34659 switch (icode)
34661 case CODE_FOR_avx512f_getmantv8df_mask_round:
34662 case CODE_FOR_avx512f_getmantv16sf_mask_round:
34663 case CODE_FOR_avx512f_vgetmantv2df_round:
34664 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
34665 case CODE_FOR_avx512f_vgetmantv4sf_round:
34666 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
34667 error ("the immediate argument must be a 4-bit immediate");
34668 return const0_rtx;
34669 case CODE_FOR_avx512f_cmpv8df3_mask_round:
34670 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
34671 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
34672 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
34673 error ("the immediate argument must be a 5-bit immediate");
34674 return const0_rtx;
34675 default:
34676 error ("the immediate argument must be an 8-bit immediate");
34677 return const0_rtx;
34681 else if (i == nargs-1)
34683 if (!insn_p->operand[nargs].predicate (op, SImode))
34685 error ("incorrect rounding operand");
34686 return const0_rtx;
34689 /* If there is no rounding use normal version of the pattern. */
34690 if (INTVAL (op) == NO_ROUND)
34691 redundant_embed_rnd = 1;
34693 else
34695 if (VECTOR_MODE_P (mode))
34696 op = safe_vector_operand (op, mode);
34698 op = fixup_modeless_constant (op, mode);
34700 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34702 if (optimize || !match)
34703 op = copy_to_mode_reg (mode, op);
34705 else
34707 op = copy_to_reg (op);
34708 op = lowpart_subreg (mode, op, GET_MODE (op));
34712 args[i].op = op;
34713 args[i].mode = mode;
34716 switch (nargs)
34718 case 1:
34719 pat = GEN_FCN (icode) (target, args[0].op);
34720 break;
34721 case 2:
34722 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34723 break;
34724 case 3:
34725 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34726 args[2].op);
34727 break;
34728 case 4:
34729 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34730 args[2].op, args[3].op);
34731 break;
34732 case 5:
34733 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34734 args[2].op, args[3].op, args[4].op);
34735 break;
34736 case 6:
34737 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34738 args[2].op, args[3].op, args[4].op,
34739 args[5].op);
34740 break;
34741 default:
34742 gcc_unreachable ();
34745 if (!pat)
34746 return 0;
34748 if (redundant_embed_rnd)
34749 pat = ix86_erase_embedded_rounding (pat);
34751 emit_insn (pat);
34752 return target;
34755 /* Subroutine of ix86_expand_builtin to take care of special insns
34756 with variable number of operands. */
34758 static rtx
34759 ix86_expand_special_args_builtin (const struct builtin_description *d,
34760 tree exp, rtx target)
34762 tree arg;
34763 rtx pat, op;
34764 unsigned int i, nargs, arg_adjust, memory;
34765 bool aligned_mem = false;
34766 struct
34768 rtx op;
34769 machine_mode mode;
34770 } args[3];
34771 enum insn_code icode = d->icode;
34772 bool last_arg_constant = false;
34773 const struct insn_data_d *insn_p = &insn_data[icode];
34774 machine_mode tmode = insn_p->operand[0].mode;
34775 enum { load, store } klass;
34777 switch ((enum ix86_builtin_func_type) d->flag)
34779 case VOID_FTYPE_VOID:
34780 emit_insn (GEN_FCN (icode) (target));
34781 return 0;
34782 case VOID_FTYPE_UINT64:
34783 case VOID_FTYPE_UNSIGNED:
34784 nargs = 0;
34785 klass = store;
34786 memory = 0;
34787 break;
34789 case INT_FTYPE_VOID:
34790 case USHORT_FTYPE_VOID:
34791 case UINT64_FTYPE_VOID:
34792 case UNSIGNED_FTYPE_VOID:
34793 nargs = 0;
34794 klass = load;
34795 memory = 0;
34796 break;
34797 case UINT64_FTYPE_PUNSIGNED:
34798 case V2DI_FTYPE_PV2DI:
34799 case V4DI_FTYPE_PV4DI:
34800 case V32QI_FTYPE_PCCHAR:
34801 case V16QI_FTYPE_PCCHAR:
34802 case V8SF_FTYPE_PCV4SF:
34803 case V8SF_FTYPE_PCFLOAT:
34804 case V4SF_FTYPE_PCFLOAT:
34805 case V4DF_FTYPE_PCV2DF:
34806 case V4DF_FTYPE_PCDOUBLE:
34807 case V2DF_FTYPE_PCDOUBLE:
34808 case VOID_FTYPE_PVOID:
34809 case V8DI_FTYPE_PV8DI:
34810 nargs = 1;
34811 klass = load;
34812 memory = 0;
34813 switch (icode)
34815 case CODE_FOR_sse4_1_movntdqa:
34816 case CODE_FOR_avx2_movntdqa:
34817 case CODE_FOR_avx512f_movntdqa:
34818 aligned_mem = true;
34819 break;
34820 default:
34821 break;
34823 break;
34824 case VOID_FTYPE_PV2SF_V4SF:
34825 case VOID_FTYPE_PV8DI_V8DI:
34826 case VOID_FTYPE_PV4DI_V4DI:
34827 case VOID_FTYPE_PV2DI_V2DI:
34828 case VOID_FTYPE_PCHAR_V32QI:
34829 case VOID_FTYPE_PCHAR_V16QI:
34830 case VOID_FTYPE_PFLOAT_V16SF:
34831 case VOID_FTYPE_PFLOAT_V8SF:
34832 case VOID_FTYPE_PFLOAT_V4SF:
34833 case VOID_FTYPE_PDOUBLE_V8DF:
34834 case VOID_FTYPE_PDOUBLE_V4DF:
34835 case VOID_FTYPE_PDOUBLE_V2DF:
34836 case VOID_FTYPE_PLONGLONG_LONGLONG:
34837 case VOID_FTYPE_PULONGLONG_ULONGLONG:
34838 case VOID_FTYPE_PINT_INT:
34839 nargs = 1;
34840 klass = store;
34841 /* Reserve memory operand for target. */
34842 memory = ARRAY_SIZE (args);
34843 switch (icode)
34845 /* These builtins and instructions require the memory
34846 to be properly aligned. */
34847 case CODE_FOR_avx_movntv4di:
34848 case CODE_FOR_sse2_movntv2di:
34849 case CODE_FOR_avx_movntv8sf:
34850 case CODE_FOR_sse_movntv4sf:
34851 case CODE_FOR_sse4a_vmmovntv4sf:
34852 case CODE_FOR_avx_movntv4df:
34853 case CODE_FOR_sse2_movntv2df:
34854 case CODE_FOR_sse4a_vmmovntv2df:
34855 case CODE_FOR_sse2_movntidi:
34856 case CODE_FOR_sse_movntq:
34857 case CODE_FOR_sse2_movntisi:
34858 case CODE_FOR_avx512f_movntv16sf:
34859 case CODE_FOR_avx512f_movntv8df:
34860 case CODE_FOR_avx512f_movntv8di:
34861 aligned_mem = true;
34862 break;
34863 default:
34864 break;
34866 break;
34867 case V4SF_FTYPE_V4SF_PCV2SF:
34868 case V2DF_FTYPE_V2DF_PCDOUBLE:
34869 nargs = 2;
34870 klass = load;
34871 memory = 1;
34872 break;
34873 case V8SF_FTYPE_PCV8SF_V8SI:
34874 case V4DF_FTYPE_PCV4DF_V4DI:
34875 case V4SF_FTYPE_PCV4SF_V4SI:
34876 case V2DF_FTYPE_PCV2DF_V2DI:
34877 case V8SI_FTYPE_PCV8SI_V8SI:
34878 case V4DI_FTYPE_PCV4DI_V4DI:
34879 case V4SI_FTYPE_PCV4SI_V4SI:
34880 case V2DI_FTYPE_PCV2DI_V2DI:
34881 case VOID_FTYPE_INT_INT64:
34882 nargs = 2;
34883 klass = load;
34884 memory = 0;
34885 break;
34886 case VOID_FTYPE_PV8DF_V8DF_UQI:
34887 case VOID_FTYPE_PV4DF_V4DF_UQI:
34888 case VOID_FTYPE_PV2DF_V2DF_UQI:
34889 case VOID_FTYPE_PV16SF_V16SF_UHI:
34890 case VOID_FTYPE_PV8SF_V8SF_UQI:
34891 case VOID_FTYPE_PV4SF_V4SF_UQI:
34892 case VOID_FTYPE_PV8DI_V8DI_UQI:
34893 case VOID_FTYPE_PV4DI_V4DI_UQI:
34894 case VOID_FTYPE_PV2DI_V2DI_UQI:
34895 case VOID_FTYPE_PV16SI_V16SI_UHI:
34896 case VOID_FTYPE_PV8SI_V8SI_UQI:
34897 case VOID_FTYPE_PV4SI_V4SI_UQI:
34898 case VOID_FTYPE_PV64QI_V64QI_UDI:
34899 case VOID_FTYPE_PV32HI_V32HI_USI:
34900 case VOID_FTYPE_PV32QI_V32QI_USI:
34901 case VOID_FTYPE_PV16QI_V16QI_UHI:
34902 case VOID_FTYPE_PV16HI_V16HI_UHI:
34903 case VOID_FTYPE_PV8HI_V8HI_UQI:
34904 switch (icode)
34906 /* These builtins and instructions require the memory
34907 to be properly aligned. */
34908 case CODE_FOR_avx512f_storev16sf_mask:
34909 case CODE_FOR_avx512f_storev16si_mask:
34910 case CODE_FOR_avx512f_storev8df_mask:
34911 case CODE_FOR_avx512f_storev8di_mask:
34912 case CODE_FOR_avx512vl_storev8sf_mask:
34913 case CODE_FOR_avx512vl_storev8si_mask:
34914 case CODE_FOR_avx512vl_storev4df_mask:
34915 case CODE_FOR_avx512vl_storev4di_mask:
34916 case CODE_FOR_avx512vl_storev4sf_mask:
34917 case CODE_FOR_avx512vl_storev4si_mask:
34918 case CODE_FOR_avx512vl_storev2df_mask:
34919 case CODE_FOR_avx512vl_storev2di_mask:
34920 aligned_mem = true;
34921 break;
34922 default:
34923 break;
34925 /* FALLTHRU */
34926 case VOID_FTYPE_PV8SF_V8SI_V8SF:
34927 case VOID_FTYPE_PV4DF_V4DI_V4DF:
34928 case VOID_FTYPE_PV4SF_V4SI_V4SF:
34929 case VOID_FTYPE_PV2DF_V2DI_V2DF:
34930 case VOID_FTYPE_PV8SI_V8SI_V8SI:
34931 case VOID_FTYPE_PV4DI_V4DI_V4DI:
34932 case VOID_FTYPE_PV4SI_V4SI_V4SI:
34933 case VOID_FTYPE_PV2DI_V2DI_V2DI:
34934 case VOID_FTYPE_PV8SI_V8DI_UQI:
34935 case VOID_FTYPE_PV8HI_V8DI_UQI:
34936 case VOID_FTYPE_PV16HI_V16SI_UHI:
34937 case VOID_FTYPE_PV16QI_V8DI_UQI:
34938 case VOID_FTYPE_PV16QI_V16SI_UHI:
34939 case VOID_FTYPE_PV4SI_V4DI_UQI:
34940 case VOID_FTYPE_PV4SI_V2DI_UQI:
34941 case VOID_FTYPE_PV8HI_V4DI_UQI:
34942 case VOID_FTYPE_PV8HI_V2DI_UQI:
34943 case VOID_FTYPE_PV8HI_V8SI_UQI:
34944 case VOID_FTYPE_PV8HI_V4SI_UQI:
34945 case VOID_FTYPE_PV16QI_V4DI_UQI:
34946 case VOID_FTYPE_PV16QI_V2DI_UQI:
34947 case VOID_FTYPE_PV16QI_V8SI_UQI:
34948 case VOID_FTYPE_PV16QI_V4SI_UQI:
34949 case VOID_FTYPE_PCHAR_V64QI_UDI:
34950 case VOID_FTYPE_PCHAR_V32QI_USI:
34951 case VOID_FTYPE_PCHAR_V16QI_UHI:
34952 case VOID_FTYPE_PSHORT_V32HI_USI:
34953 case VOID_FTYPE_PSHORT_V16HI_UHI:
34954 case VOID_FTYPE_PSHORT_V8HI_UQI:
34955 case VOID_FTYPE_PINT_V16SI_UHI:
34956 case VOID_FTYPE_PINT_V8SI_UQI:
34957 case VOID_FTYPE_PINT_V4SI_UQI:
34958 case VOID_FTYPE_PINT64_V8DI_UQI:
34959 case VOID_FTYPE_PINT64_V4DI_UQI:
34960 case VOID_FTYPE_PINT64_V2DI_UQI:
34961 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
34962 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
34963 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
34964 case VOID_FTYPE_PFLOAT_V16SF_UHI:
34965 case VOID_FTYPE_PFLOAT_V8SF_UQI:
34966 case VOID_FTYPE_PFLOAT_V4SF_UQI:
34967 case VOID_FTYPE_PV32QI_V32HI_USI:
34968 case VOID_FTYPE_PV16QI_V16HI_UHI:
34969 case VOID_FTYPE_PV8QI_V8HI_UQI:
34970 nargs = 2;
34971 klass = store;
34972 /* Reserve memory operand for target. */
34973 memory = ARRAY_SIZE (args);
34974 break;
34975 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
34976 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
34977 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
34978 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
34979 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
34980 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
34981 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
34982 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
34983 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
34984 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
34985 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
34986 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
34987 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
34988 case V32HI_FTYPE_PCV32HI_V32HI_USI:
34989 case V32QI_FTYPE_PCV32QI_V32QI_USI:
34990 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
34991 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
34992 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
34993 switch (icode)
34995 /* These builtins and instructions require the memory
34996 to be properly aligned. */
34997 case CODE_FOR_avx512f_loadv16sf_mask:
34998 case CODE_FOR_avx512f_loadv16si_mask:
34999 case CODE_FOR_avx512f_loadv8df_mask:
35000 case CODE_FOR_avx512f_loadv8di_mask:
35001 case CODE_FOR_avx512vl_loadv8sf_mask:
35002 case CODE_FOR_avx512vl_loadv8si_mask:
35003 case CODE_FOR_avx512vl_loadv4df_mask:
35004 case CODE_FOR_avx512vl_loadv4di_mask:
35005 case CODE_FOR_avx512vl_loadv4sf_mask:
35006 case CODE_FOR_avx512vl_loadv4si_mask:
35007 case CODE_FOR_avx512vl_loadv2df_mask:
35008 case CODE_FOR_avx512vl_loadv2di_mask:
35009 case CODE_FOR_avx512bw_loadv64qi_mask:
35010 case CODE_FOR_avx512vl_loadv32qi_mask:
35011 case CODE_FOR_avx512vl_loadv16qi_mask:
35012 case CODE_FOR_avx512bw_loadv32hi_mask:
35013 case CODE_FOR_avx512vl_loadv16hi_mask:
35014 case CODE_FOR_avx512vl_loadv8hi_mask:
35015 aligned_mem = true;
35016 break;
35017 default:
35018 break;
35020 /* FALLTHRU */
35021 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35022 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35023 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35024 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35025 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35026 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35027 case V16SI_FTYPE_PCINT_V16SI_UHI:
35028 case V8SI_FTYPE_PCINT_V8SI_UQI:
35029 case V4SI_FTYPE_PCINT_V4SI_UQI:
35030 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35031 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35032 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35033 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35034 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35035 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35036 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35037 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35038 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35039 nargs = 3;
35040 klass = load;
35041 memory = 0;
35042 break;
35043 case VOID_FTYPE_UINT_UINT_UINT:
35044 case VOID_FTYPE_UINT64_UINT_UINT:
35045 case UCHAR_FTYPE_UINT_UINT_UINT:
35046 case UCHAR_FTYPE_UINT64_UINT_UINT:
35047 nargs = 3;
35048 klass = load;
35049 memory = ARRAY_SIZE (args);
35050 last_arg_constant = true;
35051 break;
35052 default:
35053 gcc_unreachable ();
35056 gcc_assert (nargs <= ARRAY_SIZE (args));
35058 if (klass == store)
35060 arg = CALL_EXPR_ARG (exp, 0);
35061 op = expand_normal (arg);
35062 gcc_assert (target == 0);
35063 if (memory)
35065 op = ix86_zero_extend_to_Pmode (op);
35066 target = gen_rtx_MEM (tmode, op);
35067 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35068 on it. Try to improve it using get_pointer_alignment,
35069 and if the special builtin is one that requires strict
35070 mode alignment, also from it's GET_MODE_ALIGNMENT.
35071 Failure to do so could lead to ix86_legitimate_combined_insn
35072 rejecting all changes to such insns. */
35073 unsigned int align = get_pointer_alignment (arg);
35074 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35075 align = GET_MODE_ALIGNMENT (tmode);
35076 if (MEM_ALIGN (target) < align)
35077 set_mem_align (target, align);
35079 else
35080 target = force_reg (tmode, op);
35081 arg_adjust = 1;
35083 else
35085 arg_adjust = 0;
35086 if (optimize
35087 || target == 0
35088 || !register_operand (target, tmode)
35089 || GET_MODE (target) != tmode)
35090 target = gen_reg_rtx (tmode);
35093 for (i = 0; i < nargs; i++)
35095 machine_mode mode = insn_p->operand[i + 1].mode;
35096 bool match;
35098 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35099 op = expand_normal (arg);
35100 match = insn_p->operand[i + 1].predicate (op, mode);
35102 if (last_arg_constant && (i + 1) == nargs)
35104 if (!match)
35106 if (icode == CODE_FOR_lwp_lwpvalsi3
35107 || icode == CODE_FOR_lwp_lwpinssi3
35108 || icode == CODE_FOR_lwp_lwpvaldi3
35109 || icode == CODE_FOR_lwp_lwpinsdi3)
35110 error ("the last argument must be a 32-bit immediate");
35111 else
35112 error ("the last argument must be an 8-bit immediate");
35113 return const0_rtx;
35116 else
35118 if (i == memory)
35120 /* This must be the memory operand. */
35121 op = ix86_zero_extend_to_Pmode (op);
35122 op = gen_rtx_MEM (mode, op);
35123 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35124 on it. Try to improve it using get_pointer_alignment,
35125 and if the special builtin is one that requires strict
35126 mode alignment, also from it's GET_MODE_ALIGNMENT.
35127 Failure to do so could lead to ix86_legitimate_combined_insn
35128 rejecting all changes to such insns. */
35129 unsigned int align = get_pointer_alignment (arg);
35130 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35131 align = GET_MODE_ALIGNMENT (mode);
35132 if (MEM_ALIGN (op) < align)
35133 set_mem_align (op, align);
35135 else
35137 /* This must be register. */
35138 if (VECTOR_MODE_P (mode))
35139 op = safe_vector_operand (op, mode);
35141 op = fixup_modeless_constant (op, mode);
35143 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35144 op = copy_to_mode_reg (mode, op);
35145 else
35147 op = copy_to_reg (op);
35148 op = lowpart_subreg (mode, op, GET_MODE (op));
35153 args[i].op = op;
35154 args[i].mode = mode;
35157 switch (nargs)
35159 case 0:
35160 pat = GEN_FCN (icode) (target);
35161 break;
35162 case 1:
35163 pat = GEN_FCN (icode) (target, args[0].op);
35164 break;
35165 case 2:
35166 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35167 break;
35168 case 3:
35169 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35170 break;
35171 default:
35172 gcc_unreachable ();
35175 if (! pat)
35176 return 0;
35177 emit_insn (pat);
35178 return klass == store ? 0 : target;
35181 /* Return the integer constant in ARG. Constrain it to be in the range
35182 of the subparts of VEC_TYPE; issue an error if not. */
35184 static int
35185 get_element_number (tree vec_type, tree arg)
35187 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35189 if (!tree_fits_uhwi_p (arg)
35190 || (elt = tree_to_uhwi (arg), elt > max))
35192 error ("selector must be an integer constant in the range 0..%wi", max);
35193 return 0;
35196 return elt;
35199 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35200 ix86_expand_vector_init. We DO have language-level syntax for this, in
35201 the form of (type){ init-list }. Except that since we can't place emms
35202 instructions from inside the compiler, we can't allow the use of MMX
35203 registers unless the user explicitly asks for it. So we do *not* define
35204 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
35205 we have builtins invoked by mmintrin.h that gives us license to emit
35206 these sorts of instructions. */
35208 static rtx
35209 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
35211 machine_mode tmode = TYPE_MODE (type);
35212 machine_mode inner_mode = GET_MODE_INNER (tmode);
35213 int i, n_elt = GET_MODE_NUNITS (tmode);
35214 rtvec v = rtvec_alloc (n_elt);
35216 gcc_assert (VECTOR_MODE_P (tmode));
35217 gcc_assert (call_expr_nargs (exp) == n_elt);
35219 for (i = 0; i < n_elt; ++i)
35221 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
35222 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
35225 if (!target || !register_operand (target, tmode))
35226 target = gen_reg_rtx (tmode);
35228 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
35229 return target;
35232 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35233 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
35234 had a language-level syntax for referencing vector elements. */
35236 static rtx
35237 ix86_expand_vec_ext_builtin (tree exp, rtx target)
35239 machine_mode tmode, mode0;
35240 tree arg0, arg1;
35241 int elt;
35242 rtx op0;
35244 arg0 = CALL_EXPR_ARG (exp, 0);
35245 arg1 = CALL_EXPR_ARG (exp, 1);
35247 op0 = expand_normal (arg0);
35248 elt = get_element_number (TREE_TYPE (arg0), arg1);
35250 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35251 mode0 = TYPE_MODE (TREE_TYPE (arg0));
35252 gcc_assert (VECTOR_MODE_P (mode0));
35254 op0 = force_reg (mode0, op0);
35256 if (optimize || !target || !register_operand (target, tmode))
35257 target = gen_reg_rtx (tmode);
35259 ix86_expand_vector_extract (true, target, op0, elt);
35261 return target;
35264 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35265 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
35266 a language-level syntax for referencing vector elements. */
35268 static rtx
35269 ix86_expand_vec_set_builtin (tree exp)
35271 machine_mode tmode, mode1;
35272 tree arg0, arg1, arg2;
35273 int elt;
35274 rtx op0, op1, target;
35276 arg0 = CALL_EXPR_ARG (exp, 0);
35277 arg1 = CALL_EXPR_ARG (exp, 1);
35278 arg2 = CALL_EXPR_ARG (exp, 2);
35280 tmode = TYPE_MODE (TREE_TYPE (arg0));
35281 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35282 gcc_assert (VECTOR_MODE_P (tmode));
35284 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
35285 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
35286 elt = get_element_number (TREE_TYPE (arg0), arg2);
35288 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
35289 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
35291 op0 = force_reg (tmode, op0);
35292 op1 = force_reg (mode1, op1);
35294 /* OP0 is the source of these builtin functions and shouldn't be
35295 modified. Create a copy, use it and return it as target. */
35296 target = gen_reg_rtx (tmode);
35297 emit_move_insn (target, op0);
35298 ix86_expand_vector_set (true, target, op1, elt);
35300 return target;
35303 /* Emit conditional move of SRC to DST with condition
35304 OP1 CODE OP2. */
35305 static void
35306 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
35308 rtx t;
35310 if (TARGET_CMOVE)
35312 t = ix86_expand_compare (code, op1, op2);
35313 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
35314 src, dst)));
35316 else
35318 rtx_code_label *nomove = gen_label_rtx ();
35319 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
35320 const0_rtx, GET_MODE (op1), 1, nomove);
35321 emit_move_insn (dst, src);
35322 emit_label (nomove);
35326 /* Choose max of DST and SRC and put it to DST. */
35327 static void
35328 ix86_emit_move_max (rtx dst, rtx src)
35330 ix86_emit_cmove (dst, src, LTU, dst, src);
35333 /* Expand an expression EXP that calls a built-in function,
35334 with result going to TARGET if that's convenient
35335 (and in mode MODE if that's convenient).
35336 SUBTARGET may be used as the target for computing one of EXP's operands.
35337 IGNORE is nonzero if the value is to be ignored. */
35339 static rtx
35340 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
35341 machine_mode mode, int ignore)
35343 size_t i;
35344 enum insn_code icode, icode2;
35345 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
35346 tree arg0, arg1, arg2, arg3, arg4;
35347 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
35348 machine_mode mode0, mode1, mode2, mode3, mode4;
35349 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
35351 /* For CPU builtins that can be folded, fold first and expand the fold. */
35352 switch (fcode)
35354 case IX86_BUILTIN_CPU_INIT:
35356 /* Make it call __cpu_indicator_init in libgcc. */
35357 tree call_expr, fndecl, type;
35358 type = build_function_type_list (integer_type_node, NULL_TREE);
35359 fndecl = build_fn_decl ("__cpu_indicator_init", type);
35360 call_expr = build_call_expr (fndecl, 0);
35361 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
35363 case IX86_BUILTIN_CPU_IS:
35364 case IX86_BUILTIN_CPU_SUPPORTS:
35366 tree arg0 = CALL_EXPR_ARG (exp, 0);
35367 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
35368 gcc_assert (fold_expr != NULL_TREE);
35369 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
35373 /* Determine whether the builtin function is available under the current ISA.
35374 Originally the builtin was not created if it wasn't applicable to the
35375 current ISA based on the command line switches. With function specific
35376 options, we need to check in the context of the function making the call
35377 whether it is supported. Treat AVX512VL and MMX specially. For other flags,
35378 if isa includes more than one ISA bit, treat those are requiring any
35379 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
35380 ISAs. Likewise for MMX, require both MMX and the non-MMX ISAs.
35381 Similarly for 64BIT, but we shouldn't be building such builtins
35382 at all, -m64 is a whole TU option. */
35383 if (((ix86_builtins_isa[fcode].isa
35384 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35385 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI))
35386 && !(ix86_builtins_isa[fcode].isa
35387 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35388 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI)
35389 & ix86_isa_flags))
35390 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
35391 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
35392 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_GFNI)
35393 && !(ix86_isa_flags & OPTION_MASK_ISA_GFNI))
35394 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_MMX)
35395 && !(ix86_isa_flags & OPTION_MASK_ISA_MMX))
35396 || (ix86_builtins_isa[fcode].isa2
35397 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
35399 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
35400 ix86_builtins_isa[fcode].isa2, 0, 0,
35401 NULL, NULL, (enum fpmath_unit) 0,
35402 false);
35403 if (!opts)
35404 error ("%qE needs unknown isa option", fndecl);
35405 else
35407 gcc_assert (opts != NULL);
35408 error ("%qE needs isa option %s", fndecl, opts);
35409 free (opts);
35411 return expand_call (exp, target, ignore);
35414 switch (fcode)
35416 case IX86_BUILTIN_BNDMK:
35417 if (!target
35418 || GET_MODE (target) != BNDmode
35419 || !register_operand (target, BNDmode))
35420 target = gen_reg_rtx (BNDmode);
35422 arg0 = CALL_EXPR_ARG (exp, 0);
35423 arg1 = CALL_EXPR_ARG (exp, 1);
35425 op0 = expand_normal (arg0);
35426 op1 = expand_normal (arg1);
35428 if (!register_operand (op0, Pmode))
35429 op0 = ix86_zero_extend_to_Pmode (op0);
35430 if (!register_operand (op1, Pmode))
35431 op1 = ix86_zero_extend_to_Pmode (op1);
35433 /* Builtin arg1 is size of block but instruction op1 should
35434 be (size - 1). */
35435 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
35436 NULL_RTX, 1, OPTAB_DIRECT);
35438 emit_insn (BNDmode == BND64mode
35439 ? gen_bnd64_mk (target, op0, op1)
35440 : gen_bnd32_mk (target, op0, op1));
35441 return target;
35443 case IX86_BUILTIN_BNDSTX:
35444 arg0 = CALL_EXPR_ARG (exp, 0);
35445 arg1 = CALL_EXPR_ARG (exp, 1);
35446 arg2 = CALL_EXPR_ARG (exp, 2);
35448 op0 = expand_normal (arg0);
35449 op1 = expand_normal (arg1);
35450 op2 = expand_normal (arg2);
35452 if (!register_operand (op0, Pmode))
35453 op0 = ix86_zero_extend_to_Pmode (op0);
35454 if (!register_operand (op1, BNDmode))
35455 op1 = copy_to_mode_reg (BNDmode, op1);
35456 if (!register_operand (op2, Pmode))
35457 op2 = ix86_zero_extend_to_Pmode (op2);
35459 emit_insn (BNDmode == BND64mode
35460 ? gen_bnd64_stx (op2, op0, op1)
35461 : gen_bnd32_stx (op2, op0, op1));
35462 return 0;
35464 case IX86_BUILTIN_BNDLDX:
35465 if (!target
35466 || GET_MODE (target) != BNDmode
35467 || !register_operand (target, BNDmode))
35468 target = gen_reg_rtx (BNDmode);
35470 arg0 = CALL_EXPR_ARG (exp, 0);
35471 arg1 = CALL_EXPR_ARG (exp, 1);
35473 op0 = expand_normal (arg0);
35474 op1 = expand_normal (arg1);
35476 if (!register_operand (op0, Pmode))
35477 op0 = ix86_zero_extend_to_Pmode (op0);
35478 if (!register_operand (op1, Pmode))
35479 op1 = ix86_zero_extend_to_Pmode (op1);
35481 emit_insn (BNDmode == BND64mode
35482 ? gen_bnd64_ldx (target, op0, op1)
35483 : gen_bnd32_ldx (target, op0, op1));
35484 return target;
35486 case IX86_BUILTIN_BNDCL:
35487 arg0 = CALL_EXPR_ARG (exp, 0);
35488 arg1 = CALL_EXPR_ARG (exp, 1);
35490 op0 = expand_normal (arg0);
35491 op1 = expand_normal (arg1);
35493 if (!register_operand (op0, Pmode))
35494 op0 = ix86_zero_extend_to_Pmode (op0);
35495 if (!register_operand (op1, BNDmode))
35496 op1 = copy_to_mode_reg (BNDmode, op1);
35498 emit_insn (BNDmode == BND64mode
35499 ? gen_bnd64_cl (op1, op0)
35500 : gen_bnd32_cl (op1, op0));
35501 return 0;
35503 case IX86_BUILTIN_BNDCU:
35504 arg0 = CALL_EXPR_ARG (exp, 0);
35505 arg1 = CALL_EXPR_ARG (exp, 1);
35507 op0 = expand_normal (arg0);
35508 op1 = expand_normal (arg1);
35510 if (!register_operand (op0, Pmode))
35511 op0 = ix86_zero_extend_to_Pmode (op0);
35512 if (!register_operand (op1, BNDmode))
35513 op1 = copy_to_mode_reg (BNDmode, op1);
35515 emit_insn (BNDmode == BND64mode
35516 ? gen_bnd64_cu (op1, op0)
35517 : gen_bnd32_cu (op1, op0));
35518 return 0;
35520 case IX86_BUILTIN_BNDRET:
35521 arg0 = CALL_EXPR_ARG (exp, 0);
35522 target = chkp_get_rtl_bounds (arg0);
35524 /* If no bounds were specified for returned value,
35525 then use INIT bounds. It usually happens when
35526 some built-in function is expanded. */
35527 if (!target)
35529 rtx t1 = gen_reg_rtx (Pmode);
35530 rtx t2 = gen_reg_rtx (Pmode);
35531 target = gen_reg_rtx (BNDmode);
35532 emit_move_insn (t1, const0_rtx);
35533 emit_move_insn (t2, constm1_rtx);
35534 emit_insn (BNDmode == BND64mode
35535 ? gen_bnd64_mk (target, t1, t2)
35536 : gen_bnd32_mk (target, t1, t2));
35539 gcc_assert (target && REG_P (target));
35540 return target;
35542 case IX86_BUILTIN_BNDNARROW:
35544 rtx m1, m1h1, m1h2, lb, ub, t1;
35546 /* Return value and lb. */
35547 arg0 = CALL_EXPR_ARG (exp, 0);
35548 /* Bounds. */
35549 arg1 = CALL_EXPR_ARG (exp, 1);
35550 /* Size. */
35551 arg2 = CALL_EXPR_ARG (exp, 2);
35553 lb = expand_normal (arg0);
35554 op1 = expand_normal (arg1);
35555 op2 = expand_normal (arg2);
35557 /* Size was passed but we need to use (size - 1) as for bndmk. */
35558 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
35559 NULL_RTX, 1, OPTAB_DIRECT);
35561 /* Add LB to size and inverse to get UB. */
35562 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
35563 op2, 1, OPTAB_DIRECT);
35564 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
35566 if (!register_operand (lb, Pmode))
35567 lb = ix86_zero_extend_to_Pmode (lb);
35568 if (!register_operand (ub, Pmode))
35569 ub = ix86_zero_extend_to_Pmode (ub);
35571 /* We need to move bounds to memory before any computations. */
35572 if (MEM_P (op1))
35573 m1 = op1;
35574 else
35576 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
35577 emit_move_insn (m1, op1);
35580 /* Generate mem expression to be used for access to LB and UB. */
35581 m1h1 = adjust_address (m1, Pmode, 0);
35582 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
35584 t1 = gen_reg_rtx (Pmode);
35586 /* Compute LB. */
35587 emit_move_insn (t1, m1h1);
35588 ix86_emit_move_max (t1, lb);
35589 emit_move_insn (m1h1, t1);
35591 /* Compute UB. UB is stored in 1's complement form. Therefore
35592 we also use max here. */
35593 emit_move_insn (t1, m1h2);
35594 ix86_emit_move_max (t1, ub);
35595 emit_move_insn (m1h2, t1);
35597 op2 = gen_reg_rtx (BNDmode);
35598 emit_move_insn (op2, m1);
35600 return chkp_join_splitted_slot (lb, op2);
35603 case IX86_BUILTIN_BNDINT:
35605 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
35607 if (!target
35608 || GET_MODE (target) != BNDmode
35609 || !register_operand (target, BNDmode))
35610 target = gen_reg_rtx (BNDmode);
35612 arg0 = CALL_EXPR_ARG (exp, 0);
35613 arg1 = CALL_EXPR_ARG (exp, 1);
35615 op0 = expand_normal (arg0);
35616 op1 = expand_normal (arg1);
35618 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
35619 rh1 = adjust_address (res, Pmode, 0);
35620 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
35622 /* Put first bounds to temporaries. */
35623 lb1 = gen_reg_rtx (Pmode);
35624 ub1 = gen_reg_rtx (Pmode);
35625 if (MEM_P (op0))
35627 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
35628 emit_move_insn (ub1, adjust_address (op0, Pmode,
35629 GET_MODE_SIZE (Pmode)));
35631 else
35633 emit_move_insn (res, op0);
35634 emit_move_insn (lb1, rh1);
35635 emit_move_insn (ub1, rh2);
35638 /* Put second bounds to temporaries. */
35639 lb2 = gen_reg_rtx (Pmode);
35640 ub2 = gen_reg_rtx (Pmode);
35641 if (MEM_P (op1))
35643 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
35644 emit_move_insn (ub2, adjust_address (op1, Pmode,
35645 GET_MODE_SIZE (Pmode)));
35647 else
35649 emit_move_insn (res, op1);
35650 emit_move_insn (lb2, rh1);
35651 emit_move_insn (ub2, rh2);
35654 /* Compute LB. */
35655 ix86_emit_move_max (lb1, lb2);
35656 emit_move_insn (rh1, lb1);
35658 /* Compute UB. UB is stored in 1's complement form. Therefore
35659 we also use max here. */
35660 ix86_emit_move_max (ub1, ub2);
35661 emit_move_insn (rh2, ub1);
35663 emit_move_insn (target, res);
35665 return target;
35668 case IX86_BUILTIN_SIZEOF:
35670 tree name;
35671 rtx symbol;
35673 if (!target
35674 || GET_MODE (target) != Pmode
35675 || !register_operand (target, Pmode))
35676 target = gen_reg_rtx (Pmode);
35678 arg0 = CALL_EXPR_ARG (exp, 0);
35679 gcc_assert (VAR_P (arg0));
35681 name = DECL_ASSEMBLER_NAME (arg0);
35682 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
35684 emit_insn (Pmode == SImode
35685 ? gen_move_size_reloc_si (target, symbol)
35686 : gen_move_size_reloc_di (target, symbol));
35688 return target;
35691 case IX86_BUILTIN_BNDLOWER:
35693 rtx mem, hmem;
35695 if (!target
35696 || GET_MODE (target) != Pmode
35697 || !register_operand (target, Pmode))
35698 target = gen_reg_rtx (Pmode);
35700 arg0 = CALL_EXPR_ARG (exp, 0);
35701 op0 = expand_normal (arg0);
35703 /* We need to move bounds to memory first. */
35704 if (MEM_P (op0))
35705 mem = op0;
35706 else
35708 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35709 emit_move_insn (mem, op0);
35712 /* Generate mem expression to access LB and load it. */
35713 hmem = adjust_address (mem, Pmode, 0);
35714 emit_move_insn (target, hmem);
35716 return target;
35719 case IX86_BUILTIN_BNDUPPER:
35721 rtx mem, hmem, res;
35723 if (!target
35724 || GET_MODE (target) != Pmode
35725 || !register_operand (target, Pmode))
35726 target = gen_reg_rtx (Pmode);
35728 arg0 = CALL_EXPR_ARG (exp, 0);
35729 op0 = expand_normal (arg0);
35731 /* We need to move bounds to memory first. */
35732 if (MEM_P (op0))
35733 mem = op0;
35734 else
35736 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35737 emit_move_insn (mem, op0);
35740 /* Generate mem expression to access UB. */
35741 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
35743 /* We need to inverse all bits of UB. */
35744 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
35746 if (res != target)
35747 emit_move_insn (target, res);
35749 return target;
35752 case IX86_BUILTIN_MASKMOVQ:
35753 case IX86_BUILTIN_MASKMOVDQU:
35754 icode = (fcode == IX86_BUILTIN_MASKMOVQ
35755 ? CODE_FOR_mmx_maskmovq
35756 : CODE_FOR_sse2_maskmovdqu);
35757 /* Note the arg order is different from the operand order. */
35758 arg1 = CALL_EXPR_ARG (exp, 0);
35759 arg2 = CALL_EXPR_ARG (exp, 1);
35760 arg0 = CALL_EXPR_ARG (exp, 2);
35761 op0 = expand_normal (arg0);
35762 op1 = expand_normal (arg1);
35763 op2 = expand_normal (arg2);
35764 mode0 = insn_data[icode].operand[0].mode;
35765 mode1 = insn_data[icode].operand[1].mode;
35766 mode2 = insn_data[icode].operand[2].mode;
35768 op0 = ix86_zero_extend_to_Pmode (op0);
35769 op0 = gen_rtx_MEM (mode1, op0);
35771 if (!insn_data[icode].operand[0].predicate (op0, mode0))
35772 op0 = copy_to_mode_reg (mode0, op0);
35773 if (!insn_data[icode].operand[1].predicate (op1, mode1))
35774 op1 = copy_to_mode_reg (mode1, op1);
35775 if (!insn_data[icode].operand[2].predicate (op2, mode2))
35776 op2 = copy_to_mode_reg (mode2, op2);
35777 pat = GEN_FCN (icode) (op0, op1, op2);
35778 if (! pat)
35779 return 0;
35780 emit_insn (pat);
35781 return 0;
35783 case IX86_BUILTIN_LDMXCSR:
35784 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
35785 target = assign_386_stack_local (SImode, SLOT_TEMP);
35786 emit_move_insn (target, op0);
35787 emit_insn (gen_sse_ldmxcsr (target));
35788 return 0;
35790 case IX86_BUILTIN_STMXCSR:
35791 target = assign_386_stack_local (SImode, SLOT_TEMP);
35792 emit_insn (gen_sse_stmxcsr (target));
35793 return copy_to_mode_reg (SImode, target);
35795 case IX86_BUILTIN_CLFLUSH:
35796 arg0 = CALL_EXPR_ARG (exp, 0);
35797 op0 = expand_normal (arg0);
35798 icode = CODE_FOR_sse2_clflush;
35799 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35800 op0 = ix86_zero_extend_to_Pmode (op0);
35802 emit_insn (gen_sse2_clflush (op0));
35803 return 0;
35805 case IX86_BUILTIN_CLWB:
35806 arg0 = CALL_EXPR_ARG (exp, 0);
35807 op0 = expand_normal (arg0);
35808 icode = CODE_FOR_clwb;
35809 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35810 op0 = ix86_zero_extend_to_Pmode (op0);
35812 emit_insn (gen_clwb (op0));
35813 return 0;
35815 case IX86_BUILTIN_CLFLUSHOPT:
35816 arg0 = CALL_EXPR_ARG (exp, 0);
35817 op0 = expand_normal (arg0);
35818 icode = CODE_FOR_clflushopt;
35819 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35820 op0 = ix86_zero_extend_to_Pmode (op0);
35822 emit_insn (gen_clflushopt (op0));
35823 return 0;
35825 case IX86_BUILTIN_MONITOR:
35826 case IX86_BUILTIN_MONITORX:
35827 arg0 = CALL_EXPR_ARG (exp, 0);
35828 arg1 = CALL_EXPR_ARG (exp, 1);
35829 arg2 = CALL_EXPR_ARG (exp, 2);
35830 op0 = expand_normal (arg0);
35831 op1 = expand_normal (arg1);
35832 op2 = expand_normal (arg2);
35833 if (!REG_P (op0))
35834 op0 = ix86_zero_extend_to_Pmode (op0);
35835 if (!REG_P (op1))
35836 op1 = copy_to_mode_reg (SImode, op1);
35837 if (!REG_P (op2))
35838 op2 = copy_to_mode_reg (SImode, op2);
35840 emit_insn (fcode == IX86_BUILTIN_MONITOR
35841 ? ix86_gen_monitor (op0, op1, op2)
35842 : ix86_gen_monitorx (op0, op1, op2));
35843 return 0;
35845 case IX86_BUILTIN_MWAIT:
35846 arg0 = CALL_EXPR_ARG (exp, 0);
35847 arg1 = CALL_EXPR_ARG (exp, 1);
35848 op0 = expand_normal (arg0);
35849 op1 = expand_normal (arg1);
35850 if (!REG_P (op0))
35851 op0 = copy_to_mode_reg (SImode, op0);
35852 if (!REG_P (op1))
35853 op1 = copy_to_mode_reg (SImode, op1);
35854 emit_insn (gen_sse3_mwait (op0, op1));
35855 return 0;
35857 case IX86_BUILTIN_MWAITX:
35858 arg0 = CALL_EXPR_ARG (exp, 0);
35859 arg1 = CALL_EXPR_ARG (exp, 1);
35860 arg2 = CALL_EXPR_ARG (exp, 2);
35861 op0 = expand_normal (arg0);
35862 op1 = expand_normal (arg1);
35863 op2 = expand_normal (arg2);
35864 if (!REG_P (op0))
35865 op0 = copy_to_mode_reg (SImode, op0);
35866 if (!REG_P (op1))
35867 op1 = copy_to_mode_reg (SImode, op1);
35868 if (!REG_P (op2))
35869 op2 = copy_to_mode_reg (SImode, op2);
35870 emit_insn (gen_mwaitx (op0, op1, op2));
35871 return 0;
35873 case IX86_BUILTIN_CLZERO:
35874 arg0 = CALL_EXPR_ARG (exp, 0);
35875 op0 = expand_normal (arg0);
35876 if (!REG_P (op0))
35877 op0 = ix86_zero_extend_to_Pmode (op0);
35878 emit_insn (ix86_gen_clzero (op0));
35879 return 0;
35881 case IX86_BUILTIN_VEC_INIT_V2SI:
35882 case IX86_BUILTIN_VEC_INIT_V4HI:
35883 case IX86_BUILTIN_VEC_INIT_V8QI:
35884 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
35886 case IX86_BUILTIN_VEC_EXT_V2DF:
35887 case IX86_BUILTIN_VEC_EXT_V2DI:
35888 case IX86_BUILTIN_VEC_EXT_V4SF:
35889 case IX86_BUILTIN_VEC_EXT_V4SI:
35890 case IX86_BUILTIN_VEC_EXT_V8HI:
35891 case IX86_BUILTIN_VEC_EXT_V2SI:
35892 case IX86_BUILTIN_VEC_EXT_V4HI:
35893 case IX86_BUILTIN_VEC_EXT_V16QI:
35894 return ix86_expand_vec_ext_builtin (exp, target);
35896 case IX86_BUILTIN_VEC_SET_V2DI:
35897 case IX86_BUILTIN_VEC_SET_V4SF:
35898 case IX86_BUILTIN_VEC_SET_V4SI:
35899 case IX86_BUILTIN_VEC_SET_V8HI:
35900 case IX86_BUILTIN_VEC_SET_V4HI:
35901 case IX86_BUILTIN_VEC_SET_V16QI:
35902 return ix86_expand_vec_set_builtin (exp);
35904 case IX86_BUILTIN_NANQ:
35905 case IX86_BUILTIN_NANSQ:
35906 return expand_call (exp, target, ignore);
35908 case IX86_BUILTIN_RDPMC:
35909 case IX86_BUILTIN_RDTSC:
35910 case IX86_BUILTIN_RDTSCP:
35911 case IX86_BUILTIN_XGETBV:
35913 op0 = gen_reg_rtx (DImode);
35914 op1 = gen_reg_rtx (DImode);
35916 if (fcode == IX86_BUILTIN_RDPMC)
35918 arg0 = CALL_EXPR_ARG (exp, 0);
35919 op2 = expand_normal (arg0);
35920 if (!register_operand (op2, SImode))
35921 op2 = copy_to_mode_reg (SImode, op2);
35923 insn = (TARGET_64BIT
35924 ? gen_rdpmc_rex64 (op0, op1, op2)
35925 : gen_rdpmc (op0, op2));
35926 emit_insn (insn);
35928 else if (fcode == IX86_BUILTIN_XGETBV)
35930 arg0 = CALL_EXPR_ARG (exp, 0);
35931 op2 = expand_normal (arg0);
35932 if (!register_operand (op2, SImode))
35933 op2 = copy_to_mode_reg (SImode, op2);
35935 insn = (TARGET_64BIT
35936 ? gen_xgetbv_rex64 (op0, op1, op2)
35937 : gen_xgetbv (op0, op2));
35938 emit_insn (insn);
35940 else if (fcode == IX86_BUILTIN_RDTSC)
35942 insn = (TARGET_64BIT
35943 ? gen_rdtsc_rex64 (op0, op1)
35944 : gen_rdtsc (op0));
35945 emit_insn (insn);
35947 else
35949 op2 = gen_reg_rtx (SImode);
35951 insn = (TARGET_64BIT
35952 ? gen_rdtscp_rex64 (op0, op1, op2)
35953 : gen_rdtscp (op0, op2));
35954 emit_insn (insn);
35956 arg0 = CALL_EXPR_ARG (exp, 0);
35957 op4 = expand_normal (arg0);
35958 if (!address_operand (op4, VOIDmode))
35960 op4 = convert_memory_address (Pmode, op4);
35961 op4 = copy_addr_to_reg (op4);
35963 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
35966 if (target == 0)
35968 /* mode is VOIDmode if __builtin_rd* has been called
35969 without lhs. */
35970 if (mode == VOIDmode)
35971 return target;
35972 target = gen_reg_rtx (mode);
35975 if (TARGET_64BIT)
35977 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
35978 op1, 1, OPTAB_DIRECT);
35979 op0 = expand_simple_binop (DImode, IOR, op0, op1,
35980 op0, 1, OPTAB_DIRECT);
35983 emit_move_insn (target, op0);
35984 return target;
35986 case IX86_BUILTIN_FXSAVE:
35987 case IX86_BUILTIN_FXRSTOR:
35988 case IX86_BUILTIN_FXSAVE64:
35989 case IX86_BUILTIN_FXRSTOR64:
35990 case IX86_BUILTIN_FNSTENV:
35991 case IX86_BUILTIN_FLDENV:
35992 mode0 = BLKmode;
35993 switch (fcode)
35995 case IX86_BUILTIN_FXSAVE:
35996 icode = CODE_FOR_fxsave;
35997 break;
35998 case IX86_BUILTIN_FXRSTOR:
35999 icode = CODE_FOR_fxrstor;
36000 break;
36001 case IX86_BUILTIN_FXSAVE64:
36002 icode = CODE_FOR_fxsave64;
36003 break;
36004 case IX86_BUILTIN_FXRSTOR64:
36005 icode = CODE_FOR_fxrstor64;
36006 break;
36007 case IX86_BUILTIN_FNSTENV:
36008 icode = CODE_FOR_fnstenv;
36009 break;
36010 case IX86_BUILTIN_FLDENV:
36011 icode = CODE_FOR_fldenv;
36012 break;
36013 default:
36014 gcc_unreachable ();
36017 arg0 = CALL_EXPR_ARG (exp, 0);
36018 op0 = expand_normal (arg0);
36020 if (!address_operand (op0, VOIDmode))
36022 op0 = convert_memory_address (Pmode, op0);
36023 op0 = copy_addr_to_reg (op0);
36025 op0 = gen_rtx_MEM (mode0, op0);
36027 pat = GEN_FCN (icode) (op0);
36028 if (pat)
36029 emit_insn (pat);
36030 return 0;
36032 case IX86_BUILTIN_XSETBV:
36033 arg0 = CALL_EXPR_ARG (exp, 0);
36034 arg1 = CALL_EXPR_ARG (exp, 1);
36035 op0 = expand_normal (arg0);
36036 op1 = expand_normal (arg1);
36038 if (!REG_P (op0))
36039 op0 = copy_to_mode_reg (SImode, op0);
36041 if (TARGET_64BIT)
36043 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36044 NULL, 1, OPTAB_DIRECT);
36046 op2 = gen_lowpart (SImode, op2);
36047 op1 = gen_lowpart (SImode, op1);
36048 if (!REG_P (op1))
36049 op1 = copy_to_mode_reg (SImode, op1);
36050 if (!REG_P (op2))
36051 op2 = copy_to_mode_reg (SImode, op2);
36052 icode = CODE_FOR_xsetbv_rex64;
36053 pat = GEN_FCN (icode) (op0, op1, op2);
36055 else
36057 if (!REG_P (op1))
36058 op1 = copy_to_mode_reg (DImode, op1);
36059 icode = CODE_FOR_xsetbv;
36060 pat = GEN_FCN (icode) (op0, op1);
36062 if (pat)
36063 emit_insn (pat);
36064 return 0;
36066 case IX86_BUILTIN_XSAVE:
36067 case IX86_BUILTIN_XRSTOR:
36068 case IX86_BUILTIN_XSAVE64:
36069 case IX86_BUILTIN_XRSTOR64:
36070 case IX86_BUILTIN_XSAVEOPT:
36071 case IX86_BUILTIN_XSAVEOPT64:
36072 case IX86_BUILTIN_XSAVES:
36073 case IX86_BUILTIN_XRSTORS:
36074 case IX86_BUILTIN_XSAVES64:
36075 case IX86_BUILTIN_XRSTORS64:
36076 case IX86_BUILTIN_XSAVEC:
36077 case IX86_BUILTIN_XSAVEC64:
36078 arg0 = CALL_EXPR_ARG (exp, 0);
36079 arg1 = CALL_EXPR_ARG (exp, 1);
36080 op0 = expand_normal (arg0);
36081 op1 = expand_normal (arg1);
36083 if (!address_operand (op0, VOIDmode))
36085 op0 = convert_memory_address (Pmode, op0);
36086 op0 = copy_addr_to_reg (op0);
36088 op0 = gen_rtx_MEM (BLKmode, op0);
36090 op1 = force_reg (DImode, op1);
36092 if (TARGET_64BIT)
36094 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36095 NULL, 1, OPTAB_DIRECT);
36096 switch (fcode)
36098 case IX86_BUILTIN_XSAVE:
36099 icode = CODE_FOR_xsave_rex64;
36100 break;
36101 case IX86_BUILTIN_XRSTOR:
36102 icode = CODE_FOR_xrstor_rex64;
36103 break;
36104 case IX86_BUILTIN_XSAVE64:
36105 icode = CODE_FOR_xsave64;
36106 break;
36107 case IX86_BUILTIN_XRSTOR64:
36108 icode = CODE_FOR_xrstor64;
36109 break;
36110 case IX86_BUILTIN_XSAVEOPT:
36111 icode = CODE_FOR_xsaveopt_rex64;
36112 break;
36113 case IX86_BUILTIN_XSAVEOPT64:
36114 icode = CODE_FOR_xsaveopt64;
36115 break;
36116 case IX86_BUILTIN_XSAVES:
36117 icode = CODE_FOR_xsaves_rex64;
36118 break;
36119 case IX86_BUILTIN_XRSTORS:
36120 icode = CODE_FOR_xrstors_rex64;
36121 break;
36122 case IX86_BUILTIN_XSAVES64:
36123 icode = CODE_FOR_xsaves64;
36124 break;
36125 case IX86_BUILTIN_XRSTORS64:
36126 icode = CODE_FOR_xrstors64;
36127 break;
36128 case IX86_BUILTIN_XSAVEC:
36129 icode = CODE_FOR_xsavec_rex64;
36130 break;
36131 case IX86_BUILTIN_XSAVEC64:
36132 icode = CODE_FOR_xsavec64;
36133 break;
36134 default:
36135 gcc_unreachable ();
36138 op2 = gen_lowpart (SImode, op2);
36139 op1 = gen_lowpart (SImode, op1);
36140 pat = GEN_FCN (icode) (op0, op1, op2);
36142 else
36144 switch (fcode)
36146 case IX86_BUILTIN_XSAVE:
36147 icode = CODE_FOR_xsave;
36148 break;
36149 case IX86_BUILTIN_XRSTOR:
36150 icode = CODE_FOR_xrstor;
36151 break;
36152 case IX86_BUILTIN_XSAVEOPT:
36153 icode = CODE_FOR_xsaveopt;
36154 break;
36155 case IX86_BUILTIN_XSAVES:
36156 icode = CODE_FOR_xsaves;
36157 break;
36158 case IX86_BUILTIN_XRSTORS:
36159 icode = CODE_FOR_xrstors;
36160 break;
36161 case IX86_BUILTIN_XSAVEC:
36162 icode = CODE_FOR_xsavec;
36163 break;
36164 default:
36165 gcc_unreachable ();
36167 pat = GEN_FCN (icode) (op0, op1);
36170 if (pat)
36171 emit_insn (pat);
36172 return 0;
36174 case IX86_BUILTIN_LLWPCB:
36175 arg0 = CALL_EXPR_ARG (exp, 0);
36176 op0 = expand_normal (arg0);
36177 icode = CODE_FOR_lwp_llwpcb;
36178 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36179 op0 = ix86_zero_extend_to_Pmode (op0);
36180 emit_insn (gen_lwp_llwpcb (op0));
36181 return 0;
36183 case IX86_BUILTIN_SLWPCB:
36184 icode = CODE_FOR_lwp_slwpcb;
36185 if (!target
36186 || !insn_data[icode].operand[0].predicate (target, Pmode))
36187 target = gen_reg_rtx (Pmode);
36188 emit_insn (gen_lwp_slwpcb (target));
36189 return target;
36191 case IX86_BUILTIN_BEXTRI32:
36192 case IX86_BUILTIN_BEXTRI64:
36193 arg0 = CALL_EXPR_ARG (exp, 0);
36194 arg1 = CALL_EXPR_ARG (exp, 1);
36195 op0 = expand_normal (arg0);
36196 op1 = expand_normal (arg1);
36197 icode = (fcode == IX86_BUILTIN_BEXTRI32
36198 ? CODE_FOR_tbm_bextri_si
36199 : CODE_FOR_tbm_bextri_di);
36200 if (!CONST_INT_P (op1))
36202 error ("last argument must be an immediate");
36203 return const0_rtx;
36205 else
36207 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36208 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36209 op1 = GEN_INT (length);
36210 op2 = GEN_INT (lsb_index);
36211 pat = GEN_FCN (icode) (target, op0, op1, op2);
36212 if (pat)
36213 emit_insn (pat);
36214 return target;
36217 case IX86_BUILTIN_RDRAND16_STEP:
36218 icode = CODE_FOR_rdrandhi_1;
36219 mode0 = HImode;
36220 goto rdrand_step;
36222 case IX86_BUILTIN_RDRAND32_STEP:
36223 icode = CODE_FOR_rdrandsi_1;
36224 mode0 = SImode;
36225 goto rdrand_step;
36227 case IX86_BUILTIN_RDRAND64_STEP:
36228 icode = CODE_FOR_rdranddi_1;
36229 mode0 = DImode;
36231 rdrand_step:
36232 arg0 = CALL_EXPR_ARG (exp, 0);
36233 op1 = expand_normal (arg0);
36234 if (!address_operand (op1, VOIDmode))
36236 op1 = convert_memory_address (Pmode, op1);
36237 op1 = copy_addr_to_reg (op1);
36240 op0 = gen_reg_rtx (mode0);
36241 emit_insn (GEN_FCN (icode) (op0));
36243 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36245 op1 = gen_reg_rtx (SImode);
36246 emit_move_insn (op1, CONST1_RTX (SImode));
36248 /* Emit SImode conditional move. */
36249 if (mode0 == HImode)
36251 if (TARGET_ZERO_EXTEND_WITH_AND
36252 && optimize_function_for_speed_p (cfun))
36254 op2 = force_reg (SImode, const0_rtx);
36256 emit_insn (gen_movstricthi
36257 (gen_lowpart (HImode, op2), op0));
36259 else
36261 op2 = gen_reg_rtx (SImode);
36263 emit_insn (gen_zero_extendhisi2 (op2, op0));
36266 else if (mode0 == SImode)
36267 op2 = op0;
36268 else
36269 op2 = gen_rtx_SUBREG (SImode, op0, 0);
36271 if (target == 0
36272 || !register_operand (target, SImode))
36273 target = gen_reg_rtx (SImode);
36275 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
36276 const0_rtx);
36277 emit_insn (gen_rtx_SET (target,
36278 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
36279 return target;
36281 case IX86_BUILTIN_RDSEED16_STEP:
36282 icode = CODE_FOR_rdseedhi_1;
36283 mode0 = HImode;
36284 goto rdseed_step;
36286 case IX86_BUILTIN_RDSEED32_STEP:
36287 icode = CODE_FOR_rdseedsi_1;
36288 mode0 = SImode;
36289 goto rdseed_step;
36291 case IX86_BUILTIN_RDSEED64_STEP:
36292 icode = CODE_FOR_rdseeddi_1;
36293 mode0 = DImode;
36295 rdseed_step:
36296 arg0 = CALL_EXPR_ARG (exp, 0);
36297 op1 = expand_normal (arg0);
36298 if (!address_operand (op1, VOIDmode))
36300 op1 = convert_memory_address (Pmode, op1);
36301 op1 = copy_addr_to_reg (op1);
36304 op0 = gen_reg_rtx (mode0);
36305 emit_insn (GEN_FCN (icode) (op0));
36307 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36309 op2 = gen_reg_rtx (QImode);
36311 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
36312 const0_rtx);
36313 emit_insn (gen_rtx_SET (op2, pat));
36315 if (target == 0
36316 || !register_operand (target, SImode))
36317 target = gen_reg_rtx (SImode);
36319 emit_insn (gen_zero_extendqisi2 (target, op2));
36320 return target;
36322 case IX86_BUILTIN_SBB32:
36323 icode = CODE_FOR_subborrowsi;
36324 icode2 = CODE_FOR_subborrowsi_0;
36325 mode0 = SImode;
36326 mode1 = DImode;
36327 mode2 = CCmode;
36328 goto handlecarry;
36330 case IX86_BUILTIN_SBB64:
36331 icode = CODE_FOR_subborrowdi;
36332 icode2 = CODE_FOR_subborrowdi_0;
36333 mode0 = DImode;
36334 mode1 = TImode;
36335 mode2 = CCmode;
36336 goto handlecarry;
36338 case IX86_BUILTIN_ADDCARRYX32:
36339 icode = CODE_FOR_addcarrysi;
36340 icode2 = CODE_FOR_addcarrysi_0;
36341 mode0 = SImode;
36342 mode1 = DImode;
36343 mode2 = CCCmode;
36344 goto handlecarry;
36346 case IX86_BUILTIN_ADDCARRYX64:
36347 icode = CODE_FOR_addcarrydi;
36348 icode2 = CODE_FOR_addcarrydi_0;
36349 mode0 = DImode;
36350 mode1 = TImode;
36351 mode2 = CCCmode;
36353 handlecarry:
36354 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
36355 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
36356 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
36357 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
36359 op1 = expand_normal (arg0);
36360 if (!integer_zerop (arg0))
36361 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
36363 op2 = expand_normal (arg1);
36364 if (!register_operand (op2, mode0))
36365 op2 = copy_to_mode_reg (mode0, op2);
36367 op3 = expand_normal (arg2);
36368 if (!register_operand (op3, mode0))
36369 op3 = copy_to_mode_reg (mode0, op3);
36371 op4 = expand_normal (arg3);
36372 if (!address_operand (op4, VOIDmode))
36374 op4 = convert_memory_address (Pmode, op4);
36375 op4 = copy_addr_to_reg (op4);
36378 op0 = gen_reg_rtx (mode0);
36379 if (integer_zerop (arg0))
36381 /* If arg0 is 0, optimize right away into add or sub
36382 instruction that sets CCCmode flags. */
36383 op1 = gen_rtx_REG (mode2, FLAGS_REG);
36384 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
36386 else
36388 /* Generate CF from input operand. */
36389 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
36391 /* Generate instruction that consumes CF. */
36392 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
36393 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
36394 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
36395 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
36398 /* Return current CF value. */
36399 if (target == 0)
36400 target = gen_reg_rtx (QImode);
36402 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
36403 emit_insn (gen_rtx_SET (target, pat));
36405 /* Store the result. */
36406 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
36408 return target;
36410 case IX86_BUILTIN_READ_FLAGS:
36411 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
36413 if (optimize
36414 || target == NULL_RTX
36415 || !nonimmediate_operand (target, word_mode)
36416 || GET_MODE (target) != word_mode)
36417 target = gen_reg_rtx (word_mode);
36419 emit_insn (gen_pop (target));
36420 return target;
36422 case IX86_BUILTIN_WRITE_FLAGS:
36424 arg0 = CALL_EXPR_ARG (exp, 0);
36425 op0 = expand_normal (arg0);
36426 if (!general_no_elim_operand (op0, word_mode))
36427 op0 = copy_to_mode_reg (word_mode, op0);
36429 emit_insn (gen_push (op0));
36430 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
36431 return 0;
36433 case IX86_BUILTIN_KTESTC8:
36434 icode = CODE_FOR_ktestqi;
36435 mode3 = CCCmode;
36436 goto kortest;
36438 case IX86_BUILTIN_KTESTZ8:
36439 icode = CODE_FOR_ktestqi;
36440 mode3 = CCZmode;
36441 goto kortest;
36443 case IX86_BUILTIN_KTESTC16:
36444 icode = CODE_FOR_ktesthi;
36445 mode3 = CCCmode;
36446 goto kortest;
36448 case IX86_BUILTIN_KTESTZ16:
36449 icode = CODE_FOR_ktesthi;
36450 mode3 = CCZmode;
36451 goto kortest;
36453 case IX86_BUILTIN_KTESTC32:
36454 icode = CODE_FOR_ktestsi;
36455 mode3 = CCCmode;
36456 goto kortest;
36458 case IX86_BUILTIN_KTESTZ32:
36459 icode = CODE_FOR_ktestsi;
36460 mode3 = CCZmode;
36461 goto kortest;
36463 case IX86_BUILTIN_KTESTC64:
36464 icode = CODE_FOR_ktestdi;
36465 mode3 = CCCmode;
36466 goto kortest;
36468 case IX86_BUILTIN_KTESTZ64:
36469 icode = CODE_FOR_ktestdi;
36470 mode3 = CCZmode;
36471 goto kortest;
36473 case IX86_BUILTIN_KORTESTC8:
36474 icode = CODE_FOR_kortestqi;
36475 mode3 = CCCmode;
36476 goto kortest;
36478 case IX86_BUILTIN_KORTESTZ8:
36479 icode = CODE_FOR_kortestqi;
36480 mode3 = CCZmode;
36481 goto kortest;
36483 case IX86_BUILTIN_KORTESTC16:
36484 icode = CODE_FOR_kortesthi;
36485 mode3 = CCCmode;
36486 goto kortest;
36488 case IX86_BUILTIN_KORTESTZ16:
36489 icode = CODE_FOR_kortesthi;
36490 mode3 = CCZmode;
36491 goto kortest;
36493 case IX86_BUILTIN_KORTESTC32:
36494 icode = CODE_FOR_kortestsi;
36495 mode3 = CCCmode;
36496 goto kortest;
36498 case IX86_BUILTIN_KORTESTZ32:
36499 icode = CODE_FOR_kortestsi;
36500 mode3 = CCZmode;
36501 goto kortest;
36503 case IX86_BUILTIN_KORTESTC64:
36504 icode = CODE_FOR_kortestdi;
36505 mode3 = CCCmode;
36506 goto kortest;
36508 case IX86_BUILTIN_KORTESTZ64:
36509 icode = CODE_FOR_kortestdi;
36510 mode3 = CCZmode;
36512 kortest:
36513 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
36514 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
36515 op0 = expand_normal (arg0);
36516 op1 = expand_normal (arg1);
36518 mode0 = insn_data[icode].operand[0].mode;
36519 mode1 = insn_data[icode].operand[1].mode;
36521 if (GET_MODE (op0) != VOIDmode)
36522 op0 = force_reg (GET_MODE (op0), op0);
36524 op0 = gen_lowpart (mode0, op0);
36526 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36527 op0 = copy_to_mode_reg (mode0, op0);
36529 if (GET_MODE (op1) != VOIDmode)
36530 op1 = force_reg (GET_MODE (op1), op1);
36532 op1 = gen_lowpart (mode1, op1);
36534 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36535 op1 = copy_to_mode_reg (mode1, op1);
36537 target = gen_reg_rtx (QImode);
36539 /* Emit kortest. */
36540 emit_insn (GEN_FCN (icode) (op0, op1));
36541 /* And use setcc to return result from flags. */
36542 ix86_expand_setcc (target, EQ,
36543 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
36544 return target;
36546 case IX86_BUILTIN_GATHERSIV2DF:
36547 icode = CODE_FOR_avx2_gathersiv2df;
36548 goto gather_gen;
36549 case IX86_BUILTIN_GATHERSIV4DF:
36550 icode = CODE_FOR_avx2_gathersiv4df;
36551 goto gather_gen;
36552 case IX86_BUILTIN_GATHERDIV2DF:
36553 icode = CODE_FOR_avx2_gatherdiv2df;
36554 goto gather_gen;
36555 case IX86_BUILTIN_GATHERDIV4DF:
36556 icode = CODE_FOR_avx2_gatherdiv4df;
36557 goto gather_gen;
36558 case IX86_BUILTIN_GATHERSIV4SF:
36559 icode = CODE_FOR_avx2_gathersiv4sf;
36560 goto gather_gen;
36561 case IX86_BUILTIN_GATHERSIV8SF:
36562 icode = CODE_FOR_avx2_gathersiv8sf;
36563 goto gather_gen;
36564 case IX86_BUILTIN_GATHERDIV4SF:
36565 icode = CODE_FOR_avx2_gatherdiv4sf;
36566 goto gather_gen;
36567 case IX86_BUILTIN_GATHERDIV8SF:
36568 icode = CODE_FOR_avx2_gatherdiv8sf;
36569 goto gather_gen;
36570 case IX86_BUILTIN_GATHERSIV2DI:
36571 icode = CODE_FOR_avx2_gathersiv2di;
36572 goto gather_gen;
36573 case IX86_BUILTIN_GATHERSIV4DI:
36574 icode = CODE_FOR_avx2_gathersiv4di;
36575 goto gather_gen;
36576 case IX86_BUILTIN_GATHERDIV2DI:
36577 icode = CODE_FOR_avx2_gatherdiv2di;
36578 goto gather_gen;
36579 case IX86_BUILTIN_GATHERDIV4DI:
36580 icode = CODE_FOR_avx2_gatherdiv4di;
36581 goto gather_gen;
36582 case IX86_BUILTIN_GATHERSIV4SI:
36583 icode = CODE_FOR_avx2_gathersiv4si;
36584 goto gather_gen;
36585 case IX86_BUILTIN_GATHERSIV8SI:
36586 icode = CODE_FOR_avx2_gathersiv8si;
36587 goto gather_gen;
36588 case IX86_BUILTIN_GATHERDIV4SI:
36589 icode = CODE_FOR_avx2_gatherdiv4si;
36590 goto gather_gen;
36591 case IX86_BUILTIN_GATHERDIV8SI:
36592 icode = CODE_FOR_avx2_gatherdiv8si;
36593 goto gather_gen;
36594 case IX86_BUILTIN_GATHERALTSIV4DF:
36595 icode = CODE_FOR_avx2_gathersiv4df;
36596 goto gather_gen;
36597 case IX86_BUILTIN_GATHERALTDIV8SF:
36598 icode = CODE_FOR_avx2_gatherdiv8sf;
36599 goto gather_gen;
36600 case IX86_BUILTIN_GATHERALTSIV4DI:
36601 icode = CODE_FOR_avx2_gathersiv4di;
36602 goto gather_gen;
36603 case IX86_BUILTIN_GATHERALTDIV8SI:
36604 icode = CODE_FOR_avx2_gatherdiv8si;
36605 goto gather_gen;
36606 case IX86_BUILTIN_GATHER3SIV16SF:
36607 icode = CODE_FOR_avx512f_gathersiv16sf;
36608 goto gather_gen;
36609 case IX86_BUILTIN_GATHER3SIV8DF:
36610 icode = CODE_FOR_avx512f_gathersiv8df;
36611 goto gather_gen;
36612 case IX86_BUILTIN_GATHER3DIV16SF:
36613 icode = CODE_FOR_avx512f_gatherdiv16sf;
36614 goto gather_gen;
36615 case IX86_BUILTIN_GATHER3DIV8DF:
36616 icode = CODE_FOR_avx512f_gatherdiv8df;
36617 goto gather_gen;
36618 case IX86_BUILTIN_GATHER3SIV16SI:
36619 icode = CODE_FOR_avx512f_gathersiv16si;
36620 goto gather_gen;
36621 case IX86_BUILTIN_GATHER3SIV8DI:
36622 icode = CODE_FOR_avx512f_gathersiv8di;
36623 goto gather_gen;
36624 case IX86_BUILTIN_GATHER3DIV16SI:
36625 icode = CODE_FOR_avx512f_gatherdiv16si;
36626 goto gather_gen;
36627 case IX86_BUILTIN_GATHER3DIV8DI:
36628 icode = CODE_FOR_avx512f_gatherdiv8di;
36629 goto gather_gen;
36630 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36631 icode = CODE_FOR_avx512f_gathersiv8df;
36632 goto gather_gen;
36633 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36634 icode = CODE_FOR_avx512f_gatherdiv16sf;
36635 goto gather_gen;
36636 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36637 icode = CODE_FOR_avx512f_gathersiv8di;
36638 goto gather_gen;
36639 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36640 icode = CODE_FOR_avx512f_gatherdiv16si;
36641 goto gather_gen;
36642 case IX86_BUILTIN_GATHER3SIV2DF:
36643 icode = CODE_FOR_avx512vl_gathersiv2df;
36644 goto gather_gen;
36645 case IX86_BUILTIN_GATHER3SIV4DF:
36646 icode = CODE_FOR_avx512vl_gathersiv4df;
36647 goto gather_gen;
36648 case IX86_BUILTIN_GATHER3DIV2DF:
36649 icode = CODE_FOR_avx512vl_gatherdiv2df;
36650 goto gather_gen;
36651 case IX86_BUILTIN_GATHER3DIV4DF:
36652 icode = CODE_FOR_avx512vl_gatherdiv4df;
36653 goto gather_gen;
36654 case IX86_BUILTIN_GATHER3SIV4SF:
36655 icode = CODE_FOR_avx512vl_gathersiv4sf;
36656 goto gather_gen;
36657 case IX86_BUILTIN_GATHER3SIV8SF:
36658 icode = CODE_FOR_avx512vl_gathersiv8sf;
36659 goto gather_gen;
36660 case IX86_BUILTIN_GATHER3DIV4SF:
36661 icode = CODE_FOR_avx512vl_gatherdiv4sf;
36662 goto gather_gen;
36663 case IX86_BUILTIN_GATHER3DIV8SF:
36664 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36665 goto gather_gen;
36666 case IX86_BUILTIN_GATHER3SIV2DI:
36667 icode = CODE_FOR_avx512vl_gathersiv2di;
36668 goto gather_gen;
36669 case IX86_BUILTIN_GATHER3SIV4DI:
36670 icode = CODE_FOR_avx512vl_gathersiv4di;
36671 goto gather_gen;
36672 case IX86_BUILTIN_GATHER3DIV2DI:
36673 icode = CODE_FOR_avx512vl_gatherdiv2di;
36674 goto gather_gen;
36675 case IX86_BUILTIN_GATHER3DIV4DI:
36676 icode = CODE_FOR_avx512vl_gatherdiv4di;
36677 goto gather_gen;
36678 case IX86_BUILTIN_GATHER3SIV4SI:
36679 icode = CODE_FOR_avx512vl_gathersiv4si;
36680 goto gather_gen;
36681 case IX86_BUILTIN_GATHER3SIV8SI:
36682 icode = CODE_FOR_avx512vl_gathersiv8si;
36683 goto gather_gen;
36684 case IX86_BUILTIN_GATHER3DIV4SI:
36685 icode = CODE_FOR_avx512vl_gatherdiv4si;
36686 goto gather_gen;
36687 case IX86_BUILTIN_GATHER3DIV8SI:
36688 icode = CODE_FOR_avx512vl_gatherdiv8si;
36689 goto gather_gen;
36690 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36691 icode = CODE_FOR_avx512vl_gathersiv4df;
36692 goto gather_gen;
36693 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36694 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36695 goto gather_gen;
36696 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36697 icode = CODE_FOR_avx512vl_gathersiv4di;
36698 goto gather_gen;
36699 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36700 icode = CODE_FOR_avx512vl_gatherdiv8si;
36701 goto gather_gen;
36702 case IX86_BUILTIN_SCATTERSIV16SF:
36703 icode = CODE_FOR_avx512f_scattersiv16sf;
36704 goto scatter_gen;
36705 case IX86_BUILTIN_SCATTERSIV8DF:
36706 icode = CODE_FOR_avx512f_scattersiv8df;
36707 goto scatter_gen;
36708 case IX86_BUILTIN_SCATTERDIV16SF:
36709 icode = CODE_FOR_avx512f_scatterdiv16sf;
36710 goto scatter_gen;
36711 case IX86_BUILTIN_SCATTERDIV8DF:
36712 icode = CODE_FOR_avx512f_scatterdiv8df;
36713 goto scatter_gen;
36714 case IX86_BUILTIN_SCATTERSIV16SI:
36715 icode = CODE_FOR_avx512f_scattersiv16si;
36716 goto scatter_gen;
36717 case IX86_BUILTIN_SCATTERSIV8DI:
36718 icode = CODE_FOR_avx512f_scattersiv8di;
36719 goto scatter_gen;
36720 case IX86_BUILTIN_SCATTERDIV16SI:
36721 icode = CODE_FOR_avx512f_scatterdiv16si;
36722 goto scatter_gen;
36723 case IX86_BUILTIN_SCATTERDIV8DI:
36724 icode = CODE_FOR_avx512f_scatterdiv8di;
36725 goto scatter_gen;
36726 case IX86_BUILTIN_SCATTERSIV8SF:
36727 icode = CODE_FOR_avx512vl_scattersiv8sf;
36728 goto scatter_gen;
36729 case IX86_BUILTIN_SCATTERSIV4SF:
36730 icode = CODE_FOR_avx512vl_scattersiv4sf;
36731 goto scatter_gen;
36732 case IX86_BUILTIN_SCATTERSIV4DF:
36733 icode = CODE_FOR_avx512vl_scattersiv4df;
36734 goto scatter_gen;
36735 case IX86_BUILTIN_SCATTERSIV2DF:
36736 icode = CODE_FOR_avx512vl_scattersiv2df;
36737 goto scatter_gen;
36738 case IX86_BUILTIN_SCATTERDIV8SF:
36739 icode = CODE_FOR_avx512vl_scatterdiv8sf;
36740 goto scatter_gen;
36741 case IX86_BUILTIN_SCATTERDIV4SF:
36742 icode = CODE_FOR_avx512vl_scatterdiv4sf;
36743 goto scatter_gen;
36744 case IX86_BUILTIN_SCATTERDIV4DF:
36745 icode = CODE_FOR_avx512vl_scatterdiv4df;
36746 goto scatter_gen;
36747 case IX86_BUILTIN_SCATTERDIV2DF:
36748 icode = CODE_FOR_avx512vl_scatterdiv2df;
36749 goto scatter_gen;
36750 case IX86_BUILTIN_SCATTERSIV8SI:
36751 icode = CODE_FOR_avx512vl_scattersiv8si;
36752 goto scatter_gen;
36753 case IX86_BUILTIN_SCATTERSIV4SI:
36754 icode = CODE_FOR_avx512vl_scattersiv4si;
36755 goto scatter_gen;
36756 case IX86_BUILTIN_SCATTERSIV4DI:
36757 icode = CODE_FOR_avx512vl_scattersiv4di;
36758 goto scatter_gen;
36759 case IX86_BUILTIN_SCATTERSIV2DI:
36760 icode = CODE_FOR_avx512vl_scattersiv2di;
36761 goto scatter_gen;
36762 case IX86_BUILTIN_SCATTERDIV8SI:
36763 icode = CODE_FOR_avx512vl_scatterdiv8si;
36764 goto scatter_gen;
36765 case IX86_BUILTIN_SCATTERDIV4SI:
36766 icode = CODE_FOR_avx512vl_scatterdiv4si;
36767 goto scatter_gen;
36768 case IX86_BUILTIN_SCATTERDIV4DI:
36769 icode = CODE_FOR_avx512vl_scatterdiv4di;
36770 goto scatter_gen;
36771 case IX86_BUILTIN_SCATTERDIV2DI:
36772 icode = CODE_FOR_avx512vl_scatterdiv2di;
36773 goto scatter_gen;
36774 case IX86_BUILTIN_GATHERPFDPD:
36775 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
36776 goto vec_prefetch_gen;
36777 case IX86_BUILTIN_SCATTERALTSIV8DF:
36778 icode = CODE_FOR_avx512f_scattersiv8df;
36779 goto scatter_gen;
36780 case IX86_BUILTIN_SCATTERALTDIV16SF:
36781 icode = CODE_FOR_avx512f_scatterdiv16sf;
36782 goto scatter_gen;
36783 case IX86_BUILTIN_SCATTERALTSIV8DI:
36784 icode = CODE_FOR_avx512f_scattersiv8di;
36785 goto scatter_gen;
36786 case IX86_BUILTIN_SCATTERALTDIV16SI:
36787 icode = CODE_FOR_avx512f_scatterdiv16si;
36788 goto scatter_gen;
36789 case IX86_BUILTIN_GATHERPFDPS:
36790 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
36791 goto vec_prefetch_gen;
36792 case IX86_BUILTIN_GATHERPFQPD:
36793 icode = CODE_FOR_avx512pf_gatherpfv8didf;
36794 goto vec_prefetch_gen;
36795 case IX86_BUILTIN_GATHERPFQPS:
36796 icode = CODE_FOR_avx512pf_gatherpfv8disf;
36797 goto vec_prefetch_gen;
36798 case IX86_BUILTIN_SCATTERPFDPD:
36799 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
36800 goto vec_prefetch_gen;
36801 case IX86_BUILTIN_SCATTERPFDPS:
36802 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
36803 goto vec_prefetch_gen;
36804 case IX86_BUILTIN_SCATTERPFQPD:
36805 icode = CODE_FOR_avx512pf_scatterpfv8didf;
36806 goto vec_prefetch_gen;
36807 case IX86_BUILTIN_SCATTERPFQPS:
36808 icode = CODE_FOR_avx512pf_scatterpfv8disf;
36809 goto vec_prefetch_gen;
36811 gather_gen:
36812 rtx half;
36813 rtx (*gen) (rtx, rtx);
36815 arg0 = CALL_EXPR_ARG (exp, 0);
36816 arg1 = CALL_EXPR_ARG (exp, 1);
36817 arg2 = CALL_EXPR_ARG (exp, 2);
36818 arg3 = CALL_EXPR_ARG (exp, 3);
36819 arg4 = CALL_EXPR_ARG (exp, 4);
36820 op0 = expand_normal (arg0);
36821 op1 = expand_normal (arg1);
36822 op2 = expand_normal (arg2);
36823 op3 = expand_normal (arg3);
36824 op4 = expand_normal (arg4);
36825 /* Note the arg order is different from the operand order. */
36826 mode0 = insn_data[icode].operand[1].mode;
36827 mode2 = insn_data[icode].operand[3].mode;
36828 mode3 = insn_data[icode].operand[4].mode;
36829 mode4 = insn_data[icode].operand[5].mode;
36831 if (target == NULL_RTX
36832 || GET_MODE (target) != insn_data[icode].operand[0].mode
36833 || !insn_data[icode].operand[0].predicate (target,
36834 GET_MODE (target)))
36835 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
36836 else
36837 subtarget = target;
36839 switch (fcode)
36841 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36842 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36843 half = gen_reg_rtx (V8SImode);
36844 if (!nonimmediate_operand (op2, V16SImode))
36845 op2 = copy_to_mode_reg (V16SImode, op2);
36846 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36847 op2 = half;
36848 break;
36849 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36850 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36851 case IX86_BUILTIN_GATHERALTSIV4DF:
36852 case IX86_BUILTIN_GATHERALTSIV4DI:
36853 half = gen_reg_rtx (V4SImode);
36854 if (!nonimmediate_operand (op2, V8SImode))
36855 op2 = copy_to_mode_reg (V8SImode, op2);
36856 emit_insn (gen_vec_extract_lo_v8si (half, op2));
36857 op2 = half;
36858 break;
36859 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36860 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36861 half = gen_reg_rtx (mode0);
36862 if (mode0 == V8SFmode)
36863 gen = gen_vec_extract_lo_v16sf;
36864 else
36865 gen = gen_vec_extract_lo_v16si;
36866 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36867 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36868 emit_insn (gen (half, op0));
36869 op0 = half;
36870 if (GET_MODE (op3) != VOIDmode)
36872 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36873 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36874 emit_insn (gen (half, op3));
36875 op3 = half;
36877 break;
36878 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36879 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36880 case IX86_BUILTIN_GATHERALTDIV8SF:
36881 case IX86_BUILTIN_GATHERALTDIV8SI:
36882 half = gen_reg_rtx (mode0);
36883 if (mode0 == V4SFmode)
36884 gen = gen_vec_extract_lo_v8sf;
36885 else
36886 gen = gen_vec_extract_lo_v8si;
36887 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36888 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36889 emit_insn (gen (half, op0));
36890 op0 = half;
36891 if (GET_MODE (op3) != VOIDmode)
36893 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36894 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36895 emit_insn (gen (half, op3));
36896 op3 = half;
36898 break;
36899 default:
36900 break;
36903 /* Force memory operand only with base register here. But we
36904 don't want to do it on memory operand for other builtin
36905 functions. */
36906 op1 = ix86_zero_extend_to_Pmode (op1);
36908 if (!insn_data[icode].operand[1].predicate (op0, mode0))
36909 op0 = copy_to_mode_reg (mode0, op0);
36910 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
36911 op1 = copy_to_mode_reg (Pmode, op1);
36912 if (!insn_data[icode].operand[3].predicate (op2, mode2))
36913 op2 = copy_to_mode_reg (mode2, op2);
36915 op3 = fixup_modeless_constant (op3, mode3);
36917 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
36919 if (!insn_data[icode].operand[4].predicate (op3, mode3))
36920 op3 = copy_to_mode_reg (mode3, op3);
36922 else
36924 op3 = copy_to_reg (op3);
36925 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
36927 if (!insn_data[icode].operand[5].predicate (op4, mode4))
36929 error ("the last argument must be scale 1, 2, 4, 8");
36930 return const0_rtx;
36933 /* Optimize. If mask is known to have all high bits set,
36934 replace op0 with pc_rtx to signal that the instruction
36935 overwrites the whole destination and doesn't use its
36936 previous contents. */
36937 if (optimize)
36939 if (TREE_CODE (arg3) == INTEGER_CST)
36941 if (integer_all_onesp (arg3))
36942 op0 = pc_rtx;
36944 else if (TREE_CODE (arg3) == VECTOR_CST)
36946 unsigned int negative = 0;
36947 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
36949 tree cst = VECTOR_CST_ELT (arg3, i);
36950 if (TREE_CODE (cst) == INTEGER_CST
36951 && tree_int_cst_sign_bit (cst))
36952 negative++;
36953 else if (TREE_CODE (cst) == REAL_CST
36954 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
36955 negative++;
36957 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
36958 op0 = pc_rtx;
36960 else if (TREE_CODE (arg3) == SSA_NAME
36961 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
36963 /* Recognize also when mask is like:
36964 __v2df src = _mm_setzero_pd ();
36965 __v2df mask = _mm_cmpeq_pd (src, src);
36967 __v8sf src = _mm256_setzero_ps ();
36968 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
36969 as that is a cheaper way to load all ones into
36970 a register than having to load a constant from
36971 memory. */
36972 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
36973 if (is_gimple_call (def_stmt))
36975 tree fndecl = gimple_call_fndecl (def_stmt);
36976 if (fndecl
36977 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
36978 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
36980 case IX86_BUILTIN_CMPPD:
36981 case IX86_BUILTIN_CMPPS:
36982 case IX86_BUILTIN_CMPPD256:
36983 case IX86_BUILTIN_CMPPS256:
36984 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
36985 break;
36986 /* FALLTHRU */
36987 case IX86_BUILTIN_CMPEQPD:
36988 case IX86_BUILTIN_CMPEQPS:
36989 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
36990 && initializer_zerop (gimple_call_arg (def_stmt,
36991 1)))
36992 op0 = pc_rtx;
36993 break;
36994 default:
36995 break;
37001 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37002 if (! pat)
37003 return const0_rtx;
37004 emit_insn (pat);
37006 switch (fcode)
37008 case IX86_BUILTIN_GATHER3DIV16SF:
37009 if (target == NULL_RTX)
37010 target = gen_reg_rtx (V8SFmode);
37011 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37012 break;
37013 case IX86_BUILTIN_GATHER3DIV16SI:
37014 if (target == NULL_RTX)
37015 target = gen_reg_rtx (V8SImode);
37016 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37017 break;
37018 case IX86_BUILTIN_GATHER3DIV8SF:
37019 case IX86_BUILTIN_GATHERDIV8SF:
37020 if (target == NULL_RTX)
37021 target = gen_reg_rtx (V4SFmode);
37022 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37023 break;
37024 case IX86_BUILTIN_GATHER3DIV8SI:
37025 case IX86_BUILTIN_GATHERDIV8SI:
37026 if (target == NULL_RTX)
37027 target = gen_reg_rtx (V4SImode);
37028 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37029 break;
37030 default:
37031 target = subtarget;
37032 break;
37034 return target;
37036 scatter_gen:
37037 arg0 = CALL_EXPR_ARG (exp, 0);
37038 arg1 = CALL_EXPR_ARG (exp, 1);
37039 arg2 = CALL_EXPR_ARG (exp, 2);
37040 arg3 = CALL_EXPR_ARG (exp, 3);
37041 arg4 = CALL_EXPR_ARG (exp, 4);
37042 op0 = expand_normal (arg0);
37043 op1 = expand_normal (arg1);
37044 op2 = expand_normal (arg2);
37045 op3 = expand_normal (arg3);
37046 op4 = expand_normal (arg4);
37047 mode1 = insn_data[icode].operand[1].mode;
37048 mode2 = insn_data[icode].operand[2].mode;
37049 mode3 = insn_data[icode].operand[3].mode;
37050 mode4 = insn_data[icode].operand[4].mode;
37052 /* Scatter instruction stores operand op3 to memory with
37053 indices from op2 and scale from op4 under writemask op1.
37054 If index operand op2 has more elements then source operand
37055 op3 one need to use only its low half. And vice versa. */
37056 switch (fcode)
37058 case IX86_BUILTIN_SCATTERALTSIV8DF:
37059 case IX86_BUILTIN_SCATTERALTSIV8DI:
37060 half = gen_reg_rtx (V8SImode);
37061 if (!nonimmediate_operand (op2, V16SImode))
37062 op2 = copy_to_mode_reg (V16SImode, op2);
37063 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37064 op2 = half;
37065 break;
37066 case IX86_BUILTIN_SCATTERALTDIV16SF:
37067 case IX86_BUILTIN_SCATTERALTDIV16SI:
37068 half = gen_reg_rtx (mode3);
37069 if (mode3 == V8SFmode)
37070 gen = gen_vec_extract_lo_v16sf;
37071 else
37072 gen = gen_vec_extract_lo_v16si;
37073 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37074 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37075 emit_insn (gen (half, op3));
37076 op3 = half;
37077 break;
37078 default:
37079 break;
37082 /* Force memory operand only with base register here. But we
37083 don't want to do it on memory operand for other builtin
37084 functions. */
37085 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37087 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37088 op0 = copy_to_mode_reg (Pmode, op0);
37090 op1 = fixup_modeless_constant (op1, mode1);
37092 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37094 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37095 op1 = copy_to_mode_reg (mode1, op1);
37097 else
37099 op1 = copy_to_reg (op1);
37100 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37103 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37104 op2 = copy_to_mode_reg (mode2, op2);
37106 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37107 op3 = copy_to_mode_reg (mode3, op3);
37109 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37111 error ("the last argument must be scale 1, 2, 4, 8");
37112 return const0_rtx;
37115 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37116 if (! pat)
37117 return const0_rtx;
37119 emit_insn (pat);
37120 return 0;
37122 vec_prefetch_gen:
37123 arg0 = CALL_EXPR_ARG (exp, 0);
37124 arg1 = CALL_EXPR_ARG (exp, 1);
37125 arg2 = CALL_EXPR_ARG (exp, 2);
37126 arg3 = CALL_EXPR_ARG (exp, 3);
37127 arg4 = CALL_EXPR_ARG (exp, 4);
37128 op0 = expand_normal (arg0);
37129 op1 = expand_normal (arg1);
37130 op2 = expand_normal (arg2);
37131 op3 = expand_normal (arg3);
37132 op4 = expand_normal (arg4);
37133 mode0 = insn_data[icode].operand[0].mode;
37134 mode1 = insn_data[icode].operand[1].mode;
37135 mode3 = insn_data[icode].operand[3].mode;
37136 mode4 = insn_data[icode].operand[4].mode;
37138 op0 = fixup_modeless_constant (op0, mode0);
37140 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37142 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37143 op0 = copy_to_mode_reg (mode0, op0);
37145 else
37147 op0 = copy_to_reg (op0);
37148 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37151 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37152 op1 = copy_to_mode_reg (mode1, op1);
37154 /* Force memory operand only with base register here. But we
37155 don't want to do it on memory operand for other builtin
37156 functions. */
37157 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37159 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37160 op2 = copy_to_mode_reg (Pmode, op2);
37162 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37164 error ("the forth argument must be scale 1, 2, 4, 8");
37165 return const0_rtx;
37168 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37170 error ("incorrect hint operand");
37171 return const0_rtx;
37174 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37175 if (! pat)
37176 return const0_rtx;
37178 emit_insn (pat);
37180 return 0;
37182 case IX86_BUILTIN_XABORT:
37183 icode = CODE_FOR_xabort;
37184 arg0 = CALL_EXPR_ARG (exp, 0);
37185 op0 = expand_normal (arg0);
37186 mode0 = insn_data[icode].operand[0].mode;
37187 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37189 error ("the xabort's argument must be an 8-bit immediate");
37190 return const0_rtx;
37192 emit_insn (gen_xabort (op0));
37193 return 0;
37195 case IX86_BUILTIN_RSTORSSP:
37196 case IX86_BUILTIN_CLRSSBSY:
37197 arg0 = CALL_EXPR_ARG (exp, 0);
37198 op0 = expand_normal (arg0);
37199 icode = (fcode == IX86_BUILTIN_RSTORSSP
37200 ? CODE_FOR_rstorssp
37201 : CODE_FOR_clrssbsy);
37202 if (!address_operand (op0, VOIDmode))
37204 op1 = convert_memory_address (Pmode, op0);
37205 op0 = copy_addr_to_reg (op1);
37207 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
37208 return 0;
37210 case IX86_BUILTIN_WRSSD:
37211 case IX86_BUILTIN_WRSSQ:
37212 case IX86_BUILTIN_WRUSSD:
37213 case IX86_BUILTIN_WRUSSQ:
37214 arg0 = CALL_EXPR_ARG (exp, 0);
37215 op0 = expand_normal (arg0);
37216 arg1 = CALL_EXPR_ARG (exp, 1);
37217 op1 = expand_normal (arg1);
37218 switch (fcode)
37220 case IX86_BUILTIN_WRSSD:
37221 icode = CODE_FOR_wrsssi;
37222 mode = SImode;
37223 break;
37224 case IX86_BUILTIN_WRSSQ:
37225 icode = CODE_FOR_wrssdi;
37226 mode = DImode;
37227 break;
37228 case IX86_BUILTIN_WRUSSD:
37229 icode = CODE_FOR_wrusssi;
37230 mode = SImode;
37231 break;
37232 case IX86_BUILTIN_WRUSSQ:
37233 icode = CODE_FOR_wrussdi;
37234 mode = DImode;
37235 break;
37237 op0 = force_reg (mode, op0);
37238 if (!address_operand (op1, VOIDmode))
37240 op2 = convert_memory_address (Pmode, op1);
37241 op1 = copy_addr_to_reg (op2);
37243 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
37244 return 0;
37246 default:
37247 break;
37250 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37251 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37253 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37254 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37255 target);
37258 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37259 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37261 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37262 switch (fcode)
37264 case IX86_BUILTIN_FABSQ:
37265 case IX86_BUILTIN_COPYSIGNQ:
37266 if (!TARGET_SSE)
37267 /* Emit a normal call if SSE isn't available. */
37268 return expand_call (exp, target, ignore);
37269 /* FALLTHRU */
37270 default:
37271 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37275 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
37276 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
37278 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
37279 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
37280 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
37281 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
37282 int masked = 1;
37283 machine_mode mode, wide_mode, nar_mode;
37285 nar_mode = V4SFmode;
37286 mode = V16SFmode;
37287 wide_mode = V64SFmode;
37288 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
37289 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
37291 switch (fcode)
37293 case IX86_BUILTIN_4FMAPS:
37294 fcn = gen_avx5124fmaddps_4fmaddps;
37295 masked = 0;
37296 goto v4fma_expand;
37298 case IX86_BUILTIN_4DPWSSD:
37299 nar_mode = V4SImode;
37300 mode = V16SImode;
37301 wide_mode = V64SImode;
37302 fcn = gen_avx5124vnniw_vp4dpwssd;
37303 masked = 0;
37304 goto v4fma_expand;
37306 case IX86_BUILTIN_4DPWSSDS:
37307 nar_mode = V4SImode;
37308 mode = V16SImode;
37309 wide_mode = V64SImode;
37310 fcn = gen_avx5124vnniw_vp4dpwssds;
37311 masked = 0;
37312 goto v4fma_expand;
37314 case IX86_BUILTIN_4FNMAPS:
37315 fcn = gen_avx5124fmaddps_4fnmaddps;
37316 masked = 0;
37317 goto v4fma_expand;
37319 case IX86_BUILTIN_4FNMAPS_MASK:
37320 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
37321 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
37322 goto v4fma_expand;
37324 case IX86_BUILTIN_4DPWSSD_MASK:
37325 nar_mode = V4SImode;
37326 mode = V16SImode;
37327 wide_mode = V64SImode;
37328 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
37329 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
37330 goto v4fma_expand;
37332 case IX86_BUILTIN_4DPWSSDS_MASK:
37333 nar_mode = V4SImode;
37334 mode = V16SImode;
37335 wide_mode = V64SImode;
37336 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
37337 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
37338 goto v4fma_expand;
37340 case IX86_BUILTIN_4FMAPS_MASK:
37342 tree args[4];
37343 rtx ops[4];
37344 rtx wide_reg;
37345 rtx accum;
37346 rtx addr;
37347 rtx mem;
37349 v4fma_expand:
37350 wide_reg = gen_reg_rtx (wide_mode);
37351 for (i = 0; i < 4; i++)
37353 args[i] = CALL_EXPR_ARG (exp, i);
37354 ops[i] = expand_normal (args[i]);
37356 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
37357 ops[i]);
37360 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37361 accum = force_reg (mode, accum);
37363 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37364 addr = force_reg (Pmode, addr);
37366 mem = gen_rtx_MEM (nar_mode, addr);
37368 target = gen_reg_rtx (mode);
37370 emit_move_insn (target, accum);
37372 if (! masked)
37373 emit_insn (fcn (target, accum, wide_reg, mem));
37374 else
37376 rtx merge, mask;
37377 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37379 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37381 if (CONST_INT_P (mask))
37382 mask = fixup_modeless_constant (mask, HImode);
37384 mask = force_reg (HImode, mask);
37386 if (GET_MODE (mask) != HImode)
37387 mask = gen_rtx_SUBREG (HImode, mask, 0);
37389 /* If merge is 0 then we're about to emit z-masked variant. */
37390 if (const0_operand (merge, mode))
37391 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37392 /* If merge is the same as accum then emit merge-masked variant. */
37393 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37395 merge = force_reg (mode, merge);
37396 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37398 /* Merge with something unknown might happen if we z-mask w/ -O0. */
37399 else
37401 target = gen_reg_rtx (mode);
37402 emit_move_insn (target, merge);
37403 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37406 return target;
37409 case IX86_BUILTIN_4FNMASS:
37410 fcn = gen_avx5124fmaddps_4fnmaddss;
37411 masked = 0;
37412 goto s4fma_expand;
37414 case IX86_BUILTIN_4FMASS:
37415 fcn = gen_avx5124fmaddps_4fmaddss;
37416 masked = 0;
37417 goto s4fma_expand;
37419 case IX86_BUILTIN_4FNMASS_MASK:
37420 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
37421 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
37422 goto s4fma_expand;
37424 case IX86_BUILTIN_4FMASS_MASK:
37426 tree args[4];
37427 rtx ops[4];
37428 rtx wide_reg;
37429 rtx accum;
37430 rtx addr;
37431 rtx mem;
37433 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
37434 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
37436 s4fma_expand:
37437 mode = V4SFmode;
37438 wide_reg = gen_reg_rtx (V64SFmode);
37439 for (i = 0; i < 4; i++)
37441 rtx tmp;
37442 args[i] = CALL_EXPR_ARG (exp, i);
37443 ops[i] = expand_normal (args[i]);
37445 tmp = gen_reg_rtx (SFmode);
37446 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
37448 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
37449 gen_rtx_SUBREG (V16SFmode, tmp, 0));
37452 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37453 accum = force_reg (V4SFmode, accum);
37455 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37456 addr = force_reg (Pmode, addr);
37458 mem = gen_rtx_MEM (V4SFmode, addr);
37460 target = gen_reg_rtx (V4SFmode);
37462 emit_move_insn (target, accum);
37464 if (! masked)
37465 emit_insn (fcn (target, accum, wide_reg, mem));
37466 else
37468 rtx merge, mask;
37469 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37471 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37473 if (CONST_INT_P (mask))
37474 mask = fixup_modeless_constant (mask, QImode);
37476 mask = force_reg (QImode, mask);
37478 if (GET_MODE (mask) != QImode)
37479 mask = gen_rtx_SUBREG (QImode, mask, 0);
37481 /* If merge is 0 then we're about to emit z-masked variant. */
37482 if (const0_operand (merge, mode))
37483 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37484 /* If merge is the same as accum then emit merge-masked
37485 variant. */
37486 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37488 merge = force_reg (mode, merge);
37489 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37491 /* Merge with something unknown might happen if we z-mask
37492 w/ -O0. */
37493 else
37495 target = gen_reg_rtx (mode);
37496 emit_move_insn (target, merge);
37497 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37500 return target;
37502 case IX86_BUILTIN_RDPID:
37503 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
37504 target);
37505 default:
37506 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
37510 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
37511 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
37513 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
37514 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
37515 target);
37518 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37519 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37521 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37522 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37525 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37526 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37528 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37529 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37532 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37533 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37535 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37536 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37539 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37540 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37542 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37543 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37546 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37547 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37549 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37550 const struct builtin_description *d = bdesc_multi_arg + i;
37551 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37552 (enum ix86_builtin_func_type)
37553 d->flag, d->comparison);
37556 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
37557 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
37559 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
37560 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
37561 target);
37564 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
37565 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
37567 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
37568 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
37569 target);
37572 gcc_unreachable ();
37575 /* This returns the target-specific builtin with code CODE if
37576 current_function_decl has visibility on this builtin, which is checked
37577 using isa flags. Returns NULL_TREE otherwise. */
37579 static tree ix86_get_builtin (enum ix86_builtins code)
37581 struct cl_target_option *opts;
37582 tree target_tree = NULL_TREE;
37584 /* Determine the isa flags of current_function_decl. */
37586 if (current_function_decl)
37587 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37589 if (target_tree == NULL)
37590 target_tree = target_option_default_node;
37592 opts = TREE_TARGET_OPTION (target_tree);
37594 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37595 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
37596 return ix86_builtin_decl (code, true);
37597 else
37598 return NULL_TREE;
37601 /* Return function decl for target specific builtin
37602 for given MPX builtin passed i FCODE. */
37603 static tree
37604 ix86_builtin_mpx_function (unsigned fcode)
37606 switch (fcode)
37608 case BUILT_IN_CHKP_BNDMK:
37609 return ix86_builtins[IX86_BUILTIN_BNDMK];
37611 case BUILT_IN_CHKP_BNDSTX:
37612 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37614 case BUILT_IN_CHKP_BNDLDX:
37615 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37617 case BUILT_IN_CHKP_BNDCL:
37618 return ix86_builtins[IX86_BUILTIN_BNDCL];
37620 case BUILT_IN_CHKP_BNDCU:
37621 return ix86_builtins[IX86_BUILTIN_BNDCU];
37623 case BUILT_IN_CHKP_BNDRET:
37624 return ix86_builtins[IX86_BUILTIN_BNDRET];
37626 case BUILT_IN_CHKP_INTERSECT:
37627 return ix86_builtins[IX86_BUILTIN_BNDINT];
37629 case BUILT_IN_CHKP_NARROW:
37630 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37632 case BUILT_IN_CHKP_SIZEOF:
37633 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37635 case BUILT_IN_CHKP_EXTRACT_LOWER:
37636 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37638 case BUILT_IN_CHKP_EXTRACT_UPPER:
37639 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37641 default:
37642 return NULL_TREE;
37645 gcc_unreachable ();
37648 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37650 Return an address to be used to load/store bounds for pointer
37651 passed in SLOT.
37653 SLOT_NO is an integer constant holding number of a target
37654 dependent special slot to be used in case SLOT is not a memory.
37656 SPECIAL_BASE is a pointer to be used as a base of fake address
37657 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37658 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37660 static rtx
37661 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37663 rtx addr = NULL;
37665 /* NULL slot means we pass bounds for pointer not passed to the
37666 function at all. Register slot means we pass pointer in a
37667 register. In both these cases bounds are passed via Bounds
37668 Table. Since we do not have actual pointer stored in memory,
37669 we have to use fake addresses to access Bounds Table. We
37670 start with (special_base - sizeof (void*)) and decrease this
37671 address by pointer size to get addresses for other slots. */
37672 if (!slot || REG_P (slot))
37674 gcc_assert (CONST_INT_P (slot_no));
37675 addr = plus_constant (Pmode, special_base,
37676 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37678 /* If pointer is passed in a memory then its address is used to
37679 access Bounds Table. */
37680 else if (MEM_P (slot))
37682 addr = XEXP (slot, 0);
37683 if (!register_operand (addr, Pmode))
37684 addr = copy_addr_to_reg (addr);
37686 else
37687 gcc_unreachable ();
37689 return addr;
37692 /* Expand pass uses this hook to load bounds for function parameter
37693 PTR passed in SLOT in case its bounds are not passed in a register.
37695 If SLOT is a memory, then bounds are loaded as for regular pointer
37696 loaded from memory. PTR may be NULL in case SLOT is a memory.
37697 In such case value of PTR (if required) may be loaded from SLOT.
37699 If SLOT is NULL or a register then SLOT_NO is an integer constant
37700 holding number of the target dependent special slot which should be
37701 used to obtain bounds.
37703 Return loaded bounds. */
37705 static rtx
37706 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
37708 rtx reg = gen_reg_rtx (BNDmode);
37709 rtx addr;
37711 /* Get address to be used to access Bounds Table. Special slots start
37712 at the location of return address of the current function. */
37713 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
37715 /* Load pointer value from a memory if we don't have it. */
37716 if (!ptr)
37718 gcc_assert (MEM_P (slot));
37719 ptr = copy_addr_to_reg (slot);
37722 if (!register_operand (ptr, Pmode))
37723 ptr = ix86_zero_extend_to_Pmode (ptr);
37725 emit_insn (BNDmode == BND64mode
37726 ? gen_bnd64_ldx (reg, addr, ptr)
37727 : gen_bnd32_ldx (reg, addr, ptr));
37729 return reg;
37732 /* Expand pass uses this hook to store BOUNDS for call argument PTR
37733 passed in SLOT in case BOUNDS are not passed in a register.
37735 If SLOT is a memory, then BOUNDS are stored as for regular pointer
37736 stored in memory. PTR may be NULL in case SLOT is a memory.
37737 In such case value of PTR (if required) may be loaded from SLOT.
37739 If SLOT is NULL or a register then SLOT_NO is an integer constant
37740 holding number of the target dependent special slot which should be
37741 used to store BOUNDS. */
37743 static void
37744 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
37746 rtx addr;
37748 /* Get address to be used to access Bounds Table. Special slots start
37749 at the location of return address of a called function. */
37750 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
37752 /* Load pointer value from a memory if we don't have it. */
37753 if (!ptr)
37755 gcc_assert (MEM_P (slot));
37756 ptr = copy_addr_to_reg (slot);
37759 if (!register_operand (ptr, Pmode))
37760 ptr = ix86_zero_extend_to_Pmode (ptr);
37762 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
37763 if (!register_operand (bounds, BNDmode))
37764 bounds = copy_to_mode_reg (BNDmode, bounds);
37766 emit_insn (BNDmode == BND64mode
37767 ? gen_bnd64_stx (addr, ptr, bounds)
37768 : gen_bnd32_stx (addr, ptr, bounds));
37771 /* Load and return bounds returned by function in SLOT. */
37773 static rtx
37774 ix86_load_returned_bounds (rtx slot)
37776 rtx res;
37778 gcc_assert (REG_P (slot));
37779 res = gen_reg_rtx (BNDmode);
37780 emit_move_insn (res, slot);
37782 return res;
37785 /* Store BOUNDS returned by function into SLOT. */
37787 static void
37788 ix86_store_returned_bounds (rtx slot, rtx bounds)
37790 gcc_assert (REG_P (slot));
37791 emit_move_insn (slot, bounds);
37794 /* Returns a function decl for a vectorized version of the combined function
37795 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
37796 if it is not available. */
37798 static tree
37799 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
37800 tree type_in)
37802 machine_mode in_mode, out_mode;
37803 int in_n, out_n;
37805 if (TREE_CODE (type_out) != VECTOR_TYPE
37806 || TREE_CODE (type_in) != VECTOR_TYPE)
37807 return NULL_TREE;
37809 out_mode = TYPE_MODE (TREE_TYPE (type_out));
37810 out_n = TYPE_VECTOR_SUBPARTS (type_out);
37811 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37812 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37814 switch (fn)
37816 CASE_CFN_EXP2:
37817 if (out_mode == SFmode && in_mode == SFmode)
37819 if (out_n == 16 && in_n == 16)
37820 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
37822 break;
37824 CASE_CFN_IFLOOR:
37825 CASE_CFN_LFLOOR:
37826 CASE_CFN_LLFLOOR:
37827 /* The round insn does not trap on denormals. */
37828 if (flag_trapping_math || !TARGET_SSE4_1)
37829 break;
37831 if (out_mode == SImode && in_mode == DFmode)
37833 if (out_n == 4 && in_n == 2)
37834 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
37835 else if (out_n == 8 && in_n == 4)
37836 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
37837 else if (out_n == 16 && in_n == 8)
37838 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
37840 if (out_mode == SImode && in_mode == SFmode)
37842 if (out_n == 4 && in_n == 4)
37843 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
37844 else if (out_n == 8 && in_n == 8)
37845 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
37846 else if (out_n == 16 && in_n == 16)
37847 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
37849 break;
37851 CASE_CFN_ICEIL:
37852 CASE_CFN_LCEIL:
37853 CASE_CFN_LLCEIL:
37854 /* The round insn does not trap on denormals. */
37855 if (flag_trapping_math || !TARGET_SSE4_1)
37856 break;
37858 if (out_mode == SImode && in_mode == DFmode)
37860 if (out_n == 4 && in_n == 2)
37861 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
37862 else if (out_n == 8 && in_n == 4)
37863 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
37864 else if (out_n == 16 && in_n == 8)
37865 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
37867 if (out_mode == SImode && in_mode == SFmode)
37869 if (out_n == 4 && in_n == 4)
37870 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
37871 else if (out_n == 8 && in_n == 8)
37872 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
37873 else if (out_n == 16 && in_n == 16)
37874 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
37876 break;
37878 CASE_CFN_IRINT:
37879 CASE_CFN_LRINT:
37880 CASE_CFN_LLRINT:
37881 if (out_mode == SImode && in_mode == DFmode)
37883 if (out_n == 4 && in_n == 2)
37884 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
37885 else if (out_n == 8 && in_n == 4)
37886 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
37887 else if (out_n == 16 && in_n == 8)
37888 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
37890 if (out_mode == SImode && in_mode == SFmode)
37892 if (out_n == 4 && in_n == 4)
37893 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
37894 else if (out_n == 8 && in_n == 8)
37895 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
37896 else if (out_n == 16 && in_n == 16)
37897 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
37899 break;
37901 CASE_CFN_IROUND:
37902 CASE_CFN_LROUND:
37903 CASE_CFN_LLROUND:
37904 /* The round insn does not trap on denormals. */
37905 if (flag_trapping_math || !TARGET_SSE4_1)
37906 break;
37908 if (out_mode == SImode && in_mode == DFmode)
37910 if (out_n == 4 && in_n == 2)
37911 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
37912 else if (out_n == 8 && in_n == 4)
37913 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
37914 else if (out_n == 16 && in_n == 8)
37915 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
37917 if (out_mode == SImode && in_mode == SFmode)
37919 if (out_n == 4 && in_n == 4)
37920 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
37921 else if (out_n == 8 && in_n == 8)
37922 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
37923 else if (out_n == 16 && in_n == 16)
37924 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
37926 break;
37928 CASE_CFN_FLOOR:
37929 /* The round insn does not trap on denormals. */
37930 if (flag_trapping_math || !TARGET_SSE4_1)
37931 break;
37933 if (out_mode == DFmode && in_mode == DFmode)
37935 if (out_n == 2 && in_n == 2)
37936 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
37937 else if (out_n == 4 && in_n == 4)
37938 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
37939 else if (out_n == 8 && in_n == 8)
37940 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
37942 if (out_mode == SFmode && in_mode == SFmode)
37944 if (out_n == 4 && in_n == 4)
37945 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
37946 else if (out_n == 8 && in_n == 8)
37947 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
37948 else if (out_n == 16 && in_n == 16)
37949 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
37951 break;
37953 CASE_CFN_CEIL:
37954 /* The round insn does not trap on denormals. */
37955 if (flag_trapping_math || !TARGET_SSE4_1)
37956 break;
37958 if (out_mode == DFmode && in_mode == DFmode)
37960 if (out_n == 2 && in_n == 2)
37961 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
37962 else if (out_n == 4 && in_n == 4)
37963 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
37964 else if (out_n == 8 && in_n == 8)
37965 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
37967 if (out_mode == SFmode && in_mode == SFmode)
37969 if (out_n == 4 && in_n == 4)
37970 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
37971 else if (out_n == 8 && in_n == 8)
37972 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
37973 else if (out_n == 16 && in_n == 16)
37974 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
37976 break;
37978 CASE_CFN_TRUNC:
37979 /* The round insn does not trap on denormals. */
37980 if (flag_trapping_math || !TARGET_SSE4_1)
37981 break;
37983 if (out_mode == DFmode && in_mode == DFmode)
37985 if (out_n == 2 && in_n == 2)
37986 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
37987 else if (out_n == 4 && in_n == 4)
37988 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
37989 else if (out_n == 8 && in_n == 8)
37990 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
37992 if (out_mode == SFmode && in_mode == SFmode)
37994 if (out_n == 4 && in_n == 4)
37995 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
37996 else if (out_n == 8 && in_n == 8)
37997 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
37998 else if (out_n == 16 && in_n == 16)
37999 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38001 break;
38003 CASE_CFN_RINT:
38004 /* The round insn does not trap on denormals. */
38005 if (flag_trapping_math || !TARGET_SSE4_1)
38006 break;
38008 if (out_mode == DFmode && in_mode == DFmode)
38010 if (out_n == 2 && in_n == 2)
38011 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38012 else if (out_n == 4 && in_n == 4)
38013 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38015 if (out_mode == SFmode && in_mode == SFmode)
38017 if (out_n == 4 && in_n == 4)
38018 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38019 else if (out_n == 8 && in_n == 8)
38020 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38022 break;
38024 CASE_CFN_FMA:
38025 if (out_mode == DFmode && in_mode == DFmode)
38027 if (out_n == 2 && in_n == 2)
38028 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38029 if (out_n == 4 && in_n == 4)
38030 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38032 if (out_mode == SFmode && in_mode == SFmode)
38034 if (out_n == 4 && in_n == 4)
38035 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38036 if (out_n == 8 && in_n == 8)
38037 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38039 break;
38041 default:
38042 break;
38045 /* Dispatch to a handler for a vectorization library. */
38046 if (ix86_veclib_handler)
38047 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38049 return NULL_TREE;
38052 /* Handler for an SVML-style interface to
38053 a library with vectorized intrinsics. */
38055 static tree
38056 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38058 char name[20];
38059 tree fntype, new_fndecl, args;
38060 unsigned arity;
38061 const char *bname;
38062 machine_mode el_mode, in_mode;
38063 int n, in_n;
38065 /* The SVML is suitable for unsafe math only. */
38066 if (!flag_unsafe_math_optimizations)
38067 return NULL_TREE;
38069 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38070 n = TYPE_VECTOR_SUBPARTS (type_out);
38071 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38072 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38073 if (el_mode != in_mode
38074 || n != in_n)
38075 return NULL_TREE;
38077 switch (fn)
38079 CASE_CFN_EXP:
38080 CASE_CFN_LOG:
38081 CASE_CFN_LOG10:
38082 CASE_CFN_POW:
38083 CASE_CFN_TANH:
38084 CASE_CFN_TAN:
38085 CASE_CFN_ATAN:
38086 CASE_CFN_ATAN2:
38087 CASE_CFN_ATANH:
38088 CASE_CFN_CBRT:
38089 CASE_CFN_SINH:
38090 CASE_CFN_SIN:
38091 CASE_CFN_ASINH:
38092 CASE_CFN_ASIN:
38093 CASE_CFN_COSH:
38094 CASE_CFN_COS:
38095 CASE_CFN_ACOSH:
38096 CASE_CFN_ACOS:
38097 if ((el_mode != DFmode || n != 2)
38098 && (el_mode != SFmode || n != 4))
38099 return NULL_TREE;
38100 break;
38102 default:
38103 return NULL_TREE;
38106 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38107 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38109 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38110 strcpy (name, "vmlsLn4");
38111 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38112 strcpy (name, "vmldLn2");
38113 else if (n == 4)
38115 sprintf (name, "vmls%s", bname+10);
38116 name[strlen (name)-1] = '4';
38118 else
38119 sprintf (name, "vmld%s2", bname+10);
38121 /* Convert to uppercase. */
38122 name[4] &= ~0x20;
38124 arity = 0;
38125 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38126 arity++;
38128 if (arity == 1)
38129 fntype = build_function_type_list (type_out, type_in, NULL);
38130 else
38131 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38133 /* Build a function declaration for the vectorized function. */
38134 new_fndecl = build_decl (BUILTINS_LOCATION,
38135 FUNCTION_DECL, get_identifier (name), fntype);
38136 TREE_PUBLIC (new_fndecl) = 1;
38137 DECL_EXTERNAL (new_fndecl) = 1;
38138 DECL_IS_NOVOPS (new_fndecl) = 1;
38139 TREE_READONLY (new_fndecl) = 1;
38141 return new_fndecl;
38144 /* Handler for an ACML-style interface to
38145 a library with vectorized intrinsics. */
38147 static tree
38148 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38150 char name[20] = "__vr.._";
38151 tree fntype, new_fndecl, args;
38152 unsigned arity;
38153 const char *bname;
38154 machine_mode el_mode, in_mode;
38155 int n, in_n;
38157 /* The ACML is 64bits only and suitable for unsafe math only as
38158 it does not correctly support parts of IEEE with the required
38159 precision such as denormals. */
38160 if (!TARGET_64BIT
38161 || !flag_unsafe_math_optimizations)
38162 return NULL_TREE;
38164 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38165 n = TYPE_VECTOR_SUBPARTS (type_out);
38166 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38167 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38168 if (el_mode != in_mode
38169 || n != in_n)
38170 return NULL_TREE;
38172 switch (fn)
38174 CASE_CFN_SIN:
38175 CASE_CFN_COS:
38176 CASE_CFN_EXP:
38177 CASE_CFN_LOG:
38178 CASE_CFN_LOG2:
38179 CASE_CFN_LOG10:
38180 if (el_mode == DFmode && n == 2)
38182 name[4] = 'd';
38183 name[5] = '2';
38185 else if (el_mode == SFmode && n == 4)
38187 name[4] = 's';
38188 name[5] = '4';
38190 else
38191 return NULL_TREE;
38192 break;
38194 default:
38195 return NULL_TREE;
38198 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38199 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38200 sprintf (name + 7, "%s", bname+10);
38202 arity = 0;
38203 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38204 arity++;
38206 if (arity == 1)
38207 fntype = build_function_type_list (type_out, type_in, NULL);
38208 else
38209 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38211 /* Build a function declaration for the vectorized function. */
38212 new_fndecl = build_decl (BUILTINS_LOCATION,
38213 FUNCTION_DECL, get_identifier (name), fntype);
38214 TREE_PUBLIC (new_fndecl) = 1;
38215 DECL_EXTERNAL (new_fndecl) = 1;
38216 DECL_IS_NOVOPS (new_fndecl) = 1;
38217 TREE_READONLY (new_fndecl) = 1;
38219 return new_fndecl;
38222 /* Returns a decl of a function that implements gather load with
38223 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38224 Return NULL_TREE if it is not available. */
38226 static tree
38227 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38228 const_tree index_type, int scale)
38230 bool si;
38231 enum ix86_builtins code;
38233 if (! TARGET_AVX2)
38234 return NULL_TREE;
38236 if ((TREE_CODE (index_type) != INTEGER_TYPE
38237 && !POINTER_TYPE_P (index_type))
38238 || (TYPE_MODE (index_type) != SImode
38239 && TYPE_MODE (index_type) != DImode))
38240 return NULL_TREE;
38242 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38243 return NULL_TREE;
38245 /* v*gather* insn sign extends index to pointer mode. */
38246 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38247 && TYPE_UNSIGNED (index_type))
38248 return NULL_TREE;
38250 if (scale <= 0
38251 || scale > 8
38252 || (scale & (scale - 1)) != 0)
38253 return NULL_TREE;
38255 si = TYPE_MODE (index_type) == SImode;
38256 switch (TYPE_MODE (mem_vectype))
38258 case E_V2DFmode:
38259 if (TARGET_AVX512VL)
38260 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38261 else
38262 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38263 break;
38264 case E_V4DFmode:
38265 if (TARGET_AVX512VL)
38266 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38267 else
38268 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38269 break;
38270 case E_V2DImode:
38271 if (TARGET_AVX512VL)
38272 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38273 else
38274 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38275 break;
38276 case E_V4DImode:
38277 if (TARGET_AVX512VL)
38278 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38279 else
38280 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38281 break;
38282 case E_V4SFmode:
38283 if (TARGET_AVX512VL)
38284 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38285 else
38286 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38287 break;
38288 case E_V8SFmode:
38289 if (TARGET_AVX512VL)
38290 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38291 else
38292 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38293 break;
38294 case E_V4SImode:
38295 if (TARGET_AVX512VL)
38296 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38297 else
38298 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38299 break;
38300 case E_V8SImode:
38301 if (TARGET_AVX512VL)
38302 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38303 else
38304 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38305 break;
38306 case E_V8DFmode:
38307 if (TARGET_AVX512F)
38308 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38309 else
38310 return NULL_TREE;
38311 break;
38312 case E_V8DImode:
38313 if (TARGET_AVX512F)
38314 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38315 else
38316 return NULL_TREE;
38317 break;
38318 case E_V16SFmode:
38319 if (TARGET_AVX512F)
38320 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38321 else
38322 return NULL_TREE;
38323 break;
38324 case E_V16SImode:
38325 if (TARGET_AVX512F)
38326 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38327 else
38328 return NULL_TREE;
38329 break;
38330 default:
38331 return NULL_TREE;
38334 return ix86_get_builtin (code);
38337 /* Returns a decl of a function that implements scatter store with
38338 register type VECTYPE and index type INDEX_TYPE and SCALE.
38339 Return NULL_TREE if it is not available. */
38341 static tree
38342 ix86_vectorize_builtin_scatter (const_tree vectype,
38343 const_tree index_type, int scale)
38345 bool si;
38346 enum ix86_builtins code;
38348 if (!TARGET_AVX512F)
38349 return NULL_TREE;
38351 if ((TREE_CODE (index_type) != INTEGER_TYPE
38352 && !POINTER_TYPE_P (index_type))
38353 || (TYPE_MODE (index_type) != SImode
38354 && TYPE_MODE (index_type) != DImode))
38355 return NULL_TREE;
38357 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38358 return NULL_TREE;
38360 /* v*scatter* insn sign extends index to pointer mode. */
38361 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38362 && TYPE_UNSIGNED (index_type))
38363 return NULL_TREE;
38365 /* Scale can be 1, 2, 4 or 8. */
38366 if (scale <= 0
38367 || scale > 8
38368 || (scale & (scale - 1)) != 0)
38369 return NULL_TREE;
38371 si = TYPE_MODE (index_type) == SImode;
38372 switch (TYPE_MODE (vectype))
38374 case E_V8DFmode:
38375 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38376 break;
38377 case E_V8DImode:
38378 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38379 break;
38380 case E_V16SFmode:
38381 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38382 break;
38383 case E_V16SImode:
38384 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38385 break;
38386 default:
38387 return NULL_TREE;
38390 return ix86_builtins[code];
38393 /* Return true if it is safe to use the rsqrt optabs to optimize
38394 1.0/sqrt. */
38396 static bool
38397 use_rsqrt_p ()
38399 return (TARGET_SSE_MATH
38400 && flag_finite_math_only
38401 && !flag_trapping_math
38402 && flag_unsafe_math_optimizations);
38405 /* Returns a code for a target-specific builtin that implements
38406 reciprocal of the function, or NULL_TREE if not available. */
38408 static tree
38409 ix86_builtin_reciprocal (tree fndecl)
38411 switch (DECL_FUNCTION_CODE (fndecl))
38413 /* Vectorized version of sqrt to rsqrt conversion. */
38414 case IX86_BUILTIN_SQRTPS_NR:
38415 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38417 case IX86_BUILTIN_SQRTPS_NR256:
38418 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38420 default:
38421 return NULL_TREE;
38425 /* Helper for avx_vpermilps256_operand et al. This is also used by
38426 the expansion functions to turn the parallel back into a mask.
38427 The return value is 0 for no match and the imm8+1 for a match. */
38430 avx_vpermilp_parallel (rtx par, machine_mode mode)
38432 unsigned i, nelt = GET_MODE_NUNITS (mode);
38433 unsigned mask = 0;
38434 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38436 if (XVECLEN (par, 0) != (int) nelt)
38437 return 0;
38439 /* Validate that all of the elements are constants, and not totally
38440 out of range. Copy the data into an integral array to make the
38441 subsequent checks easier. */
38442 for (i = 0; i < nelt; ++i)
38444 rtx er = XVECEXP (par, 0, i);
38445 unsigned HOST_WIDE_INT ei;
38447 if (!CONST_INT_P (er))
38448 return 0;
38449 ei = INTVAL (er);
38450 if (ei >= nelt)
38451 return 0;
38452 ipar[i] = ei;
38455 switch (mode)
38457 case E_V8DFmode:
38458 /* In the 512-bit DFmode case, we can only move elements within
38459 a 128-bit lane. First fill the second part of the mask,
38460 then fallthru. */
38461 for (i = 4; i < 6; ++i)
38463 if (ipar[i] < 4 || ipar[i] >= 6)
38464 return 0;
38465 mask |= (ipar[i] - 4) << i;
38467 for (i = 6; i < 8; ++i)
38469 if (ipar[i] < 6)
38470 return 0;
38471 mask |= (ipar[i] - 6) << i;
38473 /* FALLTHRU */
38475 case E_V4DFmode:
38476 /* In the 256-bit DFmode case, we can only move elements within
38477 a 128-bit lane. */
38478 for (i = 0; i < 2; ++i)
38480 if (ipar[i] >= 2)
38481 return 0;
38482 mask |= ipar[i] << i;
38484 for (i = 2; i < 4; ++i)
38486 if (ipar[i] < 2)
38487 return 0;
38488 mask |= (ipar[i] - 2) << i;
38490 break;
38492 case E_V16SFmode:
38493 /* In 512 bit SFmode case, permutation in the upper 256 bits
38494 must mirror the permutation in the lower 256-bits. */
38495 for (i = 0; i < 8; ++i)
38496 if (ipar[i] + 8 != ipar[i + 8])
38497 return 0;
38498 /* FALLTHRU */
38500 case E_V8SFmode:
38501 /* In 256 bit SFmode case, we have full freedom of
38502 movement within the low 128-bit lane, but the high 128-bit
38503 lane must mirror the exact same pattern. */
38504 for (i = 0; i < 4; ++i)
38505 if (ipar[i] + 4 != ipar[i + 4])
38506 return 0;
38507 nelt = 4;
38508 /* FALLTHRU */
38510 case E_V2DFmode:
38511 case E_V4SFmode:
38512 /* In the 128-bit case, we've full freedom in the placement of
38513 the elements from the source operand. */
38514 for (i = 0; i < nelt; ++i)
38515 mask |= ipar[i] << (i * (nelt / 2));
38516 break;
38518 default:
38519 gcc_unreachable ();
38522 /* Make sure success has a non-zero value by adding one. */
38523 return mask + 1;
38526 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38527 the expansion functions to turn the parallel back into a mask.
38528 The return value is 0 for no match and the imm8+1 for a match. */
38531 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38533 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38534 unsigned mask = 0;
38535 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38537 if (XVECLEN (par, 0) != (int) nelt)
38538 return 0;
38540 /* Validate that all of the elements are constants, and not totally
38541 out of range. Copy the data into an integral array to make the
38542 subsequent checks easier. */
38543 for (i = 0; i < nelt; ++i)
38545 rtx er = XVECEXP (par, 0, i);
38546 unsigned HOST_WIDE_INT ei;
38548 if (!CONST_INT_P (er))
38549 return 0;
38550 ei = INTVAL (er);
38551 if (ei >= 2 * nelt)
38552 return 0;
38553 ipar[i] = ei;
38556 /* Validate that the halves of the permute are halves. */
38557 for (i = 0; i < nelt2 - 1; ++i)
38558 if (ipar[i] + 1 != ipar[i + 1])
38559 return 0;
38560 for (i = nelt2; i < nelt - 1; ++i)
38561 if (ipar[i] + 1 != ipar[i + 1])
38562 return 0;
38564 /* Reconstruct the mask. */
38565 for (i = 0; i < 2; ++i)
38567 unsigned e = ipar[i * nelt2];
38568 if (e % nelt2)
38569 return 0;
38570 e /= nelt2;
38571 mask |= e << (i * 4);
38574 /* Make sure success has a non-zero value by adding one. */
38575 return mask + 1;
38578 /* Return a register priority for hard reg REGNO. */
38579 static int
38580 ix86_register_priority (int hard_regno)
38582 /* ebp and r13 as the base always wants a displacement, r12 as the
38583 base always wants an index. So discourage their usage in an
38584 address. */
38585 if (hard_regno == R12_REG || hard_regno == R13_REG)
38586 return 0;
38587 if (hard_regno == BP_REG)
38588 return 1;
38589 /* New x86-64 int registers result in bigger code size. Discourage
38590 them. */
38591 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38592 return 2;
38593 /* New x86-64 SSE registers result in bigger code size. Discourage
38594 them. */
38595 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38596 return 2;
38597 /* Usage of AX register results in smaller code. Prefer it. */
38598 if (hard_regno == AX_REG)
38599 return 4;
38600 return 3;
38603 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38605 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38606 QImode must go into class Q_REGS.
38607 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38608 movdf to do mem-to-mem moves through integer regs. */
38610 static reg_class_t
38611 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38613 machine_mode mode = GET_MODE (x);
38615 /* We're only allowed to return a subclass of CLASS. Many of the
38616 following checks fail for NO_REGS, so eliminate that early. */
38617 if (regclass == NO_REGS)
38618 return NO_REGS;
38620 /* All classes can load zeros. */
38621 if (x == CONST0_RTX (mode))
38622 return regclass;
38624 /* Force constants into memory if we are loading a (nonzero) constant into
38625 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38626 instructions to load from a constant. */
38627 if (CONSTANT_P (x)
38628 && (MAYBE_MMX_CLASS_P (regclass)
38629 || MAYBE_SSE_CLASS_P (regclass)
38630 || MAYBE_MASK_CLASS_P (regclass)))
38631 return NO_REGS;
38633 /* Floating-point constants need more complex checks. */
38634 if (CONST_DOUBLE_P (x))
38636 /* General regs can load everything. */
38637 if (INTEGER_CLASS_P (regclass))
38638 return regclass;
38640 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38641 zero above. We only want to wind up preferring 80387 registers if
38642 we plan on doing computation with them. */
38643 if (IS_STACK_MODE (mode)
38644 && standard_80387_constant_p (x) > 0)
38646 /* Limit class to FP regs. */
38647 if (FLOAT_CLASS_P (regclass))
38648 return FLOAT_REGS;
38649 else if (regclass == FP_TOP_SSE_REGS)
38650 return FP_TOP_REG;
38651 else if (regclass == FP_SECOND_SSE_REGS)
38652 return FP_SECOND_REG;
38655 return NO_REGS;
38658 /* Prefer SSE regs only, if we can use them for math. */
38659 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38660 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38662 /* Generally when we see PLUS here, it's the function invariant
38663 (plus soft-fp const_int). Which can only be computed into general
38664 regs. */
38665 if (GET_CODE (x) == PLUS)
38666 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38668 /* QImode constants are easy to load, but non-constant QImode data
38669 must go into Q_REGS. */
38670 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38672 if (Q_CLASS_P (regclass))
38673 return regclass;
38674 else if (reg_class_subset_p (Q_REGS, regclass))
38675 return Q_REGS;
38676 else
38677 return NO_REGS;
38680 return regclass;
38683 /* Discourage putting floating-point values in SSE registers unless
38684 SSE math is being used, and likewise for the 387 registers. */
38685 static reg_class_t
38686 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
38688 machine_mode mode = GET_MODE (x);
38690 /* Restrict the output reload class to the register bank that we are doing
38691 math on. If we would like not to return a subset of CLASS, reject this
38692 alternative: if reload cannot do this, it will still use its choice. */
38693 mode = GET_MODE (x);
38694 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38695 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
38697 if (IS_STACK_MODE (mode))
38699 if (regclass == FP_TOP_SSE_REGS)
38700 return FP_TOP_REG;
38701 else if (regclass == FP_SECOND_SSE_REGS)
38702 return FP_SECOND_REG;
38703 else
38704 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
38707 return regclass;
38710 static reg_class_t
38711 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
38712 machine_mode mode, secondary_reload_info *sri)
38714 /* Double-word spills from general registers to non-offsettable memory
38715 references (zero-extended addresses) require special handling. */
38716 if (TARGET_64BIT
38717 && MEM_P (x)
38718 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
38719 && INTEGER_CLASS_P (rclass)
38720 && !offsettable_memref_p (x))
38722 sri->icode = (in_p
38723 ? CODE_FOR_reload_noff_load
38724 : CODE_FOR_reload_noff_store);
38725 /* Add the cost of moving address to a temporary. */
38726 sri->extra_cost = 1;
38728 return NO_REGS;
38731 /* QImode spills from non-QI registers require
38732 intermediate register on 32bit targets. */
38733 if (mode == QImode
38734 && ((!TARGET_64BIT && !in_p
38735 && INTEGER_CLASS_P (rclass)
38736 && MAYBE_NON_Q_CLASS_P (rclass))
38737 || (!TARGET_AVX512DQ
38738 && MAYBE_MASK_CLASS_P (rclass))))
38740 int regno = true_regnum (x);
38742 /* Return Q_REGS if the operand is in memory. */
38743 if (regno == -1)
38744 return Q_REGS;
38746 return NO_REGS;
38749 /* This condition handles corner case where an expression involving
38750 pointers gets vectorized. We're trying to use the address of a
38751 stack slot as a vector initializer.
38753 (set (reg:V2DI 74 [ vect_cst_.2 ])
38754 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
38756 Eventually frame gets turned into sp+offset like this:
38758 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38759 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38760 (const_int 392 [0x188]))))
38762 That later gets turned into:
38764 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38765 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38766 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
38768 We'll have the following reload recorded:
38770 Reload 0: reload_in (DI) =
38771 (plus:DI (reg/f:DI 7 sp)
38772 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
38773 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38774 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
38775 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
38776 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38777 reload_reg_rtx: (reg:V2DI 22 xmm1)
38779 Which isn't going to work since SSE instructions can't handle scalar
38780 additions. Returning GENERAL_REGS forces the addition into integer
38781 register and reload can handle subsequent reloads without problems. */
38783 if (in_p && GET_CODE (x) == PLUS
38784 && SSE_CLASS_P (rclass)
38785 && SCALAR_INT_MODE_P (mode))
38786 return GENERAL_REGS;
38788 return NO_REGS;
38791 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
38793 static bool
38794 ix86_class_likely_spilled_p (reg_class_t rclass)
38796 switch (rclass)
38798 case AREG:
38799 case DREG:
38800 case CREG:
38801 case BREG:
38802 case AD_REGS:
38803 case SIREG:
38804 case DIREG:
38805 case SSE_FIRST_REG:
38806 case FP_TOP_REG:
38807 case FP_SECOND_REG:
38808 case BND_REGS:
38809 return true;
38811 default:
38812 break;
38815 return false;
38818 /* If we are copying between registers from different register sets
38819 (e.g. FP and integer), we may need a memory location.
38821 The function can't work reliably when one of the CLASSES is a class
38822 containing registers from multiple sets. We avoid this by never combining
38823 different sets in a single alternative in the machine description.
38824 Ensure that this constraint holds to avoid unexpected surprises.
38826 When STRICT is false, we are being called from REGISTER_MOVE_COST,
38827 so do not enforce these sanity checks.
38829 To optimize register_move_cost performance, define inline variant. */
38831 static inline bool
38832 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38833 reg_class_t class2, int strict)
38835 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
38836 return false;
38838 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
38839 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
38840 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
38841 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
38842 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
38843 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
38844 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
38845 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
38847 gcc_assert (!strict || lra_in_progress);
38848 return true;
38851 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
38852 return true;
38854 /* Between mask and general, we have moves no larger than word size. */
38855 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
38856 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
38857 return true;
38859 /* ??? This is a lie. We do have moves between mmx/general, and for
38860 mmx/sse2. But by saying we need secondary memory we discourage the
38861 register allocator from using the mmx registers unless needed. */
38862 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
38863 return true;
38865 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38867 /* SSE1 doesn't have any direct moves from other classes. */
38868 if (!TARGET_SSE2)
38869 return true;
38871 /* If the target says that inter-unit moves are more expensive
38872 than moving through memory, then don't generate them. */
38873 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
38874 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
38875 return true;
38877 /* Between SSE and general, we have moves no larger than word size. */
38878 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
38879 return true;
38882 return false;
38885 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
38887 static bool
38888 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38889 reg_class_t class2)
38891 return inline_secondary_memory_needed (mode, class1, class2, true);
38894 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
38896 get_secondary_mem widens integral modes to BITS_PER_WORD.
38897 There is no need to emit full 64 bit move on 64 bit targets
38898 for integral modes that can be moved using 32 bit move. */
38900 static machine_mode
38901 ix86_secondary_memory_needed_mode (machine_mode mode)
38903 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
38904 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
38905 return mode;
38908 /* Implement the TARGET_CLASS_MAX_NREGS hook.
38910 On the 80386, this is the size of MODE in words,
38911 except in the FP regs, where a single reg is always enough. */
38913 static unsigned char
38914 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
38916 if (MAYBE_INTEGER_CLASS_P (rclass))
38918 if (mode == XFmode)
38919 return (TARGET_64BIT ? 2 : 3);
38920 else if (mode == XCmode)
38921 return (TARGET_64BIT ? 4 : 6);
38922 else
38923 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38925 else
38927 if (COMPLEX_MODE_P (mode))
38928 return 2;
38929 else
38930 return 1;
38934 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
38936 static bool
38937 ix86_can_change_mode_class (machine_mode from, machine_mode to,
38938 reg_class_t regclass)
38940 if (from == to)
38941 return true;
38943 /* x87 registers can't do subreg at all, as all values are reformatted
38944 to extended precision. */
38945 if (MAYBE_FLOAT_CLASS_P (regclass))
38946 return false;
38948 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
38950 /* Vector registers do not support QI or HImode loads. If we don't
38951 disallow a change to these modes, reload will assume it's ok to
38952 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
38953 the vec_dupv4hi pattern. */
38954 if (GET_MODE_SIZE (from) < 4)
38955 return false;
38958 return true;
38961 /* Return index of MODE in the sse load/store tables. */
38963 static inline int
38964 sse_store_index (machine_mode mode)
38966 switch (GET_MODE_SIZE (mode))
38968 case 4:
38969 return 0;
38970 case 8:
38971 return 1;
38972 case 16:
38973 return 2;
38974 case 32:
38975 return 3;
38976 case 64:
38977 return 4;
38978 default:
38979 return -1;
38983 /* Return the cost of moving data of mode M between a
38984 register and memory. A value of 2 is the default; this cost is
38985 relative to those in `REGISTER_MOVE_COST'.
38987 This function is used extensively by register_move_cost that is used to
38988 build tables at startup. Make it inline in this case.
38989 When IN is 2, return maximum of in and out move cost.
38991 If moving between registers and memory is more expensive than
38992 between two registers, you should define this macro to express the
38993 relative cost.
38995 Model also increased moving costs of QImode registers in non
38996 Q_REGS classes.
38998 static inline int
38999 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39000 int in)
39002 int cost;
39003 if (FLOAT_CLASS_P (regclass))
39005 int index;
39006 switch (mode)
39008 case E_SFmode:
39009 index = 0;
39010 break;
39011 case E_DFmode:
39012 index = 1;
39013 break;
39014 case E_XFmode:
39015 index = 2;
39016 break;
39017 default:
39018 return 100;
39020 if (in == 2)
39021 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39022 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39024 if (SSE_CLASS_P (regclass))
39026 int index = sse_store_index (mode);
39027 if (index == -1)
39028 return 100;
39029 if (in == 2)
39030 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39031 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39033 if (MMX_CLASS_P (regclass))
39035 int index;
39036 switch (GET_MODE_SIZE (mode))
39038 case 4:
39039 index = 0;
39040 break;
39041 case 8:
39042 index = 1;
39043 break;
39044 default:
39045 return 100;
39047 if (in)
39048 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39049 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39051 switch (GET_MODE_SIZE (mode))
39053 case 1:
39054 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39056 if (!in)
39057 return ix86_cost->int_store[0];
39058 if (TARGET_PARTIAL_REG_DEPENDENCY
39059 && optimize_function_for_speed_p (cfun))
39060 cost = ix86_cost->movzbl_load;
39061 else
39062 cost = ix86_cost->int_load[0];
39063 if (in == 2)
39064 return MAX (cost, ix86_cost->int_store[0]);
39065 return cost;
39067 else
39069 if (in == 2)
39070 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39071 if (in)
39072 return ix86_cost->movzbl_load;
39073 else
39074 return ix86_cost->int_store[0] + 4;
39076 break;
39077 case 2:
39078 if (in == 2)
39079 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39080 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39081 default:
39082 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39083 if (mode == TFmode)
39084 mode = XFmode;
39085 if (in == 2)
39086 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39087 else if (in)
39088 cost = ix86_cost->int_load[2];
39089 else
39090 cost = ix86_cost->int_store[2];
39091 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39095 static int
39096 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39097 bool in)
39099 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39103 /* Return the cost of moving data from a register in class CLASS1 to
39104 one in class CLASS2.
39106 It is not required that the cost always equal 2 when FROM is the same as TO;
39107 on some machines it is expensive to move between registers if they are not
39108 general registers. */
39110 static int
39111 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39112 reg_class_t class2_i)
39114 enum reg_class class1 = (enum reg_class) class1_i;
39115 enum reg_class class2 = (enum reg_class) class2_i;
39117 /* In case we require secondary memory, compute cost of the store followed
39118 by load. In order to avoid bad register allocation choices, we need
39119 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39121 if (inline_secondary_memory_needed (mode, class1, class2, false))
39123 int cost = 1;
39125 cost += inline_memory_move_cost (mode, class1, 2);
39126 cost += inline_memory_move_cost (mode, class2, 2);
39128 /* In case of copying from general_purpose_register we may emit multiple
39129 stores followed by single load causing memory size mismatch stall.
39130 Count this as arbitrarily high cost of 20. */
39131 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
39132 && TARGET_MEMORY_MISMATCH_STALL
39133 && targetm.class_max_nregs (class1, mode)
39134 > targetm.class_max_nregs (class2, mode))
39135 cost += 20;
39137 /* In the case of FP/MMX moves, the registers actually overlap, and we
39138 have to switch modes in order to treat them differently. */
39139 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39140 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39141 cost += 20;
39143 return cost;
39146 /* Moves between SSE/MMX and integer unit are expensive. */
39147 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39148 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39150 /* ??? By keeping returned value relatively high, we limit the number
39151 of moves between integer and MMX/SSE registers for all targets.
39152 Additionally, high value prevents problem with x86_modes_tieable_p(),
39153 where integer modes in MMX/SSE registers are not tieable
39154 because of missing QImode and HImode moves to, from or between
39155 MMX/SSE registers. */
39156 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
39157 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
39159 if (MAYBE_FLOAT_CLASS_P (class1))
39160 return ix86_cost->fp_move;
39161 if (MAYBE_SSE_CLASS_P (class1))
39163 if (GET_MODE_BITSIZE (mode) <= 128)
39164 return ix86_cost->xmm_move;
39165 if (GET_MODE_BITSIZE (mode) <= 256)
39166 return ix86_cost->ymm_move;
39167 return ix86_cost->zmm_move;
39169 if (MAYBE_MMX_CLASS_P (class1))
39170 return ix86_cost->mmx_move;
39171 return 2;
39174 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
39175 words of a value of mode MODE but can be less for certain modes in
39176 special long registers.
39178 Actually there are no two word move instructions for consecutive
39179 registers. And only registers 0-3 may have mov byte instructions
39180 applied to them. */
39182 static unsigned int
39183 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
39185 if (GENERAL_REGNO_P (regno))
39187 if (mode == XFmode)
39188 return TARGET_64BIT ? 2 : 3;
39189 if (mode == XCmode)
39190 return TARGET_64BIT ? 4 : 6;
39191 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39193 if (COMPLEX_MODE_P (mode))
39194 return 2;
39195 if (mode == V64SFmode || mode == V64SImode)
39196 return 4;
39197 return 1;
39200 /* Implement TARGET_HARD_REGNO_MODE_OK. */
39202 static bool
39203 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
39205 /* Flags and only flags can only hold CCmode values. */
39206 if (CC_REGNO_P (regno))
39207 return GET_MODE_CLASS (mode) == MODE_CC;
39208 if (GET_MODE_CLASS (mode) == MODE_CC
39209 || GET_MODE_CLASS (mode) == MODE_RANDOM
39210 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39211 return false;
39212 if (STACK_REGNO_P (regno))
39213 return VALID_FP_MODE_P (mode);
39214 if (MASK_REGNO_P (regno))
39215 return (VALID_MASK_REG_MODE (mode)
39216 || (TARGET_AVX512BW
39217 && VALID_MASK_AVX512BW_MODE (mode)));
39218 if (BND_REGNO_P (regno))
39219 return VALID_BND_REG_MODE (mode);
39220 if (SSE_REGNO_P (regno))
39222 /* We implement the move patterns for all vector modes into and
39223 out of SSE registers, even when no operation instructions
39224 are available. */
39226 /* For AVX-512 we allow, regardless of regno:
39227 - XI mode
39228 - any of 512-bit wide vector mode
39229 - any scalar mode. */
39230 if (TARGET_AVX512F
39231 && (mode == XImode
39232 || VALID_AVX512F_REG_MODE (mode)
39233 || VALID_AVX512F_SCALAR_MODE (mode)))
39234 return true;
39236 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
39237 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39238 && MOD4_SSE_REGNO_P (regno)
39239 && mode == V64SFmode)
39240 return true;
39242 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
39243 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39244 && MOD4_SSE_REGNO_P (regno)
39245 && mode == V64SImode)
39246 return true;
39248 /* TODO check for QI/HI scalars. */
39249 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39250 if (TARGET_AVX512VL
39251 && (mode == OImode
39252 || mode == TImode
39253 || VALID_AVX256_REG_MODE (mode)
39254 || VALID_AVX512VL_128_REG_MODE (mode)))
39255 return true;
39257 /* xmm16-xmm31 are only available for AVX-512. */
39258 if (EXT_REX_SSE_REGNO_P (regno))
39259 return false;
39261 /* OImode and AVX modes are available only when AVX is enabled. */
39262 return ((TARGET_AVX
39263 && VALID_AVX256_REG_OR_OI_MODE (mode))
39264 || VALID_SSE_REG_MODE (mode)
39265 || VALID_SSE2_REG_MODE (mode)
39266 || VALID_MMX_REG_MODE (mode)
39267 || VALID_MMX_REG_MODE_3DNOW (mode));
39269 if (MMX_REGNO_P (regno))
39271 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39272 so if the register is available at all, then we can move data of
39273 the given mode into or out of it. */
39274 return (VALID_MMX_REG_MODE (mode)
39275 || VALID_MMX_REG_MODE_3DNOW (mode));
39278 if (mode == QImode)
39280 /* Take care for QImode values - they can be in non-QI regs,
39281 but then they do cause partial register stalls. */
39282 if (ANY_QI_REGNO_P (regno))
39283 return true;
39284 if (!TARGET_PARTIAL_REG_STALL)
39285 return true;
39286 /* LRA checks if the hard register is OK for the given mode.
39287 QImode values can live in non-QI regs, so we allow all
39288 registers here. */
39289 if (lra_in_progress)
39290 return true;
39291 return !can_create_pseudo_p ();
39293 /* We handle both integer and floats in the general purpose registers. */
39294 else if (VALID_INT_MODE_P (mode))
39295 return true;
39296 else if (VALID_FP_MODE_P (mode))
39297 return true;
39298 else if (VALID_DFP_MODE_P (mode))
39299 return true;
39300 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39301 on to use that value in smaller contexts, this can easily force a
39302 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39303 supporting DImode, allow it. */
39304 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39305 return true;
39307 return false;
39310 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
39311 saves SSE registers across calls is Win64 (thus no need to check the
39312 current ABI here), and with AVX enabled Win64 only guarantees that
39313 the low 16 bytes are saved. */
39315 static bool
39316 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
39318 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
39321 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39322 tieable integer mode. */
39324 static bool
39325 ix86_tieable_integer_mode_p (machine_mode mode)
39327 switch (mode)
39329 case E_HImode:
39330 case E_SImode:
39331 return true;
39333 case E_QImode:
39334 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39336 case E_DImode:
39337 return TARGET_64BIT;
39339 default:
39340 return false;
39344 /* Implement TARGET_MODES_TIEABLE_P.
39346 Return true if MODE1 is accessible in a register that can hold MODE2
39347 without copying. That is, all register classes that can hold MODE2
39348 can also hold MODE1. */
39350 static bool
39351 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39353 if (mode1 == mode2)
39354 return true;
39356 if (ix86_tieable_integer_mode_p (mode1)
39357 && ix86_tieable_integer_mode_p (mode2))
39358 return true;
39360 /* MODE2 being XFmode implies fp stack or general regs, which means we
39361 can tie any smaller floating point modes to it. Note that we do not
39362 tie this with TFmode. */
39363 if (mode2 == XFmode)
39364 return mode1 == SFmode || mode1 == DFmode;
39366 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39367 that we can tie it with SFmode. */
39368 if (mode2 == DFmode)
39369 return mode1 == SFmode;
39371 /* If MODE2 is only appropriate for an SSE register, then tie with
39372 any other mode acceptable to SSE registers. */
39373 if (GET_MODE_SIZE (mode2) == 32
39374 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39375 return (GET_MODE_SIZE (mode1) == 32
39376 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39377 if (GET_MODE_SIZE (mode2) == 16
39378 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39379 return (GET_MODE_SIZE (mode1) == 16
39380 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39382 /* If MODE2 is appropriate for an MMX register, then tie
39383 with any other mode acceptable to MMX registers. */
39384 if (GET_MODE_SIZE (mode2) == 8
39385 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39386 return (GET_MODE_SIZE (mode1) == 8
39387 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39389 return false;
39392 /* Return the cost of moving between two registers of mode MODE. */
39394 static int
39395 ix86_set_reg_reg_cost (machine_mode mode)
39397 unsigned int units = UNITS_PER_WORD;
39399 switch (GET_MODE_CLASS (mode))
39401 default:
39402 break;
39404 case MODE_CC:
39405 units = GET_MODE_SIZE (CCmode);
39406 break;
39408 case MODE_FLOAT:
39409 if ((TARGET_SSE && mode == TFmode)
39410 || (TARGET_80387 && mode == XFmode)
39411 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39412 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39413 units = GET_MODE_SIZE (mode);
39414 break;
39416 case MODE_COMPLEX_FLOAT:
39417 if ((TARGET_SSE && mode == TCmode)
39418 || (TARGET_80387 && mode == XCmode)
39419 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39420 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39421 units = GET_MODE_SIZE (mode);
39422 break;
39424 case MODE_VECTOR_INT:
39425 case MODE_VECTOR_FLOAT:
39426 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39427 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39428 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39429 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39430 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39431 units = GET_MODE_SIZE (mode);
39434 /* Return the cost of moving between two registers of mode MODE,
39435 assuming that the move will be in pieces of at most UNITS bytes. */
39436 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39439 /* Return cost of vector operation in MODE given that scalar version has
39440 COST. If PARALLEL is true assume that CPU has more than one unit
39441 performing the operation. */
39443 static int
39444 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
39446 if (!VECTOR_MODE_P (mode))
39447 return cost;
39449 if (!parallel)
39450 return cost * GET_MODE_NUNITS (mode);
39451 if (GET_MODE_BITSIZE (mode) == 128
39452 && TARGET_SSE_SPLIT_REGS)
39453 return cost * 2;
39454 if (GET_MODE_BITSIZE (mode) > 128
39455 && TARGET_AVX128_OPTIMAL)
39456 return cost * GET_MODE_BITSIZE (mode) / 128;
39457 return cost;
39460 /* Return cost of multiplication in MODE. */
39462 static int
39463 ix86_multiplication_cost (const struct processor_costs *cost,
39464 enum machine_mode mode)
39466 machine_mode inner_mode = mode;
39467 if (VECTOR_MODE_P (mode))
39468 inner_mode = GET_MODE_INNER (mode);
39470 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39471 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
39472 else if (X87_FLOAT_MODE_P (mode))
39473 return cost->fmul;
39474 else if (FLOAT_MODE_P (mode))
39475 return ix86_vec_cost (mode,
39476 inner_mode == DFmode
39477 ? cost->mulsd : cost->mulss, true);
39478 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39480 /* V*QImode is emulated with 7-13 insns. */
39481 if (mode == V16QImode || mode == V32QImode)
39483 int extra = 11;
39484 if (TARGET_XOP && mode == V16QImode)
39485 extra = 5;
39486 else if (TARGET_SSSE3)
39487 extra = 6;
39488 return ix86_vec_cost (mode,
39489 cost->mulss * 2 + cost->sse_op * extra,
39490 true);
39492 /* V*DImode is emulated with 5-8 insns. */
39493 else if (mode == V2DImode || mode == V4DImode)
39495 if (TARGET_XOP && mode == V2DImode)
39496 return ix86_vec_cost (mode,
39497 cost->mulss * 2 + cost->sse_op * 3,
39498 true);
39499 else
39500 return ix86_vec_cost (mode,
39501 cost->mulss * 3 + cost->sse_op * 5,
39502 true);
39504 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39505 insns, including two PMULUDQ. */
39506 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39507 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
39508 true);
39509 else
39510 return ix86_vec_cost (mode, cost->mulss, true);
39512 else
39513 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
39516 /* Return cost of multiplication in MODE. */
39518 static int
39519 ix86_division_cost (const struct processor_costs *cost,
39520 enum machine_mode mode)
39522 machine_mode inner_mode = mode;
39523 if (VECTOR_MODE_P (mode))
39524 inner_mode = GET_MODE_INNER (mode);
39526 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39527 return inner_mode == DFmode ? cost->divsd : cost->divss;
39528 else if (X87_FLOAT_MODE_P (mode))
39529 return cost->fdiv;
39530 else if (FLOAT_MODE_P (mode))
39531 return ix86_vec_cost (mode,
39532 inner_mode == DFmode ? cost->divsd : cost->divss,
39533 true);
39534 else
39535 return cost->divide[MODE_INDEX (mode)];
39538 /* Return cost of shift in MODE.
39539 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
39540 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
39541 if op1 is a result of subreg.
39543 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
39545 static int
39546 ix86_shift_rotate_cost (const struct processor_costs *cost,
39547 enum machine_mode mode, bool constant_op1,
39548 HOST_WIDE_INT op1_val,
39549 bool speed,
39550 bool and_in_op1,
39551 bool shift_and_truncate,
39552 bool *skip_op0, bool *skip_op1)
39554 if (skip_op0)
39555 *skip_op0 = *skip_op1 = false;
39556 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39558 /* V*QImode is emulated with 1-11 insns. */
39559 if (mode == V16QImode || mode == V32QImode)
39561 int count = 11;
39562 if (TARGET_XOP && mode == V16QImode)
39564 /* For XOP we use vpshab, which requires a broadcast of the
39565 value to the variable shift insn. For constants this
39566 means a V16Q const in mem; even when we can perform the
39567 shift with one insn set the cost to prefer paddb. */
39568 if (constant_op1)
39570 if (skip_op1)
39571 *skip_op1 = true;
39572 return ix86_vec_cost (mode,
39573 cost->sse_op
39574 + (speed
39576 : COSTS_N_BYTES
39577 (GET_MODE_UNIT_SIZE (mode))), true);
39579 count = 3;
39581 else if (TARGET_SSSE3)
39582 count = 7;
39583 return ix86_vec_cost (mode, cost->sse_op * count, true);
39585 else
39586 return ix86_vec_cost (mode, cost->sse_op, true);
39588 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39590 if (constant_op1)
39592 if (op1_val > 32)
39593 return cost->shift_const + COSTS_N_INSNS (2);
39594 else
39595 return cost->shift_const * 2;
39597 else
39599 if (and_in_op1)
39600 return cost->shift_var * 2;
39601 else
39602 return cost->shift_var * 6 + COSTS_N_INSNS (2);
39605 else
39607 if (constant_op1)
39608 return cost->shift_const;
39609 else if (shift_and_truncate)
39611 if (skip_op0)
39612 *skip_op0 = *skip_op1 = true;
39613 /* Return the cost after shift-and truncation. */
39614 return cost->shift_var;
39616 else
39617 return cost->shift_var;
39619 return cost->shift_const;
39622 /* Compute a (partial) cost for rtx X. Return true if the complete
39623 cost has been computed, and false if subexpressions should be
39624 scanned. In either case, *TOTAL contains the cost result. */
39626 static bool
39627 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39628 int *total, bool speed)
39630 rtx mask;
39631 enum rtx_code code = GET_CODE (x);
39632 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39633 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39634 int src_cost;
39636 switch (code)
39638 case SET:
39639 if (register_operand (SET_DEST (x), VOIDmode)
39640 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39642 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39643 return true;
39646 if (register_operand (SET_SRC (x), VOIDmode))
39647 /* Avoid potentially incorrect high cost from rtx_costs
39648 for non-tieable SUBREGs. */
39649 src_cost = 0;
39650 else
39652 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
39654 if (CONSTANT_P (SET_SRC (x)))
39655 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
39656 a small value, possibly zero for cheap constants. */
39657 src_cost += COSTS_N_INSNS (1);
39660 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
39661 return true;
39663 case CONST_INT:
39664 case CONST:
39665 case LABEL_REF:
39666 case SYMBOL_REF:
39667 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39668 *total = 3;
39669 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39670 *total = 2;
39671 else if (flag_pic && SYMBOLIC_CONST (x)
39672 && !(TARGET_64BIT
39673 && (GET_CODE (x) == LABEL_REF
39674 || (GET_CODE (x) == SYMBOL_REF
39675 && SYMBOL_REF_LOCAL_P (x))))
39676 /* Use 0 cost for CONST to improve its propagation. */
39677 && (TARGET_64BIT || GET_CODE (x) != CONST))
39678 *total = 1;
39679 else
39680 *total = 0;
39681 return true;
39683 case CONST_DOUBLE:
39684 if (IS_STACK_MODE (mode))
39685 switch (standard_80387_constant_p (x))
39687 case -1:
39688 case 0:
39689 break;
39690 case 1: /* 0.0 */
39691 *total = 1;
39692 return true;
39693 default: /* Other constants */
39694 *total = 2;
39695 return true;
39697 /* FALLTHRU */
39699 case CONST_VECTOR:
39700 switch (standard_sse_constant_p (x, mode))
39702 case 0:
39703 break;
39704 case 1: /* 0: xor eliminates false dependency */
39705 *total = 0;
39706 return true;
39707 default: /* -1: cmp contains false dependency */
39708 *total = 1;
39709 return true;
39711 /* FALLTHRU */
39713 case CONST_WIDE_INT:
39714 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39715 it'll probably end up. Add a penalty for size. */
39716 *total = (COSTS_N_INSNS (1)
39717 + (!TARGET_64BIT && flag_pic)
39718 + (GET_MODE_SIZE (mode) <= 4
39719 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39720 return true;
39722 case ZERO_EXTEND:
39723 /* The zero extensions is often completely free on x86_64, so make
39724 it as cheap as possible. */
39725 if (TARGET_64BIT && mode == DImode
39726 && GET_MODE (XEXP (x, 0)) == SImode)
39727 *total = 1;
39728 else if (TARGET_ZERO_EXTEND_WITH_AND)
39729 *total = cost->add;
39730 else
39731 *total = cost->movzx;
39732 return false;
39734 case SIGN_EXTEND:
39735 *total = cost->movsx;
39736 return false;
39738 case ASHIFT:
39739 if (SCALAR_INT_MODE_P (mode)
39740 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39741 && CONST_INT_P (XEXP (x, 1)))
39743 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39744 if (value == 1)
39746 *total = cost->add;
39747 return false;
39749 if ((value == 2 || value == 3)
39750 && cost->lea <= cost->shift_const)
39752 *total = cost->lea;
39753 return false;
39756 /* FALLTHRU */
39758 case ROTATE:
39759 case ASHIFTRT:
39760 case LSHIFTRT:
39761 case ROTATERT:
39762 bool skip_op0, skip_op1;
39763 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
39764 CONST_INT_P (XEXP (x, 1))
39765 ? INTVAL (XEXP (x, 1)) : -1,
39766 speed,
39767 GET_CODE (XEXP (x, 1)) == AND,
39768 SUBREG_P (XEXP (x, 1))
39769 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
39770 &skip_op0, &skip_op1);
39771 if (skip_op0 || skip_op1)
39773 if (!skip_op0)
39774 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
39775 if (!skip_op1)
39776 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
39777 return true;
39779 return false;
39781 case FMA:
39783 rtx sub;
39785 gcc_assert (FLOAT_MODE_P (mode));
39786 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39788 *total = ix86_vec_cost (mode,
39789 mode == SFmode ? cost->fmass : cost->fmasd,
39790 true);
39791 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39793 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39794 sub = XEXP (x, 0);
39795 if (GET_CODE (sub) == NEG)
39796 sub = XEXP (sub, 0);
39797 *total += rtx_cost (sub, mode, FMA, 0, speed);
39799 sub = XEXP (x, 2);
39800 if (GET_CODE (sub) == NEG)
39801 sub = XEXP (sub, 0);
39802 *total += rtx_cost (sub, mode, FMA, 2, speed);
39803 return true;
39806 case MULT:
39807 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
39809 rtx op0 = XEXP (x, 0);
39810 rtx op1 = XEXP (x, 1);
39811 int nbits;
39812 if (CONST_INT_P (XEXP (x, 1)))
39814 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39815 for (nbits = 0; value != 0; value &= value - 1)
39816 nbits++;
39818 else
39819 /* This is arbitrary. */
39820 nbits = 7;
39822 /* Compute costs correctly for widening multiplication. */
39823 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39824 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39825 == GET_MODE_SIZE (mode))
39827 int is_mulwiden = 0;
39828 machine_mode inner_mode = GET_MODE (op0);
39830 if (GET_CODE (op0) == GET_CODE (op1))
39831 is_mulwiden = 1, op1 = XEXP (op1, 0);
39832 else if (CONST_INT_P (op1))
39834 if (GET_CODE (op0) == SIGN_EXTEND)
39835 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39836 == INTVAL (op1);
39837 else
39838 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39841 if (is_mulwiden)
39842 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39845 *total = (cost->mult_init[MODE_INDEX (mode)]
39846 + nbits * cost->mult_bit
39847 + rtx_cost (op0, mode, outer_code, opno, speed)
39848 + rtx_cost (op1, mode, outer_code, opno, speed));
39850 return true;
39852 *total = ix86_multiplication_cost (cost, mode);
39853 return false;
39855 case DIV:
39856 case UDIV:
39857 case MOD:
39858 case UMOD:
39859 *total = ix86_division_cost (cost, mode);
39860 return false;
39862 case PLUS:
39863 if (GET_MODE_CLASS (mode) == MODE_INT
39864 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
39866 if (GET_CODE (XEXP (x, 0)) == PLUS
39867 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
39868 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
39869 && CONSTANT_P (XEXP (x, 1)))
39871 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
39872 if (val == 2 || val == 4 || val == 8)
39874 *total = cost->lea;
39875 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39876 outer_code, opno, speed);
39877 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
39878 outer_code, opno, speed);
39879 *total += rtx_cost (XEXP (x, 1), mode,
39880 outer_code, opno, speed);
39881 return true;
39884 else if (GET_CODE (XEXP (x, 0)) == MULT
39885 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
39887 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
39888 if (val == 2 || val == 4 || val == 8)
39890 *total = cost->lea;
39891 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39892 outer_code, opno, speed);
39893 *total += rtx_cost (XEXP (x, 1), mode,
39894 outer_code, opno, speed);
39895 return true;
39898 else if (GET_CODE (XEXP (x, 0)) == PLUS)
39900 /* Add with carry, ignore the cost of adding a carry flag. */
39901 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
39902 *total = cost->add;
39903 else
39905 *total = cost->lea;
39906 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39907 outer_code, opno, speed);
39910 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39911 outer_code, opno, speed);
39912 *total += rtx_cost (XEXP (x, 1), mode,
39913 outer_code, opno, speed);
39914 return true;
39917 /* FALLTHRU */
39919 case MINUS:
39920 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
39921 if (GET_MODE_CLASS (mode) == MODE_INT
39922 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
39923 && GET_CODE (XEXP (x, 0)) == MINUS
39924 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
39926 *total = cost->add;
39927 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39928 outer_code, opno, speed);
39929 *total += rtx_cost (XEXP (x, 1), mode,
39930 outer_code, opno, speed);
39931 return true;
39934 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39936 *total = cost->addss;
39937 return false;
39939 else if (X87_FLOAT_MODE_P (mode))
39941 *total = cost->fadd;
39942 return false;
39944 else if (FLOAT_MODE_P (mode))
39946 *total = ix86_vec_cost (mode, cost->addss, true);
39947 return false;
39949 /* FALLTHRU */
39951 case AND:
39952 case IOR:
39953 case XOR:
39954 if (GET_MODE_CLASS (mode) == MODE_INT
39955 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39957 *total = (cost->add * 2
39958 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
39959 << (GET_MODE (XEXP (x, 0)) != DImode))
39960 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
39961 << (GET_MODE (XEXP (x, 1)) != DImode)));
39962 return true;
39964 /* FALLTHRU */
39966 case NEG:
39967 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39969 *total = cost->sse_op;
39970 return false;
39972 else if (X87_FLOAT_MODE_P (mode))
39974 *total = cost->fchs;
39975 return false;
39977 else if (FLOAT_MODE_P (mode))
39979 *total = ix86_vec_cost (mode, cost->sse_op, true);
39980 return false;
39982 /* FALLTHRU */
39984 case NOT:
39985 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39986 *total = ix86_vec_cost (mode, cost->sse_op, true);
39987 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39988 *total = cost->add * 2;
39989 else
39990 *total = cost->add;
39991 return false;
39993 case COMPARE:
39994 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
39995 && XEXP (XEXP (x, 0), 1) == const1_rtx
39996 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
39997 && XEXP (x, 1) == const0_rtx)
39999 /* This kind of construct is implemented using test[bwl].
40000 Treat it as if we had an AND. */
40001 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40002 *total = (cost->add
40003 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40004 opno, speed)
40005 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40006 return true;
40009 /* The embedded comparison operand is completely free. */
40010 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40011 && XEXP (x, 1) == const0_rtx)
40012 *total = 0;
40014 return false;
40016 case FLOAT_EXTEND:
40017 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40018 *total = 0;
40019 else
40020 *total = ix86_vec_cost (mode, cost->addss, true);
40021 return false;
40023 case FLOAT_TRUNCATE:
40024 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40025 *total = cost->fadd;
40026 else
40027 *total = ix86_vec_cost (mode, cost->addss, true);
40028 return false;
40030 case ABS:
40031 /* SSE requires memory load for the constant operand. It may make
40032 sense to account for this. Of course the constant operand may or
40033 may not be reused. */
40034 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40035 *total = cost->sse_op;
40036 else if (X87_FLOAT_MODE_P (mode))
40037 *total = cost->fabs;
40038 else if (FLOAT_MODE_P (mode))
40039 *total = ix86_vec_cost (mode, cost->sse_op, true);
40040 return false;
40042 case SQRT:
40043 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40044 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40045 else if (X87_FLOAT_MODE_P (mode))
40046 *total = cost->fsqrt;
40047 else if (FLOAT_MODE_P (mode))
40048 *total = ix86_vec_cost (mode,
40049 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40050 true);
40051 return false;
40053 case UNSPEC:
40054 if (XINT (x, 1) == UNSPEC_TP)
40055 *total = 0;
40056 return false;
40058 case VEC_SELECT:
40059 case VEC_CONCAT:
40060 case VEC_DUPLICATE:
40061 /* ??? Assume all of these vector manipulation patterns are
40062 recognizable. In which case they all pretty much have the
40063 same cost. */
40064 *total = cost->sse_op;
40065 return true;
40066 case VEC_MERGE:
40067 mask = XEXP (x, 2);
40068 /* This is masked instruction, assume the same cost,
40069 as nonmasked variant. */
40070 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40071 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40072 else
40073 *total = cost->sse_op;
40074 return true;
40076 default:
40077 return false;
40081 #if TARGET_MACHO
40083 static int current_machopic_label_num;
40085 /* Given a symbol name and its associated stub, write out the
40086 definition of the stub. */
40088 void
40089 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40091 unsigned int length;
40092 char *binder_name, *symbol_name, lazy_ptr_name[32];
40093 int label = ++current_machopic_label_num;
40095 /* For 64-bit we shouldn't get here. */
40096 gcc_assert (!TARGET_64BIT);
40098 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40099 symb = targetm.strip_name_encoding (symb);
40101 length = strlen (stub);
40102 binder_name = XALLOCAVEC (char, length + 32);
40103 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40105 length = strlen (symb);
40106 symbol_name = XALLOCAVEC (char, length + 32);
40107 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40109 sprintf (lazy_ptr_name, "L%d$lz", label);
40111 if (MACHOPIC_ATT_STUB)
40112 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40113 else if (MACHOPIC_PURE)
40114 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40115 else
40116 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40118 fprintf (file, "%s:\n", stub);
40119 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40121 if (MACHOPIC_ATT_STUB)
40123 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40125 else if (MACHOPIC_PURE)
40127 /* PIC stub. */
40128 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40129 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40130 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40131 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40132 label, lazy_ptr_name, label);
40133 fprintf (file, "\tjmp\t*%%ecx\n");
40135 else
40136 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40138 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40139 it needs no stub-binding-helper. */
40140 if (MACHOPIC_ATT_STUB)
40141 return;
40143 fprintf (file, "%s:\n", binder_name);
40145 if (MACHOPIC_PURE)
40147 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40148 fprintf (file, "\tpushl\t%%ecx\n");
40150 else
40151 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40153 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40155 /* N.B. Keep the correspondence of these
40156 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40157 old-pic/new-pic/non-pic stubs; altering this will break
40158 compatibility with existing dylibs. */
40159 if (MACHOPIC_PURE)
40161 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40162 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40164 else
40165 /* 16-byte -mdynamic-no-pic stub. */
40166 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40168 fprintf (file, "%s:\n", lazy_ptr_name);
40169 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40170 fprintf (file, ASM_LONG "%s\n", binder_name);
40172 #endif /* TARGET_MACHO */
40174 /* Order the registers for register allocator. */
40176 void
40177 x86_order_regs_for_local_alloc (void)
40179 int pos = 0;
40180 int i;
40182 /* First allocate the local general purpose registers. */
40183 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40184 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40185 reg_alloc_order [pos++] = i;
40187 /* Global general purpose registers. */
40188 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40189 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40190 reg_alloc_order [pos++] = i;
40192 /* x87 registers come first in case we are doing FP math
40193 using them. */
40194 if (!TARGET_SSE_MATH)
40195 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40196 reg_alloc_order [pos++] = i;
40198 /* SSE registers. */
40199 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40200 reg_alloc_order [pos++] = i;
40201 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40202 reg_alloc_order [pos++] = i;
40204 /* Extended REX SSE registers. */
40205 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40206 reg_alloc_order [pos++] = i;
40208 /* Mask register. */
40209 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40210 reg_alloc_order [pos++] = i;
40212 /* MPX bound registers. */
40213 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40214 reg_alloc_order [pos++] = i;
40216 /* x87 registers. */
40217 if (TARGET_SSE_MATH)
40218 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40219 reg_alloc_order [pos++] = i;
40221 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40222 reg_alloc_order [pos++] = i;
40224 /* Initialize the rest of array as we do not allocate some registers
40225 at all. */
40226 while (pos < FIRST_PSEUDO_REGISTER)
40227 reg_alloc_order [pos++] = 0;
40230 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40231 in struct attribute_spec handler. */
40232 static tree
40233 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
40234 tree args,
40235 int,
40236 bool *no_add_attrs)
40238 if (TREE_CODE (*node) != FUNCTION_TYPE
40239 && TREE_CODE (*node) != METHOD_TYPE
40240 && TREE_CODE (*node) != FIELD_DECL
40241 && TREE_CODE (*node) != TYPE_DECL)
40243 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40244 name);
40245 *no_add_attrs = true;
40246 return NULL_TREE;
40248 if (TARGET_64BIT)
40250 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40251 name);
40252 *no_add_attrs = true;
40253 return NULL_TREE;
40255 if (is_attribute_p ("callee_pop_aggregate_return", name))
40257 tree cst;
40259 cst = TREE_VALUE (args);
40260 if (TREE_CODE (cst) != INTEGER_CST)
40262 warning (OPT_Wattributes,
40263 "%qE attribute requires an integer constant argument",
40264 name);
40265 *no_add_attrs = true;
40267 else if (compare_tree_int (cst, 0) != 0
40268 && compare_tree_int (cst, 1) != 0)
40270 warning (OPT_Wattributes,
40271 "argument to %qE attribute is neither zero, nor one",
40272 name);
40273 *no_add_attrs = true;
40276 return NULL_TREE;
40279 return NULL_TREE;
40282 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40283 struct attribute_spec.handler. */
40284 static tree
40285 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40286 bool *no_add_attrs)
40288 if (TREE_CODE (*node) != FUNCTION_TYPE
40289 && TREE_CODE (*node) != METHOD_TYPE
40290 && TREE_CODE (*node) != FIELD_DECL
40291 && TREE_CODE (*node) != TYPE_DECL)
40293 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40294 name);
40295 *no_add_attrs = true;
40296 return NULL_TREE;
40299 /* Can combine regparm with all attributes but fastcall. */
40300 if (is_attribute_p ("ms_abi", name))
40302 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40304 error ("ms_abi and sysv_abi attributes are not compatible");
40307 return NULL_TREE;
40309 else if (is_attribute_p ("sysv_abi", name))
40311 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40313 error ("ms_abi and sysv_abi attributes are not compatible");
40316 return NULL_TREE;
40319 return NULL_TREE;
40322 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40323 struct attribute_spec.handler. */
40324 static tree
40325 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40326 bool *no_add_attrs)
40328 tree *type = NULL;
40329 if (DECL_P (*node))
40331 if (TREE_CODE (*node) == TYPE_DECL)
40332 type = &TREE_TYPE (*node);
40334 else
40335 type = node;
40337 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40339 warning (OPT_Wattributes, "%qE attribute ignored",
40340 name);
40341 *no_add_attrs = true;
40344 else if ((is_attribute_p ("ms_struct", name)
40345 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40346 || ((is_attribute_p ("gcc_struct", name)
40347 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40349 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40350 name);
40351 *no_add_attrs = true;
40354 return NULL_TREE;
40357 static tree
40358 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40359 bool *no_add_attrs)
40361 if (TREE_CODE (*node) != FUNCTION_DECL)
40363 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40364 name);
40365 *no_add_attrs = true;
40367 return NULL_TREE;
40370 static tree
40371 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40372 int, bool *)
40374 return NULL_TREE;
40377 static tree
40378 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40380 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40381 but the function type contains args and return type data. */
40382 tree func_type = *node;
40383 tree return_type = TREE_TYPE (func_type);
40385 int nargs = 0;
40386 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40387 while (current_arg_type
40388 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40390 if (nargs == 0)
40392 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40393 error ("interrupt service routine should have a pointer "
40394 "as the first argument");
40396 else if (nargs == 1)
40398 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40399 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40400 error ("interrupt service routine should have unsigned %s"
40401 "int as the second argument",
40402 TARGET_64BIT
40403 ? (TARGET_X32 ? "long long " : "long ")
40404 : "");
40406 nargs++;
40407 current_arg_type = TREE_CHAIN (current_arg_type);
40409 if (!nargs || nargs > 2)
40410 error ("interrupt service routine can only have a pointer argument "
40411 "and an optional integer argument");
40412 if (! VOID_TYPE_P (return_type))
40413 error ("interrupt service routine can't have non-void return value");
40415 return NULL_TREE;
40418 static bool
40419 ix86_ms_bitfield_layout_p (const_tree record_type)
40421 return ((TARGET_MS_BITFIELD_LAYOUT
40422 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40423 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40426 /* Returns an expression indicating where the this parameter is
40427 located on entry to the FUNCTION. */
40429 static rtx
40430 x86_this_parameter (tree function)
40432 tree type = TREE_TYPE (function);
40433 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40434 int nregs;
40436 if (TARGET_64BIT)
40438 const int *parm_regs;
40440 if (ix86_function_type_abi (type) == MS_ABI)
40441 parm_regs = x86_64_ms_abi_int_parameter_registers;
40442 else
40443 parm_regs = x86_64_int_parameter_registers;
40444 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40447 nregs = ix86_function_regparm (type, function);
40449 if (nregs > 0 && !stdarg_p (type))
40451 int regno;
40452 unsigned int ccvt = ix86_get_callcvt (type);
40454 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40455 regno = aggr ? DX_REG : CX_REG;
40456 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40458 regno = CX_REG;
40459 if (aggr)
40460 return gen_rtx_MEM (SImode,
40461 plus_constant (Pmode, stack_pointer_rtx, 4));
40463 else
40465 regno = AX_REG;
40466 if (aggr)
40468 regno = DX_REG;
40469 if (nregs == 1)
40470 return gen_rtx_MEM (SImode,
40471 plus_constant (Pmode,
40472 stack_pointer_rtx, 4));
40475 return gen_rtx_REG (SImode, regno);
40478 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40479 aggr ? 8 : 4));
40482 /* Determine whether x86_output_mi_thunk can succeed. */
40484 static bool
40485 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40486 const_tree function)
40488 /* 64-bit can handle anything. */
40489 if (TARGET_64BIT)
40490 return true;
40492 /* For 32-bit, everything's fine if we have one free register. */
40493 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40494 return true;
40496 /* Need a free register for vcall_offset. */
40497 if (vcall_offset)
40498 return false;
40500 /* Need a free register for GOT references. */
40501 if (flag_pic && !targetm.binds_local_p (function))
40502 return false;
40504 /* Otherwise ok. */
40505 return true;
40508 /* Output the assembler code for a thunk function. THUNK_DECL is the
40509 declaration for the thunk function itself, FUNCTION is the decl for
40510 the target function. DELTA is an immediate constant offset to be
40511 added to THIS. If VCALL_OFFSET is nonzero, the word at
40512 *(*this + vcall_offset) should be added to THIS. */
40514 static void
40515 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40516 HOST_WIDE_INT vcall_offset, tree function)
40518 rtx this_param = x86_this_parameter (function);
40519 rtx this_reg, tmp, fnaddr;
40520 unsigned int tmp_regno;
40521 rtx_insn *insn;
40523 if (TARGET_64BIT)
40524 tmp_regno = R10_REG;
40525 else
40527 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40528 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40529 tmp_regno = AX_REG;
40530 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40531 tmp_regno = DX_REG;
40532 else
40533 tmp_regno = CX_REG;
40536 emit_note (NOTE_INSN_PROLOGUE_END);
40538 /* CET is enabled, insert EB instruction. */
40539 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
40540 emit_insn (gen_nop_endbr ());
40542 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40543 pull it in now and let DELTA benefit. */
40544 if (REG_P (this_param))
40545 this_reg = this_param;
40546 else if (vcall_offset)
40548 /* Put the this parameter into %eax. */
40549 this_reg = gen_rtx_REG (Pmode, AX_REG);
40550 emit_move_insn (this_reg, this_param);
40552 else
40553 this_reg = NULL_RTX;
40555 /* Adjust the this parameter by a fixed constant. */
40556 if (delta)
40558 rtx delta_rtx = GEN_INT (delta);
40559 rtx delta_dst = this_reg ? this_reg : this_param;
40561 if (TARGET_64BIT)
40563 if (!x86_64_general_operand (delta_rtx, Pmode))
40565 tmp = gen_rtx_REG (Pmode, tmp_regno);
40566 emit_move_insn (tmp, delta_rtx);
40567 delta_rtx = tmp;
40571 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40574 /* Adjust the this parameter by a value stored in the vtable. */
40575 if (vcall_offset)
40577 rtx vcall_addr, vcall_mem, this_mem;
40579 tmp = gen_rtx_REG (Pmode, tmp_regno);
40581 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40582 if (Pmode != ptr_mode)
40583 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40584 emit_move_insn (tmp, this_mem);
40586 /* Adjust the this parameter. */
40587 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40588 if (TARGET_64BIT
40589 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40591 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40592 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40593 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40596 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40597 if (Pmode != ptr_mode)
40598 emit_insn (gen_addsi_1_zext (this_reg,
40599 gen_rtx_REG (ptr_mode,
40600 REGNO (this_reg)),
40601 vcall_mem));
40602 else
40603 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40606 /* If necessary, drop THIS back to its stack slot. */
40607 if (this_reg && this_reg != this_param)
40608 emit_move_insn (this_param, this_reg);
40610 fnaddr = XEXP (DECL_RTL (function), 0);
40611 if (TARGET_64BIT)
40613 if (!flag_pic || targetm.binds_local_p (function)
40614 || TARGET_PECOFF)
40616 else
40618 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40619 tmp = gen_rtx_CONST (Pmode, tmp);
40620 fnaddr = gen_const_mem (Pmode, tmp);
40623 else
40625 if (!flag_pic || targetm.binds_local_p (function))
40627 #if TARGET_MACHO
40628 else if (TARGET_MACHO)
40630 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40631 fnaddr = XEXP (fnaddr, 0);
40633 #endif /* TARGET_MACHO */
40634 else
40636 tmp = gen_rtx_REG (Pmode, CX_REG);
40637 output_set_got (tmp, NULL_RTX);
40639 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40640 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40641 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40642 fnaddr = gen_const_mem (Pmode, fnaddr);
40646 /* Our sibling call patterns do not allow memories, because we have no
40647 predicate that can distinguish between frame and non-frame memory.
40648 For our purposes here, we can get away with (ab)using a jump pattern,
40649 because we're going to do no optimization. */
40650 if (MEM_P (fnaddr))
40652 if (sibcall_insn_operand (fnaddr, word_mode))
40654 fnaddr = XEXP (DECL_RTL (function), 0);
40655 tmp = gen_rtx_MEM (QImode, fnaddr);
40656 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40657 tmp = emit_call_insn (tmp);
40658 SIBLING_CALL_P (tmp) = 1;
40660 else
40661 emit_jump_insn (gen_indirect_jump (fnaddr));
40663 else
40665 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40667 // CM_LARGE_PIC always uses pseudo PIC register which is
40668 // uninitialized. Since FUNCTION is local and calling it
40669 // doesn't go through PLT, we use scratch register %r11 as
40670 // PIC register and initialize it here.
40671 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40672 ix86_init_large_pic_reg (tmp_regno);
40673 fnaddr = legitimize_pic_address (fnaddr,
40674 gen_rtx_REG (Pmode, tmp_regno));
40677 if (!sibcall_insn_operand (fnaddr, word_mode))
40679 tmp = gen_rtx_REG (word_mode, tmp_regno);
40680 if (GET_MODE (fnaddr) != word_mode)
40681 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40682 emit_move_insn (tmp, fnaddr);
40683 fnaddr = tmp;
40686 tmp = gen_rtx_MEM (QImode, fnaddr);
40687 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40688 tmp = emit_call_insn (tmp);
40689 SIBLING_CALL_P (tmp) = 1;
40691 emit_barrier ();
40693 /* Emit just enough of rest_of_compilation to get the insns emitted.
40694 Note that use_thunk calls assemble_start_function et al. */
40695 insn = get_insns ();
40696 shorten_branches (insn);
40697 final_start_function (insn, file, 1);
40698 final (insn, file, 1);
40699 final_end_function ();
40702 static void
40703 x86_file_start (void)
40705 default_file_start ();
40706 if (TARGET_16BIT)
40707 fputs ("\t.code16gcc\n", asm_out_file);
40708 #if TARGET_MACHO
40709 darwin_file_start ();
40710 #endif
40711 if (X86_FILE_START_VERSION_DIRECTIVE)
40712 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40713 if (X86_FILE_START_FLTUSED)
40714 fputs ("\t.global\t__fltused\n", asm_out_file);
40715 if (ix86_asm_dialect == ASM_INTEL)
40716 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40720 x86_field_alignment (tree type, int computed)
40722 machine_mode mode;
40724 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40725 return computed;
40726 if (TARGET_IAMCU)
40727 return iamcu_alignment (type, computed);
40728 mode = TYPE_MODE (strip_array_types (type));
40729 if (mode == DFmode || mode == DCmode
40730 || GET_MODE_CLASS (mode) == MODE_INT
40731 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40732 return MIN (32, computed);
40733 return computed;
40736 /* Print call to TARGET to FILE. */
40738 static void
40739 x86_print_call_or_nop (FILE *file, const char *target)
40741 if (flag_nop_mcount)
40742 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
40743 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
40744 else
40745 fprintf (file, "1:\tcall\t%s\n", target);
40748 /* Output assembler code to FILE to increment profiler label # LABELNO
40749 for profiling a function entry. */
40750 void
40751 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40753 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40754 : MCOUNT_NAME);
40755 if (TARGET_64BIT)
40757 #ifndef NO_PROFILE_COUNTERS
40758 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40759 #endif
40761 if (!TARGET_PECOFF && flag_pic)
40762 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40763 else
40764 x86_print_call_or_nop (file, mcount_name);
40766 else if (flag_pic)
40768 #ifndef NO_PROFILE_COUNTERS
40769 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40770 LPREFIX, labelno);
40771 #endif
40772 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40774 else
40776 #ifndef NO_PROFILE_COUNTERS
40777 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40778 LPREFIX, labelno);
40779 #endif
40780 x86_print_call_or_nop (file, mcount_name);
40783 if (flag_record_mcount)
40785 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40786 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40787 fprintf (file, "\t.previous\n");
40791 /* We don't have exact information about the insn sizes, but we may assume
40792 quite safely that we are informed about all 1 byte insns and memory
40793 address sizes. This is enough to eliminate unnecessary padding in
40794 99% of cases. */
40797 ix86_min_insn_size (rtx_insn *insn)
40799 int l = 0, len;
40801 if (!INSN_P (insn) || !active_insn_p (insn))
40802 return 0;
40804 /* Discard alignments we've emit and jump instructions. */
40805 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40806 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40807 return 0;
40809 /* Important case - calls are always 5 bytes.
40810 It is common to have many calls in the row. */
40811 if (CALL_P (insn)
40812 && symbolic_reference_mentioned_p (PATTERN (insn))
40813 && !SIBLING_CALL_P (insn))
40814 return 5;
40815 len = get_attr_length (insn);
40816 if (len <= 1)
40817 return 1;
40819 /* For normal instructions we rely on get_attr_length being exact,
40820 with a few exceptions. */
40821 if (!JUMP_P (insn))
40823 enum attr_type type = get_attr_type (insn);
40825 switch (type)
40827 case TYPE_MULTI:
40828 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40829 || asm_noperands (PATTERN (insn)) >= 0)
40830 return 0;
40831 break;
40832 case TYPE_OTHER:
40833 case TYPE_FCMP:
40834 break;
40835 default:
40836 /* Otherwise trust get_attr_length. */
40837 return len;
40840 l = get_attr_length_address (insn);
40841 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40842 l = 4;
40844 if (l)
40845 return 1+l;
40846 else
40847 return 2;
40850 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40852 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40853 window. */
40855 static void
40856 ix86_avoid_jump_mispredicts (void)
40858 rtx_insn *insn, *start = get_insns ();
40859 int nbytes = 0, njumps = 0;
40860 bool isjump = false;
40862 /* Look for all minimal intervals of instructions containing 4 jumps.
40863 The intervals are bounded by START and INSN. NBYTES is the total
40864 size of instructions in the interval including INSN and not including
40865 START. When the NBYTES is smaller than 16 bytes, it is possible
40866 that the end of START and INSN ends up in the same 16byte page.
40868 The smallest offset in the page INSN can start is the case where START
40869 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40870 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40872 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40873 have to, control transfer to label(s) can be performed through other
40874 means, and also we estimate minimum length of all asm stmts as 0. */
40875 for (insn = start; insn; insn = NEXT_INSN (insn))
40877 int min_size;
40879 if (LABEL_P (insn))
40881 int align = label_to_alignment (insn);
40882 int max_skip = label_to_max_skip (insn);
40884 if (max_skip > 15)
40885 max_skip = 15;
40886 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40887 already in the current 16 byte page, because otherwise
40888 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40889 bytes to reach 16 byte boundary. */
40890 if (align <= 0
40891 || (align <= 3 && max_skip != (1 << align) - 1))
40892 max_skip = 0;
40893 if (dump_file)
40894 fprintf (dump_file, "Label %i with max_skip %i\n",
40895 INSN_UID (insn), max_skip);
40896 if (max_skip)
40898 while (nbytes + max_skip >= 16)
40900 start = NEXT_INSN (start);
40901 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40902 || CALL_P (start))
40903 njumps--, isjump = true;
40904 else
40905 isjump = false;
40906 nbytes -= ix86_min_insn_size (start);
40909 continue;
40912 min_size = ix86_min_insn_size (insn);
40913 nbytes += min_size;
40914 if (dump_file)
40915 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
40916 INSN_UID (insn), min_size);
40917 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
40918 || CALL_P (insn))
40919 njumps++;
40920 else
40921 continue;
40923 while (njumps > 3)
40925 start = NEXT_INSN (start);
40926 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40927 || CALL_P (start))
40928 njumps--, isjump = true;
40929 else
40930 isjump = false;
40931 nbytes -= ix86_min_insn_size (start);
40933 gcc_assert (njumps >= 0);
40934 if (dump_file)
40935 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
40936 INSN_UID (start), INSN_UID (insn), nbytes);
40938 if (njumps == 3 && isjump && nbytes < 16)
40940 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
40942 if (dump_file)
40943 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
40944 INSN_UID (insn), padsize);
40945 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
40949 #endif
40951 /* AMD Athlon works faster
40952 when RET is not destination of conditional jump or directly preceded
40953 by other jump instruction. We avoid the penalty by inserting NOP just
40954 before the RET instructions in such cases. */
40955 static void
40956 ix86_pad_returns (void)
40958 edge e;
40959 edge_iterator ei;
40961 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40963 basic_block bb = e->src;
40964 rtx_insn *ret = BB_END (bb);
40965 rtx_insn *prev;
40966 bool replace = false;
40968 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
40969 || optimize_bb_for_size_p (bb))
40970 continue;
40971 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
40972 if (active_insn_p (prev) || LABEL_P (prev))
40973 break;
40974 if (prev && LABEL_P (prev))
40976 edge e;
40977 edge_iterator ei;
40979 FOR_EACH_EDGE (e, ei, bb->preds)
40980 if (EDGE_FREQUENCY (e) && e->src->index >= 0
40981 && !(e->flags & EDGE_FALLTHRU))
40983 replace = true;
40984 break;
40987 if (!replace)
40989 prev = prev_active_insn (ret);
40990 if (prev
40991 && ((JUMP_P (prev) && any_condjump_p (prev))
40992 || CALL_P (prev)))
40993 replace = true;
40994 /* Empty functions get branch mispredict even when
40995 the jump destination is not visible to us. */
40996 if (!prev && !optimize_function_for_size_p (cfun))
40997 replace = true;
40999 if (replace)
41001 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41002 delete_insn (ret);
41007 /* Count the minimum number of instructions in BB. Return 4 if the
41008 number of instructions >= 4. */
41010 static int
41011 ix86_count_insn_bb (basic_block bb)
41013 rtx_insn *insn;
41014 int insn_count = 0;
41016 /* Count number of instructions in this block. Return 4 if the number
41017 of instructions >= 4. */
41018 FOR_BB_INSNS (bb, insn)
41020 /* Only happen in exit blocks. */
41021 if (JUMP_P (insn)
41022 && ANY_RETURN_P (PATTERN (insn)))
41023 break;
41025 if (NONDEBUG_INSN_P (insn)
41026 && GET_CODE (PATTERN (insn)) != USE
41027 && GET_CODE (PATTERN (insn)) != CLOBBER)
41029 insn_count++;
41030 if (insn_count >= 4)
41031 return insn_count;
41035 return insn_count;
41039 /* Count the minimum number of instructions in code path in BB.
41040 Return 4 if the number of instructions >= 4. */
41042 static int
41043 ix86_count_insn (basic_block bb)
41045 edge e;
41046 edge_iterator ei;
41047 int min_prev_count;
41049 /* Only bother counting instructions along paths with no
41050 more than 2 basic blocks between entry and exit. Given
41051 that BB has an edge to exit, determine if a predecessor
41052 of BB has an edge from entry. If so, compute the number
41053 of instructions in the predecessor block. If there
41054 happen to be multiple such blocks, compute the minimum. */
41055 min_prev_count = 4;
41056 FOR_EACH_EDGE (e, ei, bb->preds)
41058 edge prev_e;
41059 edge_iterator prev_ei;
41061 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41063 min_prev_count = 0;
41064 break;
41066 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41068 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41070 int count = ix86_count_insn_bb (e->src);
41071 if (count < min_prev_count)
41072 min_prev_count = count;
41073 break;
41078 if (min_prev_count < 4)
41079 min_prev_count += ix86_count_insn_bb (bb);
41081 return min_prev_count;
41084 /* Pad short function to 4 instructions. */
41086 static void
41087 ix86_pad_short_function (void)
41089 edge e;
41090 edge_iterator ei;
41092 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41094 rtx_insn *ret = BB_END (e->src);
41095 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41097 int insn_count = ix86_count_insn (e->src);
41099 /* Pad short function. */
41100 if (insn_count < 4)
41102 rtx_insn *insn = ret;
41104 /* Find epilogue. */
41105 while (insn
41106 && (!NOTE_P (insn)
41107 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41108 insn = PREV_INSN (insn);
41110 if (!insn)
41111 insn = ret;
41113 /* Two NOPs count as one instruction. */
41114 insn_count = 2 * (4 - insn_count);
41115 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41121 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41122 the epilogue, the Windows system unwinder will apply epilogue logic and
41123 produce incorrect offsets. This can be avoided by adding a nop between
41124 the last insn that can throw and the first insn of the epilogue. */
41126 static void
41127 ix86_seh_fixup_eh_fallthru (void)
41129 edge e;
41130 edge_iterator ei;
41132 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41134 rtx_insn *insn, *next;
41136 /* Find the beginning of the epilogue. */
41137 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41138 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41139 break;
41140 if (insn == NULL)
41141 continue;
41143 /* We only care about preceding insns that can throw. */
41144 insn = prev_active_insn (insn);
41145 if (insn == NULL || !can_throw_internal (insn))
41146 continue;
41148 /* Do not separate calls from their debug information. */
41149 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41150 if (NOTE_P (next)
41151 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41152 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41153 insn = next;
41154 else
41155 break;
41157 emit_insn_after (gen_nops (const1_rtx), insn);
41161 /* Given a register number BASE, the lowest of a group of registers, update
41162 regsets IN and OUT with the registers that should be avoided in input
41163 and output operands respectively when trying to avoid generating a modr/m
41164 byte for -fmitigate-rop. */
41166 static void
41167 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41169 SET_HARD_REG_BIT (out, base);
41170 SET_HARD_REG_BIT (out, base + 1);
41171 SET_HARD_REG_BIT (in, base + 2);
41172 SET_HARD_REG_BIT (in, base + 3);
41175 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41176 that certain encodings of modr/m bytes do not occur. */
41177 static void
41178 ix86_mitigate_rop (void)
41180 HARD_REG_SET input_risky;
41181 HARD_REG_SET output_risky;
41182 HARD_REG_SET inout_risky;
41184 CLEAR_HARD_REG_SET (output_risky);
41185 CLEAR_HARD_REG_SET (input_risky);
41186 SET_HARD_REG_BIT (output_risky, AX_REG);
41187 SET_HARD_REG_BIT (output_risky, CX_REG);
41188 SET_HARD_REG_BIT (input_risky, BX_REG);
41189 SET_HARD_REG_BIT (input_risky, DX_REG);
41190 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41191 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41192 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41193 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41194 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41195 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41196 COPY_HARD_REG_SET (inout_risky, input_risky);
41197 IOR_HARD_REG_SET (inout_risky, output_risky);
41199 df_note_add_problem ();
41200 /* Fix up what stack-regs did. */
41201 df_insn_rescan_all ();
41202 df_analyze ();
41204 regrename_init (true);
41205 regrename_analyze (NULL);
41207 auto_vec<du_head_p> cands;
41209 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41211 if (!NONDEBUG_INSN_P (insn))
41212 continue;
41214 if (GET_CODE (PATTERN (insn)) == USE
41215 || GET_CODE (PATTERN (insn)) == CLOBBER)
41216 continue;
41218 extract_insn (insn);
41220 int opno0, opno1;
41221 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41222 recog_data.n_operands, &opno0,
41223 &opno1);
41225 if (!ix86_rop_should_change_byte_p (modrm))
41226 continue;
41228 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41230 /* This happens when regrename has to fail a block. */
41231 if (!info->op_info)
41232 continue;
41234 if (info->op_info[opno0].n_chains != 0)
41236 gcc_assert (info->op_info[opno0].n_chains == 1);
41237 du_head_p op0c;
41238 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41239 if (op0c->target_data_1 + op0c->target_data_2 == 0
41240 && !op0c->cannot_rename)
41241 cands.safe_push (op0c);
41243 op0c->target_data_1++;
41245 if (info->op_info[opno1].n_chains != 0)
41247 gcc_assert (info->op_info[opno1].n_chains == 1);
41248 du_head_p op1c;
41249 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41250 if (op1c->target_data_1 + op1c->target_data_2 == 0
41251 && !op1c->cannot_rename)
41252 cands.safe_push (op1c);
41254 op1c->target_data_2++;
41258 int i;
41259 du_head_p head;
41260 FOR_EACH_VEC_ELT (cands, i, head)
41262 int old_reg, best_reg;
41263 HARD_REG_SET unavailable;
41265 CLEAR_HARD_REG_SET (unavailable);
41266 if (head->target_data_1)
41267 IOR_HARD_REG_SET (unavailable, output_risky);
41268 if (head->target_data_2)
41269 IOR_HARD_REG_SET (unavailable, input_risky);
41271 int n_uses;
41272 reg_class superclass = regrename_find_superclass (head, &n_uses,
41273 &unavailable);
41274 old_reg = head->regno;
41275 best_reg = find_rename_reg (head, superclass, &unavailable,
41276 old_reg, false);
41277 bool ok = regrename_do_replace (head, best_reg);
41278 gcc_assert (ok);
41279 if (dump_file)
41280 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41281 reg_names[best_reg], reg_class_names[superclass]);
41285 regrename_finish ();
41287 df_analyze ();
41289 basic_block bb;
41290 regset_head live;
41292 INIT_REG_SET (&live);
41294 FOR_EACH_BB_FN (bb, cfun)
41296 rtx_insn *insn;
41298 COPY_REG_SET (&live, DF_LR_OUT (bb));
41299 df_simulate_initialize_backwards (bb, &live);
41301 FOR_BB_INSNS_REVERSE (bb, insn)
41303 if (!NONDEBUG_INSN_P (insn))
41304 continue;
41306 df_simulate_one_insn_backwards (bb, insn, &live);
41308 if (GET_CODE (PATTERN (insn)) == USE
41309 || GET_CODE (PATTERN (insn)) == CLOBBER)
41310 continue;
41312 extract_insn (insn);
41313 constrain_operands_cached (insn, reload_completed);
41314 int opno0, opno1;
41315 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41316 recog_data.n_operands, &opno0,
41317 &opno1);
41318 if (modrm < 0
41319 || !ix86_rop_should_change_byte_p (modrm)
41320 || opno0 == opno1)
41321 continue;
41323 rtx oldreg = recog_data.operand[opno1];
41324 preprocess_constraints (insn);
41325 const operand_alternative *alt = which_op_alt ();
41327 int i;
41328 for (i = 0; i < recog_data.n_operands; i++)
41329 if (i != opno1
41330 && alt[i].earlyclobber
41331 && reg_overlap_mentioned_p (recog_data.operand[i],
41332 oldreg))
41333 break;
41335 if (i < recog_data.n_operands)
41336 continue;
41338 if (dump_file)
41339 fprintf (dump_file,
41340 "attempting to fix modrm byte in insn %d:"
41341 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41342 reg_class_names[alt[opno1].cl]);
41344 HARD_REG_SET unavailable;
41345 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41346 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41347 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41348 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41349 IOR_HARD_REG_SET (unavailable, output_risky);
41350 IOR_COMPL_HARD_REG_SET (unavailable,
41351 reg_class_contents[alt[opno1].cl]);
41353 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41354 if (!TEST_HARD_REG_BIT (unavailable, i))
41355 break;
41356 if (i == FIRST_PSEUDO_REGISTER)
41358 if (dump_file)
41359 fprintf (dump_file, ", none available\n");
41360 continue;
41362 if (dump_file)
41363 fprintf (dump_file, " -> %d\n", i);
41364 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41365 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41366 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41371 /* Implement machine specific optimizations. We implement padding of returns
41372 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41373 static void
41374 ix86_reorg (void)
41376 /* We are freeing block_for_insn in the toplev to keep compatibility
41377 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41378 compute_bb_for_insn ();
41380 if (flag_mitigate_rop)
41381 ix86_mitigate_rop ();
41383 if (TARGET_SEH && current_function_has_exception_handlers ())
41384 ix86_seh_fixup_eh_fallthru ();
41386 if (optimize && optimize_function_for_speed_p (cfun))
41388 if (TARGET_PAD_SHORT_FUNCTION)
41389 ix86_pad_short_function ();
41390 else if (TARGET_PAD_RETURNS)
41391 ix86_pad_returns ();
41392 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41393 if (TARGET_FOUR_JUMP_LIMIT)
41394 ix86_avoid_jump_mispredicts ();
41395 #endif
41399 /* Return nonzero when QImode register that must be represented via REX prefix
41400 is used. */
41401 bool
41402 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41404 int i;
41405 extract_insn_cached (insn);
41406 for (i = 0; i < recog_data.n_operands; i++)
41407 if (GENERAL_REG_P (recog_data.operand[i])
41408 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41409 return true;
41410 return false;
41413 /* Return true when INSN mentions register that must be encoded using REX
41414 prefix. */
41415 bool
41416 x86_extended_reg_mentioned_p (rtx insn)
41418 subrtx_iterator::array_type array;
41419 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41421 const_rtx x = *iter;
41422 if (REG_P (x)
41423 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41424 return true;
41426 return false;
41429 /* If profitable, negate (without causing overflow) integer constant
41430 of mode MODE at location LOC. Return true in this case. */
41431 bool
41432 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41434 HOST_WIDE_INT val;
41436 if (!CONST_INT_P (*loc))
41437 return false;
41439 switch (mode)
41441 case E_DImode:
41442 /* DImode x86_64 constants must fit in 32 bits. */
41443 gcc_assert (x86_64_immediate_operand (*loc, mode));
41445 mode = SImode;
41446 break;
41448 case E_SImode:
41449 case E_HImode:
41450 case E_QImode:
41451 break;
41453 default:
41454 gcc_unreachable ();
41457 /* Avoid overflows. */
41458 if (mode_signbit_p (mode, *loc))
41459 return false;
41461 val = INTVAL (*loc);
41463 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41464 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41465 if ((val < 0 && val != -128)
41466 || val == 128)
41468 *loc = GEN_INT (-val);
41469 return true;
41472 return false;
41475 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41476 optabs would emit if we didn't have TFmode patterns. */
41478 void
41479 x86_emit_floatuns (rtx operands[2])
41481 rtx_code_label *neglab, *donelab;
41482 rtx i0, i1, f0, in, out;
41483 machine_mode mode, inmode;
41485 inmode = GET_MODE (operands[1]);
41486 gcc_assert (inmode == SImode || inmode == DImode);
41488 out = operands[0];
41489 in = force_reg (inmode, operands[1]);
41490 mode = GET_MODE (out);
41491 neglab = gen_label_rtx ();
41492 donelab = gen_label_rtx ();
41493 f0 = gen_reg_rtx (mode);
41495 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41497 expand_float (out, in, 0);
41499 emit_jump_insn (gen_jump (donelab));
41500 emit_barrier ();
41502 emit_label (neglab);
41504 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41505 1, OPTAB_DIRECT);
41506 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41507 1, OPTAB_DIRECT);
41508 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41510 expand_float (f0, i0, 0);
41512 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41514 emit_label (donelab);
41517 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41518 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41519 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41520 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41522 /* Get a vector mode of the same size as the original but with elements
41523 twice as wide. This is only guaranteed to apply to integral vectors. */
41525 static inline machine_mode
41526 get_mode_wider_vector (machine_mode o)
41528 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41529 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
41530 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41531 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41532 return n;
41535 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41536 fill target with val via vec_duplicate. */
41538 static bool
41539 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41541 bool ok;
41542 rtx_insn *insn;
41543 rtx dup;
41545 /* First attempt to recognize VAL as-is. */
41546 dup = gen_vec_duplicate (mode, val);
41547 insn = emit_insn (gen_rtx_SET (target, dup));
41548 if (recog_memoized (insn) < 0)
41550 rtx_insn *seq;
41551 machine_mode innermode = GET_MODE_INNER (mode);
41552 rtx reg;
41554 /* If that fails, force VAL into a register. */
41556 start_sequence ();
41557 reg = force_reg (innermode, val);
41558 if (GET_MODE (reg) != innermode)
41559 reg = gen_lowpart (innermode, reg);
41560 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
41561 seq = get_insns ();
41562 end_sequence ();
41563 if (seq)
41564 emit_insn_before (seq, insn);
41566 ok = recog_memoized (insn) >= 0;
41567 gcc_assert (ok);
41569 return true;
41572 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41573 with all elements equal to VAR. Return true if successful. */
41575 static bool
41576 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41577 rtx target, rtx val)
41579 bool ok;
41581 switch (mode)
41583 case E_V2SImode:
41584 case E_V2SFmode:
41585 if (!mmx_ok)
41586 return false;
41587 /* FALLTHRU */
41589 case E_V4DFmode:
41590 case E_V4DImode:
41591 case E_V8SFmode:
41592 case E_V8SImode:
41593 case E_V2DFmode:
41594 case E_V2DImode:
41595 case E_V4SFmode:
41596 case E_V4SImode:
41597 case E_V16SImode:
41598 case E_V8DImode:
41599 case E_V16SFmode:
41600 case E_V8DFmode:
41601 return ix86_vector_duplicate_value (mode, target, val);
41603 case E_V4HImode:
41604 if (!mmx_ok)
41605 return false;
41606 if (TARGET_SSE || TARGET_3DNOW_A)
41608 rtx x;
41610 val = gen_lowpart (SImode, val);
41611 x = gen_rtx_TRUNCATE (HImode, val);
41612 x = gen_rtx_VEC_DUPLICATE (mode, x);
41613 emit_insn (gen_rtx_SET (target, x));
41614 return true;
41616 goto widen;
41618 case E_V8QImode:
41619 if (!mmx_ok)
41620 return false;
41621 goto widen;
41623 case E_V8HImode:
41624 if (TARGET_AVX2)
41625 return ix86_vector_duplicate_value (mode, target, val);
41627 if (TARGET_SSE2)
41629 struct expand_vec_perm_d dperm;
41630 rtx tmp1, tmp2;
41632 permute:
41633 memset (&dperm, 0, sizeof (dperm));
41634 dperm.target = target;
41635 dperm.vmode = mode;
41636 dperm.nelt = GET_MODE_NUNITS (mode);
41637 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41638 dperm.one_operand_p = true;
41640 /* Extend to SImode using a paradoxical SUBREG. */
41641 tmp1 = gen_reg_rtx (SImode);
41642 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41644 /* Insert the SImode value as low element of a V4SImode vector. */
41645 tmp2 = gen_reg_rtx (V4SImode);
41646 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41647 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41649 ok = (expand_vec_perm_1 (&dperm)
41650 || expand_vec_perm_broadcast_1 (&dperm));
41651 gcc_assert (ok);
41652 return ok;
41654 goto widen;
41656 case E_V16QImode:
41657 if (TARGET_AVX2)
41658 return ix86_vector_duplicate_value (mode, target, val);
41660 if (TARGET_SSE2)
41661 goto permute;
41662 goto widen;
41664 widen:
41665 /* Replicate the value once into the next wider mode and recurse. */
41667 machine_mode smode, wsmode, wvmode;
41668 rtx x;
41670 smode = GET_MODE_INNER (mode);
41671 wvmode = get_mode_wider_vector (mode);
41672 wsmode = GET_MODE_INNER (wvmode);
41674 val = convert_modes (wsmode, smode, val, true);
41675 x = expand_simple_binop (wsmode, ASHIFT, val,
41676 GEN_INT (GET_MODE_BITSIZE (smode)),
41677 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41678 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41680 x = gen_reg_rtx (wvmode);
41681 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41682 gcc_assert (ok);
41683 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41684 return ok;
41687 case E_V16HImode:
41688 case E_V32QImode:
41689 if (TARGET_AVX2)
41690 return ix86_vector_duplicate_value (mode, target, val);
41691 else
41693 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41694 rtx x = gen_reg_rtx (hvmode);
41696 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41697 gcc_assert (ok);
41699 x = gen_rtx_VEC_CONCAT (mode, x, x);
41700 emit_insn (gen_rtx_SET (target, x));
41702 return true;
41704 case E_V64QImode:
41705 case E_V32HImode:
41706 if (TARGET_AVX512BW)
41707 return ix86_vector_duplicate_value (mode, target, val);
41708 else
41710 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41711 rtx x = gen_reg_rtx (hvmode);
41713 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41714 gcc_assert (ok);
41716 x = gen_rtx_VEC_CONCAT (mode, x, x);
41717 emit_insn (gen_rtx_SET (target, x));
41719 return true;
41721 default:
41722 return false;
41726 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41727 whose ONE_VAR element is VAR, and other elements are zero. Return true
41728 if successful. */
41730 static bool
41731 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41732 rtx target, rtx var, int one_var)
41734 machine_mode vsimode;
41735 rtx new_target;
41736 rtx x, tmp;
41737 bool use_vector_set = false;
41739 switch (mode)
41741 case E_V2DImode:
41742 /* For SSE4.1, we normally use vector set. But if the second
41743 element is zero and inter-unit moves are OK, we use movq
41744 instead. */
41745 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41746 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41747 && one_var == 0));
41748 break;
41749 case E_V16QImode:
41750 case E_V4SImode:
41751 case E_V4SFmode:
41752 use_vector_set = TARGET_SSE4_1;
41753 break;
41754 case E_V8HImode:
41755 use_vector_set = TARGET_SSE2;
41756 break;
41757 case E_V4HImode:
41758 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41759 break;
41760 case E_V32QImode:
41761 case E_V16HImode:
41762 case E_V8SImode:
41763 case E_V8SFmode:
41764 case E_V4DFmode:
41765 use_vector_set = TARGET_AVX;
41766 break;
41767 case E_V4DImode:
41768 /* Use ix86_expand_vector_set in 64bit mode only. */
41769 use_vector_set = TARGET_AVX && TARGET_64BIT;
41770 break;
41771 default:
41772 break;
41775 if (use_vector_set)
41777 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41778 var = force_reg (GET_MODE_INNER (mode), var);
41779 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41780 return true;
41783 switch (mode)
41785 case E_V2SFmode:
41786 case E_V2SImode:
41787 if (!mmx_ok)
41788 return false;
41789 /* FALLTHRU */
41791 case E_V2DFmode:
41792 case E_V2DImode:
41793 if (one_var != 0)
41794 return false;
41795 var = force_reg (GET_MODE_INNER (mode), var);
41796 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41797 emit_insn (gen_rtx_SET (target, x));
41798 return true;
41800 case E_V4SFmode:
41801 case E_V4SImode:
41802 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41803 new_target = gen_reg_rtx (mode);
41804 else
41805 new_target = target;
41806 var = force_reg (GET_MODE_INNER (mode), var);
41807 x = gen_rtx_VEC_DUPLICATE (mode, var);
41808 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41809 emit_insn (gen_rtx_SET (new_target, x));
41810 if (one_var != 0)
41812 /* We need to shuffle the value to the correct position, so
41813 create a new pseudo to store the intermediate result. */
41815 /* With SSE2, we can use the integer shuffle insns. */
41816 if (mode != V4SFmode && TARGET_SSE2)
41818 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41819 const1_rtx,
41820 GEN_INT (one_var == 1 ? 0 : 1),
41821 GEN_INT (one_var == 2 ? 0 : 1),
41822 GEN_INT (one_var == 3 ? 0 : 1)));
41823 if (target != new_target)
41824 emit_move_insn (target, new_target);
41825 return true;
41828 /* Otherwise convert the intermediate result to V4SFmode and
41829 use the SSE1 shuffle instructions. */
41830 if (mode != V4SFmode)
41832 tmp = gen_reg_rtx (V4SFmode);
41833 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41835 else
41836 tmp = new_target;
41838 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41839 const1_rtx,
41840 GEN_INT (one_var == 1 ? 0 : 1),
41841 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41842 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41844 if (mode != V4SFmode)
41845 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41846 else if (tmp != target)
41847 emit_move_insn (target, tmp);
41849 else if (target != new_target)
41850 emit_move_insn (target, new_target);
41851 return true;
41853 case E_V8HImode:
41854 case E_V16QImode:
41855 vsimode = V4SImode;
41856 goto widen;
41857 case E_V4HImode:
41858 case E_V8QImode:
41859 if (!mmx_ok)
41860 return false;
41861 vsimode = V2SImode;
41862 goto widen;
41863 widen:
41864 if (one_var != 0)
41865 return false;
41867 /* Zero extend the variable element to SImode and recurse. */
41868 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41870 x = gen_reg_rtx (vsimode);
41871 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41872 var, one_var))
41873 gcc_unreachable ();
41875 emit_move_insn (target, gen_lowpart (mode, x));
41876 return true;
41878 default:
41879 return false;
41883 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41884 consisting of the values in VALS. It is known that all elements
41885 except ONE_VAR are constants. Return true if successful. */
41887 static bool
41888 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41889 rtx target, rtx vals, int one_var)
41891 rtx var = XVECEXP (vals, 0, one_var);
41892 machine_mode wmode;
41893 rtx const_vec, x;
41895 const_vec = copy_rtx (vals);
41896 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
41897 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
41899 switch (mode)
41901 case E_V2DFmode:
41902 case E_V2DImode:
41903 case E_V2SFmode:
41904 case E_V2SImode:
41905 /* For the two element vectors, it's just as easy to use
41906 the general case. */
41907 return false;
41909 case E_V4DImode:
41910 /* Use ix86_expand_vector_set in 64bit mode only. */
41911 if (!TARGET_64BIT)
41912 return false;
41913 /* FALLTHRU */
41914 case E_V4DFmode:
41915 case E_V8SFmode:
41916 case E_V8SImode:
41917 case E_V16HImode:
41918 case E_V32QImode:
41919 case E_V4SFmode:
41920 case E_V4SImode:
41921 case E_V8HImode:
41922 case E_V4HImode:
41923 break;
41925 case E_V16QImode:
41926 if (TARGET_SSE4_1)
41927 break;
41928 wmode = V8HImode;
41929 goto widen;
41930 case E_V8QImode:
41931 wmode = V4HImode;
41932 goto widen;
41933 widen:
41934 /* There's no way to set one QImode entry easily. Combine
41935 the variable value with its adjacent constant value, and
41936 promote to an HImode set. */
41937 x = XVECEXP (vals, 0, one_var ^ 1);
41938 if (one_var & 1)
41940 var = convert_modes (HImode, QImode, var, true);
41941 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
41942 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41943 x = GEN_INT (INTVAL (x) & 0xff);
41945 else
41947 var = convert_modes (HImode, QImode, var, true);
41948 x = gen_int_mode (INTVAL (x) << 8, HImode);
41950 if (x != const0_rtx)
41951 var = expand_simple_binop (HImode, IOR, var, x, var,
41952 1, OPTAB_LIB_WIDEN);
41954 x = gen_reg_rtx (wmode);
41955 emit_move_insn (x, gen_lowpart (wmode, const_vec));
41956 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
41958 emit_move_insn (target, gen_lowpart (mode, x));
41959 return true;
41961 default:
41962 return false;
41965 emit_move_insn (target, const_vec);
41966 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41967 return true;
41970 /* A subroutine of ix86_expand_vector_init_general. Use vector
41971 concatenate to handle the most general case: all values variable,
41972 and none identical. */
41974 static void
41975 ix86_expand_vector_init_concat (machine_mode mode,
41976 rtx target, rtx *ops, int n)
41978 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
41979 rtx first[16], second[8], third[4];
41980 rtvec v;
41981 int i, j;
41983 switch (n)
41985 case 2:
41986 switch (mode)
41988 case E_V16SImode:
41989 cmode = V8SImode;
41990 break;
41991 case E_V16SFmode:
41992 cmode = V8SFmode;
41993 break;
41994 case E_V8DImode:
41995 cmode = V4DImode;
41996 break;
41997 case E_V8DFmode:
41998 cmode = V4DFmode;
41999 break;
42000 case E_V8SImode:
42001 cmode = V4SImode;
42002 break;
42003 case E_V8SFmode:
42004 cmode = V4SFmode;
42005 break;
42006 case E_V4DImode:
42007 cmode = V2DImode;
42008 break;
42009 case E_V4DFmode:
42010 cmode = V2DFmode;
42011 break;
42012 case E_V4SImode:
42013 cmode = V2SImode;
42014 break;
42015 case E_V4SFmode:
42016 cmode = V2SFmode;
42017 break;
42018 case E_V2DImode:
42019 cmode = DImode;
42020 break;
42021 case E_V2SImode:
42022 cmode = SImode;
42023 break;
42024 case E_V2DFmode:
42025 cmode = DFmode;
42026 break;
42027 case E_V2SFmode:
42028 cmode = SFmode;
42029 break;
42030 default:
42031 gcc_unreachable ();
42034 if (!register_operand (ops[1], cmode))
42035 ops[1] = force_reg (cmode, ops[1]);
42036 if (!register_operand (ops[0], cmode))
42037 ops[0] = force_reg (cmode, ops[0]);
42038 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42039 ops[1])));
42040 break;
42042 case 4:
42043 switch (mode)
42045 case E_V4DImode:
42046 cmode = V2DImode;
42047 break;
42048 case E_V4DFmode:
42049 cmode = V2DFmode;
42050 break;
42051 case E_V4SImode:
42052 cmode = V2SImode;
42053 break;
42054 case E_V4SFmode:
42055 cmode = V2SFmode;
42056 break;
42057 default:
42058 gcc_unreachable ();
42060 goto half;
42062 case 8:
42063 switch (mode)
42065 case E_V8DImode:
42066 cmode = V2DImode;
42067 hmode = V4DImode;
42068 break;
42069 case E_V8DFmode:
42070 cmode = V2DFmode;
42071 hmode = V4DFmode;
42072 break;
42073 case E_V8SImode:
42074 cmode = V2SImode;
42075 hmode = V4SImode;
42076 break;
42077 case E_V8SFmode:
42078 cmode = V2SFmode;
42079 hmode = V4SFmode;
42080 break;
42081 default:
42082 gcc_unreachable ();
42084 goto half;
42086 case 16:
42087 switch (mode)
42089 case E_V16SImode:
42090 cmode = V2SImode;
42091 hmode = V4SImode;
42092 gmode = V8SImode;
42093 break;
42094 case E_V16SFmode:
42095 cmode = V2SFmode;
42096 hmode = V4SFmode;
42097 gmode = V8SFmode;
42098 break;
42099 default:
42100 gcc_unreachable ();
42102 goto half;
42104 half:
42105 /* FIXME: We process inputs backward to help RA. PR 36222. */
42106 i = n - 1;
42107 j = (n >> 1) - 1;
42108 for (; i > 0; i -= 2, j--)
42110 first[j] = gen_reg_rtx (cmode);
42111 v = gen_rtvec (2, ops[i - 1], ops[i]);
42112 ix86_expand_vector_init (false, first[j],
42113 gen_rtx_PARALLEL (cmode, v));
42116 n >>= 1;
42117 if (n > 4)
42119 gcc_assert (hmode != VOIDmode);
42120 gcc_assert (gmode != VOIDmode);
42121 for (i = j = 0; i < n; i += 2, j++)
42123 second[j] = gen_reg_rtx (hmode);
42124 ix86_expand_vector_init_concat (hmode, second [j],
42125 &first [i], 2);
42127 n >>= 1;
42128 for (i = j = 0; i < n; i += 2, j++)
42130 third[j] = gen_reg_rtx (gmode);
42131 ix86_expand_vector_init_concat (gmode, third[j],
42132 &second[i], 2);
42134 n >>= 1;
42135 ix86_expand_vector_init_concat (mode, target, third, n);
42137 else if (n > 2)
42139 gcc_assert (hmode != VOIDmode);
42140 for (i = j = 0; i < n; i += 2, j++)
42142 second[j] = gen_reg_rtx (hmode);
42143 ix86_expand_vector_init_concat (hmode, second [j],
42144 &first [i], 2);
42146 n >>= 1;
42147 ix86_expand_vector_init_concat (mode, target, second, n);
42149 else
42150 ix86_expand_vector_init_concat (mode, target, first, n);
42151 break;
42153 default:
42154 gcc_unreachable ();
42158 /* A subroutine of ix86_expand_vector_init_general. Use vector
42159 interleave to handle the most general case: all values variable,
42160 and none identical. */
42162 static void
42163 ix86_expand_vector_init_interleave (machine_mode mode,
42164 rtx target, rtx *ops, int n)
42166 machine_mode first_imode, second_imode, third_imode, inner_mode;
42167 int i, j;
42168 rtx op0, op1;
42169 rtx (*gen_load_even) (rtx, rtx, rtx);
42170 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42171 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42173 switch (mode)
42175 case E_V8HImode:
42176 gen_load_even = gen_vec_setv8hi;
42177 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42178 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42179 inner_mode = HImode;
42180 first_imode = V4SImode;
42181 second_imode = V2DImode;
42182 third_imode = VOIDmode;
42183 break;
42184 case E_V16QImode:
42185 gen_load_even = gen_vec_setv16qi;
42186 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42187 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42188 inner_mode = QImode;
42189 first_imode = V8HImode;
42190 second_imode = V4SImode;
42191 third_imode = V2DImode;
42192 break;
42193 default:
42194 gcc_unreachable ();
42197 for (i = 0; i < n; i++)
42199 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42200 op0 = gen_reg_rtx (SImode);
42201 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42203 /* Insert the SImode value as low element of V4SImode vector. */
42204 op1 = gen_reg_rtx (V4SImode);
42205 op0 = gen_rtx_VEC_MERGE (V4SImode,
42206 gen_rtx_VEC_DUPLICATE (V4SImode,
42207 op0),
42208 CONST0_RTX (V4SImode),
42209 const1_rtx);
42210 emit_insn (gen_rtx_SET (op1, op0));
42212 /* Cast the V4SImode vector back to a vector in orignal mode. */
42213 op0 = gen_reg_rtx (mode);
42214 emit_move_insn (op0, gen_lowpart (mode, op1));
42216 /* Load even elements into the second position. */
42217 emit_insn (gen_load_even (op0,
42218 force_reg (inner_mode,
42219 ops [i + i + 1]),
42220 const1_rtx));
42222 /* Cast vector to FIRST_IMODE vector. */
42223 ops[i] = gen_reg_rtx (first_imode);
42224 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42227 /* Interleave low FIRST_IMODE vectors. */
42228 for (i = j = 0; i < n; i += 2, j++)
42230 op0 = gen_reg_rtx (first_imode);
42231 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42233 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42234 ops[j] = gen_reg_rtx (second_imode);
42235 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42238 /* Interleave low SECOND_IMODE vectors. */
42239 switch (second_imode)
42241 case E_V4SImode:
42242 for (i = j = 0; i < n / 2; i += 2, j++)
42244 op0 = gen_reg_rtx (second_imode);
42245 emit_insn (gen_interleave_second_low (op0, ops[i],
42246 ops[i + 1]));
42248 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42249 vector. */
42250 ops[j] = gen_reg_rtx (third_imode);
42251 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42253 second_imode = V2DImode;
42254 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42255 /* FALLTHRU */
42257 case E_V2DImode:
42258 op0 = gen_reg_rtx (second_imode);
42259 emit_insn (gen_interleave_second_low (op0, ops[0],
42260 ops[1]));
42262 /* Cast the SECOND_IMODE vector back to a vector on original
42263 mode. */
42264 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42265 break;
42267 default:
42268 gcc_unreachable ();
42272 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42273 all values variable, and none identical. */
42275 static void
42276 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42277 rtx target, rtx vals)
42279 rtx ops[64], op0, op1, op2, op3, op4, op5;
42280 machine_mode half_mode = VOIDmode;
42281 machine_mode quarter_mode = VOIDmode;
42282 int n, i;
42284 switch (mode)
42286 case E_V2SFmode:
42287 case E_V2SImode:
42288 if (!mmx_ok && !TARGET_SSE)
42289 break;
42290 /* FALLTHRU */
42292 case E_V16SImode:
42293 case E_V16SFmode:
42294 case E_V8DFmode:
42295 case E_V8DImode:
42296 case E_V8SFmode:
42297 case E_V8SImode:
42298 case E_V4DFmode:
42299 case E_V4DImode:
42300 case E_V4SFmode:
42301 case E_V4SImode:
42302 case E_V2DFmode:
42303 case E_V2DImode:
42304 n = GET_MODE_NUNITS (mode);
42305 for (i = 0; i < n; i++)
42306 ops[i] = XVECEXP (vals, 0, i);
42307 ix86_expand_vector_init_concat (mode, target, ops, n);
42308 return;
42310 case E_V2TImode:
42311 for (i = 0; i < 2; i++)
42312 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42313 op0 = gen_reg_rtx (V4DImode);
42314 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
42315 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42316 return;
42318 case E_V4TImode:
42319 for (i = 0; i < 4; i++)
42320 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42321 ops[4] = gen_reg_rtx (V4DImode);
42322 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
42323 ops[5] = gen_reg_rtx (V4DImode);
42324 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
42325 op0 = gen_reg_rtx (V8DImode);
42326 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
42327 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42328 return;
42330 case E_V32QImode:
42331 half_mode = V16QImode;
42332 goto half;
42334 case E_V16HImode:
42335 half_mode = V8HImode;
42336 goto half;
42338 half:
42339 n = GET_MODE_NUNITS (mode);
42340 for (i = 0; i < n; i++)
42341 ops[i] = XVECEXP (vals, 0, i);
42342 op0 = gen_reg_rtx (half_mode);
42343 op1 = gen_reg_rtx (half_mode);
42344 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42345 n >> 2);
42346 ix86_expand_vector_init_interleave (half_mode, op1,
42347 &ops [n >> 1], n >> 2);
42348 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42349 return;
42351 case E_V64QImode:
42352 quarter_mode = V16QImode;
42353 half_mode = V32QImode;
42354 goto quarter;
42356 case E_V32HImode:
42357 quarter_mode = V8HImode;
42358 half_mode = V16HImode;
42359 goto quarter;
42361 quarter:
42362 n = GET_MODE_NUNITS (mode);
42363 for (i = 0; i < n; i++)
42364 ops[i] = XVECEXP (vals, 0, i);
42365 op0 = gen_reg_rtx (quarter_mode);
42366 op1 = gen_reg_rtx (quarter_mode);
42367 op2 = gen_reg_rtx (quarter_mode);
42368 op3 = gen_reg_rtx (quarter_mode);
42369 op4 = gen_reg_rtx (half_mode);
42370 op5 = gen_reg_rtx (half_mode);
42371 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42372 n >> 3);
42373 ix86_expand_vector_init_interleave (quarter_mode, op1,
42374 &ops [n >> 2], n >> 3);
42375 ix86_expand_vector_init_interleave (quarter_mode, op2,
42376 &ops [n >> 1], n >> 3);
42377 ix86_expand_vector_init_interleave (quarter_mode, op3,
42378 &ops [(n >> 1) | (n >> 2)], n >> 3);
42379 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42380 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42381 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42382 return;
42384 case E_V16QImode:
42385 if (!TARGET_SSE4_1)
42386 break;
42387 /* FALLTHRU */
42389 case E_V8HImode:
42390 if (!TARGET_SSE2)
42391 break;
42393 /* Don't use ix86_expand_vector_init_interleave if we can't
42394 move from GPR to SSE register directly. */
42395 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42396 break;
42398 n = GET_MODE_NUNITS (mode);
42399 for (i = 0; i < n; i++)
42400 ops[i] = XVECEXP (vals, 0, i);
42401 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42402 return;
42404 case E_V4HImode:
42405 case E_V8QImode:
42406 break;
42408 default:
42409 gcc_unreachable ();
42413 int i, j, n_elts, n_words, n_elt_per_word;
42414 machine_mode inner_mode;
42415 rtx words[4], shift;
42417 inner_mode = GET_MODE_INNER (mode);
42418 n_elts = GET_MODE_NUNITS (mode);
42419 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42420 n_elt_per_word = n_elts / n_words;
42421 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42423 for (i = 0; i < n_words; ++i)
42425 rtx word = NULL_RTX;
42427 for (j = 0; j < n_elt_per_word; ++j)
42429 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42430 elt = convert_modes (word_mode, inner_mode, elt, true);
42432 if (j == 0)
42433 word = elt;
42434 else
42436 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42437 word, 1, OPTAB_LIB_WIDEN);
42438 word = expand_simple_binop (word_mode, IOR, word, elt,
42439 word, 1, OPTAB_LIB_WIDEN);
42443 words[i] = word;
42446 if (n_words == 1)
42447 emit_move_insn (target, gen_lowpart (mode, words[0]));
42448 else if (n_words == 2)
42450 rtx tmp = gen_reg_rtx (mode);
42451 emit_clobber (tmp);
42452 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42453 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42454 emit_move_insn (target, tmp);
42456 else if (n_words == 4)
42458 rtx tmp = gen_reg_rtx (V4SImode);
42459 gcc_assert (word_mode == SImode);
42460 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42461 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42462 emit_move_insn (target, gen_lowpart (mode, tmp));
42464 else
42465 gcc_unreachable ();
42469 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42470 instructions unless MMX_OK is true. */
42472 void
42473 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42475 machine_mode mode = GET_MODE (target);
42476 machine_mode inner_mode = GET_MODE_INNER (mode);
42477 int n_elts = GET_MODE_NUNITS (mode);
42478 int n_var = 0, one_var = -1;
42479 bool all_same = true, all_const_zero = true;
42480 int i;
42481 rtx x;
42483 /* Handle first initialization from vector elts. */
42484 if (n_elts != XVECLEN (vals, 0))
42486 rtx subtarget = target;
42487 x = XVECEXP (vals, 0, 0);
42488 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
42489 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
42491 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
42492 if (inner_mode == QImode || inner_mode == HImode)
42494 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
42495 mode = mode_for_vector (SImode, n_bits / 4).require ();
42496 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
42497 ops[0] = gen_lowpart (inner_mode, ops[0]);
42498 ops[1] = gen_lowpart (inner_mode, ops[1]);
42499 subtarget = gen_reg_rtx (mode);
42501 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
42502 if (subtarget != target)
42503 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
42504 return;
42506 gcc_unreachable ();
42509 for (i = 0; i < n_elts; ++i)
42511 x = XVECEXP (vals, 0, i);
42512 if (!(CONST_SCALAR_INT_P (x)
42513 || CONST_DOUBLE_P (x)
42514 || CONST_FIXED_P (x)))
42515 n_var++, one_var = i;
42516 else if (x != CONST0_RTX (inner_mode))
42517 all_const_zero = false;
42518 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42519 all_same = false;
42522 /* Constants are best loaded from the constant pool. */
42523 if (n_var == 0)
42525 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42526 return;
42529 /* If all values are identical, broadcast the value. */
42530 if (all_same
42531 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42532 XVECEXP (vals, 0, 0)))
42533 return;
42535 /* Values where only one field is non-constant are best loaded from
42536 the pool and overwritten via move later. */
42537 if (n_var == 1)
42539 if (all_const_zero
42540 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42541 XVECEXP (vals, 0, one_var),
42542 one_var))
42543 return;
42545 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42546 return;
42549 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42552 void
42553 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42555 machine_mode mode = GET_MODE (target);
42556 machine_mode inner_mode = GET_MODE_INNER (mode);
42557 machine_mode half_mode;
42558 bool use_vec_merge = false;
42559 rtx tmp;
42560 static rtx (*gen_extract[6][2]) (rtx, rtx)
42562 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42563 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42564 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42565 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42566 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42567 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42569 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42571 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42572 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42573 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42574 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42575 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42576 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42578 int i, j, n;
42579 machine_mode mmode = VOIDmode;
42580 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42582 switch (mode)
42584 case E_V2SFmode:
42585 case E_V2SImode:
42586 if (mmx_ok)
42588 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42589 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42590 if (elt == 0)
42591 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42592 else
42593 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42594 emit_insn (gen_rtx_SET (target, tmp));
42595 return;
42597 break;
42599 case E_V2DImode:
42600 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42601 if (use_vec_merge)
42602 break;
42604 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42605 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42606 if (elt == 0)
42607 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42608 else
42609 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42610 emit_insn (gen_rtx_SET (target, tmp));
42611 return;
42613 case E_V2DFmode:
42615 rtx op0, op1;
42617 /* For the two element vectors, we implement a VEC_CONCAT with
42618 the extraction of the other element. */
42620 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42621 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42623 if (elt == 0)
42624 op0 = val, op1 = tmp;
42625 else
42626 op0 = tmp, op1 = val;
42628 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42629 emit_insn (gen_rtx_SET (target, tmp));
42631 return;
42633 case E_V4SFmode:
42634 use_vec_merge = TARGET_SSE4_1;
42635 if (use_vec_merge)
42636 break;
42638 switch (elt)
42640 case 0:
42641 use_vec_merge = true;
42642 break;
42644 case 1:
42645 /* tmp = target = A B C D */
42646 tmp = copy_to_reg (target);
42647 /* target = A A B B */
42648 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42649 /* target = X A B B */
42650 ix86_expand_vector_set (false, target, val, 0);
42651 /* target = A X C D */
42652 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42653 const1_rtx, const0_rtx,
42654 GEN_INT (2+4), GEN_INT (3+4)));
42655 return;
42657 case 2:
42658 /* tmp = target = A B C D */
42659 tmp = copy_to_reg (target);
42660 /* tmp = X B C D */
42661 ix86_expand_vector_set (false, tmp, val, 0);
42662 /* target = A B X D */
42663 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42664 const0_rtx, const1_rtx,
42665 GEN_INT (0+4), GEN_INT (3+4)));
42666 return;
42668 case 3:
42669 /* tmp = target = A B C D */
42670 tmp = copy_to_reg (target);
42671 /* tmp = X B C D */
42672 ix86_expand_vector_set (false, tmp, val, 0);
42673 /* target = A B X D */
42674 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42675 const0_rtx, const1_rtx,
42676 GEN_INT (2+4), GEN_INT (0+4)));
42677 return;
42679 default:
42680 gcc_unreachable ();
42682 break;
42684 case E_V4SImode:
42685 use_vec_merge = TARGET_SSE4_1;
42686 if (use_vec_merge)
42687 break;
42689 /* Element 0 handled by vec_merge below. */
42690 if (elt == 0)
42692 use_vec_merge = true;
42693 break;
42696 if (TARGET_SSE2)
42698 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42699 store into element 0, then shuffle them back. */
42701 rtx order[4];
42703 order[0] = GEN_INT (elt);
42704 order[1] = const1_rtx;
42705 order[2] = const2_rtx;
42706 order[3] = GEN_INT (3);
42707 order[elt] = const0_rtx;
42709 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42710 order[1], order[2], order[3]));
42712 ix86_expand_vector_set (false, target, val, 0);
42714 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42715 order[1], order[2], order[3]));
42717 else
42719 /* For SSE1, we have to reuse the V4SF code. */
42720 rtx t = gen_reg_rtx (V4SFmode);
42721 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42722 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42723 emit_move_insn (target, gen_lowpart (mode, t));
42725 return;
42727 case E_V8HImode:
42728 use_vec_merge = TARGET_SSE2;
42729 break;
42730 case E_V4HImode:
42731 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42732 break;
42734 case E_V16QImode:
42735 use_vec_merge = TARGET_SSE4_1;
42736 break;
42738 case E_V8QImode:
42739 break;
42741 case E_V32QImode:
42742 half_mode = V16QImode;
42743 j = 0;
42744 n = 16;
42745 goto half;
42747 case E_V16HImode:
42748 half_mode = V8HImode;
42749 j = 1;
42750 n = 8;
42751 goto half;
42753 case E_V8SImode:
42754 half_mode = V4SImode;
42755 j = 2;
42756 n = 4;
42757 goto half;
42759 case E_V4DImode:
42760 half_mode = V2DImode;
42761 j = 3;
42762 n = 2;
42763 goto half;
42765 case E_V8SFmode:
42766 half_mode = V4SFmode;
42767 j = 4;
42768 n = 4;
42769 goto half;
42771 case E_V4DFmode:
42772 half_mode = V2DFmode;
42773 j = 5;
42774 n = 2;
42775 goto half;
42777 half:
42778 /* Compute offset. */
42779 i = elt / n;
42780 elt %= n;
42782 gcc_assert (i <= 1);
42784 /* Extract the half. */
42785 tmp = gen_reg_rtx (half_mode);
42786 emit_insn (gen_extract[j][i] (tmp, target));
42788 /* Put val in tmp at elt. */
42789 ix86_expand_vector_set (false, tmp, val, elt);
42791 /* Put it back. */
42792 emit_insn (gen_insert[j][i] (target, target, tmp));
42793 return;
42795 case E_V8DFmode:
42796 if (TARGET_AVX512F)
42798 mmode = QImode;
42799 gen_blendm = gen_avx512f_blendmv8df;
42801 break;
42803 case E_V8DImode:
42804 if (TARGET_AVX512F)
42806 mmode = QImode;
42807 gen_blendm = gen_avx512f_blendmv8di;
42809 break;
42811 case E_V16SFmode:
42812 if (TARGET_AVX512F)
42814 mmode = HImode;
42815 gen_blendm = gen_avx512f_blendmv16sf;
42817 break;
42819 case E_V16SImode:
42820 if (TARGET_AVX512F)
42822 mmode = HImode;
42823 gen_blendm = gen_avx512f_blendmv16si;
42825 break;
42827 case E_V32HImode:
42828 if (TARGET_AVX512F && TARGET_AVX512BW)
42830 mmode = SImode;
42831 gen_blendm = gen_avx512bw_blendmv32hi;
42833 break;
42835 case E_V64QImode:
42836 if (TARGET_AVX512F && TARGET_AVX512BW)
42838 mmode = DImode;
42839 gen_blendm = gen_avx512bw_blendmv64qi;
42841 break;
42843 default:
42844 break;
42847 if (mmode != VOIDmode)
42849 tmp = gen_reg_rtx (mode);
42850 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42851 /* The avx512*_blendm<mode> expanders have different operand order
42852 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42853 elements where the mask is set and second input operand otherwise,
42854 in {sse,avx}*_*blend* the first input operand is used for elements
42855 where the mask is clear and second input operand otherwise. */
42856 emit_insn (gen_blendm (target, target, tmp,
42857 force_reg (mmode,
42858 gen_int_mode (1 << elt, mmode))));
42860 else if (use_vec_merge)
42862 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42863 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42864 emit_insn (gen_rtx_SET (target, tmp));
42866 else
42868 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42870 emit_move_insn (mem, target);
42872 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42873 emit_move_insn (tmp, val);
42875 emit_move_insn (target, mem);
42879 void
42880 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42882 machine_mode mode = GET_MODE (vec);
42883 machine_mode inner_mode = GET_MODE_INNER (mode);
42884 bool use_vec_extr = false;
42885 rtx tmp;
42887 switch (mode)
42889 case E_V2SImode:
42890 case E_V2SFmode:
42891 if (!mmx_ok)
42892 break;
42893 /* FALLTHRU */
42895 case E_V2DFmode:
42896 case E_V2DImode:
42897 case E_V2TImode:
42898 case E_V4TImode:
42899 use_vec_extr = true;
42900 break;
42902 case E_V4SFmode:
42903 use_vec_extr = TARGET_SSE4_1;
42904 if (use_vec_extr)
42905 break;
42907 switch (elt)
42909 case 0:
42910 tmp = vec;
42911 break;
42913 case 1:
42914 case 3:
42915 tmp = gen_reg_rtx (mode);
42916 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42917 GEN_INT (elt), GEN_INT (elt),
42918 GEN_INT (elt+4), GEN_INT (elt+4)));
42919 break;
42921 case 2:
42922 tmp = gen_reg_rtx (mode);
42923 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42924 break;
42926 default:
42927 gcc_unreachable ();
42929 vec = tmp;
42930 use_vec_extr = true;
42931 elt = 0;
42932 break;
42934 case E_V4SImode:
42935 use_vec_extr = TARGET_SSE4_1;
42936 if (use_vec_extr)
42937 break;
42939 if (TARGET_SSE2)
42941 switch (elt)
42943 case 0:
42944 tmp = vec;
42945 break;
42947 case 1:
42948 case 3:
42949 tmp = gen_reg_rtx (mode);
42950 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
42951 GEN_INT (elt), GEN_INT (elt),
42952 GEN_INT (elt), GEN_INT (elt)));
42953 break;
42955 case 2:
42956 tmp = gen_reg_rtx (mode);
42957 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
42958 break;
42960 default:
42961 gcc_unreachable ();
42963 vec = tmp;
42964 use_vec_extr = true;
42965 elt = 0;
42967 else
42969 /* For SSE1, we have to reuse the V4SF code. */
42970 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
42971 gen_lowpart (V4SFmode, vec), elt);
42972 return;
42974 break;
42976 case E_V8HImode:
42977 use_vec_extr = TARGET_SSE2;
42978 break;
42979 case E_V4HImode:
42980 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42981 break;
42983 case E_V16QImode:
42984 use_vec_extr = TARGET_SSE4_1;
42985 break;
42987 case E_V8SFmode:
42988 if (TARGET_AVX)
42990 tmp = gen_reg_rtx (V4SFmode);
42991 if (elt < 4)
42992 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
42993 else
42994 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
42995 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42996 return;
42998 break;
43000 case E_V4DFmode:
43001 if (TARGET_AVX)
43003 tmp = gen_reg_rtx (V2DFmode);
43004 if (elt < 2)
43005 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43006 else
43007 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43008 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43009 return;
43011 break;
43013 case E_V32QImode:
43014 if (TARGET_AVX)
43016 tmp = gen_reg_rtx (V16QImode);
43017 if (elt < 16)
43018 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43019 else
43020 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43021 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43022 return;
43024 break;
43026 case E_V16HImode:
43027 if (TARGET_AVX)
43029 tmp = gen_reg_rtx (V8HImode);
43030 if (elt < 8)
43031 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43032 else
43033 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43034 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43035 return;
43037 break;
43039 case E_V8SImode:
43040 if (TARGET_AVX)
43042 tmp = gen_reg_rtx (V4SImode);
43043 if (elt < 4)
43044 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43045 else
43046 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43047 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43048 return;
43050 break;
43052 case E_V4DImode:
43053 if (TARGET_AVX)
43055 tmp = gen_reg_rtx (V2DImode);
43056 if (elt < 2)
43057 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43058 else
43059 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43060 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43061 return;
43063 break;
43065 case E_V32HImode:
43066 if (TARGET_AVX512BW)
43068 tmp = gen_reg_rtx (V16HImode);
43069 if (elt < 16)
43070 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43071 else
43072 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43073 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43074 return;
43076 break;
43078 case E_V64QImode:
43079 if (TARGET_AVX512BW)
43081 tmp = gen_reg_rtx (V32QImode);
43082 if (elt < 32)
43083 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43084 else
43085 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43086 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43087 return;
43089 break;
43091 case E_V16SFmode:
43092 tmp = gen_reg_rtx (V8SFmode);
43093 if (elt < 8)
43094 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43095 else
43096 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43097 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43098 return;
43100 case E_V8DFmode:
43101 tmp = gen_reg_rtx (V4DFmode);
43102 if (elt < 4)
43103 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43104 else
43105 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43106 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43107 return;
43109 case E_V16SImode:
43110 tmp = gen_reg_rtx (V8SImode);
43111 if (elt < 8)
43112 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43113 else
43114 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43115 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43116 return;
43118 case E_V8DImode:
43119 tmp = gen_reg_rtx (V4DImode);
43120 if (elt < 4)
43121 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43122 else
43123 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43124 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43125 return;
43127 case E_V8QImode:
43128 /* ??? Could extract the appropriate HImode element and shift. */
43129 default:
43130 break;
43133 if (use_vec_extr)
43135 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43136 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43138 /* Let the rtl optimizers know about the zero extension performed. */
43139 if (inner_mode == QImode || inner_mode == HImode)
43141 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43142 target = gen_lowpart (SImode, target);
43145 emit_insn (gen_rtx_SET (target, tmp));
43147 else
43149 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43151 emit_move_insn (mem, vec);
43153 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43154 emit_move_insn (target, tmp);
43158 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43159 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43160 The upper bits of DEST are undefined, though they shouldn't cause
43161 exceptions (some bits from src or all zeros are ok). */
43163 static void
43164 emit_reduc_half (rtx dest, rtx src, int i)
43166 rtx tem, d = dest;
43167 switch (GET_MODE (src))
43169 case E_V4SFmode:
43170 if (i == 128)
43171 tem = gen_sse_movhlps (dest, src, src);
43172 else
43173 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43174 GEN_INT (1 + 4), GEN_INT (1 + 4));
43175 break;
43176 case E_V2DFmode:
43177 tem = gen_vec_interleave_highv2df (dest, src, src);
43178 break;
43179 case E_V16QImode:
43180 case E_V8HImode:
43181 case E_V4SImode:
43182 case E_V2DImode:
43183 d = gen_reg_rtx (V1TImode);
43184 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43185 GEN_INT (i / 2));
43186 break;
43187 case E_V8SFmode:
43188 if (i == 256)
43189 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43190 else
43191 tem = gen_avx_shufps256 (dest, src, src,
43192 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43193 break;
43194 case E_V4DFmode:
43195 if (i == 256)
43196 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43197 else
43198 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43199 break;
43200 case E_V32QImode:
43201 case E_V16HImode:
43202 case E_V8SImode:
43203 case E_V4DImode:
43204 if (i == 256)
43206 if (GET_MODE (dest) != V4DImode)
43207 d = gen_reg_rtx (V4DImode);
43208 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43209 gen_lowpart (V4DImode, src),
43210 const1_rtx);
43212 else
43214 d = gen_reg_rtx (V2TImode);
43215 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43216 GEN_INT (i / 2));
43218 break;
43219 case E_V64QImode:
43220 case E_V32HImode:
43221 case E_V16SImode:
43222 case E_V16SFmode:
43223 case E_V8DImode:
43224 case E_V8DFmode:
43225 if (i > 128)
43226 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43227 gen_lowpart (V16SImode, src),
43228 gen_lowpart (V16SImode, src),
43229 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43230 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43231 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43232 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43233 GEN_INT (0xC), GEN_INT (0xD),
43234 GEN_INT (0xE), GEN_INT (0xF),
43235 GEN_INT (0x10), GEN_INT (0x11),
43236 GEN_INT (0x12), GEN_INT (0x13),
43237 GEN_INT (0x14), GEN_INT (0x15),
43238 GEN_INT (0x16), GEN_INT (0x17));
43239 else
43240 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43241 gen_lowpart (V16SImode, src),
43242 GEN_INT (i == 128 ? 0x2 : 0x1),
43243 GEN_INT (0x3),
43244 GEN_INT (0x3),
43245 GEN_INT (0x3),
43246 GEN_INT (i == 128 ? 0x6 : 0x5),
43247 GEN_INT (0x7),
43248 GEN_INT (0x7),
43249 GEN_INT (0x7),
43250 GEN_INT (i == 128 ? 0xA : 0x9),
43251 GEN_INT (0xB),
43252 GEN_INT (0xB),
43253 GEN_INT (0xB),
43254 GEN_INT (i == 128 ? 0xE : 0xD),
43255 GEN_INT (0xF),
43256 GEN_INT (0xF),
43257 GEN_INT (0xF));
43258 break;
43259 default:
43260 gcc_unreachable ();
43262 emit_insn (tem);
43263 if (d != dest)
43264 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43267 /* Expand a vector reduction. FN is the binary pattern to reduce;
43268 DEST is the destination; IN is the input vector. */
43270 void
43271 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43273 rtx half, dst, vec = in;
43274 machine_mode mode = GET_MODE (in);
43275 int i;
43277 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43278 if (TARGET_SSE4_1
43279 && mode == V8HImode
43280 && fn == gen_uminv8hi3)
43282 emit_insn (gen_sse4_1_phminposuw (dest, in));
43283 return;
43286 for (i = GET_MODE_BITSIZE (mode);
43287 i > GET_MODE_UNIT_BITSIZE (mode);
43288 i >>= 1)
43290 half = gen_reg_rtx (mode);
43291 emit_reduc_half (half, vec, i);
43292 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43293 dst = dest;
43294 else
43295 dst = gen_reg_rtx (mode);
43296 emit_insn (fn (dst, half, vec));
43297 vec = dst;
43301 /* Target hook for scalar_mode_supported_p. */
43302 static bool
43303 ix86_scalar_mode_supported_p (scalar_mode mode)
43305 if (DECIMAL_FLOAT_MODE_P (mode))
43306 return default_decimal_float_supported_p ();
43307 else if (mode == TFmode)
43308 return true;
43309 else
43310 return default_scalar_mode_supported_p (mode);
43313 /* Implements target hook vector_mode_supported_p. */
43314 static bool
43315 ix86_vector_mode_supported_p (machine_mode mode)
43317 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43318 return true;
43319 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43320 return true;
43321 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43322 return true;
43323 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43324 return true;
43325 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43326 return true;
43327 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43328 return true;
43329 return false;
43332 /* Target hook for c_mode_for_suffix. */
43333 static machine_mode
43334 ix86_c_mode_for_suffix (char suffix)
43336 if (suffix == 'q')
43337 return TFmode;
43338 if (suffix == 'w')
43339 return XFmode;
43341 return VOIDmode;
43344 /* Worker function for TARGET_MD_ASM_ADJUST.
43346 We implement asm flag outputs, and maintain source compatibility
43347 with the old cc0-based compiler. */
43349 static rtx_insn *
43350 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43351 vec<const char *> &constraints,
43352 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43354 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43355 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43357 bool saw_asm_flag = false;
43359 start_sequence ();
43360 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43362 const char *con = constraints[i];
43363 if (strncmp (con, "=@cc", 4) != 0)
43364 continue;
43365 con += 4;
43366 if (strchr (con, ',') != NULL)
43368 error ("alternatives not allowed in asm flag output");
43369 continue;
43372 bool invert = false;
43373 if (con[0] == 'n')
43374 invert = true, con++;
43376 machine_mode mode = CCmode;
43377 rtx_code code = UNKNOWN;
43379 switch (con[0])
43381 case 'a':
43382 if (con[1] == 0)
43383 mode = CCAmode, code = EQ;
43384 else if (con[1] == 'e' && con[2] == 0)
43385 mode = CCCmode, code = NE;
43386 break;
43387 case 'b':
43388 if (con[1] == 0)
43389 mode = CCCmode, code = EQ;
43390 else if (con[1] == 'e' && con[2] == 0)
43391 mode = CCAmode, code = NE;
43392 break;
43393 case 'c':
43394 if (con[1] == 0)
43395 mode = CCCmode, code = EQ;
43396 break;
43397 case 'e':
43398 if (con[1] == 0)
43399 mode = CCZmode, code = EQ;
43400 break;
43401 case 'g':
43402 if (con[1] == 0)
43403 mode = CCGCmode, code = GT;
43404 else if (con[1] == 'e' && con[2] == 0)
43405 mode = CCGCmode, code = GE;
43406 break;
43407 case 'l':
43408 if (con[1] == 0)
43409 mode = CCGCmode, code = LT;
43410 else if (con[1] == 'e' && con[2] == 0)
43411 mode = CCGCmode, code = LE;
43412 break;
43413 case 'o':
43414 if (con[1] == 0)
43415 mode = CCOmode, code = EQ;
43416 break;
43417 case 'p':
43418 if (con[1] == 0)
43419 mode = CCPmode, code = EQ;
43420 break;
43421 case 's':
43422 if (con[1] == 0)
43423 mode = CCSmode, code = EQ;
43424 break;
43425 case 'z':
43426 if (con[1] == 0)
43427 mode = CCZmode, code = EQ;
43428 break;
43430 if (code == UNKNOWN)
43432 error ("unknown asm flag output %qs", constraints[i]);
43433 continue;
43435 if (invert)
43436 code = reverse_condition (code);
43438 rtx dest = outputs[i];
43439 if (!saw_asm_flag)
43441 /* This is the first asm flag output. Here we put the flags
43442 register in as the real output and adjust the condition to
43443 allow it. */
43444 constraints[i] = "=Bf";
43445 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43446 saw_asm_flag = true;
43448 else
43450 /* We don't need the flags register as output twice. */
43451 constraints[i] = "=X";
43452 outputs[i] = gen_rtx_SCRATCH (SImode);
43455 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43456 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43458 machine_mode dest_mode = GET_MODE (dest);
43459 if (!SCALAR_INT_MODE_P (dest_mode))
43461 error ("invalid type for asm flag output");
43462 continue;
43465 if (dest_mode == DImode && !TARGET_64BIT)
43466 dest_mode = SImode;
43468 if (dest_mode != QImode)
43470 rtx destqi = gen_reg_rtx (QImode);
43471 emit_insn (gen_rtx_SET (destqi, x));
43473 if (TARGET_ZERO_EXTEND_WITH_AND
43474 && optimize_function_for_speed_p (cfun))
43476 x = force_reg (dest_mode, const0_rtx);
43478 emit_insn (gen_movstrictqi
43479 (gen_lowpart (QImode, x), destqi));
43481 else
43482 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43485 if (dest_mode != GET_MODE (dest))
43487 rtx tmp = gen_reg_rtx (SImode);
43489 emit_insn (gen_rtx_SET (tmp, x));
43490 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43492 else
43493 emit_insn (gen_rtx_SET (dest, x));
43495 rtx_insn *seq = get_insns ();
43496 end_sequence ();
43498 if (saw_asm_flag)
43499 return seq;
43500 else
43502 /* If we had no asm flag outputs, clobber the flags. */
43503 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43504 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43505 return NULL;
43509 /* Implements target vector targetm.asm.encode_section_info. */
43511 static void ATTRIBUTE_UNUSED
43512 ix86_encode_section_info (tree decl, rtx rtl, int first)
43514 default_encode_section_info (decl, rtl, first);
43516 if (ix86_in_large_data_p (decl))
43517 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43520 /* Worker function for REVERSE_CONDITION. */
43522 enum rtx_code
43523 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43525 return (mode == CCFPmode
43526 ? reverse_condition_maybe_unordered (code)
43527 : reverse_condition (code));
43530 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43531 to OPERANDS[0]. */
43533 const char *
43534 output_387_reg_move (rtx_insn *insn, rtx *operands)
43536 if (REG_P (operands[0]))
43538 if (REG_P (operands[1])
43539 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43541 if (REGNO (operands[0]) == FIRST_STACK_REG)
43542 return output_387_ffreep (operands, 0);
43543 return "fstp\t%y0";
43545 if (STACK_TOP_P (operands[0]))
43546 return "fld%Z1\t%y1";
43547 return "fst\t%y0";
43549 else if (MEM_P (operands[0]))
43551 gcc_assert (REG_P (operands[1]));
43552 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43553 return "fstp%Z0\t%y0";
43554 else
43556 /* There is no non-popping store to memory for XFmode.
43557 So if we need one, follow the store with a load. */
43558 if (GET_MODE (operands[0]) == XFmode)
43559 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43560 else
43561 return "fst%Z0\t%y0";
43564 else
43565 gcc_unreachable();
43568 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43569 FP status register is set. */
43571 void
43572 ix86_emit_fp_unordered_jump (rtx label)
43574 rtx reg = gen_reg_rtx (HImode);
43575 rtx temp;
43577 emit_insn (gen_x86_fnstsw_1 (reg));
43579 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43581 emit_insn (gen_x86_sahf_1 (reg));
43583 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43584 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43586 else
43588 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
43590 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43591 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43594 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43595 gen_rtx_LABEL_REF (VOIDmode, label),
43596 pc_rtx);
43597 temp = gen_rtx_SET (pc_rtx, temp);
43599 emit_jump_insn (temp);
43600 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43603 /* Output code to perform a log1p XFmode calculation. */
43605 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43607 rtx_code_label *label1 = gen_label_rtx ();
43608 rtx_code_label *label2 = gen_label_rtx ();
43610 rtx tmp = gen_reg_rtx (XFmode);
43611 rtx tmp2 = gen_reg_rtx (XFmode);
43612 rtx test;
43614 emit_insn (gen_absxf2 (tmp, op1));
43615 test = gen_rtx_GE (VOIDmode, tmp,
43616 const_double_from_real_value (
43617 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43618 XFmode));
43619 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43621 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43622 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43623 emit_jump (label2);
43625 emit_label (label1);
43626 emit_move_insn (tmp, CONST1_RTX (XFmode));
43627 emit_insn (gen_addxf3 (tmp, op1, tmp));
43628 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43629 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43631 emit_label (label2);
43634 /* Emit code for round calculation. */
43635 void ix86_emit_i387_round (rtx op0, rtx op1)
43637 machine_mode inmode = GET_MODE (op1);
43638 machine_mode outmode = GET_MODE (op0);
43639 rtx e1, e2, res, tmp, tmp1, half;
43640 rtx scratch = gen_reg_rtx (HImode);
43641 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43642 rtx_code_label *jump_label = gen_label_rtx ();
43643 rtx insn;
43644 rtx (*gen_abs) (rtx, rtx);
43645 rtx (*gen_neg) (rtx, rtx);
43647 switch (inmode)
43649 case E_SFmode:
43650 gen_abs = gen_abssf2;
43651 break;
43652 case E_DFmode:
43653 gen_abs = gen_absdf2;
43654 break;
43655 case E_XFmode:
43656 gen_abs = gen_absxf2;
43657 break;
43658 default:
43659 gcc_unreachable ();
43662 switch (outmode)
43664 case E_SFmode:
43665 gen_neg = gen_negsf2;
43666 break;
43667 case E_DFmode:
43668 gen_neg = gen_negdf2;
43669 break;
43670 case E_XFmode:
43671 gen_neg = gen_negxf2;
43672 break;
43673 case E_HImode:
43674 gen_neg = gen_neghi2;
43675 break;
43676 case E_SImode:
43677 gen_neg = gen_negsi2;
43678 break;
43679 case E_DImode:
43680 gen_neg = gen_negdi2;
43681 break;
43682 default:
43683 gcc_unreachable ();
43686 e1 = gen_reg_rtx (inmode);
43687 e2 = gen_reg_rtx (inmode);
43688 res = gen_reg_rtx (outmode);
43690 half = const_double_from_real_value (dconsthalf, inmode);
43692 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43694 /* scratch = fxam(op1) */
43695 emit_insn (gen_rtx_SET (scratch,
43696 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43697 UNSPEC_FXAM)));
43698 /* e1 = fabs(op1) */
43699 emit_insn (gen_abs (e1, op1));
43701 /* e2 = e1 + 0.5 */
43702 half = force_reg (inmode, half);
43703 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43705 /* res = floor(e2) */
43706 if (inmode != XFmode)
43708 tmp1 = gen_reg_rtx (XFmode);
43710 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43712 else
43713 tmp1 = e2;
43715 switch (outmode)
43717 case E_SFmode:
43718 case E_DFmode:
43720 rtx tmp0 = gen_reg_rtx (XFmode);
43722 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43724 emit_insn (gen_rtx_SET (res,
43725 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43726 UNSPEC_TRUNC_NOOP)));
43728 break;
43729 case E_XFmode:
43730 emit_insn (gen_frndintxf2_floor (res, tmp1));
43731 break;
43732 case E_HImode:
43733 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43734 break;
43735 case E_SImode:
43736 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43737 break;
43738 case E_DImode:
43739 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43740 break;
43741 default:
43742 gcc_unreachable ();
43745 /* flags = signbit(a) */
43746 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
43748 /* if (flags) then res = -res */
43749 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43750 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43751 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43752 pc_rtx);
43753 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43754 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43755 JUMP_LABEL (insn) = jump_label;
43757 emit_insn (gen_neg (res, res));
43759 emit_label (jump_label);
43760 LABEL_NUSES (jump_label) = 1;
43762 emit_move_insn (op0, res);
43765 /* Output code to perform a Newton-Rhapson approximation of a single precision
43766 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43768 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43770 rtx x0, x1, e0, e1;
43772 x0 = gen_reg_rtx (mode);
43773 e0 = gen_reg_rtx (mode);
43774 e1 = gen_reg_rtx (mode);
43775 x1 = gen_reg_rtx (mode);
43777 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43779 b = force_reg (mode, b);
43781 /* x0 = rcp(b) estimate */
43782 if (mode == V16SFmode || mode == V8DFmode)
43784 if (TARGET_AVX512ER)
43786 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43787 UNSPEC_RCP28)));
43788 /* res = a * x0 */
43789 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43790 return;
43792 else
43793 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43794 UNSPEC_RCP14)));
43796 else
43797 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43798 UNSPEC_RCP)));
43800 /* e0 = x0 * b */
43801 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43803 /* e0 = x0 * e0 */
43804 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43806 /* e1 = x0 + x0 */
43807 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43809 /* x1 = e1 - e0 */
43810 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43812 /* res = a * x1 */
43813 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43816 /* Output code to perform a Newton-Rhapson approximation of a
43817 single precision floating point [reciprocal] square root. */
43819 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43821 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43822 REAL_VALUE_TYPE r;
43823 int unspec;
43825 x0 = gen_reg_rtx (mode);
43826 e0 = gen_reg_rtx (mode);
43827 e1 = gen_reg_rtx (mode);
43828 e2 = gen_reg_rtx (mode);
43829 e3 = gen_reg_rtx (mode);
43831 if (TARGET_AVX512ER && mode == V16SFmode)
43833 if (recip)
43834 /* res = rsqrt28(a) estimate */
43835 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43836 UNSPEC_RSQRT28)));
43837 else
43839 /* x0 = rsqrt28(a) estimate */
43840 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43841 UNSPEC_RSQRT28)));
43842 /* res = rcp28(x0) estimate */
43843 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43844 UNSPEC_RCP28)));
43846 return;
43849 real_from_integer (&r, VOIDmode, -3, SIGNED);
43850 mthree = const_double_from_real_value (r, SFmode);
43852 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43853 mhalf = const_double_from_real_value (r, SFmode);
43854 unspec = UNSPEC_RSQRT;
43856 if (VECTOR_MODE_P (mode))
43858 mthree = ix86_build_const_vector (mode, true, mthree);
43859 mhalf = ix86_build_const_vector (mode, true, mhalf);
43860 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43861 if (GET_MODE_SIZE (mode) == 64)
43862 unspec = UNSPEC_RSQRT14;
43865 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43866 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43868 a = force_reg (mode, a);
43870 /* x0 = rsqrt(a) estimate */
43871 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43872 unspec)));
43874 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43875 if (!recip)
43877 rtx zero = force_reg (mode, CONST0_RTX(mode));
43878 rtx mask;
43880 /* Handle masked compare. */
43881 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43883 mask = gen_reg_rtx (HImode);
43884 /* Imm value 0x4 corresponds to not-equal comparison. */
43885 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43886 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43888 else
43890 mask = gen_reg_rtx (mode);
43891 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43892 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43896 /* e0 = x0 * a */
43897 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43898 /* e1 = e0 * x0 */
43899 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43901 /* e2 = e1 - 3. */
43902 mthree = force_reg (mode, mthree);
43903 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43905 mhalf = force_reg (mode, mhalf);
43906 if (recip)
43907 /* e3 = -.5 * x0 */
43908 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43909 else
43910 /* e3 = -.5 * e0 */
43911 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43912 /* ret = e2 * e3 */
43913 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43916 #ifdef TARGET_SOLARIS
43917 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43919 static void
43920 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43921 tree decl)
43923 /* With Binutils 2.15, the "@unwind" marker must be specified on
43924 every occurrence of the ".eh_frame" section, not just the first
43925 one. */
43926 if (TARGET_64BIT
43927 && strcmp (name, ".eh_frame") == 0)
43929 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43930 flags & SECTION_WRITE ? "aw" : "a");
43931 return;
43934 #ifndef USE_GAS
43935 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
43937 solaris_elf_asm_comdat_section (name, flags, decl);
43938 return;
43940 #endif
43942 default_elf_asm_named_section (name, flags, decl);
43944 #endif /* TARGET_SOLARIS */
43946 /* Return the mangling of TYPE if it is an extended fundamental type. */
43948 static const char *
43949 ix86_mangle_type (const_tree type)
43951 type = TYPE_MAIN_VARIANT (type);
43953 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
43954 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
43955 return NULL;
43957 switch (TYPE_MODE (type))
43959 case E_TFmode:
43960 /* __float128 is "g". */
43961 return "g";
43962 case E_XFmode:
43963 /* "long double" or __float80 is "e". */
43964 return "e";
43965 default:
43966 return NULL;
43970 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
43972 static tree
43973 ix86_stack_protect_guard (void)
43975 if (TARGET_SSP_TLS_GUARD)
43977 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
43978 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
43979 tree type = build_qualified_type (type_node, qual);
43980 tree t;
43982 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
43984 t = ix86_tls_stack_chk_guard_decl;
43986 if (t == NULL)
43988 rtx x;
43990 t = build_decl
43991 (UNKNOWN_LOCATION, VAR_DECL,
43992 get_identifier (ix86_stack_protector_guard_symbol_str),
43993 type);
43994 TREE_STATIC (t) = 1;
43995 TREE_PUBLIC (t) = 1;
43996 DECL_EXTERNAL (t) = 1;
43997 TREE_USED (t) = 1;
43998 TREE_THIS_VOLATILE (t) = 1;
43999 DECL_ARTIFICIAL (t) = 1;
44000 DECL_IGNORED_P (t) = 1;
44002 /* Do not share RTL as the declaration is visible outside of
44003 current function. */
44004 x = DECL_RTL (t);
44005 RTX_FLAG (x, used) = 1;
44007 ix86_tls_stack_chk_guard_decl = t;
44010 else
44012 tree asptrtype = build_pointer_type (type);
44014 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
44015 t = build2 (MEM_REF, asptrtype, t,
44016 build_int_cst (asptrtype, 0));
44019 return t;
44022 return default_stack_protect_guard ();
44025 /* For 32-bit code we can save PIC register setup by using
44026 __stack_chk_fail_local hidden function instead of calling
44027 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44028 register, so it is better to call __stack_chk_fail directly. */
44030 static tree ATTRIBUTE_UNUSED
44031 ix86_stack_protect_fail (void)
44033 return TARGET_64BIT
44034 ? default_external_stack_protect_fail ()
44035 : default_hidden_stack_protect_fail ();
44038 /* Select a format to encode pointers in exception handling data. CODE
44039 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44040 true if the symbol may be affected by dynamic relocations.
44042 ??? All x86 object file formats are capable of representing this.
44043 After all, the relocation needed is the same as for the call insn.
44044 Whether or not a particular assembler allows us to enter such, I
44045 guess we'll have to see. */
44047 asm_preferred_eh_data_format (int code, int global)
44049 if (flag_pic)
44051 int type = DW_EH_PE_sdata8;
44052 if (!TARGET_64BIT
44053 || ix86_cmodel == CM_SMALL_PIC
44054 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44055 type = DW_EH_PE_sdata4;
44056 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44058 if (ix86_cmodel == CM_SMALL
44059 || (ix86_cmodel == CM_MEDIUM && code))
44060 return DW_EH_PE_udata4;
44061 return DW_EH_PE_absptr;
44064 /* Expand copysign from SIGN to the positive value ABS_VALUE
44065 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44066 the sign-bit. */
44067 static void
44068 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44070 machine_mode mode = GET_MODE (sign);
44071 rtx sgn = gen_reg_rtx (mode);
44072 if (mask == NULL_RTX)
44074 machine_mode vmode;
44076 if (mode == SFmode)
44077 vmode = V4SFmode;
44078 else if (mode == DFmode)
44079 vmode = V2DFmode;
44080 else
44081 vmode = mode;
44083 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44084 if (!VECTOR_MODE_P (mode))
44086 /* We need to generate a scalar mode mask in this case. */
44087 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44088 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44089 mask = gen_reg_rtx (mode);
44090 emit_insn (gen_rtx_SET (mask, tmp));
44093 else
44094 mask = gen_rtx_NOT (mode, mask);
44095 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44096 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44099 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44100 mask for masking out the sign-bit is stored in *SMASK, if that is
44101 non-null. */
44102 static rtx
44103 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44105 machine_mode vmode, mode = GET_MODE (op0);
44106 rtx xa, mask;
44108 xa = gen_reg_rtx (mode);
44109 if (mode == SFmode)
44110 vmode = V4SFmode;
44111 else if (mode == DFmode)
44112 vmode = V2DFmode;
44113 else
44114 vmode = mode;
44115 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44116 if (!VECTOR_MODE_P (mode))
44118 /* We need to generate a scalar mode mask in this case. */
44119 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44120 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44121 mask = gen_reg_rtx (mode);
44122 emit_insn (gen_rtx_SET (mask, tmp));
44124 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44126 if (smask)
44127 *smask = mask;
44129 return xa;
44132 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44133 swapping the operands if SWAP_OPERANDS is true. The expanded
44134 code is a forward jump to a newly created label in case the
44135 comparison is true. The generated label rtx is returned. */
44136 static rtx_code_label *
44137 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44138 bool swap_operands)
44140 bool unordered_compare = ix86_unordered_fp_compare (code);
44141 rtx_code_label *label;
44142 rtx tmp, reg;
44144 if (swap_operands)
44145 std::swap (op0, op1);
44147 label = gen_label_rtx ();
44148 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
44149 if (unordered_compare)
44150 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
44151 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
44152 emit_insn (gen_rtx_SET (reg, tmp));
44153 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
44154 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44155 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44156 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44157 JUMP_LABEL (tmp) = label;
44159 return label;
44162 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44163 using comparison code CODE. Operands are swapped for the comparison if
44164 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44165 static rtx
44166 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44167 bool swap_operands)
44169 rtx (*insn)(rtx, rtx, rtx, rtx);
44170 machine_mode mode = GET_MODE (op0);
44171 rtx mask = gen_reg_rtx (mode);
44173 if (swap_operands)
44174 std::swap (op0, op1);
44176 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44178 emit_insn (insn (mask, op0, op1,
44179 gen_rtx_fmt_ee (code, mode, op0, op1)));
44180 return mask;
44183 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44184 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44185 static rtx
44186 ix86_gen_TWO52 (machine_mode mode)
44188 REAL_VALUE_TYPE TWO52r;
44189 rtx TWO52;
44191 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44192 TWO52 = const_double_from_real_value (TWO52r, mode);
44193 TWO52 = force_reg (mode, TWO52);
44195 return TWO52;
44198 /* Expand SSE sequence for computing lround from OP1 storing
44199 into OP0. */
44200 void
44201 ix86_expand_lround (rtx op0, rtx op1)
44203 /* C code for the stuff we're doing below:
44204 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44205 return (long)tmp;
44207 machine_mode mode = GET_MODE (op1);
44208 const struct real_format *fmt;
44209 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44210 rtx adj;
44212 /* load nextafter (0.5, 0.0) */
44213 fmt = REAL_MODE_FORMAT (mode);
44214 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44215 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44217 /* adj = copysign (0.5, op1) */
44218 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44219 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44221 /* adj = op1 + adj */
44222 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44224 /* op0 = (imode)adj */
44225 expand_fix (op0, adj, 0);
44228 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44229 into OPERAND0. */
44230 void
44231 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44233 /* C code for the stuff we're doing below (for do_floor):
44234 xi = (long)op1;
44235 xi -= (double)xi > op1 ? 1 : 0;
44236 return xi;
44238 machine_mode fmode = GET_MODE (op1);
44239 machine_mode imode = GET_MODE (op0);
44240 rtx ireg, freg, tmp;
44241 rtx_code_label *label;
44243 /* reg = (long)op1 */
44244 ireg = gen_reg_rtx (imode);
44245 expand_fix (ireg, op1, 0);
44247 /* freg = (double)reg */
44248 freg = gen_reg_rtx (fmode);
44249 expand_float (freg, ireg, 0);
44251 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44252 label = ix86_expand_sse_compare_and_jump (UNLE,
44253 freg, op1, !do_floor);
44254 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44255 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44256 emit_move_insn (ireg, tmp);
44258 emit_label (label);
44259 LABEL_NUSES (label) = 1;
44261 emit_move_insn (op0, ireg);
44264 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
44265 void
44266 ix86_expand_rint (rtx operand0, rtx operand1)
44268 /* C code for the stuff we're doing below:
44269 xa = fabs (operand1);
44270 if (!isless (xa, 2**52))
44271 return operand1;
44272 two52 = 2**52;
44273 if (flag_rounding_math)
44275 two52 = copysign (two52, operand1);
44276 xa = operand1;
44278 xa = xa + two52 - two52;
44279 return copysign (xa, operand1);
44281 machine_mode mode = GET_MODE (operand0);
44282 rtx res, xa, TWO52, two52, mask;
44283 rtx_code_label *label;
44285 res = gen_reg_rtx (mode);
44286 emit_move_insn (res, operand1);
44288 /* xa = abs (operand1) */
44289 xa = ix86_expand_sse_fabs (res, &mask);
44291 /* if (!isless (xa, TWO52)) goto label; */
44292 TWO52 = ix86_gen_TWO52 (mode);
44293 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44295 two52 = TWO52;
44296 if (flag_rounding_math)
44298 two52 = gen_reg_rtx (mode);
44299 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
44300 xa = res;
44303 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
44304 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
44306 ix86_sse_copysign_to_positive (res, xa, res, mask);
44308 emit_label (label);
44309 LABEL_NUSES (label) = 1;
44311 emit_move_insn (operand0, res);
44314 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44315 into OPERAND0. */
44316 void
44317 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44319 /* C code for the stuff we expand below.
44320 double xa = fabs (x), x2;
44321 if (!isless (xa, TWO52))
44322 return x;
44323 xa = xa + TWO52 - TWO52;
44324 x2 = copysign (xa, x);
44325 Compensate. Floor:
44326 if (x2 > x)
44327 x2 -= 1;
44328 Compensate. Ceil:
44329 if (x2 < x)
44330 x2 -= -1;
44331 return x2;
44333 machine_mode mode = GET_MODE (operand0);
44334 rtx xa, TWO52, tmp, one, res, mask;
44335 rtx_code_label *label;
44337 TWO52 = ix86_gen_TWO52 (mode);
44339 /* Temporary for holding the result, initialized to the input
44340 operand to ease control flow. */
44341 res = gen_reg_rtx (mode);
44342 emit_move_insn (res, operand1);
44344 /* xa = abs (operand1) */
44345 xa = ix86_expand_sse_fabs (res, &mask);
44347 /* if (!isless (xa, TWO52)) goto label; */
44348 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44350 /* xa = xa + TWO52 - TWO52; */
44351 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44352 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44354 /* xa = copysign (xa, operand1) */
44355 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44357 /* generate 1.0 or -1.0 */
44358 one = force_reg (mode,
44359 const_double_from_real_value (do_floor
44360 ? dconst1 : dconstm1, mode));
44362 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44363 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44364 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44365 /* We always need to subtract here to preserve signed zero. */
44366 tmp = expand_simple_binop (mode, MINUS,
44367 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44368 emit_move_insn (res, tmp);
44370 emit_label (label);
44371 LABEL_NUSES (label) = 1;
44373 emit_move_insn (operand0, res);
44376 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44377 into OPERAND0. */
44378 void
44379 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44381 /* C code for the stuff we expand below.
44382 double xa = fabs (x), x2;
44383 if (!isless (xa, TWO52))
44384 return x;
44385 x2 = (double)(long)x;
44386 Compensate. Floor:
44387 if (x2 > x)
44388 x2 -= 1;
44389 Compensate. Ceil:
44390 if (x2 < x)
44391 x2 += 1;
44392 if (HONOR_SIGNED_ZEROS (mode))
44393 return copysign (x2, x);
44394 return x2;
44396 machine_mode mode = GET_MODE (operand0);
44397 rtx xa, xi, TWO52, tmp, one, res, mask;
44398 rtx_code_label *label;
44400 TWO52 = ix86_gen_TWO52 (mode);
44402 /* Temporary for holding the result, initialized to the input
44403 operand to ease control flow. */
44404 res = gen_reg_rtx (mode);
44405 emit_move_insn (res, operand1);
44407 /* xa = abs (operand1) */
44408 xa = ix86_expand_sse_fabs (res, &mask);
44410 /* if (!isless (xa, TWO52)) goto label; */
44411 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44413 /* xa = (double)(long)x */
44414 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44415 expand_fix (xi, res, 0);
44416 expand_float (xa, xi, 0);
44418 /* generate 1.0 */
44419 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44421 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44422 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44423 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44424 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44425 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44426 emit_move_insn (res, tmp);
44428 if (HONOR_SIGNED_ZEROS (mode))
44429 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44431 emit_label (label);
44432 LABEL_NUSES (label) = 1;
44434 emit_move_insn (operand0, res);
44437 /* Expand SSE sequence for computing round from OPERAND1 storing
44438 into OPERAND0. Sequence that works without relying on DImode truncation
44439 via cvttsd2siq that is only available on 64bit targets. */
44440 void
44441 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44443 /* C code for the stuff we expand below.
44444 double xa = fabs (x), xa2, x2;
44445 if (!isless (xa, TWO52))
44446 return x;
44447 Using the absolute value and copying back sign makes
44448 -0.0 -> -0.0 correct.
44449 xa2 = xa + TWO52 - TWO52;
44450 Compensate.
44451 dxa = xa2 - xa;
44452 if (dxa <= -0.5)
44453 xa2 += 1;
44454 else if (dxa > 0.5)
44455 xa2 -= 1;
44456 x2 = copysign (xa2, x);
44457 return x2;
44459 machine_mode mode = GET_MODE (operand0);
44460 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44461 rtx_code_label *label;
44463 TWO52 = ix86_gen_TWO52 (mode);
44465 /* Temporary for holding the result, initialized to the input
44466 operand to ease control flow. */
44467 res = gen_reg_rtx (mode);
44468 emit_move_insn (res, operand1);
44470 /* xa = abs (operand1) */
44471 xa = ix86_expand_sse_fabs (res, &mask);
44473 /* if (!isless (xa, TWO52)) goto label; */
44474 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44476 /* xa2 = xa + TWO52 - TWO52; */
44477 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44478 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44480 /* dxa = xa2 - xa; */
44481 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44483 /* generate 0.5, 1.0 and -0.5 */
44484 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44485 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44486 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44487 0, OPTAB_DIRECT);
44489 /* Compensate. */
44490 tmp = gen_reg_rtx (mode);
44491 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44492 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44493 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44494 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44495 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44496 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44497 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44498 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44500 /* res = copysign (xa2, operand1) */
44501 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44503 emit_label (label);
44504 LABEL_NUSES (label) = 1;
44506 emit_move_insn (operand0, res);
44509 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44510 into OPERAND0. */
44511 void
44512 ix86_expand_trunc (rtx operand0, rtx operand1)
44514 /* C code for SSE variant we expand below.
44515 double xa = fabs (x), x2;
44516 if (!isless (xa, TWO52))
44517 return x;
44518 x2 = (double)(long)x;
44519 if (HONOR_SIGNED_ZEROS (mode))
44520 return copysign (x2, x);
44521 return x2;
44523 machine_mode mode = GET_MODE (operand0);
44524 rtx xa, xi, TWO52, res, mask;
44525 rtx_code_label *label;
44527 TWO52 = ix86_gen_TWO52 (mode);
44529 /* Temporary for holding the result, initialized to the input
44530 operand to ease control flow. */
44531 res = gen_reg_rtx (mode);
44532 emit_move_insn (res, operand1);
44534 /* xa = abs (operand1) */
44535 xa = ix86_expand_sse_fabs (res, &mask);
44537 /* if (!isless (xa, TWO52)) goto label; */
44538 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44540 /* x = (double)(long)x */
44541 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44542 expand_fix (xi, res, 0);
44543 expand_float (res, xi, 0);
44545 if (HONOR_SIGNED_ZEROS (mode))
44546 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44548 emit_label (label);
44549 LABEL_NUSES (label) = 1;
44551 emit_move_insn (operand0, res);
44554 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44555 into OPERAND0. */
44556 void
44557 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44559 machine_mode mode = GET_MODE (operand0);
44560 rtx xa, mask, TWO52, one, res, smask, tmp;
44561 rtx_code_label *label;
44563 /* C code for SSE variant we expand below.
44564 double xa = fabs (x), x2;
44565 if (!isless (xa, TWO52))
44566 return x;
44567 xa2 = xa + TWO52 - TWO52;
44568 Compensate:
44569 if (xa2 > xa)
44570 xa2 -= 1.0;
44571 x2 = copysign (xa2, x);
44572 return x2;
44575 TWO52 = ix86_gen_TWO52 (mode);
44577 /* Temporary for holding the result, initialized to the input
44578 operand to ease control flow. */
44579 res = gen_reg_rtx (mode);
44580 emit_move_insn (res, operand1);
44582 /* xa = abs (operand1) */
44583 xa = ix86_expand_sse_fabs (res, &smask);
44585 /* if (!isless (xa, TWO52)) goto label; */
44586 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44588 /* res = xa + TWO52 - TWO52; */
44589 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44590 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44591 emit_move_insn (res, tmp);
44593 /* generate 1.0 */
44594 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44596 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44597 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44598 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44599 tmp = expand_simple_binop (mode, MINUS,
44600 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44601 emit_move_insn (res, tmp);
44603 /* res = copysign (res, operand1) */
44604 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44606 emit_label (label);
44607 LABEL_NUSES (label) = 1;
44609 emit_move_insn (operand0, res);
44612 /* Expand SSE sequence for computing round from OPERAND1 storing
44613 into OPERAND0. */
44614 void
44615 ix86_expand_round (rtx operand0, rtx operand1)
44617 /* C code for the stuff we're doing below:
44618 double xa = fabs (x);
44619 if (!isless (xa, TWO52))
44620 return x;
44621 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44622 return copysign (xa, x);
44624 machine_mode mode = GET_MODE (operand0);
44625 rtx res, TWO52, xa, xi, half, mask;
44626 rtx_code_label *label;
44627 const struct real_format *fmt;
44628 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44630 /* Temporary for holding the result, initialized to the input
44631 operand to ease control flow. */
44632 res = gen_reg_rtx (mode);
44633 emit_move_insn (res, operand1);
44635 TWO52 = ix86_gen_TWO52 (mode);
44636 xa = ix86_expand_sse_fabs (res, &mask);
44637 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44639 /* load nextafter (0.5, 0.0) */
44640 fmt = REAL_MODE_FORMAT (mode);
44641 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44642 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44644 /* xa = xa + 0.5 */
44645 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44646 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44648 /* xa = (double)(int64_t)xa */
44649 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44650 expand_fix (xi, xa, 0);
44651 expand_float (xa, xi, 0);
44653 /* res = copysign (xa, operand1) */
44654 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44656 emit_label (label);
44657 LABEL_NUSES (label) = 1;
44659 emit_move_insn (operand0, res);
44662 /* Expand SSE sequence for computing round
44663 from OP1 storing into OP0 using sse4 round insn. */
44664 void
44665 ix86_expand_round_sse4 (rtx op0, rtx op1)
44667 machine_mode mode = GET_MODE (op0);
44668 rtx e1, e2, res, half;
44669 const struct real_format *fmt;
44670 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44671 rtx (*gen_copysign) (rtx, rtx, rtx);
44672 rtx (*gen_round) (rtx, rtx, rtx);
44674 switch (mode)
44676 case E_SFmode:
44677 gen_copysign = gen_copysignsf3;
44678 gen_round = gen_sse4_1_roundsf2;
44679 break;
44680 case E_DFmode:
44681 gen_copysign = gen_copysigndf3;
44682 gen_round = gen_sse4_1_rounddf2;
44683 break;
44684 default:
44685 gcc_unreachable ();
44688 /* round (a) = trunc (a + copysign (0.5, a)) */
44690 /* load nextafter (0.5, 0.0) */
44691 fmt = REAL_MODE_FORMAT (mode);
44692 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44693 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44694 half = const_double_from_real_value (pred_half, mode);
44696 /* e1 = copysign (0.5, op1) */
44697 e1 = gen_reg_rtx (mode);
44698 emit_insn (gen_copysign (e1, half, op1));
44700 /* e2 = op1 + e1 */
44701 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44703 /* res = trunc (e2) */
44704 res = gen_reg_rtx (mode);
44705 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44707 emit_move_insn (op0, res);
44711 /* Table of valid machine attributes. */
44712 static const struct attribute_spec ix86_attribute_table[] =
44714 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44715 affects_type_identity, exclusions } */
44716 /* Stdcall attribute says callee is responsible for popping arguments
44717 if they are not variable. */
44718 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44719 true, NULL },
44720 /* Fastcall attribute says callee is responsible for popping arguments
44721 if they are not variable. */
44722 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44723 true, NULL },
44724 /* Thiscall attribute says callee is responsible for popping arguments
44725 if they are not variable. */
44726 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44727 true, NULL },
44728 /* Cdecl attribute says the callee is a normal C declaration */
44729 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44730 true, NULL },
44731 /* Regparm attribute specifies how many integer arguments are to be
44732 passed in registers. */
44733 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44734 true, NULL },
44735 /* Sseregparm attribute says we are using x86_64 calling conventions
44736 for FP arguments. */
44737 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44738 true, NULL },
44739 /* The transactional memory builtins are implicitly regparm or fastcall
44740 depending on the ABI. Override the generic do-nothing attribute that
44741 these builtins were declared with. */
44742 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44743 true, NULL },
44744 /* force_align_arg_pointer says this function realigns the stack at entry. */
44745 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44746 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false,
44747 NULL },
44748 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44749 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false, NULL },
44750 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false, NULL },
44751 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44752 false, NULL },
44753 #endif
44754 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44755 false, NULL },
44756 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44757 false, NULL },
44758 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44759 SUBTARGET_ATTRIBUTE_TABLE,
44760 #endif
44761 /* ms_abi and sysv_abi calling convention function attributes. */
44762 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true, NULL },
44763 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true,
44764 NULL },
44765 { "ms_abi va_list", 0, 0, false, false, false, NULL, false, NULL },
44766 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false, NULL },
44767 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
44768 false, NULL },
44769 { "callee_pop_aggregate_return", 1, 1, false, true, true,
44770 ix86_handle_callee_pop_aggregate_return, true, NULL },
44771 { "interrupt", 0, 0, false, true, true,
44772 ix86_handle_interrupt_attribute, false, NULL },
44773 { "no_caller_saved_registers", 0, 0, false, true, true,
44774 ix86_handle_no_caller_saved_registers_attribute, false, NULL },
44775 { "naked", 0, 0, true, false, false,
44776 ix86_handle_fndecl_attribute, false, NULL },
44778 /* End element. */
44779 { NULL, 0, 0, false, false, false, NULL, false, NULL }
44782 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44783 static int
44784 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44785 tree vectype, int)
44787 bool fp = false;
44788 machine_mode mode = TImode;
44789 int index;
44790 if (vectype != NULL)
44792 fp = FLOAT_TYPE_P (vectype);
44793 mode = TYPE_MODE (vectype);
44796 switch (type_of_cost)
44798 case scalar_stmt:
44799 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
44801 case scalar_load:
44802 /* load/store costs are relative to register move which is 2. Recompute
44803 it to COSTS_N_INSNS so everything have same base. */
44804 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
44805 : ix86_cost->int_load [2]) / 2;
44807 case scalar_store:
44808 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
44809 : ix86_cost->int_store [2]) / 2;
44811 case vector_stmt:
44812 return ix86_vec_cost (mode,
44813 fp ? ix86_cost->addss : ix86_cost->sse_op,
44814 true);
44816 case vector_load:
44817 index = sse_store_index (mode);
44818 /* See PR82713 - we may end up being called on non-vector type. */
44819 if (index < 0)
44820 index = 2;
44821 return ix86_vec_cost (mode,
44822 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
44823 true);
44825 case vector_store:
44826 index = sse_store_index (mode);
44827 /* See PR82713 - we may end up being called on non-vector type. */
44828 if (index < 0)
44829 index = 2;
44830 return ix86_vec_cost (mode,
44831 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
44832 true);
44834 case vec_to_scalar:
44835 case scalar_to_vec:
44836 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
44838 /* We should have separate costs for unaligned loads and gather/scatter.
44839 Do that incrementally. */
44840 case unaligned_load:
44841 index = sse_store_index (mode);
44842 /* See PR82713 - we may end up being called on non-vector type. */
44843 if (index < 0)
44844 index = 2;
44845 return ix86_vec_cost (mode,
44846 COSTS_N_INSNS
44847 (ix86_cost->sse_unaligned_load[index]) / 2,
44848 true);
44850 case unaligned_store:
44851 index = sse_store_index (mode);
44852 /* See PR82713 - we may end up being called on non-vector type. */
44853 if (index < 0)
44854 index = 2;
44855 return ix86_vec_cost (mode,
44856 COSTS_N_INSNS
44857 (ix86_cost->sse_unaligned_store[index]) / 2,
44858 true);
44860 case vector_gather_load:
44861 return ix86_vec_cost (mode,
44862 COSTS_N_INSNS
44863 (ix86_cost->gather_static
44864 + ix86_cost->gather_per_elt
44865 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44866 true);
44868 case vector_scatter_store:
44869 return ix86_vec_cost (mode,
44870 COSTS_N_INSNS
44871 (ix86_cost->scatter_static
44872 + ix86_cost->scatter_per_elt
44873 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44874 true);
44876 case cond_branch_taken:
44877 return ix86_cost->cond_taken_branch_cost;
44879 case cond_branch_not_taken:
44880 return ix86_cost->cond_not_taken_branch_cost;
44882 case vec_perm:
44883 case vec_promote_demote:
44884 return ix86_vec_cost (mode,
44885 ix86_cost->sse_op, true);
44887 case vec_construct:
44888 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
44890 default:
44891 gcc_unreachable ();
44895 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44896 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44897 insn every time. */
44899 static GTY(()) rtx_insn *vselect_insn;
44901 /* Initialize vselect_insn. */
44903 static void
44904 init_vselect_insn (void)
44906 unsigned i;
44907 rtx x;
44909 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44910 for (i = 0; i < MAX_VECT_LEN; ++i)
44911 XVECEXP (x, 0, i) = const0_rtx;
44912 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44913 const0_rtx), x);
44914 x = gen_rtx_SET (const0_rtx, x);
44915 start_sequence ();
44916 vselect_insn = emit_insn (x);
44917 end_sequence ();
44920 /* Construct (set target (vec_select op0 (parallel perm))) and
44921 return true if that's a valid instruction in the active ISA. */
44923 static bool
44924 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44925 unsigned nelt, bool testing_p)
44927 unsigned int i;
44928 rtx x, save_vconcat;
44929 int icode;
44931 if (vselect_insn == NULL_RTX)
44932 init_vselect_insn ();
44934 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44935 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44936 for (i = 0; i < nelt; ++i)
44937 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44938 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44939 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44940 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44941 SET_DEST (PATTERN (vselect_insn)) = target;
44942 icode = recog_memoized (vselect_insn);
44944 if (icode >= 0 && !testing_p)
44945 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44947 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44948 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44949 INSN_CODE (vselect_insn) = -1;
44951 return icode >= 0;
44954 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44956 static bool
44957 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44958 const unsigned char *perm, unsigned nelt,
44959 bool testing_p)
44961 machine_mode v2mode;
44962 rtx x;
44963 bool ok;
44965 if (vselect_insn == NULL_RTX)
44966 init_vselect_insn ();
44968 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
44969 return false;
44970 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44971 PUT_MODE (x, v2mode);
44972 XEXP (x, 0) = op0;
44973 XEXP (x, 1) = op1;
44974 ok = expand_vselect (target, x, perm, nelt, testing_p);
44975 XEXP (x, 0) = const0_rtx;
44976 XEXP (x, 1) = const0_rtx;
44977 return ok;
44980 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44981 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44983 static bool
44984 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44986 machine_mode mmode, vmode = d->vmode;
44987 unsigned i, mask, nelt = d->nelt;
44988 rtx target, op0, op1, maskop, x;
44989 rtx rperm[32], vperm;
44991 if (d->one_operand_p)
44992 return false;
44993 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44994 && (TARGET_AVX512BW
44995 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44997 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44999 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45001 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45003 else
45004 return false;
45006 /* This is a blend, not a permute. Elements must stay in their
45007 respective lanes. */
45008 for (i = 0; i < nelt; ++i)
45010 unsigned e = d->perm[i];
45011 if (!(e == i || e == i + nelt))
45012 return false;
45015 if (d->testing_p)
45016 return true;
45018 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45019 decision should be extracted elsewhere, so that we only try that
45020 sequence once all budget==3 options have been tried. */
45021 target = d->target;
45022 op0 = d->op0;
45023 op1 = d->op1;
45024 mask = 0;
45026 switch (vmode)
45028 case E_V8DFmode:
45029 case E_V16SFmode:
45030 case E_V4DFmode:
45031 case E_V8SFmode:
45032 case E_V2DFmode:
45033 case E_V4SFmode:
45034 case E_V8HImode:
45035 case E_V8SImode:
45036 case E_V32HImode:
45037 case E_V64QImode:
45038 case E_V16SImode:
45039 case E_V8DImode:
45040 for (i = 0; i < nelt; ++i)
45041 mask |= (d->perm[i] >= nelt) << i;
45042 break;
45044 case E_V2DImode:
45045 for (i = 0; i < 2; ++i)
45046 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45047 vmode = V8HImode;
45048 goto do_subreg;
45050 case E_V4SImode:
45051 for (i = 0; i < 4; ++i)
45052 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45053 vmode = V8HImode;
45054 goto do_subreg;
45056 case E_V16QImode:
45057 /* See if bytes move in pairs so we can use pblendw with
45058 an immediate argument, rather than pblendvb with a vector
45059 argument. */
45060 for (i = 0; i < 16; i += 2)
45061 if (d->perm[i] + 1 != d->perm[i + 1])
45063 use_pblendvb:
45064 for (i = 0; i < nelt; ++i)
45065 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45067 finish_pblendvb:
45068 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45069 vperm = force_reg (vmode, vperm);
45071 if (GET_MODE_SIZE (vmode) == 16)
45072 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45073 else
45074 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45075 if (target != d->target)
45076 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45077 return true;
45080 for (i = 0; i < 8; ++i)
45081 mask |= (d->perm[i * 2] >= 16) << i;
45082 vmode = V8HImode;
45083 /* FALLTHRU */
45085 do_subreg:
45086 target = gen_reg_rtx (vmode);
45087 op0 = gen_lowpart (vmode, op0);
45088 op1 = gen_lowpart (vmode, op1);
45089 break;
45091 case E_V32QImode:
45092 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45093 for (i = 0; i < 32; i += 2)
45094 if (d->perm[i] + 1 != d->perm[i + 1])
45095 goto use_pblendvb;
45096 /* See if bytes move in quadruplets. If yes, vpblendd
45097 with immediate can be used. */
45098 for (i = 0; i < 32; i += 4)
45099 if (d->perm[i] + 2 != d->perm[i + 2])
45100 break;
45101 if (i < 32)
45103 /* See if bytes move the same in both lanes. If yes,
45104 vpblendw with immediate can be used. */
45105 for (i = 0; i < 16; i += 2)
45106 if (d->perm[i] + 16 != d->perm[i + 16])
45107 goto use_pblendvb;
45109 /* Use vpblendw. */
45110 for (i = 0; i < 16; ++i)
45111 mask |= (d->perm[i * 2] >= 32) << i;
45112 vmode = V16HImode;
45113 goto do_subreg;
45116 /* Use vpblendd. */
45117 for (i = 0; i < 8; ++i)
45118 mask |= (d->perm[i * 4] >= 32) << i;
45119 vmode = V8SImode;
45120 goto do_subreg;
45122 case E_V16HImode:
45123 /* See if words move in pairs. If yes, vpblendd can be used. */
45124 for (i = 0; i < 16; i += 2)
45125 if (d->perm[i] + 1 != d->perm[i + 1])
45126 break;
45127 if (i < 16)
45129 /* See if words move the same in both lanes. If not,
45130 vpblendvb must be used. */
45131 for (i = 0; i < 8; i++)
45132 if (d->perm[i] + 8 != d->perm[i + 8])
45134 /* Use vpblendvb. */
45135 for (i = 0; i < 32; ++i)
45136 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45138 vmode = V32QImode;
45139 nelt = 32;
45140 target = gen_reg_rtx (vmode);
45141 op0 = gen_lowpart (vmode, op0);
45142 op1 = gen_lowpart (vmode, op1);
45143 goto finish_pblendvb;
45146 /* Use vpblendw. */
45147 for (i = 0; i < 16; ++i)
45148 mask |= (d->perm[i] >= 16) << i;
45149 break;
45152 /* Use vpblendd. */
45153 for (i = 0; i < 8; ++i)
45154 mask |= (d->perm[i * 2] >= 16) << i;
45155 vmode = V8SImode;
45156 goto do_subreg;
45158 case E_V4DImode:
45159 /* Use vpblendd. */
45160 for (i = 0; i < 4; ++i)
45161 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45162 vmode = V8SImode;
45163 goto do_subreg;
45165 default:
45166 gcc_unreachable ();
45169 switch (vmode)
45171 case E_V8DFmode:
45172 case E_V8DImode:
45173 mmode = QImode;
45174 break;
45175 case E_V16SFmode:
45176 case E_V16SImode:
45177 mmode = HImode;
45178 break;
45179 case E_V32HImode:
45180 mmode = SImode;
45181 break;
45182 case E_V64QImode:
45183 mmode = DImode;
45184 break;
45185 default:
45186 mmode = VOIDmode;
45189 if (mmode != VOIDmode)
45190 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45191 else
45192 maskop = GEN_INT (mask);
45194 /* This matches five different patterns with the different modes. */
45195 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45196 x = gen_rtx_SET (target, x);
45197 emit_insn (x);
45198 if (target != d->target)
45199 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45201 return true;
45204 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45205 in terms of the variable form of vpermilps.
45207 Note that we will have already failed the immediate input vpermilps,
45208 which requires that the high and low part shuffle be identical; the
45209 variable form doesn't require that. */
45211 static bool
45212 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45214 rtx rperm[8], vperm;
45215 unsigned i;
45217 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45218 return false;
45220 /* We can only permute within the 128-bit lane. */
45221 for (i = 0; i < 8; ++i)
45223 unsigned e = d->perm[i];
45224 if (i < 4 ? e >= 4 : e < 4)
45225 return false;
45228 if (d->testing_p)
45229 return true;
45231 for (i = 0; i < 8; ++i)
45233 unsigned e = d->perm[i];
45235 /* Within each 128-bit lane, the elements of op0 are numbered
45236 from 0 and the elements of op1 are numbered from 4. */
45237 if (e >= 8 + 4)
45238 e -= 8;
45239 else if (e >= 4)
45240 e -= 4;
45242 rperm[i] = GEN_INT (e);
45245 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45246 vperm = force_reg (V8SImode, vperm);
45247 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45249 return true;
45252 /* Return true if permutation D can be performed as VMODE permutation
45253 instead. */
45255 static bool
45256 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45258 unsigned int i, j, chunk;
45260 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45261 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45262 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45263 return false;
45265 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45266 return true;
45268 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45269 for (i = 0; i < d->nelt; i += chunk)
45270 if (d->perm[i] & (chunk - 1))
45271 return false;
45272 else
45273 for (j = 1; j < chunk; ++j)
45274 if (d->perm[i] + j != d->perm[i + j])
45275 return false;
45277 return true;
45280 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45281 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45283 static bool
45284 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45286 unsigned i, nelt, eltsz, mask;
45287 unsigned char perm[64];
45288 machine_mode vmode = V16QImode;
45289 rtx rperm[64], vperm, target, op0, op1;
45291 nelt = d->nelt;
45293 if (!d->one_operand_p)
45295 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45297 if (TARGET_AVX2
45298 && valid_perm_using_mode_p (V2TImode, d))
45300 if (d->testing_p)
45301 return true;
45303 /* Use vperm2i128 insn. The pattern uses
45304 V4DImode instead of V2TImode. */
45305 target = d->target;
45306 if (d->vmode != V4DImode)
45307 target = gen_reg_rtx (V4DImode);
45308 op0 = gen_lowpart (V4DImode, d->op0);
45309 op1 = gen_lowpart (V4DImode, d->op1);
45310 rperm[0]
45311 = GEN_INT ((d->perm[0] / (nelt / 2))
45312 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45313 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45314 if (target != d->target)
45315 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45316 return true;
45318 return false;
45321 else
45323 if (GET_MODE_SIZE (d->vmode) == 16)
45325 if (!TARGET_SSSE3)
45326 return false;
45328 else if (GET_MODE_SIZE (d->vmode) == 32)
45330 if (!TARGET_AVX2)
45331 return false;
45333 /* V4DImode should be already handled through
45334 expand_vselect by vpermq instruction. */
45335 gcc_assert (d->vmode != V4DImode);
45337 vmode = V32QImode;
45338 if (d->vmode == V8SImode
45339 || d->vmode == V16HImode
45340 || d->vmode == V32QImode)
45342 /* First see if vpermq can be used for
45343 V8SImode/V16HImode/V32QImode. */
45344 if (valid_perm_using_mode_p (V4DImode, d))
45346 for (i = 0; i < 4; i++)
45347 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45348 if (d->testing_p)
45349 return true;
45350 target = gen_reg_rtx (V4DImode);
45351 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45352 perm, 4, false))
45354 emit_move_insn (d->target,
45355 gen_lowpart (d->vmode, target));
45356 return true;
45358 return false;
45361 /* Next see if vpermd can be used. */
45362 if (valid_perm_using_mode_p (V8SImode, d))
45363 vmode = V8SImode;
45365 /* Or if vpermps can be used. */
45366 else if (d->vmode == V8SFmode)
45367 vmode = V8SImode;
45369 if (vmode == V32QImode)
45371 /* vpshufb only works intra lanes, it is not
45372 possible to shuffle bytes in between the lanes. */
45373 for (i = 0; i < nelt; ++i)
45374 if ((d->perm[i] ^ i) & (nelt / 2))
45375 return false;
45378 else if (GET_MODE_SIZE (d->vmode) == 64)
45380 if (!TARGET_AVX512BW)
45381 return false;
45383 /* If vpermq didn't work, vpshufb won't work either. */
45384 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45385 return false;
45387 vmode = V64QImode;
45388 if (d->vmode == V16SImode
45389 || d->vmode == V32HImode
45390 || d->vmode == V64QImode)
45392 /* First see if vpermq can be used for
45393 V16SImode/V32HImode/V64QImode. */
45394 if (valid_perm_using_mode_p (V8DImode, d))
45396 for (i = 0; i < 8; i++)
45397 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45398 if (d->testing_p)
45399 return true;
45400 target = gen_reg_rtx (V8DImode);
45401 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45402 perm, 8, false))
45404 emit_move_insn (d->target,
45405 gen_lowpart (d->vmode, target));
45406 return true;
45408 return false;
45411 /* Next see if vpermd can be used. */
45412 if (valid_perm_using_mode_p (V16SImode, d))
45413 vmode = V16SImode;
45415 /* Or if vpermps can be used. */
45416 else if (d->vmode == V16SFmode)
45417 vmode = V16SImode;
45418 if (vmode == V64QImode)
45420 /* vpshufb only works intra lanes, it is not
45421 possible to shuffle bytes in between the lanes. */
45422 for (i = 0; i < nelt; ++i)
45423 if ((d->perm[i] ^ i) & (nelt / 4))
45424 return false;
45427 else
45428 return false;
45431 if (d->testing_p)
45432 return true;
45434 if (vmode == V8SImode)
45435 for (i = 0; i < 8; ++i)
45436 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45437 else if (vmode == V16SImode)
45438 for (i = 0; i < 16; ++i)
45439 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45440 else
45442 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45443 if (!d->one_operand_p)
45444 mask = 2 * nelt - 1;
45445 else if (vmode == V16QImode)
45446 mask = nelt - 1;
45447 else if (vmode == V64QImode)
45448 mask = nelt / 4 - 1;
45449 else
45450 mask = nelt / 2 - 1;
45452 for (i = 0; i < nelt; ++i)
45454 unsigned j, e = d->perm[i] & mask;
45455 for (j = 0; j < eltsz; ++j)
45456 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45460 vperm = gen_rtx_CONST_VECTOR (vmode,
45461 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45462 vperm = force_reg (vmode, vperm);
45464 target = d->target;
45465 if (d->vmode != vmode)
45466 target = gen_reg_rtx (vmode);
45467 op0 = gen_lowpart (vmode, d->op0);
45468 if (d->one_operand_p)
45470 if (vmode == V16QImode)
45471 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45472 else if (vmode == V32QImode)
45473 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45474 else if (vmode == V64QImode)
45475 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45476 else if (vmode == V8SFmode)
45477 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45478 else if (vmode == V8SImode)
45479 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45480 else if (vmode == V16SFmode)
45481 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45482 else if (vmode == V16SImode)
45483 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45484 else
45485 gcc_unreachable ();
45487 else
45489 op1 = gen_lowpart (vmode, d->op1);
45490 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45492 if (target != d->target)
45493 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45495 return true;
45498 /* For V*[QHS]Imode permutations, check if the same permutation
45499 can't be performed in a 2x, 4x or 8x wider inner mode. */
45501 static bool
45502 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45503 struct expand_vec_perm_d *nd)
45505 int i;
45506 machine_mode mode = VOIDmode;
45508 switch (d->vmode)
45510 case E_V16QImode: mode = V8HImode; break;
45511 case E_V32QImode: mode = V16HImode; break;
45512 case E_V64QImode: mode = V32HImode; break;
45513 case E_V8HImode: mode = V4SImode; break;
45514 case E_V16HImode: mode = V8SImode; break;
45515 case E_V32HImode: mode = V16SImode; break;
45516 case E_V4SImode: mode = V2DImode; break;
45517 case E_V8SImode: mode = V4DImode; break;
45518 case E_V16SImode: mode = V8DImode; break;
45519 default: return false;
45521 for (i = 0; i < d->nelt; i += 2)
45522 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45523 return false;
45524 nd->vmode = mode;
45525 nd->nelt = d->nelt / 2;
45526 for (i = 0; i < nd->nelt; i++)
45527 nd->perm[i] = d->perm[2 * i] / 2;
45528 if (GET_MODE_INNER (mode) != DImode)
45529 canonicalize_vector_int_perm (nd, nd);
45530 if (nd != d)
45532 nd->one_operand_p = d->one_operand_p;
45533 nd->testing_p = d->testing_p;
45534 if (d->op0 == d->op1)
45535 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45536 else
45538 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45539 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45541 if (d->testing_p)
45542 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45543 else
45544 nd->target = gen_reg_rtx (nd->vmode);
45546 return true;
45549 /* Try to expand one-operand permutation with constant mask. */
45551 static bool
45552 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45554 machine_mode mode = GET_MODE (d->op0);
45555 machine_mode maskmode = mode;
45556 rtx (*gen) (rtx, rtx, rtx) = NULL;
45557 rtx target, op0, mask;
45558 rtx vec[64];
45560 if (!rtx_equal_p (d->op0, d->op1))
45561 return false;
45563 if (!TARGET_AVX512F)
45564 return false;
45566 switch (mode)
45568 case E_V16SImode:
45569 gen = gen_avx512f_permvarv16si;
45570 break;
45571 case E_V16SFmode:
45572 gen = gen_avx512f_permvarv16sf;
45573 maskmode = V16SImode;
45574 break;
45575 case E_V8DImode:
45576 gen = gen_avx512f_permvarv8di;
45577 break;
45578 case E_V8DFmode:
45579 gen = gen_avx512f_permvarv8df;
45580 maskmode = V8DImode;
45581 break;
45582 default:
45583 return false;
45586 target = d->target;
45587 op0 = d->op0;
45588 for (int i = 0; i < d->nelt; ++i)
45589 vec[i] = GEN_INT (d->perm[i]);
45590 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45591 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45592 return true;
45595 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45596 in a single instruction. */
45598 static bool
45599 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45601 unsigned i, nelt = d->nelt;
45602 struct expand_vec_perm_d nd;
45604 /* Check plain VEC_SELECT first, because AVX has instructions that could
45605 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45606 input where SEL+CONCAT may not. */
45607 if (d->one_operand_p)
45609 int mask = nelt - 1;
45610 bool identity_perm = true;
45611 bool broadcast_perm = true;
45613 for (i = 0; i < nelt; i++)
45615 nd.perm[i] = d->perm[i] & mask;
45616 if (nd.perm[i] != i)
45617 identity_perm = false;
45618 if (nd.perm[i])
45619 broadcast_perm = false;
45622 if (identity_perm)
45624 if (!d->testing_p)
45625 emit_move_insn (d->target, d->op0);
45626 return true;
45628 else if (broadcast_perm && TARGET_AVX2)
45630 /* Use vpbroadcast{b,w,d}. */
45631 rtx (*gen) (rtx, rtx) = NULL;
45632 switch (d->vmode)
45634 case E_V64QImode:
45635 if (TARGET_AVX512BW)
45636 gen = gen_avx512bw_vec_dupv64qi_1;
45637 break;
45638 case E_V32QImode:
45639 gen = gen_avx2_pbroadcastv32qi_1;
45640 break;
45641 case E_V32HImode:
45642 if (TARGET_AVX512BW)
45643 gen = gen_avx512bw_vec_dupv32hi_1;
45644 break;
45645 case E_V16HImode:
45646 gen = gen_avx2_pbroadcastv16hi_1;
45647 break;
45648 case E_V16SImode:
45649 if (TARGET_AVX512F)
45650 gen = gen_avx512f_vec_dupv16si_1;
45651 break;
45652 case E_V8SImode:
45653 gen = gen_avx2_pbroadcastv8si_1;
45654 break;
45655 case E_V16QImode:
45656 gen = gen_avx2_pbroadcastv16qi;
45657 break;
45658 case E_V8HImode:
45659 gen = gen_avx2_pbroadcastv8hi;
45660 break;
45661 case E_V16SFmode:
45662 if (TARGET_AVX512F)
45663 gen = gen_avx512f_vec_dupv16sf_1;
45664 break;
45665 case E_V8SFmode:
45666 gen = gen_avx2_vec_dupv8sf_1;
45667 break;
45668 case E_V8DFmode:
45669 if (TARGET_AVX512F)
45670 gen = gen_avx512f_vec_dupv8df_1;
45671 break;
45672 case E_V8DImode:
45673 if (TARGET_AVX512F)
45674 gen = gen_avx512f_vec_dupv8di_1;
45675 break;
45676 /* For other modes prefer other shuffles this function creates. */
45677 default: break;
45679 if (gen != NULL)
45681 if (!d->testing_p)
45682 emit_insn (gen (d->target, d->op0));
45683 return true;
45687 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45688 return true;
45690 /* There are plenty of patterns in sse.md that are written for
45691 SEL+CONCAT and are not replicated for a single op. Perhaps
45692 that should be changed, to avoid the nastiness here. */
45694 /* Recognize interleave style patterns, which means incrementing
45695 every other permutation operand. */
45696 for (i = 0; i < nelt; i += 2)
45698 nd.perm[i] = d->perm[i] & mask;
45699 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45701 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45702 d->testing_p))
45703 return true;
45705 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45706 if (nelt >= 4)
45708 for (i = 0; i < nelt; i += 4)
45710 nd.perm[i + 0] = d->perm[i + 0] & mask;
45711 nd.perm[i + 1] = d->perm[i + 1] & mask;
45712 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45713 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45716 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45717 d->testing_p))
45718 return true;
45722 /* Finally, try the fully general two operand permute. */
45723 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45724 d->testing_p))
45725 return true;
45727 /* Recognize interleave style patterns with reversed operands. */
45728 if (!d->one_operand_p)
45730 for (i = 0; i < nelt; ++i)
45732 unsigned e = d->perm[i];
45733 if (e >= nelt)
45734 e -= nelt;
45735 else
45736 e += nelt;
45737 nd.perm[i] = e;
45740 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45741 d->testing_p))
45742 return true;
45745 /* Try the SSE4.1 blend variable merge instructions. */
45746 if (expand_vec_perm_blend (d))
45747 return true;
45749 /* Try one of the AVX vpermil variable permutations. */
45750 if (expand_vec_perm_vpermil (d))
45751 return true;
45753 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45754 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45755 if (expand_vec_perm_pshufb (d))
45756 return true;
45758 /* Try the AVX2 vpalignr instruction. */
45759 if (expand_vec_perm_palignr (d, true))
45760 return true;
45762 /* Try the AVX512F vperm{s,d} instructions. */
45763 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45764 return true;
45766 /* Try the AVX512F vpermt2/vpermi2 instructions. */
45767 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45768 return true;
45770 /* See if we can get the same permutation in different vector integer
45771 mode. */
45772 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45774 if (!d->testing_p)
45775 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45776 return true;
45778 return false;
45781 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45782 in terms of a pair of pshuflw + pshufhw instructions. */
45784 static bool
45785 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45787 unsigned char perm2[MAX_VECT_LEN];
45788 unsigned i;
45789 bool ok;
45791 if (d->vmode != V8HImode || !d->one_operand_p)
45792 return false;
45794 /* The two permutations only operate in 64-bit lanes. */
45795 for (i = 0; i < 4; ++i)
45796 if (d->perm[i] >= 4)
45797 return false;
45798 for (i = 4; i < 8; ++i)
45799 if (d->perm[i] < 4)
45800 return false;
45802 if (d->testing_p)
45803 return true;
45805 /* Emit the pshuflw. */
45806 memcpy (perm2, d->perm, 4);
45807 for (i = 4; i < 8; ++i)
45808 perm2[i] = i;
45809 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45810 gcc_assert (ok);
45812 /* Emit the pshufhw. */
45813 memcpy (perm2 + 4, d->perm + 4, 4);
45814 for (i = 0; i < 4; ++i)
45815 perm2[i] = i;
45816 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45817 gcc_assert (ok);
45819 return true;
45822 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45823 the permutation using the SSSE3 palignr instruction. This succeeds
45824 when all of the elements in PERM fit within one vector and we merely
45825 need to shift them down so that a single vector permutation has a
45826 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45827 the vpalignr instruction itself can perform the requested permutation. */
45829 static bool
45830 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45832 unsigned i, nelt = d->nelt;
45833 unsigned min, max, minswap, maxswap;
45834 bool in_order, ok, swap = false;
45835 rtx shift, target;
45836 struct expand_vec_perm_d dcopy;
45838 /* Even with AVX, palignr only operates on 128-bit vectors,
45839 in AVX2 palignr operates on both 128-bit lanes. */
45840 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45841 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45842 return false;
45844 min = 2 * nelt;
45845 max = 0;
45846 minswap = 2 * nelt;
45847 maxswap = 0;
45848 for (i = 0; i < nelt; ++i)
45850 unsigned e = d->perm[i];
45851 unsigned eswap = d->perm[i] ^ nelt;
45852 if (GET_MODE_SIZE (d->vmode) == 32)
45854 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45855 eswap = e ^ (nelt / 2);
45857 if (e < min)
45858 min = e;
45859 if (e > max)
45860 max = e;
45861 if (eswap < minswap)
45862 minswap = eswap;
45863 if (eswap > maxswap)
45864 maxswap = eswap;
45866 if (min == 0
45867 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45869 if (d->one_operand_p
45870 || minswap == 0
45871 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45872 ? nelt / 2 : nelt))
45873 return false;
45874 swap = true;
45875 min = minswap;
45876 max = maxswap;
45879 /* Given that we have SSSE3, we know we'll be able to implement the
45880 single operand permutation after the palignr with pshufb for
45881 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45882 first. */
45883 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45884 return true;
45886 dcopy = *d;
45887 if (swap)
45889 dcopy.op0 = d->op1;
45890 dcopy.op1 = d->op0;
45891 for (i = 0; i < nelt; ++i)
45892 dcopy.perm[i] ^= nelt;
45895 in_order = true;
45896 for (i = 0; i < nelt; ++i)
45898 unsigned e = dcopy.perm[i];
45899 if (GET_MODE_SIZE (d->vmode) == 32
45900 && e >= nelt
45901 && (e & (nelt / 2 - 1)) < min)
45902 e = e - min - (nelt / 2);
45903 else
45904 e = e - min;
45905 if (e != i)
45906 in_order = false;
45907 dcopy.perm[i] = e;
45909 dcopy.one_operand_p = true;
45911 if (single_insn_only_p && !in_order)
45912 return false;
45914 /* For AVX2, test whether we can permute the result in one instruction. */
45915 if (d->testing_p)
45917 if (in_order)
45918 return true;
45919 dcopy.op1 = dcopy.op0;
45920 return expand_vec_perm_1 (&dcopy);
45923 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45924 if (GET_MODE_SIZE (d->vmode) == 16)
45926 target = gen_reg_rtx (TImode);
45927 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45928 gen_lowpart (TImode, dcopy.op0), shift));
45930 else
45932 target = gen_reg_rtx (V2TImode);
45933 emit_insn (gen_avx2_palignrv2ti (target,
45934 gen_lowpart (V2TImode, dcopy.op1),
45935 gen_lowpart (V2TImode, dcopy.op0),
45936 shift));
45939 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45941 /* Test for the degenerate case where the alignment by itself
45942 produces the desired permutation. */
45943 if (in_order)
45945 emit_move_insn (d->target, dcopy.op0);
45946 return true;
45949 ok = expand_vec_perm_1 (&dcopy);
45950 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45952 return ok;
45955 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45956 the permutation using the SSE4_1 pblendv instruction. Potentially
45957 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45959 static bool
45960 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45962 unsigned i, which, nelt = d->nelt;
45963 struct expand_vec_perm_d dcopy, dcopy1;
45964 machine_mode vmode = d->vmode;
45965 bool ok;
45967 /* Use the same checks as in expand_vec_perm_blend. */
45968 if (d->one_operand_p)
45969 return false;
45970 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45972 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45974 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45976 else
45977 return false;
45979 /* Figure out where permutation elements stay not in their
45980 respective lanes. */
45981 for (i = 0, which = 0; i < nelt; ++i)
45983 unsigned e = d->perm[i];
45984 if (e != i)
45985 which |= (e < nelt ? 1 : 2);
45987 /* We can pblend the part where elements stay not in their
45988 respective lanes only when these elements are all in one
45989 half of a permutation.
45990 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45991 lanes, but both 8 and 9 >= 8
45992 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45993 respective lanes and 8 >= 8, but 2 not. */
45994 if (which != 1 && which != 2)
45995 return false;
45996 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45997 return true;
45999 /* First we apply one operand permutation to the part where
46000 elements stay not in their respective lanes. */
46001 dcopy = *d;
46002 if (which == 2)
46003 dcopy.op0 = dcopy.op1 = d->op1;
46004 else
46005 dcopy.op0 = dcopy.op1 = d->op0;
46006 if (!d->testing_p)
46007 dcopy.target = gen_reg_rtx (vmode);
46008 dcopy.one_operand_p = true;
46010 for (i = 0; i < nelt; ++i)
46011 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46013 ok = expand_vec_perm_1 (&dcopy);
46014 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46015 return false;
46016 else
46017 gcc_assert (ok);
46018 if (d->testing_p)
46019 return true;
46021 /* Next we put permuted elements into their positions. */
46022 dcopy1 = *d;
46023 if (which == 2)
46024 dcopy1.op1 = dcopy.target;
46025 else
46026 dcopy1.op0 = dcopy.target;
46028 for (i = 0; i < nelt; ++i)
46029 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46031 ok = expand_vec_perm_blend (&dcopy1);
46032 gcc_assert (ok);
46034 return true;
46037 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46039 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46040 a two vector permutation into a single vector permutation by using
46041 an interleave operation to merge the vectors. */
46043 static bool
46044 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46046 struct expand_vec_perm_d dremap, dfinal;
46047 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46048 unsigned HOST_WIDE_INT contents;
46049 unsigned char remap[2 * MAX_VECT_LEN];
46050 rtx_insn *seq;
46051 bool ok, same_halves = false;
46053 if (GET_MODE_SIZE (d->vmode) == 16)
46055 if (d->one_operand_p)
46056 return false;
46058 else if (GET_MODE_SIZE (d->vmode) == 32)
46060 if (!TARGET_AVX)
46061 return false;
46062 /* For 32-byte modes allow even d->one_operand_p.
46063 The lack of cross-lane shuffling in some instructions
46064 might prevent a single insn shuffle. */
46065 dfinal = *d;
46066 dfinal.testing_p = true;
46067 /* If expand_vec_perm_interleave3 can expand this into
46068 a 3 insn sequence, give up and let it be expanded as
46069 3 insn sequence. While that is one insn longer,
46070 it doesn't need a memory operand and in the common
46071 case that both interleave low and high permutations
46072 with the same operands are adjacent needs 4 insns
46073 for both after CSE. */
46074 if (expand_vec_perm_interleave3 (&dfinal))
46075 return false;
46077 else
46078 return false;
46080 /* Examine from whence the elements come. */
46081 contents = 0;
46082 for (i = 0; i < nelt; ++i)
46083 contents |= HOST_WIDE_INT_1U << d->perm[i];
46085 memset (remap, 0xff, sizeof (remap));
46086 dremap = *d;
46088 if (GET_MODE_SIZE (d->vmode) == 16)
46090 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46092 /* Split the two input vectors into 4 halves. */
46093 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46094 h2 = h1 << nelt2;
46095 h3 = h2 << nelt2;
46096 h4 = h3 << nelt2;
46098 /* If the elements from the low halves use interleave low, and similarly
46099 for interleave high. If the elements are from mis-matched halves, we
46100 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46101 if ((contents & (h1 | h3)) == contents)
46103 /* punpckl* */
46104 for (i = 0; i < nelt2; ++i)
46106 remap[i] = i * 2;
46107 remap[i + nelt] = i * 2 + 1;
46108 dremap.perm[i * 2] = i;
46109 dremap.perm[i * 2 + 1] = i + nelt;
46111 if (!TARGET_SSE2 && d->vmode == V4SImode)
46112 dremap.vmode = V4SFmode;
46114 else if ((contents & (h2 | h4)) == contents)
46116 /* punpckh* */
46117 for (i = 0; i < nelt2; ++i)
46119 remap[i + nelt2] = i * 2;
46120 remap[i + nelt + nelt2] = i * 2 + 1;
46121 dremap.perm[i * 2] = i + nelt2;
46122 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46124 if (!TARGET_SSE2 && d->vmode == V4SImode)
46125 dremap.vmode = V4SFmode;
46127 else if ((contents & (h1 | h4)) == contents)
46129 /* shufps */
46130 for (i = 0; i < nelt2; ++i)
46132 remap[i] = i;
46133 remap[i + nelt + nelt2] = i + nelt2;
46134 dremap.perm[i] = i;
46135 dremap.perm[i + nelt2] = i + nelt + nelt2;
46137 if (nelt != 4)
46139 /* shufpd */
46140 dremap.vmode = V2DImode;
46141 dremap.nelt = 2;
46142 dremap.perm[0] = 0;
46143 dremap.perm[1] = 3;
46146 else if ((contents & (h2 | h3)) == contents)
46148 /* shufps */
46149 for (i = 0; i < nelt2; ++i)
46151 remap[i + nelt2] = i;
46152 remap[i + nelt] = i + nelt2;
46153 dremap.perm[i] = i + nelt2;
46154 dremap.perm[i + nelt2] = i + nelt;
46156 if (nelt != 4)
46158 /* shufpd */
46159 dremap.vmode = V2DImode;
46160 dremap.nelt = 2;
46161 dremap.perm[0] = 1;
46162 dremap.perm[1] = 2;
46165 else
46166 return false;
46168 else
46170 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46171 unsigned HOST_WIDE_INT q[8];
46172 unsigned int nonzero_halves[4];
46174 /* Split the two input vectors into 8 quarters. */
46175 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46176 for (i = 1; i < 8; ++i)
46177 q[i] = q[0] << (nelt4 * i);
46178 for (i = 0; i < 4; ++i)
46179 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46181 nonzero_halves[nzcnt] = i;
46182 ++nzcnt;
46185 if (nzcnt == 1)
46187 gcc_assert (d->one_operand_p);
46188 nonzero_halves[1] = nonzero_halves[0];
46189 same_halves = true;
46191 else if (d->one_operand_p)
46193 gcc_assert (nonzero_halves[0] == 0);
46194 gcc_assert (nonzero_halves[1] == 1);
46197 if (nzcnt <= 2)
46199 if (d->perm[0] / nelt2 == nonzero_halves[1])
46201 /* Attempt to increase the likelihood that dfinal
46202 shuffle will be intra-lane. */
46203 std::swap (nonzero_halves[0], nonzero_halves[1]);
46206 /* vperm2f128 or vperm2i128. */
46207 for (i = 0; i < nelt2; ++i)
46209 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46210 remap[i + nonzero_halves[0] * nelt2] = i;
46211 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46212 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46215 if (d->vmode != V8SFmode
46216 && d->vmode != V4DFmode
46217 && d->vmode != V8SImode)
46219 dremap.vmode = V8SImode;
46220 dremap.nelt = 8;
46221 for (i = 0; i < 4; ++i)
46223 dremap.perm[i] = i + nonzero_halves[0] * 4;
46224 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46228 else if (d->one_operand_p)
46229 return false;
46230 else if (TARGET_AVX2
46231 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46233 /* vpunpckl* */
46234 for (i = 0; i < nelt4; ++i)
46236 remap[i] = i * 2;
46237 remap[i + nelt] = i * 2 + 1;
46238 remap[i + nelt2] = i * 2 + nelt2;
46239 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46240 dremap.perm[i * 2] = i;
46241 dremap.perm[i * 2 + 1] = i + nelt;
46242 dremap.perm[i * 2 + nelt2] = i + nelt2;
46243 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46246 else if (TARGET_AVX2
46247 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46249 /* vpunpckh* */
46250 for (i = 0; i < nelt4; ++i)
46252 remap[i + nelt4] = i * 2;
46253 remap[i + nelt + nelt4] = i * 2 + 1;
46254 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46255 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46256 dremap.perm[i * 2] = i + nelt4;
46257 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46258 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46259 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46262 else
46263 return false;
46266 /* Use the remapping array set up above to move the elements from their
46267 swizzled locations into their final destinations. */
46268 dfinal = *d;
46269 for (i = 0; i < nelt; ++i)
46271 unsigned e = remap[d->perm[i]];
46272 gcc_assert (e < nelt);
46273 /* If same_halves is true, both halves of the remapped vector are the
46274 same. Avoid cross-lane accesses if possible. */
46275 if (same_halves && i >= nelt2)
46277 gcc_assert (e < nelt2);
46278 dfinal.perm[i] = e + nelt2;
46280 else
46281 dfinal.perm[i] = e;
46283 if (!d->testing_p)
46285 dremap.target = gen_reg_rtx (dremap.vmode);
46286 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46288 dfinal.op1 = dfinal.op0;
46289 dfinal.one_operand_p = true;
46291 /* Test if the final remap can be done with a single insn. For V4SFmode or
46292 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46293 start_sequence ();
46294 ok = expand_vec_perm_1 (&dfinal);
46295 seq = get_insns ();
46296 end_sequence ();
46298 if (!ok)
46299 return false;
46301 if (d->testing_p)
46302 return true;
46304 if (dremap.vmode != dfinal.vmode)
46306 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46307 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46310 ok = expand_vec_perm_1 (&dremap);
46311 gcc_assert (ok);
46313 emit_insn (seq);
46314 return true;
46317 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46318 a single vector cross-lane permutation into vpermq followed
46319 by any of the single insn permutations. */
46321 static bool
46322 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46324 struct expand_vec_perm_d dremap, dfinal;
46325 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46326 unsigned contents[2];
46327 bool ok;
46329 if (!(TARGET_AVX2
46330 && (d->vmode == V32QImode || d->vmode == V16HImode)
46331 && d->one_operand_p))
46332 return false;
46334 contents[0] = 0;
46335 contents[1] = 0;
46336 for (i = 0; i < nelt2; ++i)
46338 contents[0] |= 1u << (d->perm[i] / nelt4);
46339 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46342 for (i = 0; i < 2; ++i)
46344 unsigned int cnt = 0;
46345 for (j = 0; j < 4; ++j)
46346 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46347 return false;
46350 if (d->testing_p)
46351 return true;
46353 dremap = *d;
46354 dremap.vmode = V4DImode;
46355 dremap.nelt = 4;
46356 dremap.target = gen_reg_rtx (V4DImode);
46357 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46358 dremap.op1 = dremap.op0;
46359 dremap.one_operand_p = true;
46360 for (i = 0; i < 2; ++i)
46362 unsigned int cnt = 0;
46363 for (j = 0; j < 4; ++j)
46364 if ((contents[i] & (1u << j)) != 0)
46365 dremap.perm[2 * i + cnt++] = j;
46366 for (; cnt < 2; ++cnt)
46367 dremap.perm[2 * i + cnt] = 0;
46370 dfinal = *d;
46371 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46372 dfinal.op1 = dfinal.op0;
46373 dfinal.one_operand_p = true;
46374 for (i = 0, j = 0; i < nelt; ++i)
46376 if (i == nelt2)
46377 j = 2;
46378 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46379 if ((d->perm[i] / nelt4) == dremap.perm[j])
46381 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46382 dfinal.perm[i] |= nelt4;
46383 else
46384 gcc_unreachable ();
46387 ok = expand_vec_perm_1 (&dremap);
46388 gcc_assert (ok);
46390 ok = expand_vec_perm_1 (&dfinal);
46391 gcc_assert (ok);
46393 return true;
46396 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46397 a vector permutation using two instructions, vperm2f128 resp.
46398 vperm2i128 followed by any single in-lane permutation. */
46400 static bool
46401 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46403 struct expand_vec_perm_d dfirst, dsecond;
46404 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46405 bool ok;
46407 if (!TARGET_AVX
46408 || GET_MODE_SIZE (d->vmode) != 32
46409 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46410 return false;
46412 dsecond = *d;
46413 dsecond.one_operand_p = false;
46414 dsecond.testing_p = true;
46416 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46417 immediate. For perm < 16 the second permutation uses
46418 d->op0 as first operand, for perm >= 16 it uses d->op1
46419 as first operand. The second operand is the result of
46420 vperm2[fi]128. */
46421 for (perm = 0; perm < 32; perm++)
46423 /* Ignore permutations which do not move anything cross-lane. */
46424 if (perm < 16)
46426 /* The second shuffle for e.g. V4DFmode has
46427 0123 and ABCD operands.
46428 Ignore AB23, as 23 is already in the second lane
46429 of the first operand. */
46430 if ((perm & 0xc) == (1 << 2)) continue;
46431 /* And 01CD, as 01 is in the first lane of the first
46432 operand. */
46433 if ((perm & 3) == 0) continue;
46434 /* And 4567, as then the vperm2[fi]128 doesn't change
46435 anything on the original 4567 second operand. */
46436 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46438 else
46440 /* The second shuffle for e.g. V4DFmode has
46441 4567 and ABCD operands.
46442 Ignore AB67, as 67 is already in the second lane
46443 of the first operand. */
46444 if ((perm & 0xc) == (3 << 2)) continue;
46445 /* And 45CD, as 45 is in the first lane of the first
46446 operand. */
46447 if ((perm & 3) == 2) continue;
46448 /* And 0123, as then the vperm2[fi]128 doesn't change
46449 anything on the original 0123 first operand. */
46450 if ((perm & 0xf) == (1 << 2)) continue;
46453 for (i = 0; i < nelt; i++)
46455 j = d->perm[i] / nelt2;
46456 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46457 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46458 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46459 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46460 else
46461 break;
46464 if (i == nelt)
46466 start_sequence ();
46467 ok = expand_vec_perm_1 (&dsecond);
46468 end_sequence ();
46470 else
46471 ok = false;
46473 if (ok)
46475 if (d->testing_p)
46476 return true;
46478 /* Found a usable second shuffle. dfirst will be
46479 vperm2f128 on d->op0 and d->op1. */
46480 dsecond.testing_p = false;
46481 dfirst = *d;
46482 dfirst.target = gen_reg_rtx (d->vmode);
46483 for (i = 0; i < nelt; i++)
46484 dfirst.perm[i] = (i & (nelt2 - 1))
46485 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46487 canonicalize_perm (&dfirst);
46488 ok = expand_vec_perm_1 (&dfirst);
46489 gcc_assert (ok);
46491 /* And dsecond is some single insn shuffle, taking
46492 d->op0 and result of vperm2f128 (if perm < 16) or
46493 d->op1 and result of vperm2f128 (otherwise). */
46494 if (perm >= 16)
46495 dsecond.op0 = dsecond.op1;
46496 dsecond.op1 = dfirst.target;
46498 ok = expand_vec_perm_1 (&dsecond);
46499 gcc_assert (ok);
46501 return true;
46504 /* For one operand, the only useful vperm2f128 permutation is 0x01
46505 aka lanes swap. */
46506 if (d->one_operand_p)
46507 return false;
46510 return false;
46513 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46514 a two vector permutation using 2 intra-lane interleave insns
46515 and cross-lane shuffle for 32-byte vectors. */
46517 static bool
46518 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46520 unsigned i, nelt;
46521 rtx (*gen) (rtx, rtx, rtx);
46523 if (d->one_operand_p)
46524 return false;
46525 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46527 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46529 else
46530 return false;
46532 nelt = d->nelt;
46533 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46534 return false;
46535 for (i = 0; i < nelt; i += 2)
46536 if (d->perm[i] != d->perm[0] + i / 2
46537 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46538 return false;
46540 if (d->testing_p)
46541 return true;
46543 switch (d->vmode)
46545 case E_V32QImode:
46546 if (d->perm[0])
46547 gen = gen_vec_interleave_highv32qi;
46548 else
46549 gen = gen_vec_interleave_lowv32qi;
46550 break;
46551 case E_V16HImode:
46552 if (d->perm[0])
46553 gen = gen_vec_interleave_highv16hi;
46554 else
46555 gen = gen_vec_interleave_lowv16hi;
46556 break;
46557 case E_V8SImode:
46558 if (d->perm[0])
46559 gen = gen_vec_interleave_highv8si;
46560 else
46561 gen = gen_vec_interleave_lowv8si;
46562 break;
46563 case E_V4DImode:
46564 if (d->perm[0])
46565 gen = gen_vec_interleave_highv4di;
46566 else
46567 gen = gen_vec_interleave_lowv4di;
46568 break;
46569 case E_V8SFmode:
46570 if (d->perm[0])
46571 gen = gen_vec_interleave_highv8sf;
46572 else
46573 gen = gen_vec_interleave_lowv8sf;
46574 break;
46575 case E_V4DFmode:
46576 if (d->perm[0])
46577 gen = gen_vec_interleave_highv4df;
46578 else
46579 gen = gen_vec_interleave_lowv4df;
46580 break;
46581 default:
46582 gcc_unreachable ();
46585 emit_insn (gen (d->target, d->op0, d->op1));
46586 return true;
46589 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46590 a single vector permutation using a single intra-lane vector
46591 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46592 the non-swapped and swapped vectors together. */
46594 static bool
46595 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46597 struct expand_vec_perm_d dfirst, dsecond;
46598 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46599 rtx_insn *seq;
46600 bool ok;
46601 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46603 if (!TARGET_AVX
46604 || TARGET_AVX2
46605 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46606 || !d->one_operand_p)
46607 return false;
46609 dfirst = *d;
46610 for (i = 0; i < nelt; i++)
46611 dfirst.perm[i] = 0xff;
46612 for (i = 0, msk = 0; i < nelt; i++)
46614 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46615 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46616 return false;
46617 dfirst.perm[j] = d->perm[i];
46618 if (j != i)
46619 msk |= (1 << i);
46621 for (i = 0; i < nelt; i++)
46622 if (dfirst.perm[i] == 0xff)
46623 dfirst.perm[i] = i;
46625 if (!d->testing_p)
46626 dfirst.target = gen_reg_rtx (dfirst.vmode);
46628 start_sequence ();
46629 ok = expand_vec_perm_1 (&dfirst);
46630 seq = get_insns ();
46631 end_sequence ();
46633 if (!ok)
46634 return false;
46636 if (d->testing_p)
46637 return true;
46639 emit_insn (seq);
46641 dsecond = *d;
46642 dsecond.op0 = dfirst.target;
46643 dsecond.op1 = dfirst.target;
46644 dsecond.one_operand_p = true;
46645 dsecond.target = gen_reg_rtx (dsecond.vmode);
46646 for (i = 0; i < nelt; i++)
46647 dsecond.perm[i] = i ^ nelt2;
46649 ok = expand_vec_perm_1 (&dsecond);
46650 gcc_assert (ok);
46652 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46653 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46654 return true;
46657 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46658 permutation using two vperm2f128, followed by a vshufpd insn blending
46659 the two vectors together. */
46661 static bool
46662 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46664 struct expand_vec_perm_d dfirst, dsecond, dthird;
46665 bool ok;
46667 if (!TARGET_AVX || (d->vmode != V4DFmode))
46668 return false;
46670 if (d->testing_p)
46671 return true;
46673 dfirst = *d;
46674 dsecond = *d;
46675 dthird = *d;
46677 dfirst.perm[0] = (d->perm[0] & ~1);
46678 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46679 dfirst.perm[2] = (d->perm[2] & ~1);
46680 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46681 dsecond.perm[0] = (d->perm[1] & ~1);
46682 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46683 dsecond.perm[2] = (d->perm[3] & ~1);
46684 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46685 dthird.perm[0] = (d->perm[0] % 2);
46686 dthird.perm[1] = (d->perm[1] % 2) + 4;
46687 dthird.perm[2] = (d->perm[2] % 2) + 2;
46688 dthird.perm[3] = (d->perm[3] % 2) + 6;
46690 dfirst.target = gen_reg_rtx (dfirst.vmode);
46691 dsecond.target = gen_reg_rtx (dsecond.vmode);
46692 dthird.op0 = dfirst.target;
46693 dthird.op1 = dsecond.target;
46694 dthird.one_operand_p = false;
46696 canonicalize_perm (&dfirst);
46697 canonicalize_perm (&dsecond);
46699 ok = expand_vec_perm_1 (&dfirst)
46700 && expand_vec_perm_1 (&dsecond)
46701 && expand_vec_perm_1 (&dthird);
46703 gcc_assert (ok);
46705 return true;
46708 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46709 permutation with two pshufb insns and an ior. We should have already
46710 failed all two instruction sequences. */
46712 static bool
46713 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46715 rtx rperm[2][16], vperm, l, h, op, m128;
46716 unsigned int i, nelt, eltsz;
46718 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46719 return false;
46720 gcc_assert (!d->one_operand_p);
46722 if (d->testing_p)
46723 return true;
46725 nelt = d->nelt;
46726 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46728 /* Generate two permutation masks. If the required element is within
46729 the given vector it is shuffled into the proper lane. If the required
46730 element is in the other vector, force a zero into the lane by setting
46731 bit 7 in the permutation mask. */
46732 m128 = GEN_INT (-128);
46733 for (i = 0; i < nelt; ++i)
46735 unsigned j, e = d->perm[i];
46736 unsigned which = (e >= nelt);
46737 if (e >= nelt)
46738 e -= nelt;
46740 for (j = 0; j < eltsz; ++j)
46742 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46743 rperm[1-which][i*eltsz + j] = m128;
46747 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46748 vperm = force_reg (V16QImode, vperm);
46750 l = gen_reg_rtx (V16QImode);
46751 op = gen_lowpart (V16QImode, d->op0);
46752 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46754 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46755 vperm = force_reg (V16QImode, vperm);
46757 h = gen_reg_rtx (V16QImode);
46758 op = gen_lowpart (V16QImode, d->op1);
46759 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46761 op = d->target;
46762 if (d->vmode != V16QImode)
46763 op = gen_reg_rtx (V16QImode);
46764 emit_insn (gen_iorv16qi3 (op, l, h));
46765 if (op != d->target)
46766 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46768 return true;
46771 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46772 with two vpshufb insns, vpermq and vpor. We should have already failed
46773 all two or three instruction sequences. */
46775 static bool
46776 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46778 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46779 unsigned int i, nelt, eltsz;
46781 if (!TARGET_AVX2
46782 || !d->one_operand_p
46783 || (d->vmode != V32QImode && d->vmode != V16HImode))
46784 return false;
46786 if (d->testing_p)
46787 return true;
46789 nelt = d->nelt;
46790 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46792 /* Generate two permutation masks. If the required element is within
46793 the same lane, it is shuffled in. If the required element from the
46794 other lane, force a zero by setting bit 7 in the permutation mask.
46795 In the other mask the mask has non-negative elements if element
46796 is requested from the other lane, but also moved to the other lane,
46797 so that the result of vpshufb can have the two V2TImode halves
46798 swapped. */
46799 m128 = GEN_INT (-128);
46800 for (i = 0; i < nelt; ++i)
46802 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46803 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46805 for (j = 0; j < eltsz; ++j)
46807 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46808 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46812 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46813 vperm = force_reg (V32QImode, vperm);
46815 h = gen_reg_rtx (V32QImode);
46816 op = gen_lowpart (V32QImode, d->op0);
46817 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46819 /* Swap the 128-byte lanes of h into hp. */
46820 hp = gen_reg_rtx (V4DImode);
46821 op = gen_lowpart (V4DImode, h);
46822 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46823 const1_rtx));
46825 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46826 vperm = force_reg (V32QImode, vperm);
46828 l = gen_reg_rtx (V32QImode);
46829 op = gen_lowpart (V32QImode, d->op0);
46830 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46832 op = d->target;
46833 if (d->vmode != V32QImode)
46834 op = gen_reg_rtx (V32QImode);
46835 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46836 if (op != d->target)
46837 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46839 return true;
46842 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46843 and extract-odd permutations of two V32QImode and V16QImode operand
46844 with two vpshufb insns, vpor and vpermq. We should have already
46845 failed all two or three instruction sequences. */
46847 static bool
46848 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46850 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46851 unsigned int i, nelt, eltsz;
46853 if (!TARGET_AVX2
46854 || d->one_operand_p
46855 || (d->vmode != V32QImode && d->vmode != V16HImode))
46856 return false;
46858 for (i = 0; i < d->nelt; ++i)
46859 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46860 return false;
46862 if (d->testing_p)
46863 return true;
46865 nelt = d->nelt;
46866 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46868 /* Generate two permutation masks. In the first permutation mask
46869 the first quarter will contain indexes for the first half
46870 of the op0, the second quarter will contain bit 7 set, third quarter
46871 will contain indexes for the second half of the op0 and the
46872 last quarter bit 7 set. In the second permutation mask
46873 the first quarter will contain bit 7 set, the second quarter
46874 indexes for the first half of the op1, the third quarter bit 7 set
46875 and last quarter indexes for the second half of the op1.
46876 I.e. the first mask e.g. for V32QImode extract even will be:
46877 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46878 (all values masked with 0xf except for -128) and second mask
46879 for extract even will be
46880 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46881 m128 = GEN_INT (-128);
46882 for (i = 0; i < nelt; ++i)
46884 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46885 unsigned which = d->perm[i] >= nelt;
46886 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46888 for (j = 0; j < eltsz; ++j)
46890 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46891 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46895 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46896 vperm = force_reg (V32QImode, vperm);
46898 l = gen_reg_rtx (V32QImode);
46899 op = gen_lowpart (V32QImode, d->op0);
46900 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46902 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46903 vperm = force_reg (V32QImode, vperm);
46905 h = gen_reg_rtx (V32QImode);
46906 op = gen_lowpart (V32QImode, d->op1);
46907 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46909 ior = gen_reg_rtx (V32QImode);
46910 emit_insn (gen_iorv32qi3 (ior, l, h));
46912 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46913 op = gen_reg_rtx (V4DImode);
46914 ior = gen_lowpart (V4DImode, ior);
46915 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46916 const1_rtx, GEN_INT (3)));
46917 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46919 return true;
46922 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46923 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46924 with two "and" and "pack" or two "shift" and "pack" insns. We should
46925 have already failed all two instruction sequences. */
46927 static bool
46928 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46930 rtx op, dop0, dop1, t;
46931 unsigned i, odd, c, s, nelt = d->nelt;
46932 bool end_perm = false;
46933 machine_mode half_mode;
46934 rtx (*gen_and) (rtx, rtx, rtx);
46935 rtx (*gen_pack) (rtx, rtx, rtx);
46936 rtx (*gen_shift) (rtx, rtx, rtx);
46938 if (d->one_operand_p)
46939 return false;
46941 switch (d->vmode)
46943 case E_V8HImode:
46944 /* Required for "pack". */
46945 if (!TARGET_SSE4_1)
46946 return false;
46947 c = 0xffff;
46948 s = 16;
46949 half_mode = V4SImode;
46950 gen_and = gen_andv4si3;
46951 gen_pack = gen_sse4_1_packusdw;
46952 gen_shift = gen_lshrv4si3;
46953 break;
46954 case E_V16QImode:
46955 /* No check as all instructions are SSE2. */
46956 c = 0xff;
46957 s = 8;
46958 half_mode = V8HImode;
46959 gen_and = gen_andv8hi3;
46960 gen_pack = gen_sse2_packuswb;
46961 gen_shift = gen_lshrv8hi3;
46962 break;
46963 case E_V16HImode:
46964 if (!TARGET_AVX2)
46965 return false;
46966 c = 0xffff;
46967 s = 16;
46968 half_mode = V8SImode;
46969 gen_and = gen_andv8si3;
46970 gen_pack = gen_avx2_packusdw;
46971 gen_shift = gen_lshrv8si3;
46972 end_perm = true;
46973 break;
46974 case E_V32QImode:
46975 if (!TARGET_AVX2)
46976 return false;
46977 c = 0xff;
46978 s = 8;
46979 half_mode = V16HImode;
46980 gen_and = gen_andv16hi3;
46981 gen_pack = gen_avx2_packuswb;
46982 gen_shift = gen_lshrv16hi3;
46983 end_perm = true;
46984 break;
46985 default:
46986 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46987 general shuffles. */
46988 return false;
46991 /* Check that permutation is even or odd. */
46992 odd = d->perm[0];
46993 if (odd > 1)
46994 return false;
46996 for (i = 1; i < nelt; ++i)
46997 if (d->perm[i] != 2 * i + odd)
46998 return false;
47000 if (d->testing_p)
47001 return true;
47003 dop0 = gen_reg_rtx (half_mode);
47004 dop1 = gen_reg_rtx (half_mode);
47005 if (odd == 0)
47007 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
47008 t = force_reg (half_mode, t);
47009 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47010 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47012 else
47014 emit_insn (gen_shift (dop0,
47015 gen_lowpart (half_mode, d->op0),
47016 GEN_INT (s)));
47017 emit_insn (gen_shift (dop1,
47018 gen_lowpart (half_mode, d->op1),
47019 GEN_INT (s)));
47021 /* In AVX2 for 256 bit case we need to permute pack result. */
47022 if (TARGET_AVX2 && end_perm)
47024 op = gen_reg_rtx (d->vmode);
47025 t = gen_reg_rtx (V4DImode);
47026 emit_insn (gen_pack (op, dop0, dop1));
47027 emit_insn (gen_avx2_permv4di_1 (t,
47028 gen_lowpart (V4DImode, op),
47029 const0_rtx,
47030 const2_rtx,
47031 const1_rtx,
47032 GEN_INT (3)));
47033 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47035 else
47036 emit_insn (gen_pack (d->target, dop0, dop1));
47038 return true;
47041 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47042 and extract-odd permutations of two V64QI operands
47043 with two "shifts", two "truncs" and one "concat" insns for "odd"
47044 and two "truncs" and one concat insn for "even."
47045 Have already failed all two instruction sequences. */
47047 static bool
47048 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47050 rtx t1, t2, t3, t4;
47051 unsigned i, odd, nelt = d->nelt;
47053 if (!TARGET_AVX512BW
47054 || d->one_operand_p
47055 || d->vmode != V64QImode)
47056 return false;
47058 /* Check that permutation is even or odd. */
47059 odd = d->perm[0];
47060 if (odd > 1)
47061 return false;
47063 for (i = 1; i < nelt; ++i)
47064 if (d->perm[i] != 2 * i + odd)
47065 return false;
47067 if (d->testing_p)
47068 return true;
47071 if (odd)
47073 t1 = gen_reg_rtx (V32HImode);
47074 t2 = gen_reg_rtx (V32HImode);
47075 emit_insn (gen_lshrv32hi3 (t1,
47076 gen_lowpart (V32HImode, d->op0),
47077 GEN_INT (8)));
47078 emit_insn (gen_lshrv32hi3 (t2,
47079 gen_lowpart (V32HImode, d->op1),
47080 GEN_INT (8)));
47082 else
47084 t1 = gen_lowpart (V32HImode, d->op0);
47085 t2 = gen_lowpart (V32HImode, d->op1);
47088 t3 = gen_reg_rtx (V32QImode);
47089 t4 = gen_reg_rtx (V32QImode);
47090 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47091 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47092 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47094 return true;
47097 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47098 and extract-odd permutations. */
47100 static bool
47101 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47103 rtx t1, t2, t3, t4, t5;
47105 switch (d->vmode)
47107 case E_V4DFmode:
47108 if (d->testing_p)
47109 break;
47110 t1 = gen_reg_rtx (V4DFmode);
47111 t2 = gen_reg_rtx (V4DFmode);
47113 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47114 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47115 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47117 /* Now an unpck[lh]pd will produce the result required. */
47118 if (odd)
47119 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47120 else
47121 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47122 emit_insn (t3);
47123 break;
47125 case E_V8SFmode:
47127 int mask = odd ? 0xdd : 0x88;
47129 if (d->testing_p)
47130 break;
47131 t1 = gen_reg_rtx (V8SFmode);
47132 t2 = gen_reg_rtx (V8SFmode);
47133 t3 = gen_reg_rtx (V8SFmode);
47135 /* Shuffle within the 128-bit lanes to produce:
47136 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47137 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47138 GEN_INT (mask)));
47140 /* Shuffle the lanes around to produce:
47141 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47142 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47143 GEN_INT (0x3)));
47145 /* Shuffle within the 128-bit lanes to produce:
47146 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47147 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47149 /* Shuffle within the 128-bit lanes to produce:
47150 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47151 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47153 /* Shuffle the lanes around to produce:
47154 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47155 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47156 GEN_INT (0x20)));
47158 break;
47160 case E_V2DFmode:
47161 case E_V4SFmode:
47162 case E_V2DImode:
47163 case E_V4SImode:
47164 /* These are always directly implementable by expand_vec_perm_1. */
47165 gcc_unreachable ();
47167 case E_V8HImode:
47168 if (TARGET_SSE4_1)
47169 return expand_vec_perm_even_odd_pack (d);
47170 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47171 return expand_vec_perm_pshufb2 (d);
47172 else
47174 if (d->testing_p)
47175 break;
47176 /* We need 2*log2(N)-1 operations to achieve odd/even
47177 with interleave. */
47178 t1 = gen_reg_rtx (V8HImode);
47179 t2 = gen_reg_rtx (V8HImode);
47180 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47181 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47182 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47183 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47184 if (odd)
47185 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47186 else
47187 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47188 emit_insn (t3);
47190 break;
47192 case E_V16QImode:
47193 return expand_vec_perm_even_odd_pack (d);
47195 case E_V16HImode:
47196 case E_V32QImode:
47197 return expand_vec_perm_even_odd_pack (d);
47199 case E_V64QImode:
47200 return expand_vec_perm_even_odd_trunc (d);
47202 case E_V4DImode:
47203 if (!TARGET_AVX2)
47205 struct expand_vec_perm_d d_copy = *d;
47206 d_copy.vmode = V4DFmode;
47207 if (d->testing_p)
47208 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47209 else
47210 d_copy.target = gen_reg_rtx (V4DFmode);
47211 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47212 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47213 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47215 if (!d->testing_p)
47216 emit_move_insn (d->target,
47217 gen_lowpart (V4DImode, d_copy.target));
47218 return true;
47220 return false;
47223 if (d->testing_p)
47224 break;
47226 t1 = gen_reg_rtx (V4DImode);
47227 t2 = gen_reg_rtx (V4DImode);
47229 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47230 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47231 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47233 /* Now an vpunpck[lh]qdq will produce the result required. */
47234 if (odd)
47235 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47236 else
47237 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47238 emit_insn (t3);
47239 break;
47241 case E_V8SImode:
47242 if (!TARGET_AVX2)
47244 struct expand_vec_perm_d d_copy = *d;
47245 d_copy.vmode = V8SFmode;
47246 if (d->testing_p)
47247 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47248 else
47249 d_copy.target = gen_reg_rtx (V8SFmode);
47250 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47251 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47252 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47254 if (!d->testing_p)
47255 emit_move_insn (d->target,
47256 gen_lowpart (V8SImode, d_copy.target));
47257 return true;
47259 return false;
47262 if (d->testing_p)
47263 break;
47265 t1 = gen_reg_rtx (V8SImode);
47266 t2 = gen_reg_rtx (V8SImode);
47267 t3 = gen_reg_rtx (V4DImode);
47268 t4 = gen_reg_rtx (V4DImode);
47269 t5 = gen_reg_rtx (V4DImode);
47271 /* Shuffle the lanes around into
47272 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47273 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47274 gen_lowpart (V4DImode, d->op1),
47275 GEN_INT (0x20)));
47276 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47277 gen_lowpart (V4DImode, d->op1),
47278 GEN_INT (0x31)));
47280 /* Swap the 2nd and 3rd position in each lane into
47281 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47282 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47283 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47284 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47285 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47287 /* Now an vpunpck[lh]qdq will produce
47288 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47289 if (odd)
47290 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47291 gen_lowpart (V4DImode, t2));
47292 else
47293 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47294 gen_lowpart (V4DImode, t2));
47295 emit_insn (t3);
47296 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47297 break;
47299 default:
47300 gcc_unreachable ();
47303 return true;
47306 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47307 extract-even and extract-odd permutations. */
47309 static bool
47310 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47312 unsigned i, odd, nelt = d->nelt;
47314 odd = d->perm[0];
47315 if (odd != 0 && odd != 1)
47316 return false;
47318 for (i = 1; i < nelt; ++i)
47319 if (d->perm[i] != 2 * i + odd)
47320 return false;
47322 return expand_vec_perm_even_odd_1 (d, odd);
47325 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47326 permutations. We assume that expand_vec_perm_1 has already failed. */
47328 static bool
47329 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47331 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47332 machine_mode vmode = d->vmode;
47333 unsigned char perm2[4];
47334 rtx op0 = d->op0, dest;
47335 bool ok;
47337 switch (vmode)
47339 case E_V4DFmode:
47340 case E_V8SFmode:
47341 /* These are special-cased in sse.md so that we can optionally
47342 use the vbroadcast instruction. They expand to two insns
47343 if the input happens to be in a register. */
47344 gcc_unreachable ();
47346 case E_V2DFmode:
47347 case E_V2DImode:
47348 case E_V4SFmode:
47349 case E_V4SImode:
47350 /* These are always implementable using standard shuffle patterns. */
47351 gcc_unreachable ();
47353 case E_V8HImode:
47354 case E_V16QImode:
47355 /* These can be implemented via interleave. We save one insn by
47356 stopping once we have promoted to V4SImode and then use pshufd. */
47357 if (d->testing_p)
47358 return true;
47361 rtx dest;
47362 rtx (*gen) (rtx, rtx, rtx)
47363 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47364 : gen_vec_interleave_lowv8hi;
47366 if (elt >= nelt2)
47368 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47369 : gen_vec_interleave_highv8hi;
47370 elt -= nelt2;
47372 nelt2 /= 2;
47374 dest = gen_reg_rtx (vmode);
47375 emit_insn (gen (dest, op0, op0));
47376 vmode = get_mode_wider_vector (vmode);
47377 op0 = gen_lowpart (vmode, dest);
47379 while (vmode != V4SImode);
47381 memset (perm2, elt, 4);
47382 dest = gen_reg_rtx (V4SImode);
47383 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47384 gcc_assert (ok);
47385 if (!d->testing_p)
47386 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47387 return true;
47389 case E_V64QImode:
47390 case E_V32QImode:
47391 case E_V16HImode:
47392 case E_V8SImode:
47393 case E_V4DImode:
47394 /* For AVX2 broadcasts of the first element vpbroadcast* or
47395 vpermq should be used by expand_vec_perm_1. */
47396 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47397 return false;
47399 default:
47400 gcc_unreachable ();
47404 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47405 broadcast permutations. */
47407 static bool
47408 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47410 unsigned i, elt, nelt = d->nelt;
47412 if (!d->one_operand_p)
47413 return false;
47415 elt = d->perm[0];
47416 for (i = 1; i < nelt; ++i)
47417 if (d->perm[i] != elt)
47418 return false;
47420 return expand_vec_perm_broadcast_1 (d);
47423 /* Implement arbitrary permutations of two V64QImode operands
47424 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
47425 static bool
47426 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
47428 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47429 return false;
47431 if (d->testing_p)
47432 return true;
47434 struct expand_vec_perm_d ds[2];
47435 rtx rperm[128], vperm, target0, target1;
47436 unsigned int i, nelt;
47437 machine_mode vmode;
47439 nelt = d->nelt;
47440 vmode = V64QImode;
47442 for (i = 0; i < 2; i++)
47444 ds[i] = *d;
47445 ds[i].vmode = V32HImode;
47446 ds[i].nelt = 32;
47447 ds[i].target = gen_reg_rtx (V32HImode);
47448 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47449 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47452 /* Prepare permutations such that the first one takes care of
47453 putting the even bytes into the right positions or one higher
47454 positions (ds[0]) and the second one takes care of
47455 putting the odd bytes into the right positions or one below
47456 (ds[1]). */
47458 for (i = 0; i < nelt; i++)
47460 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47461 if (i & 1)
47463 rperm[i] = constm1_rtx;
47464 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47466 else
47468 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47469 rperm[i + 64] = constm1_rtx;
47473 bool ok = expand_vec_perm_1 (&ds[0]);
47474 gcc_assert (ok);
47475 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47477 ok = expand_vec_perm_1 (&ds[1]);
47478 gcc_assert (ok);
47479 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47481 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47482 vperm = force_reg (vmode, vperm);
47483 target0 = gen_reg_rtx (V64QImode);
47484 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47486 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47487 vperm = force_reg (vmode, vperm);
47488 target1 = gen_reg_rtx (V64QImode);
47489 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47491 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47492 return true;
47495 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47496 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47497 all the shorter instruction sequences. */
47499 static bool
47500 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47502 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47503 unsigned int i, nelt, eltsz;
47504 bool used[4];
47506 if (!TARGET_AVX2
47507 || d->one_operand_p
47508 || (d->vmode != V32QImode && d->vmode != V16HImode))
47509 return false;
47511 if (d->testing_p)
47512 return true;
47514 nelt = d->nelt;
47515 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47517 /* Generate 4 permutation masks. If the required element is within
47518 the same lane, it is shuffled in. If the required element from the
47519 other lane, force a zero by setting bit 7 in the permutation mask.
47520 In the other mask the mask has non-negative elements if element
47521 is requested from the other lane, but also moved to the other lane,
47522 so that the result of vpshufb can have the two V2TImode halves
47523 swapped. */
47524 m128 = GEN_INT (-128);
47525 for (i = 0; i < 32; ++i)
47527 rperm[0][i] = m128;
47528 rperm[1][i] = m128;
47529 rperm[2][i] = m128;
47530 rperm[3][i] = m128;
47532 used[0] = false;
47533 used[1] = false;
47534 used[2] = false;
47535 used[3] = false;
47536 for (i = 0; i < nelt; ++i)
47538 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47539 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47540 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47542 for (j = 0; j < eltsz; ++j)
47543 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47544 used[which] = true;
47547 for (i = 0; i < 2; ++i)
47549 if (!used[2 * i + 1])
47551 h[i] = NULL_RTX;
47552 continue;
47554 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47555 gen_rtvec_v (32, rperm[2 * i + 1]));
47556 vperm = force_reg (V32QImode, vperm);
47557 h[i] = gen_reg_rtx (V32QImode);
47558 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47559 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47562 /* Swap the 128-byte lanes of h[X]. */
47563 for (i = 0; i < 2; ++i)
47565 if (h[i] == NULL_RTX)
47566 continue;
47567 op = gen_reg_rtx (V4DImode);
47568 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47569 const2_rtx, GEN_INT (3), const0_rtx,
47570 const1_rtx));
47571 h[i] = gen_lowpart (V32QImode, op);
47574 for (i = 0; i < 2; ++i)
47576 if (!used[2 * i])
47578 l[i] = NULL_RTX;
47579 continue;
47581 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47582 vperm = force_reg (V32QImode, vperm);
47583 l[i] = gen_reg_rtx (V32QImode);
47584 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47585 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47588 for (i = 0; i < 2; ++i)
47590 if (h[i] && l[i])
47592 op = gen_reg_rtx (V32QImode);
47593 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47594 l[i] = op;
47596 else if (h[i])
47597 l[i] = h[i];
47600 gcc_assert (l[0] && l[1]);
47601 op = d->target;
47602 if (d->vmode != V32QImode)
47603 op = gen_reg_rtx (V32QImode);
47604 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47605 if (op != d->target)
47606 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47607 return true;
47610 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47611 With all of the interface bits taken care of, perform the expansion
47612 in D and return true on success. */
47614 static bool
47615 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47617 /* Try a single instruction expansion. */
47618 if (expand_vec_perm_1 (d))
47619 return true;
47621 /* Try sequences of two instructions. */
47623 if (expand_vec_perm_pshuflw_pshufhw (d))
47624 return true;
47626 if (expand_vec_perm_palignr (d, false))
47627 return true;
47629 if (expand_vec_perm_interleave2 (d))
47630 return true;
47632 if (expand_vec_perm_broadcast (d))
47633 return true;
47635 if (expand_vec_perm_vpermq_perm_1 (d))
47636 return true;
47638 if (expand_vec_perm_vperm2f128 (d))
47639 return true;
47641 if (expand_vec_perm_pblendv (d))
47642 return true;
47644 /* Try sequences of three instructions. */
47646 if (expand_vec_perm_even_odd_pack (d))
47647 return true;
47649 if (expand_vec_perm_2vperm2f128_vshuf (d))
47650 return true;
47652 if (expand_vec_perm_pshufb2 (d))
47653 return true;
47655 if (expand_vec_perm_interleave3 (d))
47656 return true;
47658 if (expand_vec_perm_vperm2f128_vblend (d))
47659 return true;
47661 /* Try sequences of four instructions. */
47663 if (expand_vec_perm_even_odd_trunc (d))
47664 return true;
47665 if (expand_vec_perm_vpshufb2_vpermq (d))
47666 return true;
47668 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47669 return true;
47671 if (expand_vec_perm_vpermt2_vpshub2 (d))
47672 return true;
47674 /* ??? Look for narrow permutations whose element orderings would
47675 allow the promotion to a wider mode. */
47677 /* ??? Look for sequences of interleave or a wider permute that place
47678 the data into the correct lanes for a half-vector shuffle like
47679 pshuf[lh]w or vpermilps. */
47681 /* ??? Look for sequences of interleave that produce the desired results.
47682 The combinatorics of punpck[lh] get pretty ugly... */
47684 if (expand_vec_perm_even_odd (d))
47685 return true;
47687 /* Even longer sequences. */
47688 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47689 return true;
47691 /* See if we can get the same permutation in different vector integer
47692 mode. */
47693 struct expand_vec_perm_d nd;
47694 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47696 if (!d->testing_p)
47697 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47698 return true;
47701 return false;
47704 /* If a permutation only uses one operand, make it clear. Returns true
47705 if the permutation references both operands. */
47707 static bool
47708 canonicalize_perm (struct expand_vec_perm_d *d)
47710 int i, which, nelt = d->nelt;
47712 for (i = which = 0; i < nelt; ++i)
47713 which |= (d->perm[i] < nelt ? 1 : 2);
47715 d->one_operand_p = true;
47716 switch (which)
47718 default:
47719 gcc_unreachable();
47721 case 3:
47722 if (!rtx_equal_p (d->op0, d->op1))
47724 d->one_operand_p = false;
47725 break;
47727 /* The elements of PERM do not suggest that only the first operand
47728 is used, but both operands are identical. Allow easier matching
47729 of the permutation by folding the permutation into the single
47730 input vector. */
47731 /* FALLTHRU */
47733 case 2:
47734 for (i = 0; i < nelt; ++i)
47735 d->perm[i] &= nelt - 1;
47736 d->op0 = d->op1;
47737 break;
47739 case 1:
47740 d->op1 = d->op0;
47741 break;
47744 return (which == 3);
47747 bool
47748 ix86_expand_vec_perm_const (rtx operands[4])
47750 struct expand_vec_perm_d d;
47751 unsigned char perm[MAX_VECT_LEN];
47752 int i, nelt;
47753 bool two_args;
47754 rtx sel;
47756 d.target = operands[0];
47757 d.op0 = operands[1];
47758 d.op1 = operands[2];
47759 sel = operands[3];
47761 d.vmode = GET_MODE (d.target);
47762 gcc_assert (VECTOR_MODE_P (d.vmode));
47763 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47764 d.testing_p = false;
47766 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47767 gcc_assert (XVECLEN (sel, 0) == nelt);
47768 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47770 for (i = 0; i < nelt; ++i)
47772 rtx e = XVECEXP (sel, 0, i);
47773 int ei = INTVAL (e) & (2 * nelt - 1);
47774 d.perm[i] = ei;
47775 perm[i] = ei;
47778 two_args = canonicalize_perm (&d);
47780 if (ix86_expand_vec_perm_const_1 (&d))
47781 return true;
47783 /* If the selector says both arguments are needed, but the operands are the
47784 same, the above tried to expand with one_operand_p and flattened selector.
47785 If that didn't work, retry without one_operand_p; we succeeded with that
47786 during testing. */
47787 if (two_args && d.one_operand_p)
47789 d.one_operand_p = false;
47790 memcpy (d.perm, perm, sizeof (perm));
47791 return ix86_expand_vec_perm_const_1 (&d);
47794 return false;
47797 /* Implement targetm.vectorize.vec_perm_const_ok. */
47799 static bool
47800 ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
47802 struct expand_vec_perm_d d;
47803 unsigned int i, nelt, which;
47804 bool ret;
47806 d.vmode = vmode;
47807 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47808 d.testing_p = true;
47810 /* Given sufficient ISA support we can just return true here
47811 for selected vector modes. */
47812 switch (d.vmode)
47814 case E_V16SFmode:
47815 case E_V16SImode:
47816 case E_V8DImode:
47817 case E_V8DFmode:
47818 if (TARGET_AVX512F)
47819 /* All implementable with a single vperm[it]2 insn. */
47820 return true;
47821 break;
47822 case E_V32HImode:
47823 if (TARGET_AVX512BW)
47824 /* All implementable with a single vperm[it]2 insn. */
47825 return true;
47826 break;
47827 case E_V64QImode:
47828 if (TARGET_AVX512BW)
47829 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
47830 return true;
47831 break;
47832 case E_V8SImode:
47833 case E_V8SFmode:
47834 case E_V4DFmode:
47835 case E_V4DImode:
47836 if (TARGET_AVX512VL)
47837 /* All implementable with a single vperm[it]2 insn. */
47838 return true;
47839 break;
47840 case E_V16HImode:
47841 if (TARGET_AVX2)
47842 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47843 return true;
47844 break;
47845 case E_V32QImode:
47846 if (TARGET_AVX2)
47847 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47848 return true;
47849 break;
47850 case E_V4SImode:
47851 case E_V4SFmode:
47852 case E_V8HImode:
47853 case E_V16QImode:
47854 /* All implementable with a single vpperm insn. */
47855 if (TARGET_XOP)
47856 return true;
47857 /* All implementable with 2 pshufb + 1 ior. */
47858 if (TARGET_SSSE3)
47859 return true;
47860 break;
47861 case E_V2DImode:
47862 case E_V2DFmode:
47863 /* All implementable with shufpd or unpck[lh]pd. */
47864 return true;
47865 default:
47866 return false;
47869 /* Extract the values from the vector CST into the permutation
47870 array in D. */
47871 for (i = which = 0; i < nelt; ++i)
47873 unsigned char e = sel[i];
47874 gcc_assert (e < 2 * nelt);
47875 d.perm[i] = e;
47876 which |= (e < nelt ? 1 : 2);
47879 /* For all elements from second vector, fold the elements to first. */
47880 if (which == 2)
47881 for (i = 0; i < nelt; ++i)
47882 d.perm[i] -= nelt;
47884 /* Check whether the mask can be applied to the vector type. */
47885 d.one_operand_p = (which != 3);
47887 /* Implementable with shufps or pshufd. */
47888 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47889 return true;
47891 /* Otherwise we have to go through the motions and see if we can
47892 figure out how to generate the requested permutation. */
47893 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47894 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47895 if (!d.one_operand_p)
47896 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47898 start_sequence ();
47899 ret = ix86_expand_vec_perm_const_1 (&d);
47900 end_sequence ();
47902 return ret;
47905 void
47906 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47908 struct expand_vec_perm_d d;
47909 unsigned i, nelt;
47911 d.target = targ;
47912 d.op0 = op0;
47913 d.op1 = op1;
47914 d.vmode = GET_MODE (targ);
47915 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47916 d.one_operand_p = false;
47917 d.testing_p = false;
47919 for (i = 0; i < nelt; ++i)
47920 d.perm[i] = i * 2 + odd;
47922 /* We'll either be able to implement the permutation directly... */
47923 if (expand_vec_perm_1 (&d))
47924 return;
47926 /* ... or we use the special-case patterns. */
47927 expand_vec_perm_even_odd_1 (&d, odd);
47930 static void
47931 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47933 struct expand_vec_perm_d d;
47934 unsigned i, nelt, base;
47935 bool ok;
47937 d.target = targ;
47938 d.op0 = op0;
47939 d.op1 = op1;
47940 d.vmode = GET_MODE (targ);
47941 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47942 d.one_operand_p = false;
47943 d.testing_p = false;
47945 base = high_p ? nelt / 2 : 0;
47946 for (i = 0; i < nelt / 2; ++i)
47948 d.perm[i * 2] = i + base;
47949 d.perm[i * 2 + 1] = i + base + nelt;
47952 /* Note that for AVX this isn't one instruction. */
47953 ok = ix86_expand_vec_perm_const_1 (&d);
47954 gcc_assert (ok);
47958 /* Expand a vector operation CODE for a V*QImode in terms of the
47959 same operation on V*HImode. */
47961 void
47962 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47964 machine_mode qimode = GET_MODE (dest);
47965 machine_mode himode;
47966 rtx (*gen_il) (rtx, rtx, rtx);
47967 rtx (*gen_ih) (rtx, rtx, rtx);
47968 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47969 struct expand_vec_perm_d d;
47970 bool ok, full_interleave;
47971 bool uns_p = false;
47972 int i;
47974 switch (qimode)
47976 case E_V16QImode:
47977 himode = V8HImode;
47978 gen_il = gen_vec_interleave_lowv16qi;
47979 gen_ih = gen_vec_interleave_highv16qi;
47980 break;
47981 case E_V32QImode:
47982 himode = V16HImode;
47983 gen_il = gen_avx2_interleave_lowv32qi;
47984 gen_ih = gen_avx2_interleave_highv32qi;
47985 break;
47986 case E_V64QImode:
47987 himode = V32HImode;
47988 gen_il = gen_avx512bw_interleave_lowv64qi;
47989 gen_ih = gen_avx512bw_interleave_highv64qi;
47990 break;
47991 default:
47992 gcc_unreachable ();
47995 op2_l = op2_h = op2;
47996 switch (code)
47998 case MULT:
47999 /* Unpack data such that we've got a source byte in each low byte of
48000 each word. We don't care what goes into the high byte of each word.
48001 Rather than trying to get zero in there, most convenient is to let
48002 it be a copy of the low byte. */
48003 op2_l = gen_reg_rtx (qimode);
48004 op2_h = gen_reg_rtx (qimode);
48005 emit_insn (gen_il (op2_l, op2, op2));
48006 emit_insn (gen_ih (op2_h, op2, op2));
48008 op1_l = gen_reg_rtx (qimode);
48009 op1_h = gen_reg_rtx (qimode);
48010 emit_insn (gen_il (op1_l, op1, op1));
48011 emit_insn (gen_ih (op1_h, op1, op1));
48012 full_interleave = qimode == V16QImode;
48013 break;
48015 case ASHIFT:
48016 case LSHIFTRT:
48017 uns_p = true;
48018 /* FALLTHRU */
48019 case ASHIFTRT:
48020 op1_l = gen_reg_rtx (himode);
48021 op1_h = gen_reg_rtx (himode);
48022 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48023 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48024 full_interleave = true;
48025 break;
48026 default:
48027 gcc_unreachable ();
48030 /* Perform the operation. */
48031 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48032 1, OPTAB_DIRECT);
48033 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48034 1, OPTAB_DIRECT);
48035 gcc_assert (res_l && res_h);
48037 /* Merge the data back into the right place. */
48038 d.target = dest;
48039 d.op0 = gen_lowpart (qimode, res_l);
48040 d.op1 = gen_lowpart (qimode, res_h);
48041 d.vmode = qimode;
48042 d.nelt = GET_MODE_NUNITS (qimode);
48043 d.one_operand_p = false;
48044 d.testing_p = false;
48046 if (full_interleave)
48048 /* For SSE2, we used an full interleave, so the desired
48049 results are in the even elements. */
48050 for (i = 0; i < d.nelt; ++i)
48051 d.perm[i] = i * 2;
48053 else
48055 /* For AVX, the interleave used above was not cross-lane. So the
48056 extraction is evens but with the second and third quarter swapped.
48057 Happily, that is even one insn shorter than even extraction.
48058 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48059 always first from the first and then from the second source operand,
48060 the index bits above the low 4 bits remains the same.
48061 Thus, for d.nelt == 32 we want permutation
48062 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48063 and for d.nelt == 64 we want permutation
48064 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48065 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48066 for (i = 0; i < d.nelt; ++i)
48067 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48070 ok = ix86_expand_vec_perm_const_1 (&d);
48071 gcc_assert (ok);
48073 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48074 gen_rtx_fmt_ee (code, qimode, op1, op2));
48077 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48078 if op is CONST_VECTOR with all odd elements equal to their
48079 preceding element. */
48081 static bool
48082 const_vector_equal_evenodd_p (rtx op)
48084 machine_mode mode = GET_MODE (op);
48085 int i, nunits = GET_MODE_NUNITS (mode);
48086 if (GET_CODE (op) != CONST_VECTOR
48087 || nunits != CONST_VECTOR_NUNITS (op))
48088 return false;
48089 for (i = 0; i < nunits; i += 2)
48090 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48091 return false;
48092 return true;
48095 void
48096 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48097 bool uns_p, bool odd_p)
48099 machine_mode mode = GET_MODE (op1);
48100 machine_mode wmode = GET_MODE (dest);
48101 rtx x;
48102 rtx orig_op1 = op1, orig_op2 = op2;
48104 if (!nonimmediate_operand (op1, mode))
48105 op1 = force_reg (mode, op1);
48106 if (!nonimmediate_operand (op2, mode))
48107 op2 = force_reg (mode, op2);
48109 /* We only play even/odd games with vectors of SImode. */
48110 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48112 /* If we're looking for the odd results, shift those members down to
48113 the even slots. For some cpus this is faster than a PSHUFD. */
48114 if (odd_p)
48116 /* For XOP use vpmacsdqh, but only for smult, as it is only
48117 signed. */
48118 if (TARGET_XOP && mode == V4SImode && !uns_p)
48120 x = force_reg (wmode, CONST0_RTX (wmode));
48121 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48122 return;
48125 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48126 if (!const_vector_equal_evenodd_p (orig_op1))
48127 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48128 x, NULL, 1, OPTAB_DIRECT);
48129 if (!const_vector_equal_evenodd_p (orig_op2))
48130 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48131 x, NULL, 1, OPTAB_DIRECT);
48132 op1 = gen_lowpart (mode, op1);
48133 op2 = gen_lowpart (mode, op2);
48136 if (mode == V16SImode)
48138 if (uns_p)
48139 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48140 else
48141 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48143 else if (mode == V8SImode)
48145 if (uns_p)
48146 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48147 else
48148 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48150 else if (uns_p)
48151 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48152 else if (TARGET_SSE4_1)
48153 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48154 else
48156 rtx s1, s2, t0, t1, t2;
48158 /* The easiest way to implement this without PMULDQ is to go through
48159 the motions as if we are performing a full 64-bit multiply. With
48160 the exception that we need to do less shuffling of the elements. */
48162 /* Compute the sign-extension, aka highparts, of the two operands. */
48163 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48164 op1, pc_rtx, pc_rtx);
48165 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48166 op2, pc_rtx, pc_rtx);
48168 /* Multiply LO(A) * HI(B), and vice-versa. */
48169 t1 = gen_reg_rtx (wmode);
48170 t2 = gen_reg_rtx (wmode);
48171 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48172 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48174 /* Multiply LO(A) * LO(B). */
48175 t0 = gen_reg_rtx (wmode);
48176 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48178 /* Combine and shift the highparts into place. */
48179 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48180 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48181 1, OPTAB_DIRECT);
48183 /* Combine high and low parts. */
48184 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48185 return;
48187 emit_insn (x);
48190 void
48191 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48192 bool uns_p, bool high_p)
48194 machine_mode wmode = GET_MODE (dest);
48195 machine_mode mode = GET_MODE (op1);
48196 rtx t1, t2, t3, t4, mask;
48198 switch (mode)
48200 case E_V4SImode:
48201 t1 = gen_reg_rtx (mode);
48202 t2 = gen_reg_rtx (mode);
48203 if (TARGET_XOP && !uns_p)
48205 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48206 shuffle the elements once so that all elements are in the right
48207 place for immediate use: { A C B D }. */
48208 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48209 const1_rtx, GEN_INT (3)));
48210 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48211 const1_rtx, GEN_INT (3)));
48213 else
48215 /* Put the elements into place for the multiply. */
48216 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48217 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48218 high_p = false;
48220 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48221 break;
48223 case E_V8SImode:
48224 /* Shuffle the elements between the lanes. After this we
48225 have { A B E F | C D G H } for each operand. */
48226 t1 = gen_reg_rtx (V4DImode);
48227 t2 = gen_reg_rtx (V4DImode);
48228 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48229 const0_rtx, const2_rtx,
48230 const1_rtx, GEN_INT (3)));
48231 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48232 const0_rtx, const2_rtx,
48233 const1_rtx, GEN_INT (3)));
48235 /* Shuffle the elements within the lanes. After this we
48236 have { A A B B | C C D D } or { E E F F | G G H H }. */
48237 t3 = gen_reg_rtx (V8SImode);
48238 t4 = gen_reg_rtx (V8SImode);
48239 mask = GEN_INT (high_p
48240 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48241 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48242 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48243 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48245 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48246 break;
48248 case E_V8HImode:
48249 case E_V16HImode:
48250 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48251 uns_p, OPTAB_DIRECT);
48252 t2 = expand_binop (mode,
48253 uns_p ? umul_highpart_optab : smul_highpart_optab,
48254 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48255 gcc_assert (t1 && t2);
48257 t3 = gen_reg_rtx (mode);
48258 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48259 emit_move_insn (dest, gen_lowpart (wmode, t3));
48260 break;
48262 case E_V16QImode:
48263 case E_V32QImode:
48264 case E_V32HImode:
48265 case E_V16SImode:
48266 case E_V64QImode:
48267 t1 = gen_reg_rtx (wmode);
48268 t2 = gen_reg_rtx (wmode);
48269 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48270 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48272 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48273 break;
48275 default:
48276 gcc_unreachable ();
48280 void
48281 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48283 rtx res_1, res_2, res_3, res_4;
48285 res_1 = gen_reg_rtx (V4SImode);
48286 res_2 = gen_reg_rtx (V4SImode);
48287 res_3 = gen_reg_rtx (V2DImode);
48288 res_4 = gen_reg_rtx (V2DImode);
48289 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48290 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48292 /* Move the results in element 2 down to element 1; we don't care
48293 what goes in elements 2 and 3. Then we can merge the parts
48294 back together with an interleave.
48296 Note that two other sequences were tried:
48297 (1) Use interleaves at the start instead of psrldq, which allows
48298 us to use a single shufps to merge things back at the end.
48299 (2) Use shufps here to combine the two vectors, then pshufd to
48300 put the elements in the correct order.
48301 In both cases the cost of the reformatting stall was too high
48302 and the overall sequence slower. */
48304 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48305 const0_rtx, const2_rtx,
48306 const0_rtx, const0_rtx));
48307 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48308 const0_rtx, const2_rtx,
48309 const0_rtx, const0_rtx));
48310 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48312 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48315 void
48316 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48318 machine_mode mode = GET_MODE (op0);
48319 rtx t1, t2, t3, t4, t5, t6;
48321 if (TARGET_AVX512DQ && mode == V8DImode)
48322 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48323 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48324 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48325 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48326 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48327 else if (TARGET_XOP && mode == V2DImode)
48329 /* op1: A,B,C,D, op2: E,F,G,H */
48330 op1 = gen_lowpart (V4SImode, op1);
48331 op2 = gen_lowpart (V4SImode, op2);
48333 t1 = gen_reg_rtx (V4SImode);
48334 t2 = gen_reg_rtx (V4SImode);
48335 t3 = gen_reg_rtx (V2DImode);
48336 t4 = gen_reg_rtx (V2DImode);
48338 /* t1: B,A,D,C */
48339 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48340 GEN_INT (1),
48341 GEN_INT (0),
48342 GEN_INT (3),
48343 GEN_INT (2)));
48345 /* t2: (B*E),(A*F),(D*G),(C*H) */
48346 emit_insn (gen_mulv4si3 (t2, t1, op2));
48348 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48349 emit_insn (gen_xop_phadddq (t3, t2));
48351 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48352 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48354 /* Multiply lower parts and add all */
48355 t5 = gen_reg_rtx (V2DImode);
48356 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48357 gen_lowpart (V4SImode, op1),
48358 gen_lowpart (V4SImode, op2)));
48359 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48362 else
48364 machine_mode nmode;
48365 rtx (*umul) (rtx, rtx, rtx);
48367 if (mode == V2DImode)
48369 umul = gen_vec_widen_umult_even_v4si;
48370 nmode = V4SImode;
48372 else if (mode == V4DImode)
48374 umul = gen_vec_widen_umult_even_v8si;
48375 nmode = V8SImode;
48377 else if (mode == V8DImode)
48379 umul = gen_vec_widen_umult_even_v16si;
48380 nmode = V16SImode;
48382 else
48383 gcc_unreachable ();
48386 /* Multiply low parts. */
48387 t1 = gen_reg_rtx (mode);
48388 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48390 /* Shift input vectors right 32 bits so we can multiply high parts. */
48391 t6 = GEN_INT (32);
48392 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48393 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48395 /* Multiply high parts by low parts. */
48396 t4 = gen_reg_rtx (mode);
48397 t5 = gen_reg_rtx (mode);
48398 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48399 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48401 /* Combine and shift the highparts back. */
48402 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48403 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48405 /* Combine high and low parts. */
48406 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48409 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48410 gen_rtx_MULT (mode, op1, op2));
48413 /* Return 1 if control tansfer instruction INSN
48414 should be encoded with bnd prefix.
48415 If insn is NULL then return 1 when control
48416 transfer instructions should be prefixed with
48417 bnd by default for current function. */
48419 bool
48420 ix86_bnd_prefixed_insn_p (rtx insn)
48422 /* For call insns check special flag. */
48423 if (insn && CALL_P (insn))
48425 rtx call = get_call_rtx_from (insn);
48426 if (call)
48427 return CALL_EXPR_WITH_BOUNDS_P (call);
48430 /* All other insns are prefixed only if function is instrumented. */
48431 return chkp_function_instrumented_p (current_function_decl);
48434 /* Return 1 if control tansfer instruction INSN
48435 should be encoded with notrack prefix. */
48437 static bool
48438 ix86_notrack_prefixed_insn_p (rtx insn)
48440 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
48441 return false;
48443 if (CALL_P (insn))
48445 rtx call = get_call_rtx_from (insn);
48446 gcc_assert (call != NULL_RTX);
48447 rtx addr = XEXP (call, 0);
48449 /* Do not emit 'notrack' if it's not an indirect call. */
48450 if (MEM_P (addr)
48451 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
48452 return false;
48453 else
48454 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
48457 if (JUMP_P (insn) && !flag_cet_switch)
48459 rtx target = JUMP_LABEL (insn);
48460 if (target == NULL_RTX || ANY_RETURN_P (target))
48461 return false;
48463 /* Check the jump is a switch table. */
48464 rtx_insn *label = as_a<rtx_insn *> (target);
48465 rtx_insn *table = next_insn (label);
48466 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
48467 return false;
48468 else
48469 return true;
48471 return false;
48474 /* Calculate integer abs() using only SSE2 instructions. */
48476 void
48477 ix86_expand_sse2_abs (rtx target, rtx input)
48479 machine_mode mode = GET_MODE (target);
48480 rtx tmp0, tmp1, x;
48482 switch (mode)
48484 /* For 32-bit signed integer X, the best way to calculate the absolute
48485 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48486 case E_V4SImode:
48487 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48488 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48489 NULL, 0, OPTAB_DIRECT);
48490 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48491 NULL, 0, OPTAB_DIRECT);
48492 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48493 target, 0, OPTAB_DIRECT);
48494 break;
48496 /* For 16-bit signed integer X, the best way to calculate the absolute
48497 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48498 case E_V8HImode:
48499 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48501 x = expand_simple_binop (mode, SMAX, tmp0, input,
48502 target, 0, OPTAB_DIRECT);
48503 break;
48505 /* For 8-bit signed integer X, the best way to calculate the absolute
48506 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48507 as SSE2 provides the PMINUB insn. */
48508 case E_V16QImode:
48509 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48511 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48512 target, 0, OPTAB_DIRECT);
48513 break;
48515 default:
48516 gcc_unreachable ();
48519 if (x != target)
48520 emit_move_insn (target, x);
48523 /* Expand an extract from a vector register through pextr insn.
48524 Return true if successful. */
48526 bool
48527 ix86_expand_pextr (rtx *operands)
48529 rtx dst = operands[0];
48530 rtx src = operands[1];
48532 unsigned int size = INTVAL (operands[2]);
48533 unsigned int pos = INTVAL (operands[3]);
48535 if (SUBREG_P (dst))
48537 /* Reject non-lowpart subregs. */
48538 if (SUBREG_BYTE (dst) > 0)
48539 return false;
48540 dst = SUBREG_REG (dst);
48543 if (SUBREG_P (src))
48545 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48546 src = SUBREG_REG (src);
48549 switch (GET_MODE (src))
48551 case E_V16QImode:
48552 case E_V8HImode:
48553 case E_V4SImode:
48554 case E_V2DImode:
48555 case E_V1TImode:
48556 case E_TImode:
48558 machine_mode srcmode, dstmode;
48559 rtx d, pat;
48561 if (!int_mode_for_size (size, 0).exists (&dstmode))
48562 return false;
48564 switch (dstmode)
48566 case E_QImode:
48567 if (!TARGET_SSE4_1)
48568 return false;
48569 srcmode = V16QImode;
48570 break;
48572 case E_HImode:
48573 if (!TARGET_SSE2)
48574 return false;
48575 srcmode = V8HImode;
48576 break;
48578 case E_SImode:
48579 if (!TARGET_SSE4_1)
48580 return false;
48581 srcmode = V4SImode;
48582 break;
48584 case E_DImode:
48585 gcc_assert (TARGET_64BIT);
48586 if (!TARGET_SSE4_1)
48587 return false;
48588 srcmode = V2DImode;
48589 break;
48591 default:
48592 return false;
48595 /* Reject extractions from misaligned positions. */
48596 if (pos & (size-1))
48597 return false;
48599 if (GET_MODE (dst) == dstmode)
48600 d = dst;
48601 else
48602 d = gen_reg_rtx (dstmode);
48604 /* Construct insn pattern. */
48605 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48606 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48608 /* Let the rtl optimizers know about the zero extension performed. */
48609 if (dstmode == QImode || dstmode == HImode)
48611 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48612 d = gen_lowpart (SImode, d);
48615 emit_insn (gen_rtx_SET (d, pat));
48617 if (d != dst)
48618 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48619 return true;
48622 default:
48623 return false;
48627 /* Expand an insert into a vector register through pinsr insn.
48628 Return true if successful. */
48630 bool
48631 ix86_expand_pinsr (rtx *operands)
48633 rtx dst = operands[0];
48634 rtx src = operands[3];
48636 unsigned int size = INTVAL (operands[1]);
48637 unsigned int pos = INTVAL (operands[2]);
48639 if (SUBREG_P (dst))
48641 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48642 dst = SUBREG_REG (dst);
48645 switch (GET_MODE (dst))
48647 case E_V16QImode:
48648 case E_V8HImode:
48649 case E_V4SImode:
48650 case E_V2DImode:
48651 case E_V1TImode:
48652 case E_TImode:
48654 machine_mode srcmode, dstmode;
48655 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48656 rtx d;
48658 if (!int_mode_for_size (size, 0).exists (&srcmode))
48659 return false;
48661 switch (srcmode)
48663 case E_QImode:
48664 if (!TARGET_SSE4_1)
48665 return false;
48666 dstmode = V16QImode;
48667 pinsr = gen_sse4_1_pinsrb;
48668 break;
48670 case E_HImode:
48671 if (!TARGET_SSE2)
48672 return false;
48673 dstmode = V8HImode;
48674 pinsr = gen_sse2_pinsrw;
48675 break;
48677 case E_SImode:
48678 if (!TARGET_SSE4_1)
48679 return false;
48680 dstmode = V4SImode;
48681 pinsr = gen_sse4_1_pinsrd;
48682 break;
48684 case E_DImode:
48685 gcc_assert (TARGET_64BIT);
48686 if (!TARGET_SSE4_1)
48687 return false;
48688 dstmode = V2DImode;
48689 pinsr = gen_sse4_1_pinsrq;
48690 break;
48692 default:
48693 return false;
48696 /* Reject insertions to misaligned positions. */
48697 if (pos & (size-1))
48698 return false;
48700 if (SUBREG_P (src))
48702 unsigned int srcpos = SUBREG_BYTE (src);
48704 if (srcpos > 0)
48706 rtx extr_ops[4];
48708 extr_ops[0] = gen_reg_rtx (srcmode);
48709 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48710 extr_ops[2] = GEN_INT (size);
48711 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48713 if (!ix86_expand_pextr (extr_ops))
48714 return false;
48716 src = extr_ops[0];
48718 else
48719 src = gen_lowpart (srcmode, SUBREG_REG (src));
48722 if (GET_MODE (dst) == dstmode)
48723 d = dst;
48724 else
48725 d = gen_reg_rtx (dstmode);
48727 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48728 gen_lowpart (srcmode, src),
48729 GEN_INT (1 << (pos / size))));
48730 if (d != dst)
48731 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48732 return true;
48735 default:
48736 return false;
48740 /* This function returns the calling abi specific va_list type node.
48741 It returns the FNDECL specific va_list type. */
48743 static tree
48744 ix86_fn_abi_va_list (tree fndecl)
48746 if (!TARGET_64BIT)
48747 return va_list_type_node;
48748 gcc_assert (fndecl != NULL_TREE);
48750 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48751 return ms_va_list_type_node;
48752 else
48753 return sysv_va_list_type_node;
48756 /* Returns the canonical va_list type specified by TYPE. If there
48757 is no valid TYPE provided, it return NULL_TREE. */
48759 static tree
48760 ix86_canonical_va_list_type (tree type)
48762 if (TARGET_64BIT)
48764 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48765 return ms_va_list_type_node;
48767 if ((TREE_CODE (type) == ARRAY_TYPE
48768 && integer_zerop (array_type_nelts (type)))
48769 || POINTER_TYPE_P (type))
48771 tree elem_type = TREE_TYPE (type);
48772 if (TREE_CODE (elem_type) == RECORD_TYPE
48773 && lookup_attribute ("sysv_abi va_list",
48774 TYPE_ATTRIBUTES (elem_type)))
48775 return sysv_va_list_type_node;
48778 return NULL_TREE;
48781 return std_canonical_va_list_type (type);
48784 /* Iterate through the target-specific builtin types for va_list.
48785 IDX denotes the iterator, *PTREE is set to the result type of
48786 the va_list builtin, and *PNAME to its internal type.
48787 Returns zero if there is no element for this index, otherwise
48788 IDX should be increased upon the next call.
48789 Note, do not iterate a base builtin's name like __builtin_va_list.
48790 Used from c_common_nodes_and_builtins. */
48792 static int
48793 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48795 if (TARGET_64BIT)
48797 switch (idx)
48799 default:
48800 break;
48802 case 0:
48803 *ptree = ms_va_list_type_node;
48804 *pname = "__builtin_ms_va_list";
48805 return 1;
48807 case 1:
48808 *ptree = sysv_va_list_type_node;
48809 *pname = "__builtin_sysv_va_list";
48810 return 1;
48814 return 0;
48817 #undef TARGET_SCHED_DISPATCH
48818 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
48819 #undef TARGET_SCHED_DISPATCH_DO
48820 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
48821 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48822 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48823 #undef TARGET_SCHED_REORDER
48824 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
48825 #undef TARGET_SCHED_ADJUST_PRIORITY
48826 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48827 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48828 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48829 ix86_dependencies_evaluation_hook
48832 /* Implementation of reassociation_width target hook used by
48833 reassoc phase to identify parallelism level in reassociated
48834 tree. Statements tree_code is passed in OPC. Arguments type
48835 is passed in MODE. */
48837 static int
48838 ix86_reassociation_width (unsigned int op, machine_mode mode)
48840 int width = 1;
48841 /* Vector part. */
48842 if (VECTOR_MODE_P (mode))
48844 int div = 1;
48845 if (INTEGRAL_MODE_P (mode))
48846 width = ix86_cost->reassoc_vec_int;
48847 else if (FLOAT_MODE_P (mode))
48848 width = ix86_cost->reassoc_vec_fp;
48850 if (width == 1)
48851 return 1;
48853 /* Integer vector instructions execute in FP unit
48854 and can execute 3 additions and one multiplication per cycle. */
48855 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
48856 && op != PLUS && op != MINUS)
48857 return 1;
48859 /* Account for targets that splits wide vectors into multiple parts. */
48860 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
48861 div = GET_MODE_BITSIZE (mode) / 128;
48862 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
48863 div = GET_MODE_BITSIZE (mode) / 64;
48864 width = (width + div - 1) / div;
48866 /* Scalar part. */
48867 else if (INTEGRAL_MODE_P (mode))
48868 width = ix86_cost->reassoc_int;
48869 else if (FLOAT_MODE_P (mode))
48870 width = ix86_cost->reassoc_fp;
48872 /* Avoid using too many registers in 32bit mode. */
48873 if (!TARGET_64BIT && width > 2)
48874 width = 2;
48875 return width;
48878 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
48879 place emms and femms instructions. */
48881 static machine_mode
48882 ix86_preferred_simd_mode (scalar_mode mode)
48884 if (!TARGET_SSE)
48885 return word_mode;
48887 switch (mode)
48889 case E_QImode:
48890 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48891 return V64QImode;
48892 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48893 return V32QImode;
48894 else
48895 return V16QImode;
48897 case E_HImode:
48898 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48899 return V32HImode;
48900 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48901 return V16HImode;
48902 else
48903 return V8HImode;
48905 case E_SImode:
48906 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48907 return V16SImode;
48908 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48909 return V8SImode;
48910 else
48911 return V4SImode;
48913 case E_DImode:
48914 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48915 return V8DImode;
48916 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48917 return V4DImode;
48918 else
48919 return V2DImode;
48921 case E_SFmode:
48922 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48923 return V16SFmode;
48924 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48925 return V8SFmode;
48926 else
48927 return V4SFmode;
48929 case E_DFmode:
48930 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48931 return V8DFmode;
48932 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48933 return V4DFmode;
48934 else if (TARGET_SSE2)
48935 return V2DFmode;
48936 /* FALLTHRU */
48938 default:
48939 return word_mode;
48943 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
48944 vectors. If AVX512F is enabled then try vectorizing with 512bit,
48945 256bit and 128bit vectors. */
48947 static unsigned int
48948 ix86_autovectorize_vector_sizes (void)
48950 unsigned int bytesizes = 0;
48952 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48953 bytesizes |= (64 | 32 | 16);
48954 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48955 bytesizes |= (32 | 16);
48957 return bytesizes;
48960 /* Implemenation of targetm.vectorize.get_mask_mode. */
48962 static opt_machine_mode
48963 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
48965 unsigned elem_size = vector_size / nunits;
48967 /* Scalar mask case. */
48968 if ((TARGET_AVX512F && vector_size == 64)
48969 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
48971 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
48972 return smallest_int_mode_for_size (nunits);
48975 scalar_int_mode elem_mode
48976 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
48978 gcc_assert (elem_size * nunits == vector_size);
48980 return mode_for_vector (elem_mode, nunits);
48985 /* Return class of registers which could be used for pseudo of MODE
48986 and of class RCLASS for spilling instead of memory. Return NO_REGS
48987 if it is not possible or non-profitable. */
48989 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
48991 static reg_class_t
48992 ix86_spill_class (reg_class_t rclass, machine_mode mode)
48994 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
48995 && TARGET_SSE2
48996 && TARGET_INTER_UNIT_MOVES_TO_VEC
48997 && TARGET_INTER_UNIT_MOVES_FROM_VEC
48998 && (mode == SImode || (TARGET_64BIT && mode == DImode))
48999 && INTEGER_CLASS_P (rclass))
49000 return ALL_SSE_REGS;
49001 return NO_REGS;
49004 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
49005 but returns a lower bound. */
49007 static unsigned int
49008 ix86_max_noce_ifcvt_seq_cost (edge e)
49010 bool predictable_p = predictable_edge_p (e);
49012 enum compiler_param param
49013 = (predictable_p
49014 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
49015 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
49017 /* If we have a parameter set, use that, otherwise take a guess using
49018 BRANCH_COST. */
49019 if (global_options_set.x_param_values[param])
49020 return PARAM_VALUE (param);
49021 else
49022 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
49025 /* Return true if SEQ is a good candidate as a replacement for the
49026 if-convertible sequence described in IF_INFO. */
49028 static bool
49029 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
49031 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
49033 int cmov_cnt = 0;
49034 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
49035 Maybe we should allow even more conditional moves as long as they
49036 are used far enough not to stall the CPU, or also consider
49037 IF_INFO->TEST_BB succ edge probabilities. */
49038 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
49040 rtx set = single_set (insn);
49041 if (!set)
49042 continue;
49043 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
49044 continue;
49045 rtx src = SET_SRC (set);
49046 machine_mode mode = GET_MODE (src);
49047 if (GET_MODE_CLASS (mode) != MODE_INT
49048 && GET_MODE_CLASS (mode) != MODE_FLOAT)
49049 continue;
49050 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
49051 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
49052 continue;
49053 /* insn is CMOV or FCMOV. */
49054 if (++cmov_cnt > 1)
49055 return false;
49058 return default_noce_conversion_profitable_p (seq, if_info);
49061 /* Implement targetm.vectorize.init_cost. */
49063 static void *
49064 ix86_init_cost (struct loop *)
49066 unsigned *cost = XNEWVEC (unsigned, 3);
49067 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49068 return cost;
49071 /* Implement targetm.vectorize.add_stmt_cost. */
49073 static unsigned
49074 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49075 struct _stmt_vec_info *stmt_info, int misalign,
49076 enum vect_cost_model_location where)
49078 unsigned *cost = (unsigned *) data;
49079 unsigned retval = 0;
49081 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49082 int stmt_cost = - 1;
49084 if ((kind == vector_stmt || kind == scalar_stmt)
49085 && stmt_info
49086 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
49088 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
49089 bool fp = false;
49090 machine_mode mode = TImode;
49092 if (vectype != NULL)
49094 fp = FLOAT_TYPE_P (vectype);
49095 mode = TYPE_MODE (vectype);
49097 /*machine_mode inner_mode = mode;
49098 if (VECTOR_MODE_P (mode))
49099 inner_mode = GET_MODE_INNER (mode);*/
49101 switch (subcode)
49103 case PLUS_EXPR:
49104 case POINTER_PLUS_EXPR:
49105 case MINUS_EXPR:
49106 if (kind == scalar_stmt)
49108 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49109 stmt_cost = ix86_cost->addss;
49110 else if (X87_FLOAT_MODE_P (mode))
49111 stmt_cost = ix86_cost->fadd;
49112 else
49113 stmt_cost = ix86_cost->add;
49115 else
49116 stmt_cost = ix86_vec_cost (mode,
49117 fp ? ix86_cost->addss
49118 : ix86_cost->sse_op,
49119 true);
49120 break;
49122 case MULT_EXPR:
49123 case WIDEN_MULT_EXPR:
49124 case MULT_HIGHPART_EXPR:
49125 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
49126 break;
49127 case FMA_EXPR:
49128 stmt_cost = ix86_vec_cost (mode,
49129 mode == SFmode ? ix86_cost->fmass
49130 : ix86_cost->fmasd,
49131 true);
49132 break;
49133 case NEGATE_EXPR:
49134 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49135 stmt_cost = ix86_cost->sse_op;
49136 else if (X87_FLOAT_MODE_P (mode))
49137 stmt_cost = ix86_cost->fchs;
49138 else if (VECTOR_MODE_P (mode))
49139 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49140 else
49141 stmt_cost = ix86_cost->add;
49142 break;
49143 case TRUNC_DIV_EXPR:
49144 case CEIL_DIV_EXPR:
49145 case FLOOR_DIV_EXPR:
49146 case ROUND_DIV_EXPR:
49147 case TRUNC_MOD_EXPR:
49148 case CEIL_MOD_EXPR:
49149 case FLOOR_MOD_EXPR:
49150 case RDIV_EXPR:
49151 case ROUND_MOD_EXPR:
49152 case EXACT_DIV_EXPR:
49153 stmt_cost = ix86_division_cost (ix86_cost, mode);
49154 break;
49156 case RSHIFT_EXPR:
49157 case LSHIFT_EXPR:
49158 case LROTATE_EXPR:
49159 case RROTATE_EXPR:
49161 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
49162 stmt_cost = ix86_shift_rotate_cost
49163 (ix86_cost, mode,
49164 TREE_CODE (op2) == INTEGER_CST,
49165 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
49166 true, false, false, NULL, NULL);
49168 break;
49169 case NOP_EXPR:
49170 stmt_cost = 0;
49171 break;
49173 case BIT_IOR_EXPR:
49174 case ABS_EXPR:
49175 case MIN_EXPR:
49176 case MAX_EXPR:
49177 case BIT_XOR_EXPR:
49178 case BIT_AND_EXPR:
49179 case BIT_NOT_EXPR:
49180 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49181 stmt_cost = ix86_cost->sse_op;
49182 else if (VECTOR_MODE_P (mode))
49183 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49184 else
49185 stmt_cost = ix86_cost->add;
49186 break;
49187 default:
49188 break;
49191 if (stmt_cost == -1)
49192 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49194 /* Penalize DFmode vector operations for Bonnell. */
49195 if (TARGET_BONNELL && kind == vector_stmt
49196 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49197 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49199 /* Statements in an inner loop relative to the loop being
49200 vectorized are weighted more heavily. The value here is
49201 arbitrary and could potentially be improved with analysis. */
49202 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49203 count *= 50; /* FIXME. */
49205 retval = (unsigned) (count * stmt_cost);
49207 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49208 for Silvermont as it has out of order integer pipeline and can execute
49209 2 scalar instruction per tick, but has in order SIMD pipeline. */
49210 if ((TARGET_SILVERMONT || TARGET_INTEL)
49211 && stmt_info && stmt_info->stmt)
49213 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49214 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49215 retval = (retval * 17) / 10;
49218 cost[where] += retval;
49220 return retval;
49223 /* Implement targetm.vectorize.finish_cost. */
49225 static void
49226 ix86_finish_cost (void *data, unsigned *prologue_cost,
49227 unsigned *body_cost, unsigned *epilogue_cost)
49229 unsigned *cost = (unsigned *) data;
49230 *prologue_cost = cost[vect_prologue];
49231 *body_cost = cost[vect_body];
49232 *epilogue_cost = cost[vect_epilogue];
49235 /* Implement targetm.vectorize.destroy_cost_data. */
49237 static void
49238 ix86_destroy_cost_data (void *data)
49240 free (data);
49243 /* Validate target specific memory model bits in VAL. */
49245 static unsigned HOST_WIDE_INT
49246 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49248 enum memmodel model = memmodel_from_int (val);
49249 bool strong;
49251 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49252 |MEMMODEL_MASK)
49253 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49255 warning (OPT_Winvalid_memory_model,
49256 "unknown architecture specific memory model");
49257 return MEMMODEL_SEQ_CST;
49259 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49260 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49262 warning (OPT_Winvalid_memory_model,
49263 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49264 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49266 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49268 warning (OPT_Winvalid_memory_model,
49269 "HLE_RELEASE not used with RELEASE or stronger memory model");
49270 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49272 return val;
49275 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49276 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49277 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49278 or number of vecsize_mangle variants that should be emitted. */
49280 static int
49281 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49282 struct cgraph_simd_clone *clonei,
49283 tree base_type, int num)
49285 int ret = 1;
49287 if (clonei->simdlen
49288 && (clonei->simdlen < 2
49289 || clonei->simdlen > 1024
49290 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49292 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49293 "unsupported simdlen %d", clonei->simdlen);
49294 return 0;
49297 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49298 if (TREE_CODE (ret_type) != VOID_TYPE)
49299 switch (TYPE_MODE (ret_type))
49301 case E_QImode:
49302 case E_HImode:
49303 case E_SImode:
49304 case E_DImode:
49305 case E_SFmode:
49306 case E_DFmode:
49307 /* case E_SCmode: */
49308 /* case E_DCmode: */
49309 break;
49310 default:
49311 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49312 "unsupported return type %qT for simd\n", ret_type);
49313 return 0;
49316 tree t;
49317 int i;
49319 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49320 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49321 switch (TYPE_MODE (TREE_TYPE (t)))
49323 case E_QImode:
49324 case E_HImode:
49325 case E_SImode:
49326 case E_DImode:
49327 case E_SFmode:
49328 case E_DFmode:
49329 /* case E_SCmode: */
49330 /* case E_DCmode: */
49331 break;
49332 default:
49333 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49334 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49335 return 0;
49338 if (!TREE_PUBLIC (node->decl))
49340 /* If the function isn't exported, we can pick up just one ISA
49341 for the clones. */
49342 if (TARGET_AVX512F)
49343 clonei->vecsize_mangle = 'e';
49344 else if (TARGET_AVX2)
49345 clonei->vecsize_mangle = 'd';
49346 else if (TARGET_AVX)
49347 clonei->vecsize_mangle = 'c';
49348 else
49349 clonei->vecsize_mangle = 'b';
49350 ret = 1;
49352 else
49354 clonei->vecsize_mangle = "bcde"[num];
49355 ret = 4;
49357 clonei->mask_mode = VOIDmode;
49358 switch (clonei->vecsize_mangle)
49360 case 'b':
49361 clonei->vecsize_int = 128;
49362 clonei->vecsize_float = 128;
49363 break;
49364 case 'c':
49365 clonei->vecsize_int = 128;
49366 clonei->vecsize_float = 256;
49367 break;
49368 case 'd':
49369 clonei->vecsize_int = 256;
49370 clonei->vecsize_float = 256;
49371 break;
49372 case 'e':
49373 clonei->vecsize_int = 512;
49374 clonei->vecsize_float = 512;
49375 if (TYPE_MODE (base_type) == QImode)
49376 clonei->mask_mode = DImode;
49377 else
49378 clonei->mask_mode = SImode;
49379 break;
49381 if (clonei->simdlen == 0)
49383 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49384 clonei->simdlen = clonei->vecsize_int;
49385 else
49386 clonei->simdlen = clonei->vecsize_float;
49387 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49389 else if (clonei->simdlen > 16)
49391 /* For compatibility with ICC, use the same upper bounds
49392 for simdlen. In particular, for CTYPE below, use the return type,
49393 unless the function returns void, in that case use the characteristic
49394 type. If it is possible for given SIMDLEN to pass CTYPE value
49395 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49396 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49397 emit corresponding clone. */
49398 tree ctype = ret_type;
49399 if (TREE_CODE (ret_type) == VOID_TYPE)
49400 ctype = base_type;
49401 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49402 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49403 cnt /= clonei->vecsize_int;
49404 else
49405 cnt /= clonei->vecsize_float;
49406 if (cnt > (TARGET_64BIT ? 16 : 8))
49408 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49409 "unsupported simdlen %d", clonei->simdlen);
49410 return 0;
49413 return ret;
49416 /* Add target attribute to SIMD clone NODE if needed. */
49418 static void
49419 ix86_simd_clone_adjust (struct cgraph_node *node)
49421 const char *str = NULL;
49422 gcc_assert (node->decl == cfun->decl);
49423 switch (node->simdclone->vecsize_mangle)
49425 case 'b':
49426 if (!TARGET_SSE2)
49427 str = "sse2";
49428 break;
49429 case 'c':
49430 if (!TARGET_AVX)
49431 str = "avx";
49432 break;
49433 case 'd':
49434 if (!TARGET_AVX2)
49435 str = "avx2";
49436 break;
49437 case 'e':
49438 if (!TARGET_AVX512F)
49439 str = "avx512f";
49440 break;
49441 default:
49442 gcc_unreachable ();
49444 if (str == NULL)
49445 return;
49446 push_cfun (NULL);
49447 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49448 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49449 gcc_assert (ok);
49450 pop_cfun ();
49451 ix86_reset_previous_fndecl ();
49452 ix86_set_current_function (node->decl);
49455 /* If SIMD clone NODE can't be used in a vectorized loop
49456 in current function, return -1, otherwise return a badness of using it
49457 (0 if it is most desirable from vecsize_mangle point of view, 1
49458 slightly less desirable, etc.). */
49460 static int
49461 ix86_simd_clone_usable (struct cgraph_node *node)
49463 switch (node->simdclone->vecsize_mangle)
49465 case 'b':
49466 if (!TARGET_SSE2)
49467 return -1;
49468 if (!TARGET_AVX)
49469 return 0;
49470 return TARGET_AVX2 ? 2 : 1;
49471 case 'c':
49472 if (!TARGET_AVX)
49473 return -1;
49474 return TARGET_AVX2 ? 1 : 0;
49475 case 'd':
49476 if (!TARGET_AVX2)
49477 return -1;
49478 return 0;
49479 case 'e':
49480 if (!TARGET_AVX512F)
49481 return -1;
49482 return 0;
49483 default:
49484 gcc_unreachable ();
49488 /* This function adjusts the unroll factor based on
49489 the hardware capabilities. For ex, bdver3 has
49490 a loop buffer which makes unrolling of smaller
49491 loops less important. This function decides the
49492 unroll factor using number of memory references
49493 (value 32 is used) as a heuristic. */
49495 static unsigned
49496 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49498 basic_block *bbs;
49499 rtx_insn *insn;
49500 unsigned i;
49501 unsigned mem_count = 0;
49503 if (!TARGET_ADJUST_UNROLL)
49504 return nunroll;
49506 /* Count the number of memory references within the loop body.
49507 This value determines the unrolling factor for bdver3 and bdver4
49508 architectures. */
49509 subrtx_iterator::array_type array;
49510 bbs = get_loop_body (loop);
49511 for (i = 0; i < loop->num_nodes; i++)
49512 FOR_BB_INSNS (bbs[i], insn)
49513 if (NONDEBUG_INSN_P (insn))
49514 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49515 if (const_rtx x = *iter)
49516 if (MEM_P (x))
49518 machine_mode mode = GET_MODE (x);
49519 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
49520 if (n_words > 4)
49521 mem_count += 2;
49522 else
49523 mem_count += 1;
49525 free (bbs);
49527 if (mem_count && mem_count <=32)
49528 return 32/mem_count;
49530 return nunroll;
49534 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
49536 static bool
49537 ix86_float_exceptions_rounding_supported_p (void)
49539 /* For x87 floating point with standard excess precision handling,
49540 there is no adddf3 pattern (since x87 floating point only has
49541 XFmode operations) so the default hook implementation gets this
49542 wrong. */
49543 return TARGET_80387 || TARGET_SSE_MATH;
49546 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
49548 static void
49549 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
49551 if (!TARGET_80387 && !TARGET_SSE_MATH)
49552 return;
49553 tree exceptions_var = create_tmp_var_raw (integer_type_node);
49554 if (TARGET_80387)
49556 tree fenv_index_type = build_index_type (size_int (6));
49557 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
49558 tree fenv_var = create_tmp_var_raw (fenv_type);
49559 TREE_ADDRESSABLE (fenv_var) = 1;
49560 tree fenv_ptr = build_pointer_type (fenv_type);
49561 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
49562 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
49563 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
49564 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
49565 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
49566 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
49567 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
49568 tree hold_fnclex = build_call_expr (fnclex, 0);
49569 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
49570 NULL_TREE, NULL_TREE);
49571 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
49572 hold_fnclex);
49573 *clear = build_call_expr (fnclex, 0);
49574 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
49575 tree fnstsw_call = build_call_expr (fnstsw, 0);
49576 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
49577 sw_var, fnstsw_call);
49578 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
49579 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
49580 exceptions_var, exceptions_x87);
49581 *update = build2 (COMPOUND_EXPR, integer_type_node,
49582 sw_mod, update_mod);
49583 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
49584 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
49586 if (TARGET_SSE_MATH)
49588 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
49589 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
49590 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
49591 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
49592 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
49593 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
49594 mxcsr_orig_var, stmxcsr_hold_call);
49595 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
49596 mxcsr_orig_var,
49597 build_int_cst (unsigned_type_node, 0x1f80));
49598 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
49599 build_int_cst (unsigned_type_node, 0xffffffc0));
49600 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
49601 mxcsr_mod_var, hold_mod_val);
49602 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49603 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
49604 hold_assign_orig, hold_assign_mod);
49605 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
49606 ldmxcsr_hold_call);
49607 if (*hold)
49608 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
49609 else
49610 *hold = hold_all;
49611 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49612 if (*clear)
49613 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
49614 ldmxcsr_clear_call);
49615 else
49616 *clear = ldmxcsr_clear_call;
49617 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
49618 tree exceptions_sse = fold_convert (integer_type_node,
49619 stxmcsr_update_call);
49620 if (*update)
49622 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
49623 exceptions_var, exceptions_sse);
49624 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
49625 exceptions_var, exceptions_mod);
49626 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
49627 exceptions_assign);
49629 else
49630 *update = build2 (MODIFY_EXPR, integer_type_node,
49631 exceptions_var, exceptions_sse);
49632 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
49633 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49634 ldmxcsr_update_call);
49636 tree atomic_feraiseexcept
49637 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
49638 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
49639 1, exceptions_var);
49640 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49641 atomic_feraiseexcept_call);
49644 /* Return mode to be used for bounds or VOIDmode
49645 if bounds are not supported. */
49647 static machine_mode
49648 ix86_mpx_bound_mode ()
49650 /* Do not support pointer checker if MPX
49651 is not enabled. */
49652 if (!TARGET_MPX)
49654 if (flag_check_pointer_bounds)
49655 warning (0, "Pointer Checker requires MPX support on this target."
49656 " Use -mmpx options to enable MPX.");
49657 return VOIDmode;
49660 return BNDmode;
49663 /* Return constant used to statically initialize constant bounds.
49665 This function is used to create special bound values. For now
49666 only INIT bounds and NONE bounds are expected. More special
49667 values may be added later. */
49669 static tree
49670 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
49672 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
49673 : build_zero_cst (pointer_sized_int_node);
49674 tree high = ub ? build_zero_cst (pointer_sized_int_node)
49675 : build_minus_one_cst (pointer_sized_int_node);
49677 /* This function is supposed to be used to create INIT and
49678 NONE bounds only. */
49679 gcc_assert ((lb == 0 && ub == -1)
49680 || (lb == -1 && ub == 0));
49682 return build_complex (NULL, low, high);
49685 /* Generate a list of statements STMTS to initialize pointer bounds
49686 variable VAR with bounds LB and UB. Return the number of generated
49687 statements. */
49689 static int
49690 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
49692 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
49693 tree lhs, modify, var_p;
49695 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
49696 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
49698 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
49699 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
49700 append_to_statement_list (modify, stmts);
49702 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
49703 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
49704 TYPE_SIZE_UNIT (pointer_sized_int_node)));
49705 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
49706 append_to_statement_list (modify, stmts);
49708 return 2;
49711 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
49712 /* For i386, common symbol is local only for non-PIE binaries. For
49713 x86-64, common symbol is local only for non-PIE binaries or linker
49714 supports copy reloc in PIE binaries. */
49716 static bool
49717 ix86_binds_local_p (const_tree exp)
49719 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
49720 (!flag_pic
49721 || (TARGET_64BIT
49722 && HAVE_LD_PIE_COPYRELOC != 0)));
49724 #endif
49726 /* If MEM is in the form of [base+offset], extract the two parts
49727 of address and set to BASE and OFFSET, otherwise return false. */
49729 static bool
49730 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
49732 rtx addr;
49734 gcc_assert (MEM_P (mem));
49736 addr = XEXP (mem, 0);
49738 if (GET_CODE (addr) == CONST)
49739 addr = XEXP (addr, 0);
49741 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
49743 *base = addr;
49744 *offset = const0_rtx;
49745 return true;
49748 if (GET_CODE (addr) == PLUS
49749 && (REG_P (XEXP (addr, 0))
49750 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49751 && CONST_INT_P (XEXP (addr, 1)))
49753 *base = XEXP (addr, 0);
49754 *offset = XEXP (addr, 1);
49755 return true;
49758 return false;
49761 /* Given OPERANDS of consecutive load/store, check if we can merge
49762 them into move multiple. LOAD is true if they are load instructions.
49763 MODE is the mode of memory operands. */
49765 bool
49766 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
49767 machine_mode mode)
49769 HOST_WIDE_INT offval_1, offval_2, msize;
49770 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
49772 if (load)
49774 mem_1 = operands[1];
49775 mem_2 = operands[3];
49776 reg_1 = operands[0];
49777 reg_2 = operands[2];
49779 else
49781 mem_1 = operands[0];
49782 mem_2 = operands[2];
49783 reg_1 = operands[1];
49784 reg_2 = operands[3];
49787 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
49789 if (REGNO (reg_1) != REGNO (reg_2))
49790 return false;
49792 /* Check if the addresses are in the form of [base+offset]. */
49793 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
49794 return false;
49795 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
49796 return false;
49798 /* Check if the bases are the same. */
49799 if (!rtx_equal_p (base_1, base_2))
49800 return false;
49802 offval_1 = INTVAL (offset_1);
49803 offval_2 = INTVAL (offset_2);
49804 msize = GET_MODE_SIZE (mode);
49805 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
49806 if (offval_1 + msize != offval_2)
49807 return false;
49809 return true;
49812 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
49814 static bool
49815 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
49816 optimization_type opt_type)
49818 switch (op)
49820 case asin_optab:
49821 case acos_optab:
49822 case log1p_optab:
49823 case exp_optab:
49824 case exp10_optab:
49825 case exp2_optab:
49826 case expm1_optab:
49827 case ldexp_optab:
49828 case scalb_optab:
49829 case round_optab:
49830 return opt_type == OPTIMIZE_FOR_SPEED;
49832 case rint_optab:
49833 if (SSE_FLOAT_MODE_P (mode1)
49834 && TARGET_SSE_MATH
49835 && !flag_trapping_math
49836 && !TARGET_SSE4_1)
49837 return opt_type == OPTIMIZE_FOR_SPEED;
49838 return true;
49840 case floor_optab:
49841 case ceil_optab:
49842 case btrunc_optab:
49843 if (SSE_FLOAT_MODE_P (mode1)
49844 && TARGET_SSE_MATH
49845 && !flag_trapping_math
49846 && TARGET_SSE4_1)
49847 return true;
49848 return opt_type == OPTIMIZE_FOR_SPEED;
49850 case rsqrt_optab:
49851 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
49853 default:
49854 return true;
49858 /* Address space support.
49860 This is not "far pointers" in the 16-bit sense, but an easy way
49861 to use %fs and %gs segment prefixes. Therefore:
49863 (a) All address spaces have the same modes,
49864 (b) All address spaces have the same addresss forms,
49865 (c) While %fs and %gs are technically subsets of the generic
49866 address space, they are probably not subsets of each other.
49867 (d) Since we have no access to the segment base register values
49868 without resorting to a system call, we cannot convert a
49869 non-default address space to a default address space.
49870 Therefore we do not claim %fs or %gs are subsets of generic.
49872 Therefore we can (mostly) use the default hooks. */
49874 /* All use of segmentation is assumed to make address 0 valid. */
49876 static bool
49877 ix86_addr_space_zero_address_valid (addr_space_t as)
49879 return as != ADDR_SPACE_GENERIC;
49882 static void
49883 ix86_init_libfuncs (void)
49885 if (TARGET_64BIT)
49887 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
49888 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
49890 else
49892 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
49893 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
49896 #if TARGET_MACHO
49897 darwin_rename_builtins ();
49898 #endif
49901 /* Generate call to __divmoddi4. */
49903 static void
49904 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
49905 rtx op0, rtx op1,
49906 rtx *quot_p, rtx *rem_p)
49908 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
49910 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
49911 mode,
49912 op0, GET_MODE (op0),
49913 op1, GET_MODE (op1),
49914 XEXP (rem, 0), Pmode);
49915 *quot_p = quot;
49916 *rem_p = rem;
49919 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
49920 FPU, assume that the fpcw is set to extended precision; when using
49921 only SSE, rounding is correct; when using both SSE and the FPU,
49922 the rounding precision is indeterminate, since either may be chosen
49923 apparently at random. */
49925 static enum flt_eval_method
49926 ix86_excess_precision (enum excess_precision_type type)
49928 switch (type)
49930 case EXCESS_PRECISION_TYPE_FAST:
49931 /* The fastest type to promote to will always be the native type,
49932 whether that occurs with implicit excess precision or
49933 otherwise. */
49934 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49935 case EXCESS_PRECISION_TYPE_STANDARD:
49936 case EXCESS_PRECISION_TYPE_IMPLICIT:
49937 /* Otherwise, the excess precision we want when we are
49938 in a standards compliant mode, and the implicit precision we
49939 provide would be identical were it not for the unpredictable
49940 cases. */
49941 if (!TARGET_80387)
49942 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49943 else if (!TARGET_MIX_SSE_I387)
49945 if (!TARGET_SSE_MATH)
49946 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
49947 else if (TARGET_SSE2)
49948 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49951 /* If we are in standards compliant mode, but we know we will
49952 calculate in unpredictable precision, return
49953 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
49954 excess precision if the target can't guarantee it will honor
49955 it. */
49956 return (type == EXCESS_PRECISION_TYPE_STANDARD
49957 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
49958 : FLT_EVAL_METHOD_UNPREDICTABLE);
49959 default:
49960 gcc_unreachable ();
49963 return FLT_EVAL_METHOD_UNPREDICTABLE;
49966 /* Target-specific selftests. */
49968 #if CHECKING_P
49970 namespace selftest {
49972 /* Verify that hard regs are dumped as expected (in compact mode). */
49974 static void
49975 ix86_test_dumping_hard_regs ()
49977 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
49978 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
49981 /* Test dumping an insn with repeated references to the same SCRATCH,
49982 to verify the rtx_reuse code. */
49984 static void
49985 ix86_test_dumping_memory_blockage ()
49987 set_new_first_and_last_insn (NULL, NULL);
49989 rtx pat = gen_memory_blockage ();
49990 rtx_reuse_manager r;
49991 r.preprocess (pat);
49993 /* Verify that the repeated references to the SCRATCH show use
49994 reuse IDS. The first should be prefixed with a reuse ID,
49995 and the second should be dumped as a "reuse_rtx" of that ID.
49996 The expected string assumes Pmode == DImode. */
49997 if (Pmode == DImode)
49998 ASSERT_RTL_DUMP_EQ_WITH_REUSE
49999 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
50000 " (unspec:BLK [\n"
50001 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
50002 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
50005 /* Verify loading an RTL dump; specifically a dump of copying
50006 a param on x86_64 from a hard reg into the frame.
50007 This test is target-specific since the dump contains target-specific
50008 hard reg names. */
50010 static void
50011 ix86_test_loading_dump_fragment_1 ()
50013 rtl_dump_test t (SELFTEST_LOCATION,
50014 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
50016 rtx_insn *insn = get_insn_by_uid (1);
50018 /* The block structure and indentation here is purely for
50019 readability; it mirrors the structure of the rtx. */
50020 tree mem_expr;
50022 rtx pat = PATTERN (insn);
50023 ASSERT_EQ (SET, GET_CODE (pat));
50025 rtx dest = SET_DEST (pat);
50026 ASSERT_EQ (MEM, GET_CODE (dest));
50027 /* Verify the "/c" was parsed. */
50028 ASSERT_TRUE (RTX_FLAG (dest, call));
50029 ASSERT_EQ (SImode, GET_MODE (dest));
50031 rtx addr = XEXP (dest, 0);
50032 ASSERT_EQ (PLUS, GET_CODE (addr));
50033 ASSERT_EQ (DImode, GET_MODE (addr));
50035 rtx lhs = XEXP (addr, 0);
50036 /* Verify that the "frame" REG was consolidated. */
50037 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
50040 rtx rhs = XEXP (addr, 1);
50041 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
50042 ASSERT_EQ (-4, INTVAL (rhs));
50045 /* Verify the "[1 i+0 S4 A32]" was parsed. */
50046 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
50047 /* "i" should have been handled by synthesizing a global int
50048 variable named "i". */
50049 mem_expr = MEM_EXPR (dest);
50050 ASSERT_NE (mem_expr, NULL);
50051 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
50052 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
50053 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
50054 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
50055 /* "+0". */
50056 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
50057 ASSERT_EQ (0, MEM_OFFSET (dest));
50058 /* "S4". */
50059 ASSERT_EQ (4, MEM_SIZE (dest));
50060 /* "A32. */
50061 ASSERT_EQ (32, MEM_ALIGN (dest));
50064 rtx src = SET_SRC (pat);
50065 ASSERT_EQ (REG, GET_CODE (src));
50066 ASSERT_EQ (SImode, GET_MODE (src));
50067 ASSERT_EQ (5, REGNO (src));
50068 tree reg_expr = REG_EXPR (src);
50069 /* "i" here should point to the same var as for the MEM_EXPR. */
50070 ASSERT_EQ (reg_expr, mem_expr);
50075 /* Verify that the RTL loader copes with a call_insn dump.
50076 This test is target-specific since the dump contains a target-specific
50077 hard reg name. */
50079 static void
50080 ix86_test_loading_call_insn ()
50082 /* The test dump includes register "xmm0", where requires TARGET_SSE
50083 to exist. */
50084 if (!TARGET_SSE)
50085 return;
50087 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
50089 rtx_insn *insn = get_insns ();
50090 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
50092 /* "/j". */
50093 ASSERT_TRUE (RTX_FLAG (insn, jump));
50095 rtx pat = PATTERN (insn);
50096 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
50098 /* Verify REG_NOTES. */
50100 /* "(expr_list:REG_CALL_DECL". */
50101 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
50102 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
50103 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
50105 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
50106 rtx_expr_list *note1 = note0->next ();
50107 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
50109 ASSERT_EQ (NULL, note1->next ());
50112 /* Verify CALL_INSN_FUNCTION_USAGE. */
50114 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
50115 rtx_expr_list *usage
50116 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
50117 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
50118 ASSERT_EQ (DFmode, GET_MODE (usage));
50119 ASSERT_EQ (USE, GET_CODE (usage->element ()));
50120 ASSERT_EQ (NULL, usage->next ());
50124 /* Verify that the RTL loader copes a dump from print_rtx_function.
50125 This test is target-specific since the dump contains target-specific
50126 hard reg names. */
50128 static void
50129 ix86_test_loading_full_dump ()
50131 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
50133 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50135 rtx_insn *insn_1 = get_insn_by_uid (1);
50136 ASSERT_EQ (NOTE, GET_CODE (insn_1));
50138 rtx_insn *insn_7 = get_insn_by_uid (7);
50139 ASSERT_EQ (INSN, GET_CODE (insn_7));
50140 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
50142 rtx_insn *insn_15 = get_insn_by_uid (15);
50143 ASSERT_EQ (INSN, GET_CODE (insn_15));
50144 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
50146 /* Verify crtl->return_rtx. */
50147 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
50148 ASSERT_EQ (0, REGNO (crtl->return_rtx));
50149 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
50152 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
50153 In particular, verify that it correctly loads the 2nd operand.
50154 This test is target-specific since these are machine-specific
50155 operands (and enums). */
50157 static void
50158 ix86_test_loading_unspec ()
50160 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
50162 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50164 ASSERT_TRUE (cfun);
50166 /* Test of an UNSPEC. */
50167 rtx_insn *insn = get_insns ();
50168 ASSERT_EQ (INSN, GET_CODE (insn));
50169 rtx set = single_set (insn);
50170 ASSERT_NE (NULL, set);
50171 rtx dst = SET_DEST (set);
50172 ASSERT_EQ (MEM, GET_CODE (dst));
50173 rtx src = SET_SRC (set);
50174 ASSERT_EQ (UNSPEC, GET_CODE (src));
50175 ASSERT_EQ (BLKmode, GET_MODE (src));
50176 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
50178 rtx v0 = XVECEXP (src, 0, 0);
50180 /* Verify that the two uses of the first SCRATCH have pointer
50181 equality. */
50182 rtx scratch_a = XEXP (dst, 0);
50183 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
50185 rtx scratch_b = XEXP (v0, 0);
50186 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
50188 ASSERT_EQ (scratch_a, scratch_b);
50190 /* Verify that the two mems are thus treated as equal. */
50191 ASSERT_TRUE (rtx_equal_p (dst, v0));
50193 /* Verify the the insn is recognized. */
50194 ASSERT_NE(-1, recog_memoized (insn));
50196 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
50197 insn = NEXT_INSN (insn);
50198 ASSERT_EQ (INSN, GET_CODE (insn));
50200 set = single_set (insn);
50201 ASSERT_NE (NULL, set);
50203 src = SET_SRC (set);
50204 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
50205 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
50208 /* Run all target-specific selftests. */
50210 static void
50211 ix86_run_selftests (void)
50213 ix86_test_dumping_hard_regs ();
50214 ix86_test_dumping_memory_blockage ();
50216 /* Various tests of loading RTL dumps, here because they contain
50217 ix86-isms (e.g. names of hard regs). */
50218 ix86_test_loading_dump_fragment_1 ();
50219 ix86_test_loading_call_insn ();
50220 ix86_test_loading_full_dump ();
50221 ix86_test_loading_unspec ();
50224 } // namespace selftest
50226 #endif /* CHECKING_P */
50228 /* Initialize the GCC target structure. */
50229 #undef TARGET_RETURN_IN_MEMORY
50230 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50232 #undef TARGET_LEGITIMIZE_ADDRESS
50233 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50235 #undef TARGET_ATTRIBUTE_TABLE
50236 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50237 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50238 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50239 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50240 # undef TARGET_MERGE_DECL_ATTRIBUTES
50241 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50242 #endif
50244 #undef TARGET_COMP_TYPE_ATTRIBUTES
50245 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50247 #undef TARGET_INIT_BUILTINS
50248 #define TARGET_INIT_BUILTINS ix86_init_builtins
50249 #undef TARGET_BUILTIN_DECL
50250 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50251 #undef TARGET_EXPAND_BUILTIN
50252 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50254 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50255 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50256 ix86_builtin_vectorized_function
50258 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50259 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50261 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50262 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50264 #undef TARGET_BUILTIN_RECIPROCAL
50265 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50267 #undef TARGET_ASM_FUNCTION_EPILOGUE
50268 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50270 #undef TARGET_ENCODE_SECTION_INFO
50271 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50272 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50273 #else
50274 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50275 #endif
50277 #undef TARGET_ASM_OPEN_PAREN
50278 #define TARGET_ASM_OPEN_PAREN ""
50279 #undef TARGET_ASM_CLOSE_PAREN
50280 #define TARGET_ASM_CLOSE_PAREN ""
50282 #undef TARGET_ASM_BYTE_OP
50283 #define TARGET_ASM_BYTE_OP ASM_BYTE
50285 #undef TARGET_ASM_ALIGNED_HI_OP
50286 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50287 #undef TARGET_ASM_ALIGNED_SI_OP
50288 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50289 #ifdef ASM_QUAD
50290 #undef TARGET_ASM_ALIGNED_DI_OP
50291 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50292 #endif
50294 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50295 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50297 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50298 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50300 #undef TARGET_ASM_UNALIGNED_HI_OP
50301 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50302 #undef TARGET_ASM_UNALIGNED_SI_OP
50303 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50304 #undef TARGET_ASM_UNALIGNED_DI_OP
50305 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50307 #undef TARGET_PRINT_OPERAND
50308 #define TARGET_PRINT_OPERAND ix86_print_operand
50309 #undef TARGET_PRINT_OPERAND_ADDRESS
50310 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50311 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50312 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50313 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50314 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50316 #undef TARGET_SCHED_INIT_GLOBAL
50317 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50318 #undef TARGET_SCHED_ADJUST_COST
50319 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50320 #undef TARGET_SCHED_ISSUE_RATE
50321 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50322 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50323 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50324 ia32_multipass_dfa_lookahead
50325 #undef TARGET_SCHED_MACRO_FUSION_P
50326 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50327 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50328 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50330 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50331 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50333 #undef TARGET_MEMMODEL_CHECK
50334 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50336 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50337 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50339 #ifdef HAVE_AS_TLS
50340 #undef TARGET_HAVE_TLS
50341 #define TARGET_HAVE_TLS true
50342 #endif
50343 #undef TARGET_CANNOT_FORCE_CONST_MEM
50344 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50345 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50346 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50348 #undef TARGET_DELEGITIMIZE_ADDRESS
50349 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50351 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
50352 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
50354 #undef TARGET_MS_BITFIELD_LAYOUT_P
50355 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50357 #if TARGET_MACHO
50358 #undef TARGET_BINDS_LOCAL_P
50359 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50360 #else
50361 #undef TARGET_BINDS_LOCAL_P
50362 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50363 #endif
50364 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50365 #undef TARGET_BINDS_LOCAL_P
50366 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50367 #endif
50369 #undef TARGET_ASM_OUTPUT_MI_THUNK
50370 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50371 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50372 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50374 #undef TARGET_ASM_FILE_START
50375 #define TARGET_ASM_FILE_START x86_file_start
50377 #undef TARGET_OPTION_OVERRIDE
50378 #define TARGET_OPTION_OVERRIDE ix86_option_override
50380 #undef TARGET_REGISTER_MOVE_COST
50381 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50382 #undef TARGET_MEMORY_MOVE_COST
50383 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50384 #undef TARGET_RTX_COSTS
50385 #define TARGET_RTX_COSTS ix86_rtx_costs
50386 #undef TARGET_ADDRESS_COST
50387 #define TARGET_ADDRESS_COST ix86_address_cost
50389 #undef TARGET_FLAGS_REGNUM
50390 #define TARGET_FLAGS_REGNUM FLAGS_REG
50391 #undef TARGET_FIXED_CONDITION_CODE_REGS
50392 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50393 #undef TARGET_CC_MODES_COMPATIBLE
50394 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50396 #undef TARGET_MACHINE_DEPENDENT_REORG
50397 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50399 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50400 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50402 #undef TARGET_BUILD_BUILTIN_VA_LIST
50403 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50405 #undef TARGET_FOLD_BUILTIN
50406 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50408 #undef TARGET_GIMPLE_FOLD_BUILTIN
50409 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
50411 #undef TARGET_COMPARE_VERSION_PRIORITY
50412 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50414 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50415 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50416 ix86_generate_version_dispatcher_body
50418 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50419 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50420 ix86_get_function_versions_dispatcher
50422 #undef TARGET_ENUM_VA_LIST_P
50423 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50425 #undef TARGET_FN_ABI_VA_LIST
50426 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50428 #undef TARGET_CANONICAL_VA_LIST_TYPE
50429 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50431 #undef TARGET_EXPAND_BUILTIN_VA_START
50432 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50434 #undef TARGET_MD_ASM_ADJUST
50435 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50437 #undef TARGET_C_EXCESS_PRECISION
50438 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
50439 #undef TARGET_PROMOTE_PROTOTYPES
50440 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50441 #undef TARGET_SETUP_INCOMING_VARARGS
50442 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50443 #undef TARGET_MUST_PASS_IN_STACK
50444 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50445 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
50446 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
50447 #undef TARGET_FUNCTION_ARG_ADVANCE
50448 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50449 #undef TARGET_FUNCTION_ARG
50450 #define TARGET_FUNCTION_ARG ix86_function_arg
50451 #undef TARGET_INIT_PIC_REG
50452 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50453 #undef TARGET_USE_PSEUDO_PIC_REG
50454 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50455 #undef TARGET_FUNCTION_ARG_BOUNDARY
50456 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50457 #undef TARGET_PASS_BY_REFERENCE
50458 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50459 #undef TARGET_INTERNAL_ARG_POINTER
50460 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50461 #undef TARGET_UPDATE_STACK_BOUNDARY
50462 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50463 #undef TARGET_GET_DRAP_RTX
50464 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50465 #undef TARGET_STRICT_ARGUMENT_NAMING
50466 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50467 #undef TARGET_STATIC_CHAIN
50468 #define TARGET_STATIC_CHAIN ix86_static_chain
50469 #undef TARGET_TRAMPOLINE_INIT
50470 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50471 #undef TARGET_RETURN_POPS_ARGS
50472 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50474 #undef TARGET_WARN_FUNC_RETURN
50475 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
50477 #undef TARGET_LEGITIMATE_COMBINED_INSN
50478 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50480 #undef TARGET_ASAN_SHADOW_OFFSET
50481 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50483 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50484 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50486 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50487 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50489 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50490 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50492 #undef TARGET_C_MODE_FOR_SUFFIX
50493 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50495 #ifdef HAVE_AS_TLS
50496 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50497 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50498 #endif
50500 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50501 #undef TARGET_INSERT_ATTRIBUTES
50502 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50503 #endif
50505 #undef TARGET_MANGLE_TYPE
50506 #define TARGET_MANGLE_TYPE ix86_mangle_type
50508 #undef TARGET_STACK_PROTECT_GUARD
50509 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50511 #if !TARGET_MACHO
50512 #undef TARGET_STACK_PROTECT_FAIL
50513 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50514 #endif
50516 #undef TARGET_FUNCTION_VALUE
50517 #define TARGET_FUNCTION_VALUE ix86_function_value
50519 #undef TARGET_FUNCTION_VALUE_REGNO_P
50520 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50522 #undef TARGET_PROMOTE_FUNCTION_MODE
50523 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50525 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50526 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50528 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50529 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50531 #undef TARGET_INSTANTIATE_DECLS
50532 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50534 #undef TARGET_SECONDARY_RELOAD
50535 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50536 #undef TARGET_SECONDARY_MEMORY_NEEDED
50537 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
50538 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
50539 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
50541 #undef TARGET_CLASS_MAX_NREGS
50542 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50544 #undef TARGET_PREFERRED_RELOAD_CLASS
50545 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50546 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50547 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50548 #undef TARGET_CLASS_LIKELY_SPILLED_P
50549 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50551 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50552 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50553 ix86_builtin_vectorization_cost
50554 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50555 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50556 ix86_vectorize_vec_perm_const_ok
50557 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50558 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50559 ix86_preferred_simd_mode
50560 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50561 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50562 ix86_autovectorize_vector_sizes
50563 #undef TARGET_VECTORIZE_GET_MASK_MODE
50564 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50565 #undef TARGET_VECTORIZE_INIT_COST
50566 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50567 #undef TARGET_VECTORIZE_ADD_STMT_COST
50568 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50569 #undef TARGET_VECTORIZE_FINISH_COST
50570 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50571 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50572 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50574 #undef TARGET_SET_CURRENT_FUNCTION
50575 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50577 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50578 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50580 #undef TARGET_OPTION_SAVE
50581 #define TARGET_OPTION_SAVE ix86_function_specific_save
50583 #undef TARGET_OPTION_RESTORE
50584 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50586 #undef TARGET_OPTION_POST_STREAM_IN
50587 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50589 #undef TARGET_OPTION_PRINT
50590 #define TARGET_OPTION_PRINT ix86_function_specific_print
50592 #undef TARGET_OPTION_FUNCTION_VERSIONS
50593 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
50595 #undef TARGET_CAN_INLINE_P
50596 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50598 #undef TARGET_LEGITIMATE_ADDRESS_P
50599 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50601 #undef TARGET_REGISTER_PRIORITY
50602 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50604 #undef TARGET_REGISTER_USAGE_LEVELING_P
50605 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50607 #undef TARGET_LEGITIMATE_CONSTANT_P
50608 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50610 #undef TARGET_COMPUTE_FRAME_LAYOUT
50611 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
50613 #undef TARGET_FRAME_POINTER_REQUIRED
50614 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50616 #undef TARGET_CAN_ELIMINATE
50617 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50619 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50620 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50622 #undef TARGET_ASM_CODE_END
50623 #define TARGET_ASM_CODE_END ix86_code_end
50625 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50626 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50628 #undef TARGET_CANONICALIZE_COMPARISON
50629 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
50631 #undef TARGET_LOOP_UNROLL_ADJUST
50632 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50634 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50635 #undef TARGET_SPILL_CLASS
50636 #define TARGET_SPILL_CLASS ix86_spill_class
50638 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50639 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50640 ix86_simd_clone_compute_vecsize_and_simdlen
50642 #undef TARGET_SIMD_CLONE_ADJUST
50643 #define TARGET_SIMD_CLONE_ADJUST \
50644 ix86_simd_clone_adjust
50646 #undef TARGET_SIMD_CLONE_USABLE
50647 #define TARGET_SIMD_CLONE_USABLE \
50648 ix86_simd_clone_usable
50650 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50651 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50652 ix86_float_exceptions_rounding_supported_p
50654 #undef TARGET_MODE_EMIT
50655 #define TARGET_MODE_EMIT ix86_emit_mode_set
50657 #undef TARGET_MODE_NEEDED
50658 #define TARGET_MODE_NEEDED ix86_mode_needed
50660 #undef TARGET_MODE_AFTER
50661 #define TARGET_MODE_AFTER ix86_mode_after
50663 #undef TARGET_MODE_ENTRY
50664 #define TARGET_MODE_ENTRY ix86_mode_entry
50666 #undef TARGET_MODE_EXIT
50667 #define TARGET_MODE_EXIT ix86_mode_exit
50669 #undef TARGET_MODE_PRIORITY
50670 #define TARGET_MODE_PRIORITY ix86_mode_priority
50672 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50673 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50675 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50676 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50678 #undef TARGET_STORE_BOUNDS_FOR_ARG
50679 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50681 #undef TARGET_LOAD_RETURNED_BOUNDS
50682 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50684 #undef TARGET_STORE_RETURNED_BOUNDS
50685 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50687 #undef TARGET_CHKP_BOUND_MODE
50688 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50690 #undef TARGET_BUILTIN_CHKP_FUNCTION
50691 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50693 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50694 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50696 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50697 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50699 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50700 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50702 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50703 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50705 #undef TARGET_OFFLOAD_OPTIONS
50706 #define TARGET_OFFLOAD_OPTIONS \
50707 ix86_offload_options
50709 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50710 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50712 #undef TARGET_OPTAB_SUPPORTED_P
50713 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50715 #undef TARGET_HARD_REGNO_SCRATCH_OK
50716 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50718 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
50719 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
50721 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
50722 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
50724 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50725 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50727 #undef TARGET_INIT_LIBFUNCS
50728 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
50730 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
50731 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
50733 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
50734 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
50736 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
50737 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
50739 #undef TARGET_HARD_REGNO_NREGS
50740 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
50741 #undef TARGET_HARD_REGNO_MODE_OK
50742 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
50744 #undef TARGET_MODES_TIEABLE_P
50745 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
50747 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
50748 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
50749 ix86_hard_regno_call_part_clobbered
50751 #undef TARGET_CAN_CHANGE_MODE_CLASS
50752 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
50754 #undef TARGET_STATIC_RTX_ALIGNMENT
50755 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
50756 #undef TARGET_CONSTANT_ALIGNMENT
50757 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
50759 #undef TARGET_EMPTY_RECORD_P
50760 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
50762 #undef TARGET_WARN_PARAMETER_PASSING_ABI
50763 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
50765 #if CHECKING_P
50766 #undef TARGET_RUN_TARGET_SELFTESTS
50767 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
50768 #endif /* #if CHECKING_P */
50770 struct gcc_target targetm = TARGET_INITIALIZER;
50772 #include "gt-i386.h"