PR target/84146
[official-gcc.git] / gcc / config / i386 / i386.c
blobfaa9d9415f69387a565321cd551b60b91eab3448
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
93 #include "wide-int-bitmask.h"
95 /* This file should be included last. */
96 #include "target-def.h"
98 #include "x86-tune-costs.h"
100 static rtx legitimize_dllimport_symbol (rtx, bool);
101 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
102 static rtx legitimize_pe_coff_symbol (rtx, bool);
103 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
104 static bool ix86_save_reg (unsigned int, bool, bool);
105 static bool ix86_function_naked (const_tree);
106 static bool ix86_notrack_prefixed_insn_p (rtx);
107 static void ix86_emit_restore_reg_using_pop (rtx);
110 #ifndef CHECK_STACK_LIMIT
111 #define CHECK_STACK_LIMIT (-1)
112 #endif
114 /* Return index of given mode in mult and division cost tables. */
115 #define MODE_INDEX(mode) \
116 ((mode) == QImode ? 0 \
117 : (mode) == HImode ? 1 \
118 : (mode) == SImode ? 2 \
119 : (mode) == DImode ? 3 \
120 : 4)
123 /* Set by -mtune. */
124 const struct processor_costs *ix86_tune_cost = NULL;
126 /* Set by -mtune or -Os. */
127 const struct processor_costs *ix86_cost = NULL;
129 /* Processor feature/optimization bitmasks. */
130 #define m_386 (1U<<PROCESSOR_I386)
131 #define m_486 (1U<<PROCESSOR_I486)
132 #define m_PENT (1U<<PROCESSOR_PENTIUM)
133 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
134 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
135 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
136 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
137 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
138 #define m_CORE2 (1U<<PROCESSOR_CORE2)
139 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
140 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
141 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
142 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
143 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
144 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
145 #define m_KNL (1U<<PROCESSOR_KNL)
146 #define m_KNM (1U<<PROCESSOR_KNM)
147 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
148 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
149 #define m_ICELAKE (1U<<PROCESSOR_ICELAKE)
150 #define m_INTEL (1U<<PROCESSOR_INTEL)
152 #define m_GEODE (1U<<PROCESSOR_GEODE)
153 #define m_K6 (1U<<PROCESSOR_K6)
154 #define m_K6_GEODE (m_K6 | m_GEODE)
155 #define m_K8 (1U<<PROCESSOR_K8)
156 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
157 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
158 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
159 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
160 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
161 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
162 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
163 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
164 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
165 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
166 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
167 #define m_BTVER (m_BTVER1 | m_BTVER2)
168 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
169 | m_ZNVER1)
171 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
173 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
174 #undef DEF_TUNE
175 #define DEF_TUNE(tune, name, selector) name,
176 #include "x86-tune.def"
177 #undef DEF_TUNE
180 /* Feature tests against the various tunings. */
181 unsigned char ix86_tune_features[X86_TUNE_LAST];
183 /* Feature tests against the various tunings used to create ix86_tune_features
184 based on the processor mask. */
185 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
186 #undef DEF_TUNE
187 #define DEF_TUNE(tune, name, selector) selector,
188 #include "x86-tune.def"
189 #undef DEF_TUNE
192 /* Feature tests against the various architecture variations. */
193 unsigned char ix86_arch_features[X86_ARCH_LAST];
195 /* Feature tests against the various architecture variations, used to create
196 ix86_arch_features based on the processor mask. */
197 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
198 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
199 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
201 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
202 ~m_386,
204 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
205 ~(m_386 | m_486),
207 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
208 ~m_386,
210 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
211 ~m_386,
214 /* In case the average insn count for single function invocation is
215 lower than this constant, emit fast (but longer) prologue and
216 epilogue code. */
217 #define FAST_PROLOGUE_INSN_COUNT 20
219 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
220 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
221 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
222 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
224 /* Array of the smallest class containing reg number REGNO, indexed by
225 REGNO. Used by REGNO_REG_CLASS in i386.h. */
227 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
229 /* ax, dx, cx, bx */
230 AREG, DREG, CREG, BREG,
231 /* si, di, bp, sp */
232 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
233 /* FP registers */
234 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
235 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
236 /* arg pointer */
237 NON_Q_REGS,
238 /* flags, fpsr, fpcr, frame */
239 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
240 /* SSE registers */
241 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
242 SSE_REGS, SSE_REGS,
243 /* MMX registers */
244 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
245 MMX_REGS, MMX_REGS,
246 /* REX registers */
247 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
248 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
249 /* SSE REX registers */
250 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
251 SSE_REGS, SSE_REGS,
252 /* AVX-512 SSE registers */
253 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
256 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
257 /* Mask registers. */
258 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
259 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
260 /* MPX bound registers */
261 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
264 /* The "default" register map used in 32bit mode. */
266 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
268 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
269 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
270 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
271 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
272 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
273 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
274 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
275 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
276 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
277 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
278 101, 102, 103, 104, /* bound registers */
281 /* The "default" register map used in 64bit mode. */
283 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
285 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
286 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
287 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
288 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
289 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
290 8,9,10,11,12,13,14,15, /* extended integer registers */
291 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
292 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
293 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
294 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
295 126, 127, 128, 129, /* bound registers */
298 /* Define the register numbers to be used in Dwarf debugging information.
299 The SVR4 reference port C compiler uses the following register numbers
300 in its Dwarf output code:
301 0 for %eax (gcc regno = 0)
302 1 for %ecx (gcc regno = 2)
303 2 for %edx (gcc regno = 1)
304 3 for %ebx (gcc regno = 3)
305 4 for %esp (gcc regno = 7)
306 5 for %ebp (gcc regno = 6)
307 6 for %esi (gcc regno = 4)
308 7 for %edi (gcc regno = 5)
309 The following three DWARF register numbers are never generated by
310 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
311 believed these numbers have these meanings.
312 8 for %eip (no gcc equivalent)
313 9 for %eflags (gcc regno = 17)
314 10 for %trapno (no gcc equivalent)
315 It is not at all clear how we should number the FP stack registers
316 for the x86 architecture. If the version of SDB on x86/svr4 were
317 a bit less brain dead with respect to floating-point then we would
318 have a precedent to follow with respect to DWARF register numbers
319 for x86 FP registers, but the SDB on x86/svr4 was so completely
320 broken with respect to FP registers that it is hardly worth thinking
321 of it as something to strive for compatibility with.
322 The version of x86/svr4 SDB I had does (partially)
323 seem to believe that DWARF register number 11 is associated with
324 the x86 register %st(0), but that's about all. Higher DWARF
325 register numbers don't seem to be associated with anything in
326 particular, and even for DWARF regno 11, SDB only seemed to under-
327 stand that it should say that a variable lives in %st(0) (when
328 asked via an `=' command) if we said it was in DWARF regno 11,
329 but SDB still printed garbage when asked for the value of the
330 variable in question (via a `/' command).
331 (Also note that the labels SDB printed for various FP stack regs
332 when doing an `x' command were all wrong.)
333 Note that these problems generally don't affect the native SVR4
334 C compiler because it doesn't allow the use of -O with -g and
335 because when it is *not* optimizing, it allocates a memory
336 location for each floating-point variable, and the memory
337 location is what gets described in the DWARF AT_location
338 attribute for the variable in question.
339 Regardless of the severe mental illness of the x86/svr4 SDB, we
340 do something sensible here and we use the following DWARF
341 register numbers. Note that these are all stack-top-relative
342 numbers.
343 11 for %st(0) (gcc regno = 8)
344 12 for %st(1) (gcc regno = 9)
345 13 for %st(2) (gcc regno = 10)
346 14 for %st(3) (gcc regno = 11)
347 15 for %st(4) (gcc regno = 12)
348 16 for %st(5) (gcc regno = 13)
349 17 for %st(6) (gcc regno = 14)
350 18 for %st(7) (gcc regno = 15)
352 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
354 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
355 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
356 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
357 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
358 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
359 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
360 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
361 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
362 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
363 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
364 101, 102, 103, 104, /* bound registers */
367 /* Define parameter passing and return registers. */
369 static int const x86_64_int_parameter_registers[6] =
371 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
374 static int const x86_64_ms_abi_int_parameter_registers[4] =
376 CX_REG, DX_REG, R8_REG, R9_REG
379 static int const x86_64_int_return_registers[4] =
381 AX_REG, DX_REG, DI_REG, SI_REG
384 /* Additional registers that are clobbered by SYSV calls. */
386 #define NUM_X86_64_MS_CLOBBERED_REGS 12
387 static int const x86_64_ms_sysv_extra_clobbered_registers
388 [NUM_X86_64_MS_CLOBBERED_REGS] =
390 SI_REG, DI_REG,
391 XMM6_REG, XMM7_REG,
392 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
393 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
396 enum xlogue_stub {
397 XLOGUE_STUB_SAVE,
398 XLOGUE_STUB_RESTORE,
399 XLOGUE_STUB_RESTORE_TAIL,
400 XLOGUE_STUB_SAVE_HFP,
401 XLOGUE_STUB_RESTORE_HFP,
402 XLOGUE_STUB_RESTORE_HFP_TAIL,
404 XLOGUE_STUB_COUNT
407 enum xlogue_stub_sets {
408 XLOGUE_SET_ALIGNED,
409 XLOGUE_SET_ALIGNED_PLUS_8,
410 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
411 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
413 XLOGUE_SET_COUNT
416 /* Register save/restore layout used by out-of-line stubs. */
417 class xlogue_layout {
418 public:
419 struct reginfo
421 unsigned regno;
422 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
423 rsi) to where each register is stored. */
426 unsigned get_nregs () const {return m_nregs;}
427 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
429 const reginfo &get_reginfo (unsigned reg) const
431 gcc_assert (reg < m_nregs);
432 return m_regs[reg];
435 static const char *get_stub_name (enum xlogue_stub stub,
436 unsigned n_extra_args);
438 /* Returns an rtx for the stub's symbol based upon
439 1.) the specified stub (save, restore or restore_ret) and
440 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
441 3.) rather or not stack alignment is being performed. */
442 static rtx get_stub_rtx (enum xlogue_stub stub);
444 /* Returns the amount of stack space (including padding) that the stub
445 needs to store registers based upon data in the machine_function. */
446 HOST_WIDE_INT get_stack_space_used () const
448 const struct machine_function *m = cfun->machine;
449 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
451 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
452 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
455 /* Returns the offset for the base pointer used by the stub. */
456 HOST_WIDE_INT get_stub_ptr_offset () const
458 return STUB_INDEX_OFFSET + m_stack_align_off_in;
461 static const struct xlogue_layout &get_instance ();
462 static unsigned count_stub_managed_regs ();
463 static bool is_stub_managed_reg (unsigned regno, unsigned count);
465 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
466 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
467 static const unsigned MAX_REGS = 18;
468 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
469 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
470 static const unsigned STUB_NAME_MAX_LEN = 20;
471 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
472 static const unsigned REG_ORDER[MAX_REGS];
473 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
475 private:
476 xlogue_layout ();
477 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
478 xlogue_layout (const xlogue_layout &);
480 /* True if hard frame pointer is used. */
481 bool m_hfp;
483 /* Max number of register this layout manages. */
484 unsigned m_nregs;
486 /* Incoming offset from 16-byte alignment. */
487 HOST_WIDE_INT m_stack_align_off_in;
489 /* Register order and offsets. */
490 struct reginfo m_regs[MAX_REGS];
492 /* Lazy-inited cache of symbol names for stubs. */
493 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
494 [STUB_NAME_MAX_LEN];
496 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
499 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
500 "savms64",
501 "resms64",
502 "resms64x",
503 "savms64f",
504 "resms64f",
505 "resms64fx"
508 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
509 /* The below offset values are where each register is stored for the layout
510 relative to incoming stack pointer. The value of each m_regs[].offset will
511 be relative to the incoming base pointer (rax or rsi) used by the stub.
513 s_instances: 0 1 2 3
514 Offset: realigned or aligned + 8
515 Register aligned aligned + 8 aligned w/HFP w/HFP */
516 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
517 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
518 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
519 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
520 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
521 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
522 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
523 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
524 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
525 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
526 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
527 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
528 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
529 BP_REG, /* 0xc0 0xc8 N/A N/A */
530 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
531 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
532 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
533 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
536 /* Instantiate static const values. */
537 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
538 const unsigned xlogue_layout::MIN_REGS;
539 const unsigned xlogue_layout::MAX_REGS;
540 const unsigned xlogue_layout::MAX_EXTRA_REGS;
541 const unsigned xlogue_layout::VARIANT_COUNT;
542 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
544 /* Initialize xlogue_layout::s_stub_names to zero. */
545 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
546 [STUB_NAME_MAX_LEN];
548 /* Instantiates all xlogue_layout instances. */
549 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
550 xlogue_layout (0, false),
551 xlogue_layout (8, false),
552 xlogue_layout (0, true),
553 xlogue_layout (8, true)
556 /* Return an appropriate const instance of xlogue_layout based upon values
557 in cfun->machine and crtl. */
558 const struct xlogue_layout &
559 xlogue_layout::get_instance ()
561 enum xlogue_stub_sets stub_set;
562 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
564 if (stack_realign_fp)
565 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
566 else if (frame_pointer_needed)
567 stub_set = aligned_plus_8
568 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
569 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
570 else
571 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
573 return s_instances[stub_set];
576 /* Determine how many clobbered registers can be saved by the stub.
577 Returns the count of registers the stub will save and restore. */
578 unsigned
579 xlogue_layout::count_stub_managed_regs ()
581 bool hfp = frame_pointer_needed || stack_realign_fp;
582 unsigned i, count;
583 unsigned regno;
585 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
587 regno = REG_ORDER[i];
588 if (regno == BP_REG && hfp)
589 continue;
590 if (!ix86_save_reg (regno, false, false))
591 break;
592 ++count;
594 return count;
597 /* Determine if register REGNO is a stub managed register given the
598 total COUNT of stub managed registers. */
599 bool
600 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
602 bool hfp = frame_pointer_needed || stack_realign_fp;
603 unsigned i;
605 for (i = 0; i < count; ++i)
607 gcc_assert (i < MAX_REGS);
608 if (REG_ORDER[i] == BP_REG && hfp)
609 ++count;
610 else if (REG_ORDER[i] == regno)
611 return true;
613 return false;
616 /* Constructor for xlogue_layout. */
617 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
618 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
619 m_stack_align_off_in (stack_align_off_in)
621 HOST_WIDE_INT offset = stack_align_off_in;
622 unsigned i, j;
624 for (i = j = 0; i < MAX_REGS; ++i)
626 unsigned regno = REG_ORDER[i];
628 if (regno == BP_REG && hfp)
629 continue;
630 if (SSE_REGNO_P (regno))
632 offset += 16;
633 /* Verify that SSE regs are always aligned. */
634 gcc_assert (!((stack_align_off_in + offset) & 15));
636 else
637 offset += 8;
639 m_regs[j].regno = regno;
640 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
642 gcc_assert (j == m_nregs);
645 const char *
646 xlogue_layout::get_stub_name (enum xlogue_stub stub,
647 unsigned n_extra_regs)
649 const int have_avx = TARGET_AVX;
650 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
652 /* Lazy init */
653 if (!*name)
655 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
656 (have_avx ? "avx" : "sse"),
657 STUB_BASE_NAMES[stub],
658 MIN_REGS + n_extra_regs);
659 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
662 return name;
665 /* Return rtx of a symbol ref for the entry point (based upon
666 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
668 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
670 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
671 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
672 gcc_assert (stub < XLOGUE_STUB_COUNT);
673 gcc_assert (crtl->stack_realign_finalized);
675 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
678 /* Define the structure for the machine field in struct function. */
680 struct GTY(()) stack_local_entry {
681 unsigned short mode;
682 unsigned short n;
683 rtx rtl;
684 struct stack_local_entry *next;
687 /* Which cpu are we scheduling for. */
688 enum attr_cpu ix86_schedule;
690 /* Which cpu are we optimizing for. */
691 enum processor_type ix86_tune;
693 /* Which instruction set architecture to use. */
694 enum processor_type ix86_arch;
696 /* True if processor has SSE prefetch instruction. */
697 unsigned char x86_prefetch_sse;
699 /* -mstackrealign option */
700 static const char ix86_force_align_arg_pointer_string[]
701 = "force_align_arg_pointer";
703 static rtx (*ix86_gen_leave) (void);
704 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
705 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
706 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
707 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
708 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
709 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_clzero) (rtx);
711 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
713 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
714 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
715 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
716 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
718 /* Preferred alignment for stack boundary in bits. */
719 unsigned int ix86_preferred_stack_boundary;
721 /* Alignment for incoming stack boundary in bits specified at
722 command line. */
723 static unsigned int ix86_user_incoming_stack_boundary;
725 /* Default alignment for incoming stack boundary in bits. */
726 static unsigned int ix86_default_incoming_stack_boundary;
728 /* Alignment for incoming stack boundary in bits. */
729 unsigned int ix86_incoming_stack_boundary;
731 /* Calling abi specific va_list type nodes. */
732 static GTY(()) tree sysv_va_list_type_node;
733 static GTY(()) tree ms_va_list_type_node;
735 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
736 char internal_label_prefix[16];
737 int internal_label_prefix_len;
739 /* Fence to use after loop using movnt. */
740 tree x86_mfence;
742 /* Register class used for passing given 64bit part of the argument.
743 These represent classes as documented by the PS ABI, with the exception
744 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
745 use SF or DFmode move instead of DImode to avoid reformatting penalties.
747 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
748 whenever possible (upper half does contain padding). */
749 enum x86_64_reg_class
751 X86_64_NO_CLASS,
752 X86_64_INTEGER_CLASS,
753 X86_64_INTEGERSI_CLASS,
754 X86_64_SSE_CLASS,
755 X86_64_SSESF_CLASS,
756 X86_64_SSEDF_CLASS,
757 X86_64_SSEUP_CLASS,
758 X86_64_X87_CLASS,
759 X86_64_X87UP_CLASS,
760 X86_64_COMPLEX_X87_CLASS,
761 X86_64_MEMORY_CLASS
764 #define MAX_CLASSES 8
766 /* Table of constants used by fldpi, fldln2, etc.... */
767 static REAL_VALUE_TYPE ext_80387_constants_table [5];
768 static bool ext_80387_constants_init;
771 static struct machine_function * ix86_init_machine_status (void);
772 static rtx ix86_function_value (const_tree, const_tree, bool);
773 static bool ix86_function_value_regno_p (const unsigned int);
774 static unsigned int ix86_function_arg_boundary (machine_mode,
775 const_tree);
776 static rtx ix86_static_chain (const_tree, bool);
777 static int ix86_function_regparm (const_tree, const_tree);
778 static void ix86_compute_frame_layout (void);
779 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
780 rtx, rtx, int);
781 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
782 static tree ix86_canonical_va_list_type (tree);
783 static void predict_jump (int);
784 static unsigned int split_stack_prologue_scratch_regno (void);
785 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
787 enum ix86_function_specific_strings
789 IX86_FUNCTION_SPECIFIC_ARCH,
790 IX86_FUNCTION_SPECIFIC_TUNE,
791 IX86_FUNCTION_SPECIFIC_MAX
794 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
795 const char *, const char *, enum fpmath_unit,
796 bool);
797 static void ix86_function_specific_save (struct cl_target_option *,
798 struct gcc_options *opts);
799 static void ix86_function_specific_restore (struct gcc_options *opts,
800 struct cl_target_option *);
801 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
802 static void ix86_function_specific_print (FILE *, int,
803 struct cl_target_option *);
804 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
805 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
806 struct gcc_options *,
807 struct gcc_options *,
808 struct gcc_options *);
809 static bool ix86_can_inline_p (tree, tree);
810 static void ix86_set_current_function (tree);
811 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
813 static enum calling_abi ix86_function_abi (const_tree);
816 #ifndef SUBTARGET32_DEFAULT_CPU
817 #define SUBTARGET32_DEFAULT_CPU "i386"
818 #endif
820 /* Whether -mtune= or -march= were specified */
821 static int ix86_tune_defaulted;
822 static int ix86_arch_specified;
824 /* Vectorization library interface and handlers. */
825 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
827 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
828 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
830 /* Processor target table, indexed by processor number */
831 struct ptt
833 const char *const name; /* processor name */
834 const struct processor_costs *cost; /* Processor costs */
835 const int align_loop; /* Default alignments. */
836 const int align_loop_max_skip;
837 const int align_jump;
838 const int align_jump_max_skip;
839 const int align_func;
842 /* This table must be in sync with enum processor_type in i386.h. */
843 static const struct ptt processor_target_table[PROCESSOR_max] =
845 {"generic", &generic_cost, 16, 10, 16, 10, 16},
846 {"i386", &i386_cost, 4, 3, 4, 3, 4},
847 {"i486", &i486_cost, 16, 15, 16, 15, 16},
848 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
849 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
850 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
851 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
852 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
853 {"core2", &core_cost, 16, 10, 16, 10, 16},
854 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
855 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
856 {"haswell", &core_cost, 16, 10, 16, 10, 16},
857 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
858 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
859 {"knl", &slm_cost, 16, 15, 16, 7, 16},
860 {"knm", &slm_cost, 16, 15, 16, 7, 16},
861 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
862 {"cannonlake", &skylake_cost, 16, 10, 16, 10, 16},
863 {"icelake", &skylake_cost, 16, 10, 16, 10, 16},
864 {"intel", &intel_cost, 16, 15, 16, 7, 16},
865 {"geode", &geode_cost, 0, 0, 0, 0, 0},
866 {"k6", &k6_cost, 32, 7, 32, 7, 32},
867 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
868 {"k8", &k8_cost, 16, 7, 16, 7, 16},
869 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
870 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
871 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
872 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
873 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
874 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
875 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
876 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
879 static unsigned int
880 rest_of_handle_insert_vzeroupper (void)
882 int i;
884 /* vzeroupper instructions are inserted immediately after reload to
885 account for possible spills from 256bit or 512bit registers. The pass
886 reuses mode switching infrastructure by re-running mode insertion
887 pass, so disable entities that have already been processed. */
888 for (i = 0; i < MAX_386_ENTITIES; i++)
889 ix86_optimize_mode_switching[i] = 0;
891 ix86_optimize_mode_switching[AVX_U128] = 1;
893 /* Call optimize_mode_switching. */
894 g->get_passes ()->execute_pass_mode_switching ();
895 return 0;
898 /* Return 1 if INSN uses or defines a hard register.
899 Hard register uses in a memory address are ignored.
900 Clobbers and flags definitions are ignored. */
902 static bool
903 has_non_address_hard_reg (rtx_insn *insn)
905 df_ref ref;
906 FOR_EACH_INSN_DEF (ref, insn)
907 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
908 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
909 && DF_REF_REGNO (ref) != FLAGS_REG)
910 return true;
912 FOR_EACH_INSN_USE (ref, insn)
913 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
914 return true;
916 return false;
919 /* Check if comparison INSN may be transformed
920 into vector comparison. Currently we transform
921 zero checks only which look like:
923 (set (reg:CCZ 17 flags)
924 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
925 (subreg:SI (reg:DI x) 0))
926 (const_int 0 [0]))) */
928 static bool
929 convertible_comparison_p (rtx_insn *insn)
931 if (!TARGET_SSE4_1)
932 return false;
934 rtx def_set = single_set (insn);
936 gcc_assert (def_set);
938 rtx src = SET_SRC (def_set);
939 rtx dst = SET_DEST (def_set);
941 gcc_assert (GET_CODE (src) == COMPARE);
943 if (GET_CODE (dst) != REG
944 || REGNO (dst) != FLAGS_REG
945 || GET_MODE (dst) != CCZmode)
946 return false;
948 rtx op1 = XEXP (src, 0);
949 rtx op2 = XEXP (src, 1);
951 if (op2 != CONST0_RTX (GET_MODE (op2)))
952 return false;
954 if (GET_CODE (op1) != IOR)
955 return false;
957 op2 = XEXP (op1, 1);
958 op1 = XEXP (op1, 0);
960 if (!SUBREG_P (op1)
961 || !SUBREG_P (op2)
962 || GET_MODE (op1) != SImode
963 || GET_MODE (op2) != SImode
964 || ((SUBREG_BYTE (op1) != 0
965 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
966 && (SUBREG_BYTE (op2) != 0
967 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
968 return false;
970 op1 = SUBREG_REG (op1);
971 op2 = SUBREG_REG (op2);
973 if (op1 != op2
974 || !REG_P (op1)
975 || GET_MODE (op1) != DImode)
976 return false;
978 return true;
981 /* The DImode version of scalar_to_vector_candidate_p. */
983 static bool
984 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
986 rtx def_set = single_set (insn);
988 if (!def_set)
989 return false;
991 if (has_non_address_hard_reg (insn))
992 return false;
994 rtx src = SET_SRC (def_set);
995 rtx dst = SET_DEST (def_set);
997 if (GET_CODE (src) == COMPARE)
998 return convertible_comparison_p (insn);
1000 /* We are interested in DImode promotion only. */
1001 if ((GET_MODE (src) != DImode
1002 && !CONST_INT_P (src))
1003 || GET_MODE (dst) != DImode)
1004 return false;
1006 if (!REG_P (dst) && !MEM_P (dst))
1007 return false;
1009 switch (GET_CODE (src))
1011 case ASHIFTRT:
1012 if (!TARGET_AVX512VL)
1013 return false;
1014 /* FALLTHRU */
1016 case ASHIFT:
1017 case LSHIFTRT:
1018 if (!REG_P (XEXP (src, 1))
1019 && (!SUBREG_P (XEXP (src, 1))
1020 || SUBREG_BYTE (XEXP (src, 1)) != 0
1021 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1022 && (!CONST_INT_P (XEXP (src, 1))
1023 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1024 return false;
1026 if (GET_MODE (XEXP (src, 1)) != QImode
1027 && !CONST_INT_P (XEXP (src, 1)))
1028 return false;
1029 break;
1031 case PLUS:
1032 case MINUS:
1033 case IOR:
1034 case XOR:
1035 case AND:
1036 if (!REG_P (XEXP (src, 1))
1037 && !MEM_P (XEXP (src, 1))
1038 && !CONST_INT_P (XEXP (src, 1)))
1039 return false;
1041 if (GET_MODE (XEXP (src, 1)) != DImode
1042 && !CONST_INT_P (XEXP (src, 1)))
1043 return false;
1044 break;
1046 case NEG:
1047 case NOT:
1048 break;
1050 case REG:
1051 return true;
1053 case MEM:
1054 case CONST_INT:
1055 return REG_P (dst);
1057 default:
1058 return false;
1061 if (!REG_P (XEXP (src, 0))
1062 && !MEM_P (XEXP (src, 0))
1063 && !CONST_INT_P (XEXP (src, 0))
1064 /* Check for andnot case. */
1065 && (GET_CODE (src) != AND
1066 || GET_CODE (XEXP (src, 0)) != NOT
1067 || !REG_P (XEXP (XEXP (src, 0), 0))))
1068 return false;
1070 if (GET_MODE (XEXP (src, 0)) != DImode
1071 && !CONST_INT_P (XEXP (src, 0)))
1072 return false;
1074 return true;
1077 /* The TImode version of scalar_to_vector_candidate_p. */
1079 static bool
1080 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1082 rtx def_set = single_set (insn);
1084 if (!def_set)
1085 return false;
1087 if (has_non_address_hard_reg (insn))
1088 return false;
1090 rtx src = SET_SRC (def_set);
1091 rtx dst = SET_DEST (def_set);
1093 /* Only TImode load and store are allowed. */
1094 if (GET_MODE (dst) != TImode)
1095 return false;
1097 if (MEM_P (dst))
1099 /* Check for store. Memory must be aligned or unaligned store
1100 is optimal. Only support store from register, standard SSE
1101 constant or CONST_WIDE_INT generated from piecewise store.
1103 ??? Verify performance impact before enabling CONST_INT for
1104 __int128 store. */
1105 if (misaligned_operand (dst, TImode)
1106 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1107 return false;
1109 switch (GET_CODE (src))
1111 default:
1112 return false;
1114 case REG:
1115 case CONST_WIDE_INT:
1116 return true;
1118 case CONST_INT:
1119 return standard_sse_constant_p (src, TImode);
1122 else if (MEM_P (src))
1124 /* Check for load. Memory must be aligned or unaligned load is
1125 optimal. */
1126 return (REG_P (dst)
1127 && (!misaligned_operand (src, TImode)
1128 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1131 return false;
1134 /* Return 1 if INSN may be converted into vector
1135 instruction. */
1137 static bool
1138 scalar_to_vector_candidate_p (rtx_insn *insn)
1140 if (TARGET_64BIT)
1141 return timode_scalar_to_vector_candidate_p (insn);
1142 else
1143 return dimode_scalar_to_vector_candidate_p (insn);
1146 /* The DImode version of remove_non_convertible_regs. */
1148 static void
1149 dimode_remove_non_convertible_regs (bitmap candidates)
1151 bitmap_iterator bi;
1152 unsigned id;
1153 bitmap regs = BITMAP_ALLOC (NULL);
1155 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1157 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1158 rtx reg = SET_DEST (def_set);
1160 if (!REG_P (reg)
1161 || bitmap_bit_p (regs, REGNO (reg))
1162 || HARD_REGISTER_P (reg))
1163 continue;
1165 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1166 def;
1167 def = DF_REF_NEXT_REG (def))
1169 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1171 if (dump_file)
1172 fprintf (dump_file,
1173 "r%d has non convertible definition in insn %d\n",
1174 REGNO (reg), DF_REF_INSN_UID (def));
1176 bitmap_set_bit (regs, REGNO (reg));
1177 break;
1182 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1184 for (df_ref def = DF_REG_DEF_CHAIN (id);
1185 def;
1186 def = DF_REF_NEXT_REG (def))
1187 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1189 if (dump_file)
1190 fprintf (dump_file, "Removing insn %d from candidates list\n",
1191 DF_REF_INSN_UID (def));
1193 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1197 BITMAP_FREE (regs);
1200 /* For a register REGNO, scan instructions for its defs and uses.
1201 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1203 static void
1204 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1205 unsigned int regno)
1207 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1208 def;
1209 def = DF_REF_NEXT_REG (def))
1211 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1213 if (dump_file)
1214 fprintf (dump_file,
1215 "r%d has non convertible def in insn %d\n",
1216 regno, DF_REF_INSN_UID (def));
1218 bitmap_set_bit (regs, regno);
1219 break;
1223 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1224 ref;
1225 ref = DF_REF_NEXT_REG (ref))
1227 /* Debug instructions are skipped. */
1228 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1229 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1231 if (dump_file)
1232 fprintf (dump_file,
1233 "r%d has non convertible use in insn %d\n",
1234 regno, DF_REF_INSN_UID (ref));
1236 bitmap_set_bit (regs, regno);
1237 break;
1242 /* The TImode version of remove_non_convertible_regs. */
1244 static void
1245 timode_remove_non_convertible_regs (bitmap candidates)
1247 bitmap_iterator bi;
1248 unsigned id;
1249 bitmap regs = BITMAP_ALLOC (NULL);
1251 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1253 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1254 rtx dest = SET_DEST (def_set);
1255 rtx src = SET_SRC (def_set);
1257 if ((!REG_P (dest)
1258 || bitmap_bit_p (regs, REGNO (dest))
1259 || HARD_REGISTER_P (dest))
1260 && (!REG_P (src)
1261 || bitmap_bit_p (regs, REGNO (src))
1262 || HARD_REGISTER_P (src)))
1263 continue;
1265 if (REG_P (dest))
1266 timode_check_non_convertible_regs (candidates, regs,
1267 REGNO (dest));
1269 if (REG_P (src))
1270 timode_check_non_convertible_regs (candidates, regs,
1271 REGNO (src));
1274 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1276 for (df_ref def = DF_REG_DEF_CHAIN (id);
1277 def;
1278 def = DF_REF_NEXT_REG (def))
1279 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1281 if (dump_file)
1282 fprintf (dump_file, "Removing insn %d from candidates list\n",
1283 DF_REF_INSN_UID (def));
1285 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1288 for (df_ref ref = DF_REG_USE_CHAIN (id);
1289 ref;
1290 ref = DF_REF_NEXT_REG (ref))
1291 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1293 if (dump_file)
1294 fprintf (dump_file, "Removing insn %d from candidates list\n",
1295 DF_REF_INSN_UID (ref));
1297 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1301 BITMAP_FREE (regs);
1304 /* For a given bitmap of insn UIDs scans all instruction and
1305 remove insn from CANDIDATES in case it has both convertible
1306 and not convertible definitions.
1308 All insns in a bitmap are conversion candidates according to
1309 scalar_to_vector_candidate_p. Currently it implies all insns
1310 are single_set. */
1312 static void
1313 remove_non_convertible_regs (bitmap candidates)
1315 if (TARGET_64BIT)
1316 timode_remove_non_convertible_regs (candidates);
1317 else
1318 dimode_remove_non_convertible_regs (candidates);
1321 class scalar_chain
1323 public:
1324 scalar_chain ();
1325 virtual ~scalar_chain ();
1327 static unsigned max_id;
1329 /* ID of a chain. */
1330 unsigned int chain_id;
1331 /* A queue of instructions to be included into a chain. */
1332 bitmap queue;
1333 /* Instructions included into a chain. */
1334 bitmap insns;
1335 /* All registers defined by a chain. */
1336 bitmap defs;
1337 /* Registers used in both vector and sclar modes. */
1338 bitmap defs_conv;
1340 void build (bitmap candidates, unsigned insn_uid);
1341 virtual int compute_convert_gain () = 0;
1342 int convert ();
1344 protected:
1345 void add_to_queue (unsigned insn_uid);
1346 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1348 private:
1349 void add_insn (bitmap candidates, unsigned insn_uid);
1350 void analyze_register_chain (bitmap candidates, df_ref ref);
1351 virtual void mark_dual_mode_def (df_ref def) = 0;
1352 virtual void convert_insn (rtx_insn *insn) = 0;
1353 virtual void convert_registers () = 0;
1356 class dimode_scalar_chain : public scalar_chain
1358 public:
1359 int compute_convert_gain ();
1360 private:
1361 void mark_dual_mode_def (df_ref def);
1362 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1363 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1364 void convert_insn (rtx_insn *insn);
1365 void convert_op (rtx *op, rtx_insn *insn);
1366 void convert_reg (unsigned regno);
1367 void make_vector_copies (unsigned regno);
1368 void convert_registers ();
1369 int vector_const_cost (rtx exp);
1372 class timode_scalar_chain : public scalar_chain
1374 public:
1375 /* Convert from TImode to V1TImode is always faster. */
1376 int compute_convert_gain () { return 1; }
1378 private:
1379 void mark_dual_mode_def (df_ref def);
1380 void fix_debug_reg_uses (rtx reg);
1381 void convert_insn (rtx_insn *insn);
1382 /* We don't convert registers to difference size. */
1383 void convert_registers () {}
1386 unsigned scalar_chain::max_id = 0;
1388 /* Initialize new chain. */
1390 scalar_chain::scalar_chain ()
1392 chain_id = ++max_id;
1394 if (dump_file)
1395 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1397 bitmap_obstack_initialize (NULL);
1398 insns = BITMAP_ALLOC (NULL);
1399 defs = BITMAP_ALLOC (NULL);
1400 defs_conv = BITMAP_ALLOC (NULL);
1401 queue = NULL;
1404 /* Free chain's data. */
1406 scalar_chain::~scalar_chain ()
1408 BITMAP_FREE (insns);
1409 BITMAP_FREE (defs);
1410 BITMAP_FREE (defs_conv);
1411 bitmap_obstack_release (NULL);
1414 /* Add instruction into chains' queue. */
1416 void
1417 scalar_chain::add_to_queue (unsigned insn_uid)
1419 if (bitmap_bit_p (insns, insn_uid)
1420 || bitmap_bit_p (queue, insn_uid))
1421 return;
1423 if (dump_file)
1424 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1425 insn_uid, chain_id);
1426 bitmap_set_bit (queue, insn_uid);
1429 /* For DImode conversion, mark register defined by DEF as requiring
1430 conversion. */
1432 void
1433 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1435 gcc_assert (DF_REF_REG_DEF_P (def));
1437 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1438 return;
1440 if (dump_file)
1441 fprintf (dump_file,
1442 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1443 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1445 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1448 /* For TImode conversion, it is unused. */
1450 void
1451 timode_scalar_chain::mark_dual_mode_def (df_ref)
1453 gcc_unreachable ();
1456 /* Check REF's chain to add new insns into a queue
1457 and find registers requiring conversion. */
1459 void
1460 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1462 df_link *chain;
1464 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1465 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1466 add_to_queue (DF_REF_INSN_UID (ref));
1468 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1470 unsigned uid = DF_REF_INSN_UID (chain->ref);
1472 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1473 continue;
1475 if (!DF_REF_REG_MEM_P (chain->ref))
1477 if (bitmap_bit_p (insns, uid))
1478 continue;
1480 if (bitmap_bit_p (candidates, uid))
1482 add_to_queue (uid);
1483 continue;
1487 if (DF_REF_REG_DEF_P (chain->ref))
1489 if (dump_file)
1490 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1491 DF_REF_REGNO (chain->ref), uid);
1492 mark_dual_mode_def (chain->ref);
1494 else
1496 if (dump_file)
1497 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1498 DF_REF_REGNO (chain->ref), uid);
1499 mark_dual_mode_def (ref);
1504 /* Add instruction into a chain. */
1506 void
1507 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1509 if (bitmap_bit_p (insns, insn_uid))
1510 return;
1512 if (dump_file)
1513 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1515 bitmap_set_bit (insns, insn_uid);
1517 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1518 rtx def_set = single_set (insn);
1519 if (def_set && REG_P (SET_DEST (def_set))
1520 && !HARD_REGISTER_P (SET_DEST (def_set)))
1521 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1523 df_ref ref;
1524 df_ref def;
1525 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1526 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1527 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1528 def;
1529 def = DF_REF_NEXT_REG (def))
1530 analyze_register_chain (candidates, def);
1531 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1532 if (!DF_REF_REG_MEM_P (ref))
1533 analyze_register_chain (candidates, ref);
1536 /* Build new chain starting from insn INSN_UID recursively
1537 adding all dependent uses and definitions. */
1539 void
1540 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1542 queue = BITMAP_ALLOC (NULL);
1543 bitmap_set_bit (queue, insn_uid);
1545 if (dump_file)
1546 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1548 while (!bitmap_empty_p (queue))
1550 insn_uid = bitmap_first_set_bit (queue);
1551 bitmap_clear_bit (queue, insn_uid);
1552 bitmap_clear_bit (candidates, insn_uid);
1553 add_insn (candidates, insn_uid);
1556 if (dump_file)
1558 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1559 fprintf (dump_file, " insns: ");
1560 dump_bitmap (dump_file, insns);
1561 if (!bitmap_empty_p (defs_conv))
1563 bitmap_iterator bi;
1564 unsigned id;
1565 const char *comma = "";
1566 fprintf (dump_file, " defs to convert: ");
1567 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1569 fprintf (dump_file, "%sr%d", comma, id);
1570 comma = ", ";
1572 fprintf (dump_file, "\n");
1576 BITMAP_FREE (queue);
1579 /* Return a cost of building a vector costant
1580 instead of using a scalar one. */
1583 dimode_scalar_chain::vector_const_cost (rtx exp)
1585 gcc_assert (CONST_INT_P (exp));
1587 if (standard_sse_constant_p (exp, V2DImode))
1588 return COSTS_N_INSNS (1);
1589 return ix86_cost->sse_load[1];
1592 /* Compute a gain for chain conversion. */
1595 dimode_scalar_chain::compute_convert_gain ()
1597 bitmap_iterator bi;
1598 unsigned insn_uid;
1599 int gain = 0;
1600 int cost = 0;
1602 if (dump_file)
1603 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1605 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1607 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1608 rtx def_set = single_set (insn);
1609 rtx src = SET_SRC (def_set);
1610 rtx dst = SET_DEST (def_set);
1612 if (REG_P (src) && REG_P (dst))
1613 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1614 else if (REG_P (src) && MEM_P (dst))
1615 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1616 else if (MEM_P (src) && REG_P (dst))
1617 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1618 else if (GET_CODE (src) == ASHIFT
1619 || GET_CODE (src) == ASHIFTRT
1620 || GET_CODE (src) == LSHIFTRT)
1622 if (CONST_INT_P (XEXP (src, 0)))
1623 gain -= vector_const_cost (XEXP (src, 0));
1624 if (CONST_INT_P (XEXP (src, 1)))
1626 gain += ix86_cost->shift_const;
1627 if (INTVAL (XEXP (src, 1)) >= 32)
1628 gain -= COSTS_N_INSNS (1);
1630 else
1631 /* Additional gain for omitting two CMOVs. */
1632 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1634 else if (GET_CODE (src) == PLUS
1635 || GET_CODE (src) == MINUS
1636 || GET_CODE (src) == IOR
1637 || GET_CODE (src) == XOR
1638 || GET_CODE (src) == AND)
1640 gain += ix86_cost->add;
1641 /* Additional gain for andnot for targets without BMI. */
1642 if (GET_CODE (XEXP (src, 0)) == NOT
1643 && !TARGET_BMI)
1644 gain += 2 * ix86_cost->add;
1646 if (CONST_INT_P (XEXP (src, 0)))
1647 gain -= vector_const_cost (XEXP (src, 0));
1648 if (CONST_INT_P (XEXP (src, 1)))
1649 gain -= vector_const_cost (XEXP (src, 1));
1651 else if (GET_CODE (src) == NEG
1652 || GET_CODE (src) == NOT)
1653 gain += ix86_cost->add - COSTS_N_INSNS (1);
1654 else if (GET_CODE (src) == COMPARE)
1656 /* Assume comparison cost is the same. */
1658 else if (CONST_INT_P (src))
1660 if (REG_P (dst))
1661 gain += COSTS_N_INSNS (2);
1662 else if (MEM_P (dst))
1663 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1664 gain -= vector_const_cost (src);
1666 else
1667 gcc_unreachable ();
1670 if (dump_file)
1671 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1673 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1674 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1676 if (dump_file)
1677 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1679 gain -= cost;
1681 if (dump_file)
1682 fprintf (dump_file, " Total gain: %d\n", gain);
1684 return gain;
1687 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1690 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1692 if (x == reg)
1693 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1695 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1696 int i, j;
1697 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1699 if (fmt[i] == 'e')
1700 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1701 else if (fmt[i] == 'E')
1702 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1703 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1704 reg, new_reg);
1707 return x;
1710 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1712 void
1713 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1714 rtx reg, rtx new_reg)
1716 replace_with_subreg (single_set (insn), reg, new_reg);
1719 /* Insert generated conversion instruction sequence INSNS
1720 after instruction AFTER. New BB may be required in case
1721 instruction has EH region attached. */
1723 void
1724 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1726 if (!control_flow_insn_p (after))
1728 emit_insn_after (insns, after);
1729 return;
1732 basic_block bb = BLOCK_FOR_INSN (after);
1733 edge e = find_fallthru_edge (bb->succs);
1734 gcc_assert (e);
1736 basic_block new_bb = split_edge (e);
1737 emit_insn_after (insns, BB_HEAD (new_bb));
1740 /* Make vector copies for all register REGNO definitions
1741 and replace its uses in a chain. */
1743 void
1744 dimode_scalar_chain::make_vector_copies (unsigned regno)
1746 rtx reg = regno_reg_rtx[regno];
1747 rtx vreg = gen_reg_rtx (DImode);
1748 bool count_reg = false;
1749 df_ref ref;
1751 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1752 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1754 df_ref use;
1756 /* Detect the count register of a shift instruction. */
1757 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1758 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1760 rtx_insn *insn = DF_REF_INSN (use);
1761 rtx def_set = single_set (insn);
1763 gcc_assert (def_set);
1765 rtx src = SET_SRC (def_set);
1767 if ((GET_CODE (src) == ASHIFT
1768 || GET_CODE (src) == ASHIFTRT
1769 || GET_CODE (src) == LSHIFTRT)
1770 && !CONST_INT_P (XEXP (src, 1))
1771 && reg_or_subregno (XEXP (src, 1)) == regno)
1772 count_reg = true;
1775 start_sequence ();
1776 if (count_reg)
1778 rtx qreg = gen_lowpart (QImode, reg);
1779 rtx tmp = gen_reg_rtx (SImode);
1781 if (TARGET_ZERO_EXTEND_WITH_AND
1782 && optimize_function_for_speed_p (cfun))
1784 emit_move_insn (tmp, const0_rtx);
1785 emit_insn (gen_movstrictqi
1786 (gen_lowpart (QImode, tmp), qreg));
1788 else
1789 emit_insn (gen_rtx_SET
1790 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1792 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1794 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1795 emit_move_insn (slot, tmp);
1796 tmp = copy_rtx (slot);
1799 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1801 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1803 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1804 emit_move_insn (adjust_address (tmp, SImode, 0),
1805 gen_rtx_SUBREG (SImode, reg, 0));
1806 emit_move_insn (adjust_address (tmp, SImode, 4),
1807 gen_rtx_SUBREG (SImode, reg, 4));
1808 emit_move_insn (vreg, tmp);
1810 else if (TARGET_SSE4_1)
1812 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1813 CONST0_RTX (V4SImode),
1814 gen_rtx_SUBREG (SImode, reg, 0)));
1815 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1816 gen_rtx_SUBREG (V4SImode, vreg, 0),
1817 gen_rtx_SUBREG (SImode, reg, 4),
1818 GEN_INT (2)));
1820 else
1822 rtx tmp = gen_reg_rtx (DImode);
1823 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1824 CONST0_RTX (V4SImode),
1825 gen_rtx_SUBREG (SImode, reg, 0)));
1826 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1827 CONST0_RTX (V4SImode),
1828 gen_rtx_SUBREG (SImode, reg, 4)));
1829 emit_insn (gen_vec_interleave_lowv4si
1830 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1831 gen_rtx_SUBREG (V4SImode, vreg, 0),
1832 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1834 rtx_insn *seq = get_insns ();
1835 end_sequence ();
1836 rtx_insn *insn = DF_REF_INSN (ref);
1837 emit_conversion_insns (seq, insn);
1839 if (dump_file)
1840 fprintf (dump_file,
1841 " Copied r%d to a vector register r%d for insn %d\n",
1842 regno, REGNO (vreg), INSN_UID (insn));
1845 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1846 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1848 rtx_insn *insn = DF_REF_INSN (ref);
1849 if (count_reg)
1851 rtx def_set = single_set (insn);
1852 gcc_assert (def_set);
1854 rtx src = SET_SRC (def_set);
1856 if ((GET_CODE (src) == ASHIFT
1857 || GET_CODE (src) == ASHIFTRT
1858 || GET_CODE (src) == LSHIFTRT)
1859 && !CONST_INT_P (XEXP (src, 1))
1860 && reg_or_subregno (XEXP (src, 1)) == regno)
1861 XEXP (src, 1) = vreg;
1863 else
1864 replace_with_subreg_in_insn (insn, reg, vreg);
1866 if (dump_file)
1867 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1868 regno, REGNO (vreg), INSN_UID (insn));
1872 /* Convert all definitions of register REGNO
1873 and fix its uses. Scalar copies may be created
1874 in case register is used in not convertible insn. */
1876 void
1877 dimode_scalar_chain::convert_reg (unsigned regno)
1879 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1880 rtx reg = regno_reg_rtx[regno];
1881 rtx scopy = NULL_RTX;
1882 df_ref ref;
1883 bitmap conv;
1885 conv = BITMAP_ALLOC (NULL);
1886 bitmap_copy (conv, insns);
1888 if (scalar_copy)
1889 scopy = gen_reg_rtx (DImode);
1891 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1893 rtx_insn *insn = DF_REF_INSN (ref);
1894 rtx def_set = single_set (insn);
1895 rtx src = SET_SRC (def_set);
1896 rtx reg = DF_REF_REG (ref);
1898 if (!MEM_P (src))
1900 replace_with_subreg_in_insn (insn, reg, reg);
1901 bitmap_clear_bit (conv, INSN_UID (insn));
1904 if (scalar_copy)
1906 start_sequence ();
1907 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1909 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1910 emit_move_insn (tmp, reg);
1911 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1912 adjust_address (tmp, SImode, 0));
1913 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1914 adjust_address (tmp, SImode, 4));
1916 else if (TARGET_SSE4_1)
1918 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1919 emit_insn
1920 (gen_rtx_SET
1921 (gen_rtx_SUBREG (SImode, scopy, 0),
1922 gen_rtx_VEC_SELECT (SImode,
1923 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1925 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1926 emit_insn
1927 (gen_rtx_SET
1928 (gen_rtx_SUBREG (SImode, scopy, 4),
1929 gen_rtx_VEC_SELECT (SImode,
1930 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1932 else
1934 rtx vcopy = gen_reg_rtx (V2DImode);
1935 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1936 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1937 gen_rtx_SUBREG (SImode, vcopy, 0));
1938 emit_move_insn (vcopy,
1939 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1940 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1941 gen_rtx_SUBREG (SImode, vcopy, 0));
1943 rtx_insn *seq = get_insns ();
1944 end_sequence ();
1945 emit_conversion_insns (seq, insn);
1947 if (dump_file)
1948 fprintf (dump_file,
1949 " Copied r%d to a scalar register r%d for insn %d\n",
1950 regno, REGNO (scopy), INSN_UID (insn));
1954 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1955 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1957 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1959 rtx_insn *insn = DF_REF_INSN (ref);
1961 rtx def_set = single_set (insn);
1962 gcc_assert (def_set);
1964 rtx src = SET_SRC (def_set);
1965 rtx dst = SET_DEST (def_set);
1967 if ((GET_CODE (src) == ASHIFT
1968 || GET_CODE (src) == ASHIFTRT
1969 || GET_CODE (src) == LSHIFTRT)
1970 && !CONST_INT_P (XEXP (src, 1))
1971 && reg_or_subregno (XEXP (src, 1)) == regno)
1973 rtx tmp2 = gen_reg_rtx (V2DImode);
1975 start_sequence ();
1977 if (TARGET_SSE4_1)
1978 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1979 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1980 else
1982 rtx vec_cst
1983 = gen_rtx_CONST_VECTOR (V2DImode,
1984 gen_rtvec (2, GEN_INT (0xff),
1985 const0_rtx));
1986 vec_cst
1987 = validize_mem (force_const_mem (V2DImode, vec_cst));
1989 emit_insn (gen_rtx_SET
1990 (tmp2,
1991 gen_rtx_AND (V2DImode,
1992 gen_rtx_SUBREG (V2DImode, reg, 0),
1993 vec_cst)));
1995 rtx_insn *seq = get_insns ();
1996 end_sequence ();
1998 emit_insn_before (seq, insn);
2000 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
2002 else if (!MEM_P (dst) || !REG_P (src))
2003 replace_with_subreg_in_insn (insn, reg, reg);
2005 bitmap_clear_bit (conv, INSN_UID (insn));
2008 /* Skip debug insns and uninitialized uses. */
2009 else if (DF_REF_CHAIN (ref)
2010 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2012 gcc_assert (scopy);
2013 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2014 df_insn_rescan (DF_REF_INSN (ref));
2017 BITMAP_FREE (conv);
2020 /* Convert operand OP in INSN. We should handle
2021 memory operands and uninitialized registers.
2022 All other register uses are converted during
2023 registers conversion. */
2025 void
2026 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2028 *op = copy_rtx_if_shared (*op);
2030 if (GET_CODE (*op) == NOT)
2032 convert_op (&XEXP (*op, 0), insn);
2033 PUT_MODE (*op, V2DImode);
2035 else if (MEM_P (*op))
2037 rtx tmp = gen_reg_rtx (DImode);
2039 emit_insn_before (gen_move_insn (tmp, *op), insn);
2040 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2042 if (dump_file)
2043 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2044 INSN_UID (insn), REGNO (tmp));
2046 else if (REG_P (*op))
2048 /* We may have not converted register usage in case
2049 this register has no definition. Otherwise it
2050 should be converted in convert_reg. */
2051 df_ref ref;
2052 FOR_EACH_INSN_USE (ref, insn)
2053 if (DF_REF_REGNO (ref) == REGNO (*op))
2055 gcc_assert (!DF_REF_CHAIN (ref));
2056 break;
2058 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2060 else if (CONST_INT_P (*op))
2062 rtx vec_cst;
2063 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2065 /* Prefer all ones vector in case of -1. */
2066 if (constm1_operand (*op, GET_MODE (*op)))
2067 vec_cst = CONSTM1_RTX (V2DImode);
2068 else
2069 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2070 gen_rtvec (2, *op, const0_rtx));
2072 if (!standard_sse_constant_p (vec_cst, V2DImode))
2074 start_sequence ();
2075 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2076 rtx_insn *seq = get_insns ();
2077 end_sequence ();
2078 emit_insn_before (seq, insn);
2081 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2082 *op = tmp;
2084 else
2086 gcc_assert (SUBREG_P (*op));
2087 gcc_assert (GET_MODE (*op) == V2DImode);
2091 /* Convert INSN to vector mode. */
2093 void
2094 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2096 rtx def_set = single_set (insn);
2097 rtx src = SET_SRC (def_set);
2098 rtx dst = SET_DEST (def_set);
2099 rtx subreg;
2101 if (MEM_P (dst) && !REG_P (src))
2103 /* There are no scalar integer instructions and therefore
2104 temporary register usage is required. */
2105 rtx tmp = gen_reg_rtx (DImode);
2106 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2107 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2110 switch (GET_CODE (src))
2112 case ASHIFT:
2113 case ASHIFTRT:
2114 case LSHIFTRT:
2115 convert_op (&XEXP (src, 0), insn);
2116 PUT_MODE (src, V2DImode);
2117 break;
2119 case PLUS:
2120 case MINUS:
2121 case IOR:
2122 case XOR:
2123 case AND:
2124 convert_op (&XEXP (src, 0), insn);
2125 convert_op (&XEXP (src, 1), insn);
2126 PUT_MODE (src, V2DImode);
2127 break;
2129 case NEG:
2130 src = XEXP (src, 0);
2131 convert_op (&src, insn);
2132 subreg = gen_reg_rtx (V2DImode);
2133 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2134 src = gen_rtx_MINUS (V2DImode, subreg, src);
2135 break;
2137 case NOT:
2138 src = XEXP (src, 0);
2139 convert_op (&src, insn);
2140 subreg = gen_reg_rtx (V2DImode);
2141 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2142 src = gen_rtx_XOR (V2DImode, src, subreg);
2143 break;
2145 case MEM:
2146 if (!REG_P (dst))
2147 convert_op (&src, insn);
2148 break;
2150 case REG:
2151 if (!MEM_P (dst))
2152 convert_op (&src, insn);
2153 break;
2155 case SUBREG:
2156 gcc_assert (GET_MODE (src) == V2DImode);
2157 break;
2159 case COMPARE:
2160 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2162 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2163 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2165 if (REG_P (src))
2166 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2167 else
2168 subreg = copy_rtx_if_shared (src);
2169 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2170 copy_rtx_if_shared (subreg),
2171 copy_rtx_if_shared (subreg)),
2172 insn);
2173 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2174 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2175 copy_rtx_if_shared (src)),
2176 UNSPEC_PTEST);
2177 break;
2179 case CONST_INT:
2180 convert_op (&src, insn);
2181 break;
2183 default:
2184 gcc_unreachable ();
2187 SET_SRC (def_set) = src;
2188 SET_DEST (def_set) = dst;
2190 /* Drop possible dead definitions. */
2191 PATTERN (insn) = def_set;
2193 INSN_CODE (insn) = -1;
2194 recog_memoized (insn);
2195 df_insn_rescan (insn);
2198 /* Fix uses of converted REG in debug insns. */
2200 void
2201 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2203 if (!flag_var_tracking)
2204 return;
2206 df_ref ref, next;
2207 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2209 rtx_insn *insn = DF_REF_INSN (ref);
2210 /* Make sure the next ref is for a different instruction,
2211 so that we're not affected by the rescan. */
2212 next = DF_REF_NEXT_REG (ref);
2213 while (next && DF_REF_INSN (next) == insn)
2214 next = DF_REF_NEXT_REG (next);
2216 if (DEBUG_INSN_P (insn))
2218 /* It may be a debug insn with a TImode variable in
2219 register. */
2220 bool changed = false;
2221 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2223 rtx *loc = DF_REF_LOC (ref);
2224 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2226 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2227 changed = true;
2230 if (changed)
2231 df_insn_rescan (insn);
2236 /* Convert INSN from TImode to V1T1mode. */
2238 void
2239 timode_scalar_chain::convert_insn (rtx_insn *insn)
2241 rtx def_set = single_set (insn);
2242 rtx src = SET_SRC (def_set);
2243 rtx dst = SET_DEST (def_set);
2245 switch (GET_CODE (dst))
2247 case REG:
2249 rtx tmp = find_reg_equal_equiv_note (insn);
2250 if (tmp)
2251 PUT_MODE (XEXP (tmp, 0), V1TImode);
2252 PUT_MODE (dst, V1TImode);
2253 fix_debug_reg_uses (dst);
2255 break;
2256 case MEM:
2257 PUT_MODE (dst, V1TImode);
2258 break;
2260 default:
2261 gcc_unreachable ();
2264 switch (GET_CODE (src))
2266 case REG:
2267 PUT_MODE (src, V1TImode);
2268 /* Call fix_debug_reg_uses only if SRC is never defined. */
2269 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2270 fix_debug_reg_uses (src);
2271 break;
2273 case MEM:
2274 PUT_MODE (src, V1TImode);
2275 break;
2277 case CONST_WIDE_INT:
2278 if (NONDEBUG_INSN_P (insn))
2280 /* Since there are no instructions to store 128-bit constant,
2281 temporary register usage is required. */
2282 rtx tmp = gen_reg_rtx (V1TImode);
2283 start_sequence ();
2284 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2285 src = validize_mem (force_const_mem (V1TImode, src));
2286 rtx_insn *seq = get_insns ();
2287 end_sequence ();
2288 if (seq)
2289 emit_insn_before (seq, insn);
2290 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2291 dst = tmp;
2293 break;
2295 case CONST_INT:
2296 switch (standard_sse_constant_p (src, TImode))
2298 case 1:
2299 src = CONST0_RTX (GET_MODE (dst));
2300 break;
2301 case 2:
2302 src = CONSTM1_RTX (GET_MODE (dst));
2303 break;
2304 default:
2305 gcc_unreachable ();
2307 if (NONDEBUG_INSN_P (insn))
2309 rtx tmp = gen_reg_rtx (V1TImode);
2310 /* Since there are no instructions to store standard SSE
2311 constant, temporary register usage is required. */
2312 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2313 dst = tmp;
2315 break;
2317 default:
2318 gcc_unreachable ();
2321 SET_SRC (def_set) = src;
2322 SET_DEST (def_set) = dst;
2324 /* Drop possible dead definitions. */
2325 PATTERN (insn) = def_set;
2327 INSN_CODE (insn) = -1;
2328 recog_memoized (insn);
2329 df_insn_rescan (insn);
2332 void
2333 dimode_scalar_chain::convert_registers ()
2335 bitmap_iterator bi;
2336 unsigned id;
2338 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2339 convert_reg (id);
2341 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2342 make_vector_copies (id);
2345 /* Convert whole chain creating required register
2346 conversions and copies. */
2349 scalar_chain::convert ()
2351 bitmap_iterator bi;
2352 unsigned id;
2353 int converted_insns = 0;
2355 if (!dbg_cnt (stv_conversion))
2356 return 0;
2358 if (dump_file)
2359 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2361 convert_registers ();
2363 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2365 convert_insn (DF_INSN_UID_GET (id)->insn);
2366 converted_insns++;
2369 return converted_insns;
2372 /* Main STV pass function. Find and convert scalar
2373 instructions into vector mode when profitable. */
2375 static unsigned int
2376 convert_scalars_to_vector ()
2378 basic_block bb;
2379 bitmap candidates;
2380 int converted_insns = 0;
2382 bitmap_obstack_initialize (NULL);
2383 candidates = BITMAP_ALLOC (NULL);
2385 calculate_dominance_info (CDI_DOMINATORS);
2386 df_set_flags (DF_DEFER_INSN_RESCAN);
2387 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2388 df_md_add_problem ();
2389 df_analyze ();
2391 /* Find all instructions we want to convert into vector mode. */
2392 if (dump_file)
2393 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2395 FOR_EACH_BB_FN (bb, cfun)
2397 rtx_insn *insn;
2398 FOR_BB_INSNS (bb, insn)
2399 if (scalar_to_vector_candidate_p (insn))
2401 if (dump_file)
2402 fprintf (dump_file, " insn %d is marked as a candidate\n",
2403 INSN_UID (insn));
2405 bitmap_set_bit (candidates, INSN_UID (insn));
2409 remove_non_convertible_regs (candidates);
2411 if (bitmap_empty_p (candidates))
2412 if (dump_file)
2413 fprintf (dump_file, "There are no candidates for optimization.\n");
2415 while (!bitmap_empty_p (candidates))
2417 unsigned uid = bitmap_first_set_bit (candidates);
2418 scalar_chain *chain;
2420 if (TARGET_64BIT)
2421 chain = new timode_scalar_chain;
2422 else
2423 chain = new dimode_scalar_chain;
2425 /* Find instructions chain we want to convert to vector mode.
2426 Check all uses and definitions to estimate all required
2427 conversions. */
2428 chain->build (candidates, uid);
2430 if (chain->compute_convert_gain () > 0)
2431 converted_insns += chain->convert ();
2432 else
2433 if (dump_file)
2434 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2435 chain->chain_id);
2437 delete chain;
2440 if (dump_file)
2441 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2443 BITMAP_FREE (candidates);
2444 bitmap_obstack_release (NULL);
2445 df_process_deferred_rescans ();
2447 /* Conversion means we may have 128bit register spills/fills
2448 which require aligned stack. */
2449 if (converted_insns)
2451 if (crtl->stack_alignment_needed < 128)
2452 crtl->stack_alignment_needed = 128;
2453 if (crtl->stack_alignment_estimated < 128)
2454 crtl->stack_alignment_estimated = 128;
2455 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2456 if (TARGET_64BIT)
2457 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2458 parm; parm = DECL_CHAIN (parm))
2460 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2461 continue;
2462 if (DECL_RTL_SET_P (parm)
2463 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2465 rtx r = DECL_RTL (parm);
2466 if (REG_P (r))
2467 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2469 if (DECL_INCOMING_RTL (parm)
2470 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2472 rtx r = DECL_INCOMING_RTL (parm);
2473 if (REG_P (r))
2474 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2479 return 0;
2482 namespace {
2484 const pass_data pass_data_insert_vzeroupper =
2486 RTL_PASS, /* type */
2487 "vzeroupper", /* name */
2488 OPTGROUP_NONE, /* optinfo_flags */
2489 TV_MACH_DEP, /* tv_id */
2490 0, /* properties_required */
2491 0, /* properties_provided */
2492 0, /* properties_destroyed */
2493 0, /* todo_flags_start */
2494 TODO_df_finish, /* todo_flags_finish */
2497 class pass_insert_vzeroupper : public rtl_opt_pass
2499 public:
2500 pass_insert_vzeroupper(gcc::context *ctxt)
2501 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2504 /* opt_pass methods: */
2505 virtual bool gate (function *)
2507 return TARGET_AVX
2508 && TARGET_VZEROUPPER && flag_expensive_optimizations
2509 && !optimize_size;
2512 virtual unsigned int execute (function *)
2514 return rest_of_handle_insert_vzeroupper ();
2517 }; // class pass_insert_vzeroupper
2519 const pass_data pass_data_stv =
2521 RTL_PASS, /* type */
2522 "stv", /* name */
2523 OPTGROUP_NONE, /* optinfo_flags */
2524 TV_MACH_DEP, /* tv_id */
2525 0, /* properties_required */
2526 0, /* properties_provided */
2527 0, /* properties_destroyed */
2528 0, /* todo_flags_start */
2529 TODO_df_finish, /* todo_flags_finish */
2532 class pass_stv : public rtl_opt_pass
2534 public:
2535 pass_stv (gcc::context *ctxt)
2536 : rtl_opt_pass (pass_data_stv, ctxt),
2537 timode_p (false)
2540 /* opt_pass methods: */
2541 virtual bool gate (function *)
2543 return (timode_p == !!TARGET_64BIT
2544 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2547 virtual unsigned int execute (function *)
2549 return convert_scalars_to_vector ();
2552 opt_pass *clone ()
2554 return new pass_stv (m_ctxt);
2557 void set_pass_param (unsigned int n, bool param)
2559 gcc_assert (n == 0);
2560 timode_p = param;
2563 private:
2564 bool timode_p;
2565 }; // class pass_stv
2567 } // anon namespace
2569 rtl_opt_pass *
2570 make_pass_insert_vzeroupper (gcc::context *ctxt)
2572 return new pass_insert_vzeroupper (ctxt);
2575 rtl_opt_pass *
2576 make_pass_stv (gcc::context *ctxt)
2578 return new pass_stv (ctxt);
2581 /* Inserting ENDBRANCH instructions. */
2583 static unsigned int
2584 rest_of_insert_endbranch (void)
2586 timevar_push (TV_MACH_DEP);
2588 rtx cet_eb;
2589 rtx_insn *insn;
2590 basic_block bb;
2592 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2593 absent among function attributes. Later an optimization will be
2594 introduced to make analysis if an address of a static function is
2595 taken. A static function whose address is not taken will get a
2596 nocf_check attribute. This will allow to reduce the number of EB. */
2598 if (!lookup_attribute ("nocf_check",
2599 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2600 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2602 cet_eb = gen_nop_endbr ();
2604 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2605 insn = BB_HEAD (bb);
2606 emit_insn_before (cet_eb, insn);
2609 bb = 0;
2610 FOR_EACH_BB_FN (bb, cfun)
2612 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2613 insn = NEXT_INSN (insn))
2615 if (CALL_P (insn))
2617 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2618 continue;
2619 /* Generate ENDBRANCH after CALL, which can return more than
2620 twice, setjmp-like functions. */
2622 /* Skip notes that must immediately follow the call insn. */
2623 rtx_insn *next_insn = insn;
2624 if (NEXT_INSN (insn)
2625 && NOTE_P (NEXT_INSN (insn))
2626 && (NOTE_KIND (NEXT_INSN (insn))
2627 == NOTE_INSN_CALL_ARG_LOCATION))
2628 next_insn = NEXT_INSN (insn);
2630 cet_eb = gen_nop_endbr ();
2631 emit_insn_after_setloc (cet_eb, next_insn, INSN_LOCATION (insn));
2632 continue;
2635 if (JUMP_P (insn) && flag_cet_switch)
2637 rtx target = JUMP_LABEL (insn);
2638 if (target == NULL_RTX || ANY_RETURN_P (target))
2639 continue;
2641 /* Check the jump is a switch table. */
2642 rtx_insn *label = as_a<rtx_insn *> (target);
2643 rtx_insn *table = next_insn (label);
2644 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2645 continue;
2647 /* For the indirect jump find out all places it jumps and insert
2648 ENDBRANCH there. It should be done under a special flag to
2649 control ENDBRANCH generation for switch stmts. */
2650 edge_iterator ei;
2651 edge e;
2652 basic_block dest_blk;
2654 FOR_EACH_EDGE (e, ei, bb->succs)
2656 rtx_insn *insn;
2658 dest_blk = e->dest;
2659 insn = BB_HEAD (dest_blk);
2660 gcc_assert (LABEL_P (insn));
2661 cet_eb = gen_nop_endbr ();
2662 emit_insn_after (cet_eb, insn);
2664 continue;
2667 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2668 || (NOTE_P (insn)
2669 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2670 /* TODO. Check /s bit also. */
2672 cet_eb = gen_nop_endbr ();
2673 emit_insn_after (cet_eb, insn);
2674 continue;
2679 timevar_pop (TV_MACH_DEP);
2680 return 0;
2683 namespace {
2685 const pass_data pass_data_insert_endbranch =
2687 RTL_PASS, /* type. */
2688 "cet", /* name. */
2689 OPTGROUP_NONE, /* optinfo_flags. */
2690 TV_MACH_DEP, /* tv_id. */
2691 0, /* properties_required. */
2692 0, /* properties_provided. */
2693 0, /* properties_destroyed. */
2694 0, /* todo_flags_start. */
2695 0, /* todo_flags_finish. */
2698 class pass_insert_endbranch : public rtl_opt_pass
2700 public:
2701 pass_insert_endbranch (gcc::context *ctxt)
2702 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2705 /* opt_pass methods: */
2706 virtual bool gate (function *)
2708 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2711 virtual unsigned int execute (function *)
2713 return rest_of_insert_endbranch ();
2716 }; // class pass_insert_endbranch
2718 } // anon namespace
2720 rtl_opt_pass *
2721 make_pass_insert_endbranch (gcc::context *ctxt)
2723 return new pass_insert_endbranch (ctxt);
2726 /* Return true if a red-zone is in use. We can't use red-zone when
2727 there are local indirect jumps, like "indirect_jump" or "tablejump",
2728 which jumps to another place in the function, since "call" in the
2729 indirect thunk pushes the return address onto stack, destroying
2730 red-zone.
2732 TODO: If we can reserve the first 2 WORDs, for PUSH and, another
2733 for CALL, in red-zone, we can allow local indirect jumps with
2734 indirect thunk. */
2736 bool
2737 ix86_using_red_zone (void)
2739 return (TARGET_RED_ZONE
2740 && !TARGET_64BIT_MS_ABI
2741 && (!cfun->machine->has_local_indirect_jump
2742 || cfun->machine->indirect_branch_type == indirect_branch_keep));
2745 /* Return a string that documents the current -m options. The caller is
2746 responsible for freeing the string. */
2748 static char *
2749 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2750 int flags, int flags2,
2751 const char *arch, const char *tune,
2752 enum fpmath_unit fpmath, bool add_nl_p)
2754 struct ix86_target_opts
2756 const char *option; /* option string */
2757 HOST_WIDE_INT mask; /* isa mask options */
2760 /* This table is ordered so that options like -msse4.2 that imply other
2761 ISAs come first. Target string will be displayed in the same order. */
2762 static struct ix86_target_opts isa2_opts[] =
2764 { "-mcx16", OPTION_MASK_ISA_CX16 },
2765 { "-mmpx", OPTION_MASK_ISA_MPX },
2766 { "-mvaes", OPTION_MASK_ISA_VAES },
2767 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2768 { "-msgx", OPTION_MASK_ISA_SGX },
2769 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2770 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2771 { "-mibt", OPTION_MASK_ISA_IBT },
2772 { "-mhle", OPTION_MASK_ISA_HLE },
2773 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2774 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2775 { "-mmwaitx", OPTION_MASK_ISA_MWAITX }
2777 static struct ix86_target_opts isa_opts[] =
2779 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2780 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2781 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2782 { "-mgfni", OPTION_MASK_ISA_GFNI },
2783 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2784 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2785 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2786 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2787 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2788 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2789 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2790 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2791 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2792 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2793 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2794 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2795 { "-mfma", OPTION_MASK_ISA_FMA },
2796 { "-mxop", OPTION_MASK_ISA_XOP },
2797 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2798 { "-mf16c", OPTION_MASK_ISA_F16C },
2799 { "-mavx", OPTION_MASK_ISA_AVX },
2800 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2801 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2802 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2803 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2804 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2805 { "-msse3", OPTION_MASK_ISA_SSE3 },
2806 { "-maes", OPTION_MASK_ISA_AES },
2807 { "-msha", OPTION_MASK_ISA_SHA },
2808 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2809 { "-msse2", OPTION_MASK_ISA_SSE2 },
2810 { "-msse", OPTION_MASK_ISA_SSE },
2811 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2812 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2813 { "-mmmx", OPTION_MASK_ISA_MMX },
2814 { "-mrtm", OPTION_MASK_ISA_RTM },
2815 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2816 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2817 { "-madx", OPTION_MASK_ISA_ADX },
2818 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2819 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2820 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2821 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2822 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2823 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2824 { "-mabm", OPTION_MASK_ISA_ABM },
2825 { "-mbmi", OPTION_MASK_ISA_BMI },
2826 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2827 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2828 { "-mtbm", OPTION_MASK_ISA_TBM },
2829 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2830 { "-msahf", OPTION_MASK_ISA_SAHF },
2831 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2832 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2833 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2834 { "-mpku", OPTION_MASK_ISA_PKU },
2835 { "-mlwp", OPTION_MASK_ISA_LWP },
2836 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2837 { "-mclwb", OPTION_MASK_ISA_CLWB },
2838 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2841 /* Flag options. */
2842 static struct ix86_target_opts flag_opts[] =
2844 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2845 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2846 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2847 { "-m80387", MASK_80387 },
2848 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2849 { "-malign-double", MASK_ALIGN_DOUBLE },
2850 { "-mcld", MASK_CLD },
2851 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2852 { "-mieee-fp", MASK_IEEE_FP },
2853 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2854 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2855 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2856 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2857 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2858 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2859 { "-mno-red-zone", MASK_NO_RED_ZONE },
2860 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2861 { "-mrecip", MASK_RECIP },
2862 { "-mrtd", MASK_RTD },
2863 { "-msseregparm", MASK_SSEREGPARM },
2864 { "-mstack-arg-probe", MASK_STACK_PROBE },
2865 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2866 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2867 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2868 { "-mvzeroupper", MASK_VZEROUPPER },
2869 { "-mstv", MASK_STV },
2870 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2871 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2872 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2875 /* Additional flag options. */
2876 static struct ix86_target_opts flag2_opts[] =
2878 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2881 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2882 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2884 char isa_other[40];
2885 char isa2_other[40];
2886 char flags_other[40];
2887 char flags2_other[40];
2888 unsigned num = 0;
2889 unsigned i, j;
2890 char *ret;
2891 char *ptr;
2892 size_t len;
2893 size_t line_len;
2894 size_t sep_len;
2895 const char *abi;
2897 memset (opts, '\0', sizeof (opts));
2899 /* Add -march= option. */
2900 if (arch)
2902 opts[num][0] = "-march=";
2903 opts[num++][1] = arch;
2906 /* Add -mtune= option. */
2907 if (tune)
2909 opts[num][0] = "-mtune=";
2910 opts[num++][1] = tune;
2913 /* Add -m32/-m64/-mx32. */
2914 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2916 if ((isa & OPTION_MASK_ABI_64) != 0)
2917 abi = "-m64";
2918 else
2919 abi = "-mx32";
2920 isa &= ~ (OPTION_MASK_ISA_64BIT
2921 | OPTION_MASK_ABI_64
2922 | OPTION_MASK_ABI_X32);
2924 else
2925 abi = "-m32";
2926 opts[num++][0] = abi;
2928 /* Pick out the options in isa2 options. */
2929 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2931 if ((isa2 & isa2_opts[i].mask) != 0)
2933 opts[num++][0] = isa2_opts[i].option;
2934 isa2 &= ~ isa2_opts[i].mask;
2938 if (isa2 && add_nl_p)
2940 opts[num++][0] = isa2_other;
2941 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2944 /* Pick out the options in isa options. */
2945 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2947 if ((isa & isa_opts[i].mask) != 0)
2949 opts[num++][0] = isa_opts[i].option;
2950 isa &= ~ isa_opts[i].mask;
2954 if (isa && add_nl_p)
2956 opts[num++][0] = isa_other;
2957 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2960 /* Add flag options. */
2961 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2963 if ((flags & flag_opts[i].mask) != 0)
2965 opts[num++][0] = flag_opts[i].option;
2966 flags &= ~ flag_opts[i].mask;
2970 if (flags && add_nl_p)
2972 opts[num++][0] = flags_other;
2973 sprintf (flags_other, "(other flags: %#x)", flags);
2976 /* Add additional flag options. */
2977 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2979 if ((flags2 & flag2_opts[i].mask) != 0)
2981 opts[num++][0] = flag2_opts[i].option;
2982 flags2 &= ~ flag2_opts[i].mask;
2986 if (flags2 && add_nl_p)
2988 opts[num++][0] = flags2_other;
2989 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2992 /* Add -fpmath= option. */
2993 if (fpmath)
2995 opts[num][0] = "-mfpmath=";
2996 switch ((int) fpmath)
2998 case FPMATH_387:
2999 opts[num++][1] = "387";
3000 break;
3002 case FPMATH_SSE:
3003 opts[num++][1] = "sse";
3004 break;
3006 case FPMATH_387 | FPMATH_SSE:
3007 opts[num++][1] = "sse+387";
3008 break;
3010 default:
3011 gcc_unreachable ();
3015 /* Any options? */
3016 if (num == 0)
3017 return NULL;
3019 gcc_assert (num < ARRAY_SIZE (opts));
3021 /* Size the string. */
3022 len = 0;
3023 sep_len = (add_nl_p) ? 3 : 1;
3024 for (i = 0; i < num; i++)
3026 len += sep_len;
3027 for (j = 0; j < 2; j++)
3028 if (opts[i][j])
3029 len += strlen (opts[i][j]);
3032 /* Build the string. */
3033 ret = ptr = (char *) xmalloc (len);
3034 line_len = 0;
3036 for (i = 0; i < num; i++)
3038 size_t len2[2];
3040 for (j = 0; j < 2; j++)
3041 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3043 if (i != 0)
3045 *ptr++ = ' ';
3046 line_len++;
3048 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3050 *ptr++ = '\\';
3051 *ptr++ = '\n';
3052 line_len = 0;
3056 for (j = 0; j < 2; j++)
3057 if (opts[i][j])
3059 memcpy (ptr, opts[i][j], len2[j]);
3060 ptr += len2[j];
3061 line_len += len2[j];
3065 *ptr = '\0';
3066 gcc_assert (ret + len >= ptr);
3068 return ret;
3071 /* Return true, if profiling code should be emitted before
3072 prologue. Otherwise it returns false.
3073 Note: For x86 with "hotfix" it is sorried. */
3074 static bool
3075 ix86_profile_before_prologue (void)
3077 return flag_fentry != 0;
3080 /* Function that is callable from the debugger to print the current
3081 options. */
3082 void ATTRIBUTE_UNUSED
3083 ix86_debug_options (void)
3085 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3086 target_flags, ix86_target_flags,
3087 ix86_arch_string,ix86_tune_string,
3088 ix86_fpmath, true);
3090 if (opts)
3092 fprintf (stderr, "%s\n\n", opts);
3093 free (opts);
3095 else
3096 fputs ("<no options>\n\n", stderr);
3098 return;
3101 /* Return true if T is one of the bytes we should avoid with
3102 -mmitigate-rop. */
3104 static bool
3105 ix86_rop_should_change_byte_p (int t)
3107 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3110 static const char *stringop_alg_names[] = {
3111 #define DEF_ENUM
3112 #define DEF_ALG(alg, name) #name,
3113 #include "stringop.def"
3114 #undef DEF_ENUM
3115 #undef DEF_ALG
3118 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3119 The string is of the following form (or comma separated list of it):
3121 strategy_alg:max_size:[align|noalign]
3123 where the full size range for the strategy is either [0, max_size] or
3124 [min_size, max_size], in which min_size is the max_size + 1 of the
3125 preceding range. The last size range must have max_size == -1.
3127 Examples:
3130 -mmemcpy-strategy=libcall:-1:noalign
3132 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3136 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3138 This is to tell the compiler to use the following strategy for memset
3139 1) when the expected size is between [1, 16], use rep_8byte strategy;
3140 2) when the size is between [17, 2048], use vector_loop;
3141 3) when the size is > 2048, use libcall. */
3143 struct stringop_size_range
3145 int max;
3146 stringop_alg alg;
3147 bool noalign;
3150 static void
3151 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3153 const struct stringop_algs *default_algs;
3154 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3155 char *curr_range_str, *next_range_str;
3156 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3157 int i = 0, n = 0;
3159 if (is_memset)
3160 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3161 else
3162 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3164 curr_range_str = strategy_str;
3168 int maxs;
3169 char alg_name[128];
3170 char align[16];
3171 next_range_str = strchr (curr_range_str, ',');
3172 if (next_range_str)
3173 *next_range_str++ = '\0';
3175 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3176 align) != 3)
3178 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3179 return;
3182 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3184 error ("size ranges of option %qs should be increasing", opt);
3185 return;
3188 for (i = 0; i < last_alg; i++)
3189 if (!strcmp (alg_name, stringop_alg_names[i]))
3190 break;
3192 if (i == last_alg)
3194 error ("wrong strategy name %qs specified for option %qs",
3195 alg_name, opt);
3197 auto_vec <const char *> candidates;
3198 for (i = 0; i < last_alg; i++)
3199 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3200 candidates.safe_push (stringop_alg_names[i]);
3202 char *s;
3203 const char *hint
3204 = candidates_list_and_hint (alg_name, s, candidates);
3205 if (hint)
3206 inform (input_location,
3207 "valid arguments to %qs are: %s; did you mean %qs?",
3208 opt, s, hint);
3209 else
3210 inform (input_location, "valid arguments to %qs are: %s",
3211 opt, s);
3212 XDELETEVEC (s);
3213 return;
3216 if ((stringop_alg) i == rep_prefix_8_byte
3217 && !TARGET_64BIT)
3219 /* rep; movq isn't available in 32-bit code. */
3220 error ("strategy name %qs specified for option %qs "
3221 "not supported for 32-bit code", alg_name, opt);
3222 return;
3225 input_ranges[n].max = maxs;
3226 input_ranges[n].alg = (stringop_alg) i;
3227 if (!strcmp (align, "align"))
3228 input_ranges[n].noalign = false;
3229 else if (!strcmp (align, "noalign"))
3230 input_ranges[n].noalign = true;
3231 else
3233 error ("unknown alignment %qs specified for option %qs", align, opt);
3234 return;
3236 n++;
3237 curr_range_str = next_range_str;
3239 while (curr_range_str);
3241 if (input_ranges[n - 1].max != -1)
3243 error ("the max value for the last size range should be -1"
3244 " for option %qs", opt);
3245 return;
3248 if (n > MAX_STRINGOP_ALGS)
3250 error ("too many size ranges specified in option %qs", opt);
3251 return;
3254 /* Now override the default algs array. */
3255 for (i = 0; i < n; i++)
3257 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3258 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3259 = input_ranges[i].alg;
3260 *const_cast<int *>(&default_algs->size[i].noalign)
3261 = input_ranges[i].noalign;
3266 /* parse -mtune-ctrl= option. When DUMP is true,
3267 print the features that are explicitly set. */
3269 static void
3270 parse_mtune_ctrl_str (bool dump)
3272 if (!ix86_tune_ctrl_string)
3273 return;
3275 char *next_feature_string = NULL;
3276 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3277 char *orig = curr_feature_string;
3278 int i;
3281 bool clear = false;
3283 next_feature_string = strchr (curr_feature_string, ',');
3284 if (next_feature_string)
3285 *next_feature_string++ = '\0';
3286 if (*curr_feature_string == '^')
3288 curr_feature_string++;
3289 clear = true;
3291 for (i = 0; i < X86_TUNE_LAST; i++)
3293 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3295 ix86_tune_features[i] = !clear;
3296 if (dump)
3297 fprintf (stderr, "Explicitly %s feature %s\n",
3298 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3299 break;
3302 if (i == X86_TUNE_LAST)
3303 error ("unknown parameter to option -mtune-ctrl: %s",
3304 clear ? curr_feature_string - 1 : curr_feature_string);
3305 curr_feature_string = next_feature_string;
3307 while (curr_feature_string);
3308 free (orig);
3311 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3312 processor type. */
3314 static void
3315 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3317 unsigned int ix86_tune_mask = 1u << ix86_tune;
3318 int i;
3320 for (i = 0; i < X86_TUNE_LAST; ++i)
3322 if (ix86_tune_no_default)
3323 ix86_tune_features[i] = 0;
3324 else
3325 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3328 if (dump)
3330 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3331 for (i = 0; i < X86_TUNE_LAST; i++)
3332 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3333 ix86_tune_features[i] ? "on" : "off");
3336 parse_mtune_ctrl_str (dump);
3340 /* Default align_* from the processor table. */
3342 static void
3343 ix86_default_align (struct gcc_options *opts)
3345 if (opts->x_align_loops == 0)
3347 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3348 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3350 if (opts->x_align_jumps == 0)
3352 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3353 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3355 if (opts->x_align_functions == 0)
3357 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3361 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3363 static void
3364 ix86_override_options_after_change (void)
3366 ix86_default_align (&global_options);
3369 /* Override various settings based on options. If MAIN_ARGS_P, the
3370 options are from the command line, otherwise they are from
3371 attributes. Return true if there's an error related to march
3372 option. */
3374 static bool
3375 ix86_option_override_internal (bool main_args_p,
3376 struct gcc_options *opts,
3377 struct gcc_options *opts_set)
3379 int i;
3380 unsigned int ix86_arch_mask;
3381 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3383 const wide_int_bitmask PTA_3DNOW (HOST_WIDE_INT_1U << 0);
3384 const wide_int_bitmask PTA_3DNOW_A (HOST_WIDE_INT_1U << 1);
3385 const wide_int_bitmask PTA_64BIT (HOST_WIDE_INT_1U << 2);
3386 const wide_int_bitmask PTA_ABM (HOST_WIDE_INT_1U << 3);
3387 const wide_int_bitmask PTA_AES (HOST_WIDE_INT_1U << 4);
3388 const wide_int_bitmask PTA_AVX (HOST_WIDE_INT_1U << 5);
3389 const wide_int_bitmask PTA_BMI (HOST_WIDE_INT_1U << 6);
3390 const wide_int_bitmask PTA_CX16 (HOST_WIDE_INT_1U << 7);
3391 const wide_int_bitmask PTA_F16C (HOST_WIDE_INT_1U << 8);
3392 const wide_int_bitmask PTA_FMA (HOST_WIDE_INT_1U << 9);
3393 const wide_int_bitmask PTA_FMA4 (HOST_WIDE_INT_1U << 10);
3394 const wide_int_bitmask PTA_FSGSBASE (HOST_WIDE_INT_1U << 11);
3395 const wide_int_bitmask PTA_LWP (HOST_WIDE_INT_1U << 12);
3396 const wide_int_bitmask PTA_LZCNT (HOST_WIDE_INT_1U << 13);
3397 const wide_int_bitmask PTA_MMX (HOST_WIDE_INT_1U << 14);
3398 const wide_int_bitmask PTA_MOVBE (HOST_WIDE_INT_1U << 15);
3399 const wide_int_bitmask PTA_NO_SAHF (HOST_WIDE_INT_1U << 16);
3400 const wide_int_bitmask PTA_PCLMUL (HOST_WIDE_INT_1U << 17);
3401 const wide_int_bitmask PTA_POPCNT (HOST_WIDE_INT_1U << 18);
3402 const wide_int_bitmask PTA_PREFETCH_SSE (HOST_WIDE_INT_1U << 19);
3403 const wide_int_bitmask PTA_RDRND (HOST_WIDE_INT_1U << 20);
3404 const wide_int_bitmask PTA_SSE (HOST_WIDE_INT_1U << 21);
3405 const wide_int_bitmask PTA_SSE2 (HOST_WIDE_INT_1U << 22);
3406 const wide_int_bitmask PTA_SSE3 (HOST_WIDE_INT_1U << 23);
3407 const wide_int_bitmask PTA_SSE4_1 (HOST_WIDE_INT_1U << 24);
3408 const wide_int_bitmask PTA_SSE4_2 (HOST_WIDE_INT_1U << 25);
3409 const wide_int_bitmask PTA_SSE4A (HOST_WIDE_INT_1U << 26);
3410 const wide_int_bitmask PTA_SSSE3 (HOST_WIDE_INT_1U << 27);
3411 const wide_int_bitmask PTA_TBM (HOST_WIDE_INT_1U << 28);
3412 const wide_int_bitmask PTA_XOP (HOST_WIDE_INT_1U << 29);
3413 const wide_int_bitmask PTA_AVX2 (HOST_WIDE_INT_1U << 30);
3414 const wide_int_bitmask PTA_BMI2 (HOST_WIDE_INT_1U << 31);
3415 const wide_int_bitmask PTA_RTM (HOST_WIDE_INT_1U << 32);
3416 const wide_int_bitmask PTA_HLE (HOST_WIDE_INT_1U << 33);
3417 const wide_int_bitmask PTA_PRFCHW (HOST_WIDE_INT_1U << 34);
3418 const wide_int_bitmask PTA_RDSEED (HOST_WIDE_INT_1U << 35);
3419 const wide_int_bitmask PTA_ADX (HOST_WIDE_INT_1U << 36);
3420 const wide_int_bitmask PTA_FXSR (HOST_WIDE_INT_1U << 37);
3421 const wide_int_bitmask PTA_XSAVE (HOST_WIDE_INT_1U << 38);
3422 const wide_int_bitmask PTA_XSAVEOPT (HOST_WIDE_INT_1U << 39);
3423 const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40);
3424 const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41);
3425 const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42);
3426 const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43);
3427 const wide_int_bitmask PTA_MPX (HOST_WIDE_INT_1U << 44);
3428 const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45);
3429 const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46);
3430 const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47);
3431 const wide_int_bitmask PTA_XSAVEC (HOST_WIDE_INT_1U << 48);
3432 const wide_int_bitmask PTA_XSAVES (HOST_WIDE_INT_1U << 49);
3433 const wide_int_bitmask PTA_AVX512DQ (HOST_WIDE_INT_1U << 50);
3434 const wide_int_bitmask PTA_AVX512BW (HOST_WIDE_INT_1U << 51);
3435 const wide_int_bitmask PTA_AVX512VL (HOST_WIDE_INT_1U << 52);
3436 const wide_int_bitmask PTA_AVX512IFMA (HOST_WIDE_INT_1U << 53);
3437 const wide_int_bitmask PTA_AVX512VBMI (HOST_WIDE_INT_1U << 54);
3438 const wide_int_bitmask PTA_CLWB (HOST_WIDE_INT_1U << 55);
3439 const wide_int_bitmask PTA_MWAITX (HOST_WIDE_INT_1U << 56);
3440 const wide_int_bitmask PTA_CLZERO (HOST_WIDE_INT_1U << 57);
3441 const wide_int_bitmask PTA_NO_80387 (HOST_WIDE_INT_1U << 58);
3442 const wide_int_bitmask PTA_PKU (HOST_WIDE_INT_1U << 59);
3443 const wide_int_bitmask PTA_AVX5124VNNIW (HOST_WIDE_INT_1U << 60);
3444 const wide_int_bitmask PTA_AVX5124FMAPS (HOST_WIDE_INT_1U << 61);
3445 const wide_int_bitmask PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1U << 62);
3446 const wide_int_bitmask PTA_SGX (HOST_WIDE_INT_1U << 63);
3447 const wide_int_bitmask PTA_AVX512VNNI (0, HOST_WIDE_INT_1U);
3448 const wide_int_bitmask PTA_GFNI (0, HOST_WIDE_INT_1U << 1);
3449 const wide_int_bitmask PTA_VAES (0, HOST_WIDE_INT_1U << 2);
3450 const wide_int_bitmask PTA_AVX512VBMI2 (0, HOST_WIDE_INT_1U << 3);
3451 const wide_int_bitmask PTA_VPCLMULQDQ (0, HOST_WIDE_INT_1U << 4);
3452 const wide_int_bitmask PTA_AVX512BITALG (0, HOST_WIDE_INT_1U << 5);
3453 const wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6);
3455 const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
3456 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
3457 const wide_int_bitmask PTA_NEHALEM = PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2
3458 | PTA_POPCNT;
3459 const wide_int_bitmask PTA_WESTMERE = PTA_NEHALEM | PTA_AES | PTA_PCLMUL;
3460 const wide_int_bitmask PTA_SANDYBRIDGE = PTA_WESTMERE | PTA_AVX | PTA_XSAVE
3461 | PTA_XSAVEOPT;
3462 const wide_int_bitmask PTA_IVYBRIDGE = PTA_SANDYBRIDGE | PTA_FSGSBASE
3463 | PTA_RDRND | PTA_F16C;
3464 const wide_int_bitmask PTA_HASWELL = PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI
3465 | PTA_BMI2 | PTA_LZCNT | PTA_FMA | PTA_MOVBE | PTA_HLE;
3466 const wide_int_bitmask PTA_BROADWELL = PTA_HASWELL | PTA_ADX | PTA_PRFCHW
3467 | PTA_RDSEED;
3468 const wide_int_bitmask PTA_SKYLAKE = PTA_BROADWELL | PTA_CLFLUSHOPT
3469 | PTA_XSAVEC | PTA_XSAVES;
3470 const wide_int_bitmask PTA_SKYLAKE_AVX512 = PTA_SKYLAKE | PTA_AVX512F
3471 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3472 | PTA_CLWB;
3473 const wide_int_bitmask PTA_CANNONLAKE = PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI
3474 | PTA_AVX512IFMA | PTA_SHA;
3475 const wide_int_bitmask PTA_ICELAKE = PTA_CANNONLAKE | PTA_AVX512VNNI
3476 | PTA_GFNI | PTA_VAES | PTA_AVX512VBMI2 | PTA_VPCLMULQDQ | PTA_AVX512BITALG
3477 | PTA_RDPID;
3478 const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER
3479 | PTA_AVX512F | PTA_AVX512CD;
3480 const wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
3481 const wide_int_bitmask PTA_SILVERMONT = PTA_WESTMERE | PTA_MOVBE | PTA_RDRND;
3482 const wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
3483 | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
3485 static struct pta
3487 const char *const name; /* processor name or nickname. */
3488 const enum processor_type processor;
3489 const enum attr_cpu schedule;
3490 const wide_int_bitmask flags;
3492 const processor_alias_table[] =
3494 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3495 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3496 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3497 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3498 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3499 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3500 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3501 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3502 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3503 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3504 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3505 PTA_MMX | PTA_SSE | PTA_FXSR},
3506 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3507 PTA_MMX | PTA_SSE | PTA_FXSR},
3508 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3509 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3510 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3511 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3512 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3513 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3514 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3515 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3516 PTA_MMX | PTA_SSE | PTA_FXSR},
3517 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3518 PTA_MMX | PTA_SSE | PTA_FXSR},
3519 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3520 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3521 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3522 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3523 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3524 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3525 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3526 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3527 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3528 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3529 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3530 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3531 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3532 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3533 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3534 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3535 PTA_SANDYBRIDGE},
3536 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3537 PTA_SANDYBRIDGE},
3538 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3539 PTA_IVYBRIDGE},
3540 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3541 PTA_IVYBRIDGE},
3542 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3543 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3544 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3545 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3546 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3547 PTA_SKYLAKE_AVX512},
3548 {"cannonlake", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL, PTA_CANNONLAKE},
3549 {"icelake", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL, PTA_ICELAKE},
3550 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3551 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3552 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3553 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3554 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3555 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3556 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3557 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3558 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3559 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3560 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3561 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3562 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3563 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3564 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3565 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3566 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3567 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3568 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3569 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3570 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3571 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3572 {"x86-64", PROCESSOR_K8, CPU_K8,
3573 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3574 {"eden-x2", PROCESSOR_K8, CPU_K8,
3575 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3576 {"nano", PROCESSOR_K8, CPU_K8,
3577 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3578 | PTA_SSSE3 | PTA_FXSR},
3579 {"nano-1000", PROCESSOR_K8, CPU_K8,
3580 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3581 | PTA_SSSE3 | PTA_FXSR},
3582 {"nano-2000", PROCESSOR_K8, CPU_K8,
3583 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3584 | PTA_SSSE3 | PTA_FXSR},
3585 {"nano-3000", PROCESSOR_K8, CPU_K8,
3586 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3587 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3588 {"nano-x2", PROCESSOR_K8, CPU_K8,
3589 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3590 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3591 {"eden-x4", PROCESSOR_K8, CPU_K8,
3592 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3593 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3594 {"nano-x4", PROCESSOR_K8, CPU_K8,
3595 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3596 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3597 {"k8", PROCESSOR_K8, CPU_K8,
3598 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3599 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3600 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3601 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3602 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3603 {"opteron", PROCESSOR_K8, CPU_K8,
3604 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3605 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3606 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3607 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3608 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3609 {"athlon64", PROCESSOR_K8, CPU_K8,
3610 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3611 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3612 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3613 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3614 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3615 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3616 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3617 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3618 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3619 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3620 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3621 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3622 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3623 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3624 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3625 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3626 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3627 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3628 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3629 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3630 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3631 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3632 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3633 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3634 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3635 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3636 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3637 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3638 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3639 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3640 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3641 | PTA_XSAVEOPT | PTA_FSGSBASE},
3642 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3643 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3644 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3645 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3646 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3647 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3648 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3649 | PTA_MOVBE | PTA_MWAITX},
3650 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3651 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3652 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3653 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3654 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3655 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3656 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3657 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3658 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3659 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3660 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3661 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3662 | PTA_FXSR | PTA_XSAVE},
3663 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3664 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3665 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3666 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3667 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3668 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3670 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3671 PTA_64BIT
3672 | PTA_HLE /* flags are only used for -march switch. */ },
3675 /* -mrecip options. */
3676 static struct
3678 const char *string; /* option name */
3679 unsigned int mask; /* mask bits to set */
3681 const recip_options[] =
3683 { "all", RECIP_MASK_ALL },
3684 { "none", RECIP_MASK_NONE },
3685 { "div", RECIP_MASK_DIV },
3686 { "sqrt", RECIP_MASK_SQRT },
3687 { "vec-div", RECIP_MASK_VEC_DIV },
3688 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3691 int const pta_size = ARRAY_SIZE (processor_alias_table);
3693 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3694 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3695 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3696 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3697 #ifdef TARGET_BI_ARCH
3698 else
3700 #if TARGET_BI_ARCH == 1
3701 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3702 is on and OPTION_MASK_ABI_X32 is off. We turn off
3703 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3704 -mx32. */
3705 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3706 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3707 #else
3708 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3709 on and OPTION_MASK_ABI_64 is off. We turn off
3710 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3711 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3712 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3713 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3714 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3715 #endif
3716 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3717 && TARGET_IAMCU_P (opts->x_target_flags))
3718 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3719 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3721 #endif
3723 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3725 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3726 OPTION_MASK_ABI_64 for TARGET_X32. */
3727 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3728 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3730 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3731 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3732 | OPTION_MASK_ABI_X32
3733 | OPTION_MASK_ABI_64);
3734 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3736 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3737 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3738 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3739 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3742 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3743 SUBTARGET_OVERRIDE_OPTIONS;
3744 #endif
3746 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3747 SUBSUBTARGET_OVERRIDE_OPTIONS;
3748 #endif
3750 /* -fPIC is the default for x86_64. */
3751 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3752 opts->x_flag_pic = 2;
3754 /* Need to check -mtune=generic first. */
3755 if (opts->x_ix86_tune_string)
3757 /* As special support for cross compilers we read -mtune=native
3758 as -mtune=generic. With native compilers we won't see the
3759 -mtune=native, as it was changed by the driver. */
3760 if (!strcmp (opts->x_ix86_tune_string, "native"))
3762 opts->x_ix86_tune_string = "generic";
3764 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3765 warning (OPT_Wdeprecated,
3766 main_args_p
3767 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3768 "or %<-mtune=generic%> instead as appropriate")
3769 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3770 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3771 " instead as appropriate"));
3773 else
3775 if (opts->x_ix86_arch_string)
3776 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3777 if (!opts->x_ix86_tune_string)
3779 opts->x_ix86_tune_string
3780 = processor_target_table[TARGET_CPU_DEFAULT].name;
3781 ix86_tune_defaulted = 1;
3784 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3785 or defaulted. We need to use a sensible tune option. */
3786 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3788 opts->x_ix86_tune_string = "generic";
3792 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3793 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3795 /* rep; movq isn't available in 32-bit code. */
3796 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3797 opts->x_ix86_stringop_alg = no_stringop;
3800 if (!opts->x_ix86_arch_string)
3801 opts->x_ix86_arch_string
3802 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3803 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3804 else
3805 ix86_arch_specified = 1;
3807 if (opts_set->x_ix86_pmode)
3809 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3810 && opts->x_ix86_pmode == PMODE_SI)
3811 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3812 && opts->x_ix86_pmode == PMODE_DI))
3813 error ("address mode %qs not supported in the %s bit mode",
3814 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3815 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3817 else
3818 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3819 ? PMODE_DI : PMODE_SI;
3821 if (!opts_set->x_ix86_abi)
3822 opts->x_ix86_abi = DEFAULT_ABI;
3824 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3825 error ("-mabi=ms not supported with X32 ABI");
3826 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3828 /* For targets using ms ABI enable ms-extensions, if not
3829 explicit turned off. For non-ms ABI we turn off this
3830 option. */
3831 if (!opts_set->x_flag_ms_extensions)
3832 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3834 if (opts_set->x_ix86_cmodel)
3836 switch (opts->x_ix86_cmodel)
3838 case CM_SMALL:
3839 case CM_SMALL_PIC:
3840 if (opts->x_flag_pic)
3841 opts->x_ix86_cmodel = CM_SMALL_PIC;
3842 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3843 error ("code model %qs not supported in the %s bit mode",
3844 "small", "32");
3845 break;
3847 case CM_MEDIUM:
3848 case CM_MEDIUM_PIC:
3849 if (opts->x_flag_pic)
3850 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3851 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3852 error ("code model %qs not supported in the %s bit mode",
3853 "medium", "32");
3854 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3855 error ("code model %qs not supported in x32 mode",
3856 "medium");
3857 break;
3859 case CM_LARGE:
3860 case CM_LARGE_PIC:
3861 if (opts->x_flag_pic)
3862 opts->x_ix86_cmodel = CM_LARGE_PIC;
3863 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3864 error ("code model %qs not supported in the %s bit mode",
3865 "large", "32");
3866 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3867 error ("code model %qs not supported in x32 mode",
3868 "large");
3869 break;
3871 case CM_32:
3872 if (opts->x_flag_pic)
3873 error ("code model %s does not support PIC mode", "32");
3874 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3875 error ("code model %qs not supported in the %s bit mode",
3876 "32", "64");
3877 break;
3879 case CM_KERNEL:
3880 if (opts->x_flag_pic)
3882 error ("code model %s does not support PIC mode", "kernel");
3883 opts->x_ix86_cmodel = CM_32;
3885 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3886 error ("code model %qs not supported in the %s bit mode",
3887 "kernel", "32");
3888 break;
3890 default:
3891 gcc_unreachable ();
3894 else
3896 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3897 use of rip-relative addressing. This eliminates fixups that
3898 would otherwise be needed if this object is to be placed in a
3899 DLL, and is essentially just as efficient as direct addressing. */
3900 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3901 && (TARGET_RDOS || TARGET_PECOFF))
3902 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3903 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3904 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3905 else
3906 opts->x_ix86_cmodel = CM_32;
3908 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3910 error ("-masm=intel not supported in this configuration");
3911 opts->x_ix86_asm_dialect = ASM_ATT;
3913 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3914 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3915 sorry ("%i-bit mode not compiled in",
3916 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3918 for (i = 0; i < pta_size; i++)
3919 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3921 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3923 error (main_args_p
3924 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3925 "switch")
3926 : G_("%<generic%> CPU can be used only for "
3927 "%<target(\"tune=\")%> attribute"));
3928 return false;
3930 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3932 error (main_args_p
3933 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3934 "switch")
3935 : G_("%<intel%> CPU can be used only for "
3936 "%<target(\"tune=\")%> attribute"));
3937 return false;
3940 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3941 && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
3943 error ("CPU you selected does not support x86-64 "
3944 "instruction set");
3945 return false;
3948 ix86_schedule = processor_alias_table[i].schedule;
3949 ix86_arch = processor_alias_table[i].processor;
3950 /* Default cpu tuning to the architecture. */
3951 ix86_tune = ix86_arch;
3953 if (((processor_alias_table[i].flags & PTA_MMX) != 0)
3954 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3955 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3956 if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
3957 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3958 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3959 if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
3960 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3961 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3962 if (((processor_alias_table[i].flags & PTA_SSE) != 0)
3963 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3964 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3965 if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
3966 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3967 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3968 if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
3969 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3970 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3971 if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
3972 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3973 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3974 if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
3975 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3976 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3977 if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
3978 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3979 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3980 if (((processor_alias_table[i].flags & PTA_AVX) != 0)
3981 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3982 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3983 if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
3984 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3985 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3986 if (((processor_alias_table[i].flags & PTA_FMA) != 0)
3987 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3988 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3989 if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
3990 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3991 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3992 if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
3993 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3994 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3995 if (((processor_alias_table[i].flags & PTA_XOP) != 0)
3996 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3997 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3998 if (((processor_alias_table[i].flags & PTA_LWP) != 0)
3999 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
4000 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
4001 if (((processor_alias_table[i].flags & PTA_ABM) != 0)
4002 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
4003 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
4004 if (((processor_alias_table[i].flags & PTA_BMI) != 0)
4005 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
4006 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
4007 if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
4008 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
4009 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
4010 if (((processor_alias_table[i].flags & PTA_TBM) != 0)
4011 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
4012 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
4013 if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
4014 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4015 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4016 if (((processor_alias_table[i].flags & PTA_CX16) != 0)
4017 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4018 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4019 if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
4020 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4021 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4022 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4023 && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
4024 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4025 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4026 if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
4027 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4028 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4029 if (((processor_alias_table[i].flags & PTA_AES) != 0)
4030 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4031 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4032 if (((processor_alias_table[i].flags & PTA_SHA) != 0)
4033 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4034 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4035 if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
4036 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4037 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4038 if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
4039 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4040 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4041 if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
4042 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4043 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4044 if (((processor_alias_table[i].flags & PTA_F16C) != 0)
4045 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4046 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4047 if (((processor_alias_table[i].flags & PTA_RTM) != 0)
4048 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4049 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4050 if (((processor_alias_table[i].flags & PTA_HLE) != 0)
4051 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4052 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4053 if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
4054 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4055 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4056 if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
4057 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4058 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4059 if (((processor_alias_table[i].flags & PTA_ADX) != 0)
4060 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4061 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4062 if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
4063 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4064 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4065 if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
4066 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4067 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4068 if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
4069 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4070 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4071 if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
4072 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4073 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4074 if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
4075 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4076 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4077 if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
4078 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4079 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4080 if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
4081 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4082 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4083 if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
4084 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4085 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4086 if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
4087 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4088 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4089 if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
4090 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4091 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4092 if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
4093 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4094 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4095 if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
4096 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4097 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4098 if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
4099 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4100 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4101 if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
4102 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4103 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4104 if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
4105 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4106 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4107 if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
4108 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4109 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4110 if (((processor_alias_table[i].flags & PTA_MPX) != 0)
4111 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4112 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4113 if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
4114 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4115 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4116 if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
4117 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4118 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4119 if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
4120 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
4121 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
4122 if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
4123 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
4124 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
4125 if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
4126 && !(opts->x_ix86_isa_flags_explicit
4127 & OPTION_MASK_ISA_AVX512VBMI2))
4128 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
4129 if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
4130 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
4131 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
4132 if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
4133 && !(opts->x_ix86_isa_flags_explicit
4134 & OPTION_MASK_ISA_AVX512BITALG))
4135 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
4137 if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
4138 && !(opts->x_ix86_isa_flags2_explicit
4139 & OPTION_MASK_ISA_AVX5124VNNIW))
4140 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4141 if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
4142 && !(opts->x_ix86_isa_flags2_explicit
4143 & OPTION_MASK_ISA_AVX5124FMAPS))
4144 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4145 if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
4146 && !(opts->x_ix86_isa_flags_explicit
4147 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4148 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4149 if (((processor_alias_table[i].flags & PTA_SGX) != 0)
4150 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4151 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4152 if (((processor_alias_table[i].flags & PTA_VAES) != 0)
4153 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
4154 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
4155 if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
4156 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
4157 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
4159 if ((processor_alias_table[i].flags
4160 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
4161 x86_prefetch_sse = true;
4162 if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
4163 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4164 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4165 if (((processor_alias_table[i].flags & PTA_PKU) != 0)
4166 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4167 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4169 /* Don't enable x87 instructions if only
4170 general registers are allowed. */
4171 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4172 && !(opts_set->x_target_flags & MASK_80387))
4174 if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
4175 opts->x_target_flags &= ~MASK_80387;
4176 else
4177 opts->x_target_flags |= MASK_80387;
4179 break;
4182 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4183 error ("Intel MPX does not support x32");
4185 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4186 error ("Intel MPX does not support x32");
4188 if (i == pta_size)
4190 error (main_args_p
4191 ? G_("bad value (%qs) for %<-march=%> switch")
4192 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4193 opts->x_ix86_arch_string);
4195 auto_vec <const char *> candidates;
4196 for (i = 0; i < pta_size; i++)
4197 if (strcmp (processor_alias_table[i].name, "generic")
4198 && strcmp (processor_alias_table[i].name, "intel")
4199 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4200 || ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
4201 candidates.safe_push (processor_alias_table[i].name);
4203 char *s;
4204 const char *hint
4205 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4206 if (hint)
4207 inform (input_location,
4208 main_args_p
4209 ? G_("valid arguments to %<-march=%> switch are: "
4210 "%s; did you mean %qs?")
4211 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4212 "%s; did you mean %qs?"), s, hint);
4213 else
4214 inform (input_location,
4215 main_args_p
4216 ? G_("valid arguments to %<-march=%> switch are: %s")
4217 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4218 "are: %s"), s);
4219 XDELETEVEC (s);
4222 ix86_arch_mask = 1u << ix86_arch;
4223 for (i = 0; i < X86_ARCH_LAST; ++i)
4224 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4226 for (i = 0; i < pta_size; i++)
4227 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4229 ix86_schedule = processor_alias_table[i].schedule;
4230 ix86_tune = processor_alias_table[i].processor;
4231 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4233 if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
4235 if (ix86_tune_defaulted)
4237 opts->x_ix86_tune_string = "x86-64";
4238 for (i = 0; i < pta_size; i++)
4239 if (! strcmp (opts->x_ix86_tune_string,
4240 processor_alias_table[i].name))
4241 break;
4242 ix86_schedule = processor_alias_table[i].schedule;
4243 ix86_tune = processor_alias_table[i].processor;
4245 else
4246 error ("CPU you selected does not support x86-64 "
4247 "instruction set");
4250 /* Intel CPUs have always interpreted SSE prefetch instructions as
4251 NOPs; so, we can enable SSE prefetch instructions even when
4252 -mtune (rather than -march) points us to a processor that has them.
4253 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4254 higher processors. */
4255 if (TARGET_CMOV
4256 && ((processor_alias_table[i].flags
4257 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
4258 x86_prefetch_sse = true;
4259 break;
4262 if (ix86_tune_specified && i == pta_size)
4264 error (main_args_p
4265 ? G_("bad value (%qs) for %<-mtune=%> switch")
4266 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4267 opts->x_ix86_tune_string);
4269 auto_vec <const char *> candidates;
4270 for (i = 0; i < pta_size; i++)
4271 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4272 || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
4273 candidates.safe_push (processor_alias_table[i].name);
4275 char *s;
4276 const char *hint
4277 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4278 if (hint)
4279 inform (input_location,
4280 main_args_p
4281 ? G_("valid arguments to %<-mtune=%> switch are: "
4282 "%s; did you mean %qs?")
4283 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4284 "%s; did you mean %qs?"), s, hint);
4285 else
4286 inform (input_location,
4287 main_args_p
4288 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4289 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4290 "are: %s"), s);
4291 XDELETEVEC (s);
4294 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4296 #ifndef USE_IX86_FRAME_POINTER
4297 #define USE_IX86_FRAME_POINTER 0
4298 #endif
4300 #ifndef USE_X86_64_FRAME_POINTER
4301 #define USE_X86_64_FRAME_POINTER 0
4302 #endif
4304 /* Set the default values for switches whose default depends on TARGET_64BIT
4305 in case they weren't overwritten by command line options. */
4306 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4308 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4309 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4310 if (opts->x_flag_asynchronous_unwind_tables
4311 && !opts_set->x_flag_unwind_tables
4312 && TARGET_64BIT_MS_ABI)
4313 opts->x_flag_unwind_tables = 1;
4314 if (opts->x_flag_asynchronous_unwind_tables == 2)
4315 opts->x_flag_unwind_tables
4316 = opts->x_flag_asynchronous_unwind_tables = 1;
4317 if (opts->x_flag_pcc_struct_return == 2)
4318 opts->x_flag_pcc_struct_return = 0;
4320 else
4322 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4323 opts->x_flag_omit_frame_pointer
4324 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4325 if (opts->x_flag_asynchronous_unwind_tables == 2)
4326 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4327 if (opts->x_flag_pcc_struct_return == 2)
4329 /* Intel MCU psABI specifies that -freg-struct-return should
4330 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4331 we check -miamcu so that -freg-struct-return is always
4332 turned on if -miamcu is used. */
4333 if (TARGET_IAMCU_P (opts->x_target_flags))
4334 opts->x_flag_pcc_struct_return = 0;
4335 else
4336 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4340 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4341 /* TODO: ix86_cost should be chosen at instruction or function granuality
4342 so for cold code we use size_cost even in !optimize_size compilation. */
4343 if (opts->x_optimize_size)
4344 ix86_cost = &ix86_size_cost;
4345 else
4346 ix86_cost = ix86_tune_cost;
4348 /* Arrange to set up i386_stack_locals for all functions. */
4349 init_machine_status = ix86_init_machine_status;
4351 /* Validate -mregparm= value. */
4352 if (opts_set->x_ix86_regparm)
4354 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4355 warning (0, "-mregparm is ignored in 64-bit mode");
4356 else if (TARGET_IAMCU_P (opts->x_target_flags))
4357 warning (0, "-mregparm is ignored for Intel MCU psABI");
4358 if (opts->x_ix86_regparm > REGPARM_MAX)
4360 error ("-mregparm=%d is not between 0 and %d",
4361 opts->x_ix86_regparm, REGPARM_MAX);
4362 opts->x_ix86_regparm = 0;
4365 if (TARGET_IAMCU_P (opts->x_target_flags)
4366 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4367 opts->x_ix86_regparm = REGPARM_MAX;
4369 /* Default align_* from the processor table. */
4370 ix86_default_align (opts);
4372 /* Provide default for -mbranch-cost= value. */
4373 if (!opts_set->x_ix86_branch_cost)
4374 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4376 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4378 opts->x_target_flags
4379 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4381 /* Enable by default the SSE and MMX builtins. Do allow the user to
4382 explicitly disable any of these. In particular, disabling SSE and
4383 MMX for kernel code is extremely useful. */
4384 if (!ix86_arch_specified)
4385 opts->x_ix86_isa_flags
4386 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4387 | TARGET_SUBTARGET64_ISA_DEFAULT)
4388 & ~opts->x_ix86_isa_flags_explicit);
4390 if (TARGET_RTD_P (opts->x_target_flags))
4391 warning (0,
4392 main_args_p
4393 ? G_("%<-mrtd%> is ignored in 64bit mode")
4394 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4396 else
4398 opts->x_target_flags
4399 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4401 if (!ix86_arch_specified)
4402 opts->x_ix86_isa_flags
4403 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4405 /* i386 ABI does not specify red zone. It still makes sense to use it
4406 when programmer takes care to stack from being destroyed. */
4407 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4408 opts->x_target_flags |= MASK_NO_RED_ZONE;
4411 /* Keep nonleaf frame pointers. */
4412 if (opts->x_flag_omit_frame_pointer)
4413 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4414 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4415 opts->x_flag_omit_frame_pointer = 1;
4417 /* If we're doing fast math, we don't care about comparison order
4418 wrt NaNs. This lets us use a shorter comparison sequence. */
4419 if (opts->x_flag_finite_math_only)
4420 opts->x_target_flags &= ~MASK_IEEE_FP;
4422 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4423 since the insns won't need emulation. */
4424 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4425 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4427 /* Likewise, if the target doesn't have a 387, or we've specified
4428 software floating point, don't use 387 inline intrinsics. */
4429 if (!TARGET_80387_P (opts->x_target_flags))
4430 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4432 /* Turn on MMX builtins for -msse. */
4433 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4434 opts->x_ix86_isa_flags
4435 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4437 /* Enable SSE prefetch. */
4438 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4439 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4440 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4441 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4442 x86_prefetch_sse = true;
4444 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4445 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4446 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4447 opts->x_ix86_isa_flags
4448 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4450 /* Enable lzcnt instruction for -mabm. */
4451 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4452 opts->x_ix86_isa_flags
4453 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4455 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4456 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4457 opts->x_ix86_isa_flags
4458 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4459 & ~opts->x_ix86_isa_flags_explicit);
4461 /* Validate -mpreferred-stack-boundary= value or default it to
4462 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4463 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4464 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4466 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4467 int max = TARGET_SEH ? 4 : 12;
4469 if (opts->x_ix86_preferred_stack_boundary_arg < min
4470 || opts->x_ix86_preferred_stack_boundary_arg > max)
4472 if (min == max)
4473 error ("-mpreferred-stack-boundary is not supported "
4474 "for this target");
4475 else
4476 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4477 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4479 else
4480 ix86_preferred_stack_boundary
4481 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4484 /* Set the default value for -mstackrealign. */
4485 if (!opts_set->x_ix86_force_align_arg_pointer)
4486 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4488 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4490 /* Validate -mincoming-stack-boundary= value or default it to
4491 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4492 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4493 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4495 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4497 if (opts->x_ix86_incoming_stack_boundary_arg < min
4498 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4499 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4500 opts->x_ix86_incoming_stack_boundary_arg, min);
4501 else
4503 ix86_user_incoming_stack_boundary
4504 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4505 ix86_incoming_stack_boundary
4506 = ix86_user_incoming_stack_boundary;
4510 #ifndef NO_PROFILE_COUNTERS
4511 if (flag_nop_mcount)
4512 error ("-mnop-mcount is not compatible with this target");
4513 #endif
4514 if (flag_nop_mcount && flag_pic)
4515 error ("-mnop-mcount is not implemented for -fPIC");
4517 /* Accept -msseregparm only if at least SSE support is enabled. */
4518 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4519 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4520 error (main_args_p
4521 ? G_("%<-msseregparm%> used without SSE enabled")
4522 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4524 if (opts_set->x_ix86_fpmath)
4526 if (opts->x_ix86_fpmath & FPMATH_SSE)
4528 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4530 if (TARGET_80387_P (opts->x_target_flags))
4532 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4533 opts->x_ix86_fpmath = FPMATH_387;
4536 else if ((opts->x_ix86_fpmath & FPMATH_387)
4537 && !TARGET_80387_P (opts->x_target_flags))
4539 warning (0, "387 instruction set disabled, using SSE arithmetics");
4540 opts->x_ix86_fpmath = FPMATH_SSE;
4544 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4545 fpmath=387. The second is however default at many targets since the
4546 extra 80bit precision of temporaries is considered to be part of ABI.
4547 Overwrite the default at least for -ffast-math.
4548 TODO: -mfpmath=both seems to produce same performing code with bit
4549 smaller binaries. It is however not clear if register allocation is
4550 ready for this setting.
4551 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4552 codegen. We may switch to 387 with -ffast-math for size optimized
4553 functions. */
4554 else if (fast_math_flags_set_p (&global_options)
4555 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4556 opts->x_ix86_fpmath = FPMATH_SSE;
4557 else
4558 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4560 /* Use external vectorized library in vectorizing intrinsics. */
4561 if (opts_set->x_ix86_veclibabi_type)
4562 switch (opts->x_ix86_veclibabi_type)
4564 case ix86_veclibabi_type_svml:
4565 ix86_veclib_handler = ix86_veclibabi_svml;
4566 break;
4568 case ix86_veclibabi_type_acml:
4569 ix86_veclib_handler = ix86_veclibabi_acml;
4570 break;
4572 default:
4573 gcc_unreachable ();
4576 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4577 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4578 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4580 /* If stack probes are required, the space used for large function
4581 arguments on the stack must also be probed, so enable
4582 -maccumulate-outgoing-args so this happens in the prologue. */
4583 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4584 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4586 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4587 warning (0,
4588 main_args_p
4589 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4590 "for correctness")
4591 : G_("stack probing requires "
4592 "%<target(\"accumulate-outgoing-args\")%> for "
4593 "correctness"));
4594 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4597 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4598 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4599 if (fixed_regs[BP_REG]
4600 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4602 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4603 warning (0,
4604 main_args_p
4605 ? G_("fixed ebp register requires "
4606 "%<-maccumulate-outgoing-args%>")
4607 : G_("fixed ebp register requires "
4608 "%<target(\"accumulate-outgoing-args\")%>"));
4609 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4612 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4614 char *p;
4615 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4616 p = strchr (internal_label_prefix, 'X');
4617 internal_label_prefix_len = p - internal_label_prefix;
4618 *p = '\0';
4621 /* When scheduling description is not available, disable scheduler pass
4622 so it won't slow down the compilation and make x87 code slower. */
4623 if (!TARGET_SCHEDULE)
4624 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4626 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4627 ix86_tune_cost->simultaneous_prefetches,
4628 opts->x_param_values,
4629 opts_set->x_param_values);
4630 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4631 ix86_tune_cost->prefetch_block,
4632 opts->x_param_values,
4633 opts_set->x_param_values);
4634 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4635 ix86_tune_cost->l1_cache_size,
4636 opts->x_param_values,
4637 opts_set->x_param_values);
4638 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4639 ix86_tune_cost->l2_cache_size,
4640 opts->x_param_values,
4641 opts_set->x_param_values);
4643 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4644 if (opts->x_flag_prefetch_loop_arrays < 0
4645 && HAVE_prefetch
4646 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4647 && !opts->x_optimize_size
4648 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4649 opts->x_flag_prefetch_loop_arrays = 1;
4651 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4652 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4653 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4654 targetm.expand_builtin_va_start = NULL;
4656 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4658 ix86_gen_leave = gen_leave_rex64;
4659 if (Pmode == DImode)
4661 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4662 ix86_gen_tls_local_dynamic_base_64
4663 = gen_tls_local_dynamic_base_64_di;
4665 else
4667 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4668 ix86_gen_tls_local_dynamic_base_64
4669 = gen_tls_local_dynamic_base_64_si;
4672 else
4673 ix86_gen_leave = gen_leave;
4675 if (Pmode == DImode)
4677 ix86_gen_add3 = gen_adddi3;
4678 ix86_gen_sub3 = gen_subdi3;
4679 ix86_gen_sub3_carry = gen_subdi3_carry;
4680 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4681 ix86_gen_andsp = gen_anddi3;
4682 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4683 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4684 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4685 ix86_gen_monitor = gen_sse3_monitor_di;
4686 ix86_gen_monitorx = gen_monitorx_di;
4687 ix86_gen_clzero = gen_clzero_di;
4689 else
4691 ix86_gen_add3 = gen_addsi3;
4692 ix86_gen_sub3 = gen_subsi3;
4693 ix86_gen_sub3_carry = gen_subsi3_carry;
4694 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4695 ix86_gen_andsp = gen_andsi3;
4696 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4697 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4698 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4699 ix86_gen_monitor = gen_sse3_monitor_si;
4700 ix86_gen_monitorx = gen_monitorx_si;
4701 ix86_gen_clzero = gen_clzero_si;
4704 #ifdef USE_IX86_CLD
4705 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4706 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4707 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4708 #endif
4710 /* Set the default value for -mfentry. */
4711 if (!opts_set->x_flag_fentry)
4712 opts->x_flag_fentry = TARGET_SEH;
4713 else
4715 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4716 && opts->x_flag_fentry)
4717 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4718 "with -fpic");
4719 else if (TARGET_SEH && !opts->x_flag_fentry)
4720 sorry ("-mno-fentry isn%'t compatible with SEH");
4723 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4724 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4726 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4727 && TARGET_EMIT_VZEROUPPER)
4728 opts->x_target_flags |= MASK_VZEROUPPER;
4729 if (!(opts_set->x_target_flags & MASK_STV))
4730 opts->x_target_flags |= MASK_STV;
4731 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4732 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4733 stack realignment will be extra cost the pass doesn't take into
4734 account and the pass can't realign the stack. */
4735 if (ix86_preferred_stack_boundary < 128
4736 || ix86_incoming_stack_boundary < 128
4737 || opts->x_ix86_force_align_arg_pointer)
4738 opts->x_target_flags &= ~MASK_STV;
4739 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4740 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4741 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4742 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4743 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4744 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4746 /* Enable 128-bit AVX instruction generation
4747 for the auto-vectorizer. */
4748 if (TARGET_AVX128_OPTIMAL
4749 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4750 opts->x_prefer_vector_width_type = PVW_AVX128;
4752 /* Use 256-bit AVX instruction generation
4753 in the auto-vectorizer. */
4754 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4755 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4756 opts->x_prefer_vector_width_type = PVW_AVX256;
4758 if (opts->x_ix86_recip_name)
4760 char *p = ASTRDUP (opts->x_ix86_recip_name);
4761 char *q;
4762 unsigned int mask, i;
4763 bool invert;
4765 while ((q = strtok (p, ",")) != NULL)
4767 p = NULL;
4768 if (*q == '!')
4770 invert = true;
4771 q++;
4773 else
4774 invert = false;
4776 if (!strcmp (q, "default"))
4777 mask = RECIP_MASK_ALL;
4778 else
4780 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4781 if (!strcmp (q, recip_options[i].string))
4783 mask = recip_options[i].mask;
4784 break;
4787 if (i == ARRAY_SIZE (recip_options))
4789 error ("unknown option for -mrecip=%s", q);
4790 invert = false;
4791 mask = RECIP_MASK_NONE;
4795 opts->x_recip_mask_explicit |= mask;
4796 if (invert)
4797 opts->x_recip_mask &= ~mask;
4798 else
4799 opts->x_recip_mask |= mask;
4803 if (TARGET_RECIP_P (opts->x_target_flags))
4804 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4805 else if (opts_set->x_target_flags & MASK_RECIP)
4806 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4808 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4809 for 64-bit Bionic. Also default long double to 64-bit for Intel
4810 MCU psABI. */
4811 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4812 && !(opts_set->x_target_flags
4813 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4814 opts->x_target_flags |= (TARGET_64BIT
4815 ? MASK_LONG_DOUBLE_128
4816 : MASK_LONG_DOUBLE_64);
4818 /* Only one of them can be active. */
4819 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4820 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4822 /* Handle stack protector */
4823 if (!opts_set->x_ix86_stack_protector_guard)
4824 opts->x_ix86_stack_protector_guard
4825 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4827 #ifdef TARGET_THREAD_SSP_OFFSET
4828 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4829 #endif
4831 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4833 char *endp;
4834 const char *str = ix86_stack_protector_guard_offset_str;
4836 errno = 0;
4837 int64_t offset;
4839 #if defined(INT64_T_IS_LONG)
4840 offset = strtol (str, &endp, 0);
4841 #else
4842 offset = strtoll (str, &endp, 0);
4843 #endif
4845 if (!*str || *endp || errno)
4846 error ("%qs is not a valid number "
4847 "in -mstack-protector-guard-offset=", str);
4849 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4850 HOST_WIDE_INT_C (0x7fffffff)))
4851 error ("%qs is not a valid offset "
4852 "in -mstack-protector-guard-offset=", str);
4854 ix86_stack_protector_guard_offset = offset;
4857 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4859 /* The kernel uses a different segment register for performance
4860 reasons; a system call would not have to trash the userspace
4861 segment register, which would be expensive. */
4862 if (ix86_cmodel == CM_KERNEL)
4863 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4865 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4867 const char *str = ix86_stack_protector_guard_reg_str;
4868 addr_space_t seg = ADDR_SPACE_GENERIC;
4870 /* Discard optional register prefix. */
4871 if (str[0] == '%')
4872 str++;
4874 if (strlen (str) == 2 && str[1] == 's')
4876 if (str[0] == 'f')
4877 seg = ADDR_SPACE_SEG_FS;
4878 else if (str[0] == 'g')
4879 seg = ADDR_SPACE_SEG_GS;
4882 if (seg == ADDR_SPACE_GENERIC)
4883 error ("%qs is not a valid base register "
4884 "in -mstack-protector-guard-reg=",
4885 ix86_stack_protector_guard_reg_str);
4887 ix86_stack_protector_guard_reg = seg;
4890 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4891 if (opts->x_ix86_tune_memcpy_strategy)
4893 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4894 ix86_parse_stringop_strategy_string (str, false);
4895 free (str);
4898 if (opts->x_ix86_tune_memset_strategy)
4900 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4901 ix86_parse_stringop_strategy_string (str, true);
4902 free (str);
4905 /* Save the initial options in case the user does function specific
4906 options. */
4907 if (main_args_p)
4908 target_option_default_node = target_option_current_node
4909 = build_target_option_node (opts);
4911 /* Do not support control flow instrumentation if CET is not enabled. */
4912 if (opts->x_flag_cf_protection != CF_NONE)
4914 switch (flag_cf_protection)
4916 case CF_NONE:
4917 break;
4918 case CF_BRANCH:
4919 if (! TARGET_IBT_P (opts->x_ix86_isa_flags2))
4921 error ("%<-fcf-protection=branch%> requires Intel CET "
4922 "support. Use -mcet or -mibt option to enable CET");
4923 flag_cf_protection = CF_NONE;
4924 return false;
4926 break;
4927 case CF_RETURN:
4928 if (! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4930 error ("%<-fcf-protection=return%> requires Intel CET "
4931 "support. Use -mcet or -mshstk option to enable CET");
4932 flag_cf_protection = CF_NONE;
4933 return false;
4935 break;
4936 case CF_FULL:
4937 if ( ! TARGET_IBT_P (opts->x_ix86_isa_flags2)
4938 || ! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4940 error ("%<-fcf-protection=full%> requires Intel CET "
4941 "support. Use -mcet or both of -mibt and "
4942 "-mshstk options to enable CET");
4943 flag_cf_protection = CF_NONE;
4944 return false;
4946 break;
4947 default:
4948 gcc_unreachable ();
4951 opts->x_flag_cf_protection =
4952 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4955 if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
4956 maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
4957 opts->x_param_values,
4958 opts_set->x_param_values);
4960 return true;
4963 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4965 static void
4966 ix86_option_override (void)
4968 ix86_option_override_internal (true, &global_options, &global_options_set);
4971 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4972 static char *
4973 ix86_offload_options (void)
4975 if (TARGET_LP64)
4976 return xstrdup ("-foffload-abi=lp64");
4977 return xstrdup ("-foffload-abi=ilp32");
4980 /* Update register usage after having seen the compiler flags. */
4982 static void
4983 ix86_conditional_register_usage (void)
4985 int i, c_mask;
4987 /* If there are no caller-saved registers, preserve all registers.
4988 except fixed_regs and registers used for function return value
4989 since aggregate_value_p checks call_used_regs[regno] on return
4990 value. */
4991 if (cfun && cfun->machine->no_caller_saved_registers)
4992 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4993 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4994 call_used_regs[i] = 0;
4996 /* For 32-bit targets, squash the REX registers. */
4997 if (! TARGET_64BIT)
4999 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
5000 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5001 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
5002 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5003 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5004 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5007 /* See the definition of CALL_USED_REGISTERS in i386.h. */
5008 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
5010 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
5012 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5014 /* Set/reset conditionally defined registers from
5015 CALL_USED_REGISTERS initializer. */
5016 if (call_used_regs[i] > 1)
5017 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
5019 /* Calculate registers of CLOBBERED_REGS register set
5020 as call used registers from GENERAL_REGS register set. */
5021 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
5022 && call_used_regs[i])
5023 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
5026 /* If MMX is disabled, squash the registers. */
5027 if (! TARGET_MMX)
5028 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5029 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
5030 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5032 /* If SSE is disabled, squash the registers. */
5033 if (! TARGET_SSE)
5034 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5035 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
5036 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5038 /* If the FPU is disabled, squash the registers. */
5039 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
5040 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5041 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
5042 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5044 /* If AVX512F is disabled, squash the registers. */
5045 if (! TARGET_AVX512F)
5047 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5048 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5050 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
5051 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5054 /* If MPX is disabled, squash the registers. */
5055 if (! TARGET_MPX)
5056 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
5057 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5060 /* Canonicalize a comparison from one we don't have to one we do have. */
5062 static void
5063 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5064 bool op0_preserve_value)
5066 /* The order of operands in x87 ficom compare is forced by combine in
5067 simplify_comparison () function. Float operator is treated as RTX_OBJ
5068 with a precedence over other operators and is always put in the first
5069 place. Swap condition and operands to match ficom instruction. */
5070 if (!op0_preserve_value
5071 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5073 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5075 /* We are called only for compares that are split to SAHF instruction.
5076 Ensure that we have setcc/jcc insn for the swapped condition. */
5077 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5079 std::swap (*op0, *op1);
5080 *code = (int) scode;
5085 /* Save the current options */
5087 static void
5088 ix86_function_specific_save (struct cl_target_option *ptr,
5089 struct gcc_options *opts)
5091 ptr->arch = ix86_arch;
5092 ptr->schedule = ix86_schedule;
5093 ptr->prefetch_sse = x86_prefetch_sse;
5094 ptr->tune = ix86_tune;
5095 ptr->branch_cost = ix86_branch_cost;
5096 ptr->tune_defaulted = ix86_tune_defaulted;
5097 ptr->arch_specified = ix86_arch_specified;
5098 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5099 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5100 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5101 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5102 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5103 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5104 ptr->x_ix86_abi = opts->x_ix86_abi;
5105 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5106 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5107 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5108 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5109 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5110 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5111 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5112 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5113 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5114 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5115 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5116 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5117 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5118 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5119 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5120 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5121 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5122 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5123 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5124 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5126 /* The fields are char but the variables are not; make sure the
5127 values fit in the fields. */
5128 gcc_assert (ptr->arch == ix86_arch);
5129 gcc_assert (ptr->schedule == ix86_schedule);
5130 gcc_assert (ptr->tune == ix86_tune);
5131 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5134 /* Restore the current options */
5136 static void
5137 ix86_function_specific_restore (struct gcc_options *opts,
5138 struct cl_target_option *ptr)
5140 enum processor_type old_tune = ix86_tune;
5141 enum processor_type old_arch = ix86_arch;
5142 unsigned int ix86_arch_mask;
5143 int i;
5145 /* We don't change -fPIC. */
5146 opts->x_flag_pic = flag_pic;
5148 ix86_arch = (enum processor_type) ptr->arch;
5149 ix86_schedule = (enum attr_cpu) ptr->schedule;
5150 ix86_tune = (enum processor_type) ptr->tune;
5151 x86_prefetch_sse = ptr->prefetch_sse;
5152 opts->x_ix86_branch_cost = ptr->branch_cost;
5153 ix86_tune_defaulted = ptr->tune_defaulted;
5154 ix86_arch_specified = ptr->arch_specified;
5155 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5156 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5157 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5158 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5159 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5160 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5161 opts->x_ix86_abi = ptr->x_ix86_abi;
5162 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5163 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5164 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5165 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5166 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5167 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5168 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5169 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5170 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5171 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5172 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5173 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5174 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5175 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5176 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5177 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5178 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5179 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5180 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5181 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5182 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5183 /* TODO: ix86_cost should be chosen at instruction or function granuality
5184 so for cold code we use size_cost even in !optimize_size compilation. */
5185 if (opts->x_optimize_size)
5186 ix86_cost = &ix86_size_cost;
5187 else
5188 ix86_cost = ix86_tune_cost;
5190 /* Recreate the arch feature tests if the arch changed */
5191 if (old_arch != ix86_arch)
5193 ix86_arch_mask = 1u << ix86_arch;
5194 for (i = 0; i < X86_ARCH_LAST; ++i)
5195 ix86_arch_features[i]
5196 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5199 /* Recreate the tune optimization tests */
5200 if (old_tune != ix86_tune)
5201 set_ix86_tune_features (ix86_tune, false);
5204 /* Adjust target options after streaming them in. This is mainly about
5205 reconciling them with global options. */
5207 static void
5208 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5210 /* flag_pic is a global option, but ix86_cmodel is target saved option
5211 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5212 for PIC, or error out. */
5213 if (flag_pic)
5214 switch (ptr->x_ix86_cmodel)
5216 case CM_SMALL:
5217 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5218 break;
5220 case CM_MEDIUM:
5221 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5222 break;
5224 case CM_LARGE:
5225 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5226 break;
5228 case CM_KERNEL:
5229 error ("code model %s does not support PIC mode", "kernel");
5230 break;
5232 default:
5233 break;
5235 else
5236 switch (ptr->x_ix86_cmodel)
5238 case CM_SMALL_PIC:
5239 ptr->x_ix86_cmodel = CM_SMALL;
5240 break;
5242 case CM_MEDIUM_PIC:
5243 ptr->x_ix86_cmodel = CM_MEDIUM;
5244 break;
5246 case CM_LARGE_PIC:
5247 ptr->x_ix86_cmodel = CM_LARGE;
5248 break;
5250 default:
5251 break;
5255 /* Print the current options */
5257 static void
5258 ix86_function_specific_print (FILE *file, int indent,
5259 struct cl_target_option *ptr)
5261 char *target_string
5262 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5263 ptr->x_target_flags, ptr->x_ix86_target_flags,
5264 NULL, NULL, ptr->x_ix86_fpmath, false);
5266 gcc_assert (ptr->arch < PROCESSOR_max);
5267 fprintf (file, "%*sarch = %d (%s)\n",
5268 indent, "",
5269 ptr->arch, processor_target_table[ptr->arch].name);
5271 gcc_assert (ptr->tune < PROCESSOR_max);
5272 fprintf (file, "%*stune = %d (%s)\n",
5273 indent, "",
5274 ptr->tune, processor_target_table[ptr->tune].name);
5276 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5278 if (target_string)
5280 fprintf (file, "%*s%s\n", indent, "", target_string);
5281 free (target_string);
5286 /* Inner function to process the attribute((target(...))), take an argument and
5287 set the current options from the argument. If we have a list, recursively go
5288 over the list. */
5290 static bool
5291 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5292 struct gcc_options *opts,
5293 struct gcc_options *opts_set,
5294 struct gcc_options *enum_opts_set)
5296 char *next_optstr;
5297 bool ret = true;
5299 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5300 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5301 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5302 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5303 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5305 enum ix86_opt_type
5307 ix86_opt_unknown,
5308 ix86_opt_yes,
5309 ix86_opt_no,
5310 ix86_opt_str,
5311 ix86_opt_enum,
5312 ix86_opt_isa
5315 static const struct
5317 const char *string;
5318 size_t len;
5319 enum ix86_opt_type type;
5320 int opt;
5321 int mask;
5322 } attrs[] = {
5323 /* isa options */
5324 IX86_ATTR_ISA ("sgx", OPT_msgx),
5325 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5326 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5327 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5328 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5329 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5330 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5332 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5333 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5334 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5335 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5336 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5337 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5338 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5339 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5340 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5341 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5342 IX86_ATTR_ISA ("fma", OPT_mfma),
5343 IX86_ATTR_ISA ("xop", OPT_mxop),
5344 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5345 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5346 IX86_ATTR_ISA ("avx", OPT_mavx),
5347 IX86_ATTR_ISA ("sse4", OPT_msse4),
5348 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5349 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5350 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5351 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5352 IX86_ATTR_ISA ("sse3", OPT_msse3),
5353 IX86_ATTR_ISA ("aes", OPT_maes),
5354 IX86_ATTR_ISA ("sha", OPT_msha),
5355 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5356 IX86_ATTR_ISA ("sse2", OPT_msse2),
5357 IX86_ATTR_ISA ("sse", OPT_msse),
5358 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5359 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5360 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5361 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5362 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5363 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5364 IX86_ATTR_ISA ("adx", OPT_madx),
5365 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5366 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5367 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5368 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5369 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5370 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5371 IX86_ATTR_ISA ("abm", OPT_mabm),
5372 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5373 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5374 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5375 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5376 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5377 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5378 IX86_ATTR_ISA ("sahf", OPT_msahf),
5379 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5380 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5381 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5382 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5383 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5384 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5385 IX86_ATTR_ISA ("pku", OPT_mpku),
5386 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5387 IX86_ATTR_ISA ("hle", OPT_mhle),
5388 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5389 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5390 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5391 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5392 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5393 IX86_ATTR_ISA ("ibt", OPT_mibt),
5394 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5395 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5396 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5398 /* enum options */
5399 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5401 /* string options */
5402 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5403 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5405 /* flag options */
5406 IX86_ATTR_YES ("cld",
5407 OPT_mcld,
5408 MASK_CLD),
5410 IX86_ATTR_NO ("fancy-math-387",
5411 OPT_mfancy_math_387,
5412 MASK_NO_FANCY_MATH_387),
5414 IX86_ATTR_YES ("ieee-fp",
5415 OPT_mieee_fp,
5416 MASK_IEEE_FP),
5418 IX86_ATTR_YES ("inline-all-stringops",
5419 OPT_minline_all_stringops,
5420 MASK_INLINE_ALL_STRINGOPS),
5422 IX86_ATTR_YES ("inline-stringops-dynamically",
5423 OPT_minline_stringops_dynamically,
5424 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5426 IX86_ATTR_NO ("align-stringops",
5427 OPT_mno_align_stringops,
5428 MASK_NO_ALIGN_STRINGOPS),
5430 IX86_ATTR_YES ("recip",
5431 OPT_mrecip,
5432 MASK_RECIP),
5436 /* If this is a list, recurse to get the options. */
5437 if (TREE_CODE (args) == TREE_LIST)
5439 bool ret = true;
5441 for (; args; args = TREE_CHAIN (args))
5442 if (TREE_VALUE (args)
5443 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5444 p_strings, opts, opts_set,
5445 enum_opts_set))
5446 ret = false;
5448 return ret;
5451 else if (TREE_CODE (args) != STRING_CST)
5453 error ("attribute %<target%> argument not a string");
5454 return false;
5457 /* Handle multiple arguments separated by commas. */
5458 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5460 while (next_optstr && *next_optstr != '\0')
5462 char *p = next_optstr;
5463 char *orig_p = p;
5464 char *comma = strchr (next_optstr, ',');
5465 const char *opt_string;
5466 size_t len, opt_len;
5467 int opt;
5468 bool opt_set_p;
5469 char ch;
5470 unsigned i;
5471 enum ix86_opt_type type = ix86_opt_unknown;
5472 int mask = 0;
5474 if (comma)
5476 *comma = '\0';
5477 len = comma - next_optstr;
5478 next_optstr = comma + 1;
5480 else
5482 len = strlen (p);
5483 next_optstr = NULL;
5486 /* Recognize no-xxx. */
5487 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5489 opt_set_p = false;
5490 p += 3;
5491 len -= 3;
5493 else
5494 opt_set_p = true;
5496 /* Find the option. */
5497 ch = *p;
5498 opt = N_OPTS;
5499 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5501 type = attrs[i].type;
5502 opt_len = attrs[i].len;
5503 if (ch == attrs[i].string[0]
5504 && ((type != ix86_opt_str && type != ix86_opt_enum)
5505 ? len == opt_len
5506 : len > opt_len)
5507 && memcmp (p, attrs[i].string, opt_len) == 0)
5509 opt = attrs[i].opt;
5510 mask = attrs[i].mask;
5511 opt_string = attrs[i].string;
5512 break;
5516 /* Process the option. */
5517 if (opt == N_OPTS)
5519 error ("attribute(target(\"%s\")) is unknown", orig_p);
5520 ret = false;
5523 else if (type == ix86_opt_isa)
5525 struct cl_decoded_option decoded;
5527 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5528 ix86_handle_option (opts, opts_set,
5529 &decoded, input_location);
5532 else if (type == ix86_opt_yes || type == ix86_opt_no)
5534 if (type == ix86_opt_no)
5535 opt_set_p = !opt_set_p;
5537 if (opt_set_p)
5538 opts->x_target_flags |= mask;
5539 else
5540 opts->x_target_flags &= ~mask;
5543 else if (type == ix86_opt_str)
5545 if (p_strings[opt])
5547 error ("option(\"%s\") was already specified", opt_string);
5548 ret = false;
5550 else
5551 p_strings[opt] = xstrdup (p + opt_len);
5554 else if (type == ix86_opt_enum)
5556 bool arg_ok;
5557 int value;
5559 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5560 if (arg_ok)
5561 set_option (opts, enum_opts_set, opt, value,
5562 p + opt_len, DK_UNSPECIFIED, input_location,
5563 global_dc);
5564 else
5566 error ("attribute(target(\"%s\")) is unknown", orig_p);
5567 ret = false;
5571 else
5572 gcc_unreachable ();
5575 return ret;
5578 /* Release allocated strings. */
5579 static void
5580 release_options_strings (char **option_strings)
5582 /* Free up memory allocated to hold the strings */
5583 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5584 free (option_strings[i]);
5587 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5589 tree
5590 ix86_valid_target_attribute_tree (tree args,
5591 struct gcc_options *opts,
5592 struct gcc_options *opts_set)
5594 const char *orig_arch_string = opts->x_ix86_arch_string;
5595 const char *orig_tune_string = opts->x_ix86_tune_string;
5596 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5597 int orig_tune_defaulted = ix86_tune_defaulted;
5598 int orig_arch_specified = ix86_arch_specified;
5599 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5600 tree t = NULL_TREE;
5601 struct cl_target_option *def
5602 = TREE_TARGET_OPTION (target_option_default_node);
5603 struct gcc_options enum_opts_set;
5605 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5607 /* Process each of the options on the chain. */
5608 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5609 opts_set, &enum_opts_set))
5610 return error_mark_node;
5612 /* If the changed options are different from the default, rerun
5613 ix86_option_override_internal, and then save the options away.
5614 The string options are attribute options, and will be undone
5615 when we copy the save structure. */
5616 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5617 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5618 || opts->x_target_flags != def->x_target_flags
5619 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5620 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5621 || enum_opts_set.x_ix86_fpmath)
5623 /* If we are using the default tune= or arch=, undo the string assigned,
5624 and use the default. */
5625 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5627 opts->x_ix86_arch_string
5628 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5630 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5631 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5632 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5633 | OPTION_MASK_ABI_64
5634 | OPTION_MASK_ABI_X32
5635 | OPTION_MASK_CODE16);
5636 opts->x_ix86_isa_flags2 = 0;
5638 else if (!orig_arch_specified)
5639 opts->x_ix86_arch_string = NULL;
5641 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5642 opts->x_ix86_tune_string
5643 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5644 else if (orig_tune_defaulted)
5645 opts->x_ix86_tune_string = NULL;
5647 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5648 if (enum_opts_set.x_ix86_fpmath)
5649 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5651 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5652 bool r = ix86_option_override_internal (false, opts, opts_set);
5653 if (!r)
5655 release_options_strings (option_strings);
5656 return error_mark_node;
5659 /* Add any builtin functions with the new isa if any. */
5660 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5662 /* Save the current options unless we are validating options for
5663 #pragma. */
5664 t = build_target_option_node (opts);
5666 opts->x_ix86_arch_string = orig_arch_string;
5667 opts->x_ix86_tune_string = orig_tune_string;
5668 opts_set->x_ix86_fpmath = orig_fpmath_set;
5670 release_options_strings (option_strings);
5673 return t;
5676 /* Hook to validate attribute((target("string"))). */
5678 static bool
5679 ix86_valid_target_attribute_p (tree fndecl,
5680 tree ARG_UNUSED (name),
5681 tree args,
5682 int ARG_UNUSED (flags))
5684 struct gcc_options func_options;
5685 tree new_target, new_optimize;
5686 bool ret = true;
5688 /* attribute((target("default"))) does nothing, beyond
5689 affecting multi-versioning. */
5690 if (TREE_VALUE (args)
5691 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5692 && TREE_CHAIN (args) == NULL_TREE
5693 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5694 return true;
5696 tree old_optimize = build_optimization_node (&global_options);
5698 /* Get the optimization options of the current function. */
5699 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5701 if (!func_optimize)
5702 func_optimize = old_optimize;
5704 /* Init func_options. */
5705 memset (&func_options, 0, sizeof (func_options));
5706 init_options_struct (&func_options, NULL);
5707 lang_hooks.init_options_struct (&func_options);
5709 cl_optimization_restore (&func_options,
5710 TREE_OPTIMIZATION (func_optimize));
5712 /* Initialize func_options to the default before its target options can
5713 be set. */
5714 cl_target_option_restore (&func_options,
5715 TREE_TARGET_OPTION (target_option_default_node));
5717 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5718 &global_options_set);
5720 new_optimize = build_optimization_node (&func_options);
5722 if (new_target == error_mark_node)
5723 ret = false;
5725 else if (fndecl && new_target)
5727 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5729 if (old_optimize != new_optimize)
5730 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5733 finalize_options_struct (&func_options);
5735 return ret;
5739 /* Hook to determine if one function can safely inline another. */
5741 static bool
5742 ix86_can_inline_p (tree caller, tree callee)
5744 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5745 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5746 if (!callee_tree)
5747 callee_tree = target_option_default_node;
5748 if (!caller_tree)
5749 caller_tree = target_option_default_node;
5750 if (callee_tree == caller_tree)
5751 return true;
5753 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5754 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5755 bool ret = false;
5757 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5758 function can inline a SSE2 function but a SSE2 function can't inline
5759 a SSE4 function. */
5760 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5761 != callee_opts->x_ix86_isa_flags)
5762 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5763 != callee_opts->x_ix86_isa_flags2))
5764 ret = false;
5766 /* See if we have the same non-isa options. */
5767 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5768 ret = false;
5770 /* See if arch, tune, etc. are the same. */
5771 else if (caller_opts->arch != callee_opts->arch)
5772 ret = false;
5774 else if (caller_opts->tune != callee_opts->tune)
5775 ret = false;
5777 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5778 /* If the calle doesn't use FP expressions differences in
5779 ix86_fpmath can be ignored. We are called from FEs
5780 for multi-versioning call optimization, so beware of
5781 ipa_fn_summaries not available. */
5782 && (! ipa_fn_summaries
5783 || ipa_fn_summaries->get
5784 (cgraph_node::get (callee))->fp_expressions))
5785 ret = false;
5787 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5788 ret = false;
5790 else
5791 ret = true;
5793 return ret;
5797 /* Remember the last target of ix86_set_current_function. */
5798 static GTY(()) tree ix86_previous_fndecl;
5800 /* Set targets globals to the default (or current #pragma GCC target
5801 if active). Invalidate ix86_previous_fndecl cache. */
5803 void
5804 ix86_reset_previous_fndecl (void)
5806 tree new_tree = target_option_current_node;
5807 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5808 if (TREE_TARGET_GLOBALS (new_tree))
5809 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5810 else if (new_tree == target_option_default_node)
5811 restore_target_globals (&default_target_globals);
5812 else
5813 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5814 ix86_previous_fndecl = NULL_TREE;
5817 /* Set the func_type field from the function FNDECL. */
5819 static void
5820 ix86_set_func_type (tree fndecl)
5822 if (cfun->machine->func_type == TYPE_UNKNOWN)
5824 if (lookup_attribute ("interrupt",
5825 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5827 if (ix86_function_naked (fndecl))
5828 error_at (DECL_SOURCE_LOCATION (fndecl),
5829 "interrupt and naked attributes are not compatible");
5831 int nargs = 0;
5832 for (tree arg = DECL_ARGUMENTS (fndecl);
5833 arg;
5834 arg = TREE_CHAIN (arg))
5835 nargs++;
5836 cfun->machine->no_caller_saved_registers = true;
5837 cfun->machine->func_type
5838 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5840 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5842 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5843 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5844 sorry ("Only DWARF debug format is supported for interrupt "
5845 "service routine.");
5847 else
5849 cfun->machine->func_type = TYPE_NORMAL;
5850 if (lookup_attribute ("no_caller_saved_registers",
5851 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5852 cfun->machine->no_caller_saved_registers = true;
5857 /* Set the indirect_branch_type field from the function FNDECL. */
5859 static void
5860 ix86_set_indirect_branch_type (tree fndecl)
5862 if (cfun->machine->indirect_branch_type == indirect_branch_unset)
5864 tree attr = lookup_attribute ("indirect_branch",
5865 DECL_ATTRIBUTES (fndecl));
5866 if (attr != NULL)
5868 tree args = TREE_VALUE (attr);
5869 if (args == NULL)
5870 gcc_unreachable ();
5871 tree cst = TREE_VALUE (args);
5872 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5873 cfun->machine->indirect_branch_type = indirect_branch_keep;
5874 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5875 cfun->machine->indirect_branch_type = indirect_branch_thunk;
5876 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5877 cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
5878 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5879 cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
5880 else
5881 gcc_unreachable ();
5883 else
5884 cfun->machine->indirect_branch_type = ix86_indirect_branch;
5886 /* -mcmodel=large is not compatible with -mindirect-branch=thunk
5887 nor -mindirect-branch=thunk-extern. */
5888 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5889 && ((cfun->machine->indirect_branch_type
5890 == indirect_branch_thunk_extern)
5891 || (cfun->machine->indirect_branch_type
5892 == indirect_branch_thunk)))
5893 error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
5894 "compatible",
5895 ((cfun->machine->indirect_branch_type
5896 == indirect_branch_thunk_extern)
5897 ? "thunk-extern" : "thunk"));
5900 if (cfun->machine->function_return_type == indirect_branch_unset)
5902 tree attr = lookup_attribute ("function_return",
5903 DECL_ATTRIBUTES (fndecl));
5904 if (attr != NULL)
5906 tree args = TREE_VALUE (attr);
5907 if (args == NULL)
5908 gcc_unreachable ();
5909 tree cst = TREE_VALUE (args);
5910 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5911 cfun->machine->function_return_type = indirect_branch_keep;
5912 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5913 cfun->machine->function_return_type = indirect_branch_thunk;
5914 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5915 cfun->machine->function_return_type = indirect_branch_thunk_inline;
5916 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5917 cfun->machine->function_return_type = indirect_branch_thunk_extern;
5918 else
5919 gcc_unreachable ();
5921 else
5922 cfun->machine->function_return_type = ix86_function_return;
5924 /* -mcmodel=large is not compatible with -mfunction-return=thunk
5925 nor -mfunction-return=thunk-extern. */
5926 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5927 && ((cfun->machine->function_return_type
5928 == indirect_branch_thunk_extern)
5929 || (cfun->machine->function_return_type
5930 == indirect_branch_thunk)))
5931 error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
5932 "compatible",
5933 ((cfun->machine->function_return_type
5934 == indirect_branch_thunk_extern)
5935 ? "thunk-extern" : "thunk"));
5939 /* Establish appropriate back-end context for processing the function
5940 FNDECL. The argument might be NULL to indicate processing at top
5941 level, outside of any function scope. */
5942 static void
5943 ix86_set_current_function (tree fndecl)
5945 /* Only change the context if the function changes. This hook is called
5946 several times in the course of compiling a function, and we don't want to
5947 slow things down too much or call target_reinit when it isn't safe. */
5948 if (fndecl == ix86_previous_fndecl)
5950 /* There may be 2 function bodies for the same function FNDECL,
5951 one is extern inline and one isn't. Call ix86_set_func_type
5952 to set the func_type field. */
5953 if (fndecl != NULL_TREE)
5955 ix86_set_func_type (fndecl);
5956 ix86_set_indirect_branch_type (fndecl);
5958 return;
5961 tree old_tree;
5962 if (ix86_previous_fndecl == NULL_TREE)
5963 old_tree = target_option_current_node;
5964 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5965 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5966 else
5967 old_tree = target_option_default_node;
5969 if (fndecl == NULL_TREE)
5971 if (old_tree != target_option_current_node)
5972 ix86_reset_previous_fndecl ();
5973 return;
5976 ix86_set_func_type (fndecl);
5977 ix86_set_indirect_branch_type (fndecl);
5979 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5980 if (new_tree == NULL_TREE)
5981 new_tree = target_option_default_node;
5983 if (old_tree != new_tree)
5985 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5986 if (TREE_TARGET_GLOBALS (new_tree))
5987 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5988 else if (new_tree == target_option_default_node)
5989 restore_target_globals (&default_target_globals);
5990 else
5991 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5993 ix86_previous_fndecl = fndecl;
5995 static bool prev_no_caller_saved_registers;
5997 /* 64-bit MS and SYSV ABI have different set of call used registers.
5998 Avoid expensive re-initialization of init_regs each time we switch
5999 function context. */
6000 if (TARGET_64BIT
6001 && (call_used_regs[SI_REG]
6002 == (cfun->machine->call_abi == MS_ABI)))
6003 reinit_regs ();
6004 /* Need to re-initialize init_regs if caller-saved registers are
6005 changed. */
6006 else if (prev_no_caller_saved_registers
6007 != cfun->machine->no_caller_saved_registers)
6008 reinit_regs ();
6010 if (cfun->machine->func_type != TYPE_NORMAL
6011 || cfun->machine->no_caller_saved_registers)
6013 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
6014 may change processor state. */
6015 const char *isa;
6016 if (TARGET_MPX)
6017 isa = "MPX";
6018 else if (TARGET_SSE)
6019 isa = "SSE";
6020 else if (TARGET_MMX)
6021 isa = "MMX/3Dnow";
6022 else if (TARGET_80387)
6023 isa = "80387";
6024 else
6025 isa = NULL;
6026 if (isa != NULL)
6028 if (cfun->machine->func_type != TYPE_NORMAL)
6029 sorry ("%s instructions aren't allowed in %s service routine",
6030 isa, (cfun->machine->func_type == TYPE_EXCEPTION
6031 ? "exception" : "interrupt"));
6032 else
6033 sorry ("%s instructions aren't allowed in function with "
6034 "no_caller_saved_registers attribute", isa);
6035 /* Don't issue the same error twice. */
6036 cfun->machine->func_type = TYPE_NORMAL;
6037 cfun->machine->no_caller_saved_registers = false;
6041 prev_no_caller_saved_registers
6042 = cfun->machine->no_caller_saved_registers;
6046 /* Return true if this goes in large data/bss. */
6048 static bool
6049 ix86_in_large_data_p (tree exp)
6051 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
6052 return false;
6054 if (exp == NULL_TREE)
6055 return false;
6057 /* Functions are never large data. */
6058 if (TREE_CODE (exp) == FUNCTION_DECL)
6059 return false;
6061 /* Automatic variables are never large data. */
6062 if (VAR_P (exp) && !is_global_var (exp))
6063 return false;
6065 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
6067 const char *section = DECL_SECTION_NAME (exp);
6068 if (strcmp (section, ".ldata") == 0
6069 || strcmp (section, ".lbss") == 0)
6070 return true;
6071 return false;
6073 else
6075 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
6077 /* If this is an incomplete type with size 0, then we can't put it
6078 in data because it might be too big when completed. Also,
6079 int_size_in_bytes returns -1 if size can vary or is larger than
6080 an integer in which case also it is safer to assume that it goes in
6081 large data. */
6082 if (size <= 0 || size > ix86_section_threshold)
6083 return true;
6086 return false;
6089 /* i386-specific section flag to mark large sections. */
6090 #define SECTION_LARGE SECTION_MACH_DEP
6092 /* Switch to the appropriate section for output of DECL.
6093 DECL is either a `VAR_DECL' node or a constant of some sort.
6094 RELOC indicates whether forming the initial value of DECL requires
6095 link-time relocations. */
6097 ATTRIBUTE_UNUSED static section *
6098 x86_64_elf_select_section (tree decl, int reloc,
6099 unsigned HOST_WIDE_INT align)
6101 if (ix86_in_large_data_p (decl))
6103 const char *sname = NULL;
6104 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
6105 switch (categorize_decl_for_section (decl, reloc))
6107 case SECCAT_DATA:
6108 sname = ".ldata";
6109 break;
6110 case SECCAT_DATA_REL:
6111 sname = ".ldata.rel";
6112 break;
6113 case SECCAT_DATA_REL_LOCAL:
6114 sname = ".ldata.rel.local";
6115 break;
6116 case SECCAT_DATA_REL_RO:
6117 sname = ".ldata.rel.ro";
6118 break;
6119 case SECCAT_DATA_REL_RO_LOCAL:
6120 sname = ".ldata.rel.ro.local";
6121 break;
6122 case SECCAT_BSS:
6123 sname = ".lbss";
6124 flags |= SECTION_BSS;
6125 break;
6126 case SECCAT_RODATA:
6127 case SECCAT_RODATA_MERGE_STR:
6128 case SECCAT_RODATA_MERGE_STR_INIT:
6129 case SECCAT_RODATA_MERGE_CONST:
6130 sname = ".lrodata";
6131 flags &= ~SECTION_WRITE;
6132 break;
6133 case SECCAT_SRODATA:
6134 case SECCAT_SDATA:
6135 case SECCAT_SBSS:
6136 gcc_unreachable ();
6137 case SECCAT_TEXT:
6138 case SECCAT_TDATA:
6139 case SECCAT_TBSS:
6140 /* We don't split these for medium model. Place them into
6141 default sections and hope for best. */
6142 break;
6144 if (sname)
6146 /* We might get called with string constants, but get_named_section
6147 doesn't like them as they are not DECLs. Also, we need to set
6148 flags in that case. */
6149 if (!DECL_P (decl))
6150 return get_section (sname, flags, NULL);
6151 return get_named_section (decl, sname, reloc);
6154 return default_elf_select_section (decl, reloc, align);
6157 /* Select a set of attributes for section NAME based on the properties
6158 of DECL and whether or not RELOC indicates that DECL's initializer
6159 might contain runtime relocations. */
6161 static unsigned int ATTRIBUTE_UNUSED
6162 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6164 unsigned int flags = default_section_type_flags (decl, name, reloc);
6166 if (ix86_in_large_data_p (decl))
6167 flags |= SECTION_LARGE;
6169 if (decl == NULL_TREE
6170 && (strcmp (name, ".ldata.rel.ro") == 0
6171 || strcmp (name, ".ldata.rel.ro.local") == 0))
6172 flags |= SECTION_RELRO;
6174 if (strcmp (name, ".lbss") == 0
6175 || strncmp (name, ".lbss.", 5) == 0
6176 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6177 flags |= SECTION_BSS;
6179 return flags;
6182 /* Build up a unique section name, expressed as a
6183 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6184 RELOC indicates whether the initial value of EXP requires
6185 link-time relocations. */
6187 static void ATTRIBUTE_UNUSED
6188 x86_64_elf_unique_section (tree decl, int reloc)
6190 if (ix86_in_large_data_p (decl))
6192 const char *prefix = NULL;
6193 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6194 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6196 switch (categorize_decl_for_section (decl, reloc))
6198 case SECCAT_DATA:
6199 case SECCAT_DATA_REL:
6200 case SECCAT_DATA_REL_LOCAL:
6201 case SECCAT_DATA_REL_RO:
6202 case SECCAT_DATA_REL_RO_LOCAL:
6203 prefix = one_only ? ".ld" : ".ldata";
6204 break;
6205 case SECCAT_BSS:
6206 prefix = one_only ? ".lb" : ".lbss";
6207 break;
6208 case SECCAT_RODATA:
6209 case SECCAT_RODATA_MERGE_STR:
6210 case SECCAT_RODATA_MERGE_STR_INIT:
6211 case SECCAT_RODATA_MERGE_CONST:
6212 prefix = one_only ? ".lr" : ".lrodata";
6213 break;
6214 case SECCAT_SRODATA:
6215 case SECCAT_SDATA:
6216 case SECCAT_SBSS:
6217 gcc_unreachable ();
6218 case SECCAT_TEXT:
6219 case SECCAT_TDATA:
6220 case SECCAT_TBSS:
6221 /* We don't split these for medium model. Place them into
6222 default sections and hope for best. */
6223 break;
6225 if (prefix)
6227 const char *name, *linkonce;
6228 char *string;
6230 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6231 name = targetm.strip_name_encoding (name);
6233 /* If we're using one_only, then there needs to be a .gnu.linkonce
6234 prefix to the section name. */
6235 linkonce = one_only ? ".gnu.linkonce" : "";
6237 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6239 set_decl_section_name (decl, string);
6240 return;
6243 default_unique_section (decl, reloc);
6246 #ifdef COMMON_ASM_OP
6248 #ifndef LARGECOMM_SECTION_ASM_OP
6249 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6250 #endif
6252 /* This says how to output assembler code to declare an
6253 uninitialized external linkage data object.
6255 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6256 large objects. */
6257 void
6258 x86_elf_aligned_decl_common (FILE *file, tree decl,
6259 const char *name, unsigned HOST_WIDE_INT size,
6260 int align)
6262 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6263 && size > (unsigned int)ix86_section_threshold)
6265 switch_to_section (get_named_section (decl, ".lbss", 0));
6266 fputs (LARGECOMM_SECTION_ASM_OP, file);
6268 else
6269 fputs (COMMON_ASM_OP, file);
6270 assemble_name (file, name);
6271 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6272 size, align / BITS_PER_UNIT);
6274 #endif
6276 /* Utility function for targets to use in implementing
6277 ASM_OUTPUT_ALIGNED_BSS. */
6279 void
6280 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6281 unsigned HOST_WIDE_INT size, int align)
6283 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6284 && size > (unsigned int)ix86_section_threshold)
6285 switch_to_section (get_named_section (decl, ".lbss", 0));
6286 else
6287 switch_to_section (bss_section);
6288 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6289 #ifdef ASM_DECLARE_OBJECT_NAME
6290 last_assemble_variable_decl = decl;
6291 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6292 #else
6293 /* Standard thing is just output label for the object. */
6294 ASM_OUTPUT_LABEL (file, name);
6295 #endif /* ASM_DECLARE_OBJECT_NAME */
6296 ASM_OUTPUT_SKIP (file, size ? size : 1);
6299 /* Decide whether we must probe the stack before any space allocation
6300 on this target. It's essentially TARGET_STACK_PROBE except when
6301 -fstack-check causes the stack to be already probed differently. */
6303 bool
6304 ix86_target_stack_probe (void)
6306 /* Do not probe the stack twice if static stack checking is enabled. */
6307 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6308 return false;
6310 return TARGET_STACK_PROBE;
6313 /* Decide whether we can make a sibling call to a function. DECL is the
6314 declaration of the function being targeted by the call and EXP is the
6315 CALL_EXPR representing the call. */
6317 static bool
6318 ix86_function_ok_for_sibcall (tree decl, tree exp)
6320 tree type, decl_or_type;
6321 rtx a, b;
6322 bool bind_global = decl && !targetm.binds_local_p (decl);
6324 if (ix86_function_naked (current_function_decl))
6325 return false;
6327 /* Sibling call isn't OK if there are no caller-saved registers
6328 since all registers must be preserved before return. */
6329 if (cfun->machine->no_caller_saved_registers)
6330 return false;
6332 /* If we are generating position-independent code, we cannot sibcall
6333 optimize direct calls to global functions, as the PLT requires
6334 %ebx be live. (Darwin does not have a PLT.) */
6335 if (!TARGET_MACHO
6336 && !TARGET_64BIT
6337 && flag_pic
6338 && flag_plt
6339 && bind_global)
6340 return false;
6342 /* If we need to align the outgoing stack, then sibcalling would
6343 unalign the stack, which may break the called function. */
6344 if (ix86_minimum_incoming_stack_boundary (true)
6345 < PREFERRED_STACK_BOUNDARY)
6346 return false;
6348 if (decl)
6350 decl_or_type = decl;
6351 type = TREE_TYPE (decl);
6353 else
6355 /* We're looking at the CALL_EXPR, we need the type of the function. */
6356 type = CALL_EXPR_FN (exp); /* pointer expression */
6357 type = TREE_TYPE (type); /* pointer type */
6358 type = TREE_TYPE (type); /* function type */
6359 decl_or_type = type;
6362 /* Check that the return value locations are the same. Like
6363 if we are returning floats on the 80387 register stack, we cannot
6364 make a sibcall from a function that doesn't return a float to a
6365 function that does or, conversely, from a function that does return
6366 a float to a function that doesn't; the necessary stack adjustment
6367 would not be executed. This is also the place we notice
6368 differences in the return value ABI. Note that it is ok for one
6369 of the functions to have void return type as long as the return
6370 value of the other is passed in a register. */
6371 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6372 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6373 cfun->decl, false);
6374 if (STACK_REG_P (a) || STACK_REG_P (b))
6376 if (!rtx_equal_p (a, b))
6377 return false;
6379 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6381 else if (!rtx_equal_p (a, b))
6382 return false;
6384 if (TARGET_64BIT)
6386 /* The SYSV ABI has more call-clobbered registers;
6387 disallow sibcalls from MS to SYSV. */
6388 if (cfun->machine->call_abi == MS_ABI
6389 && ix86_function_type_abi (type) == SYSV_ABI)
6390 return false;
6392 else
6394 /* If this call is indirect, we'll need to be able to use a
6395 call-clobbered register for the address of the target function.
6396 Make sure that all such registers are not used for passing
6397 parameters. Note that DLLIMPORT functions and call to global
6398 function via GOT slot are indirect. */
6399 if (!decl
6400 || (bind_global && flag_pic && !flag_plt)
6401 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6403 /* Check if regparm >= 3 since arg_reg_available is set to
6404 false if regparm == 0. If regparm is 1 or 2, there is
6405 always a call-clobbered register available.
6407 ??? The symbol indirect call doesn't need a call-clobbered
6408 register. But we don't know if this is a symbol indirect
6409 call or not here. */
6410 if (ix86_function_regparm (type, NULL) >= 3
6411 && !cfun->machine->arg_reg_available)
6412 return false;
6416 /* Otherwise okay. That also includes certain types of indirect calls. */
6417 return true;
6420 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6421 and "sseregparm" calling convention attributes;
6422 arguments as in struct attribute_spec.handler. */
6424 static tree
6425 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6426 bool *no_add_attrs)
6428 if (TREE_CODE (*node) != FUNCTION_TYPE
6429 && TREE_CODE (*node) != METHOD_TYPE
6430 && TREE_CODE (*node) != FIELD_DECL
6431 && TREE_CODE (*node) != TYPE_DECL)
6433 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6434 name);
6435 *no_add_attrs = true;
6436 return NULL_TREE;
6439 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6440 if (is_attribute_p ("regparm", name))
6442 tree cst;
6444 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6446 error ("fastcall and regparm attributes are not compatible");
6449 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6451 error ("regparam and thiscall attributes are not compatible");
6454 cst = TREE_VALUE (args);
6455 if (TREE_CODE (cst) != INTEGER_CST)
6457 warning (OPT_Wattributes,
6458 "%qE attribute requires an integer constant argument",
6459 name);
6460 *no_add_attrs = true;
6462 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6464 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6465 name, REGPARM_MAX);
6466 *no_add_attrs = true;
6469 return NULL_TREE;
6472 if (TARGET_64BIT)
6474 /* Do not warn when emulating the MS ABI. */
6475 if ((TREE_CODE (*node) != FUNCTION_TYPE
6476 && TREE_CODE (*node) != METHOD_TYPE)
6477 || ix86_function_type_abi (*node) != MS_ABI)
6478 warning (OPT_Wattributes, "%qE attribute ignored",
6479 name);
6480 *no_add_attrs = true;
6481 return NULL_TREE;
6484 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6485 if (is_attribute_p ("fastcall", name))
6487 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6489 error ("fastcall and cdecl attributes are not compatible");
6491 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6493 error ("fastcall and stdcall attributes are not compatible");
6495 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6497 error ("fastcall and regparm attributes are not compatible");
6499 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6501 error ("fastcall and thiscall attributes are not compatible");
6505 /* Can combine stdcall with fastcall (redundant), regparm and
6506 sseregparm. */
6507 else if (is_attribute_p ("stdcall", name))
6509 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6511 error ("stdcall and cdecl attributes are not compatible");
6513 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6515 error ("stdcall and fastcall attributes are not compatible");
6517 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6519 error ("stdcall and thiscall attributes are not compatible");
6523 /* Can combine cdecl with regparm and sseregparm. */
6524 else if (is_attribute_p ("cdecl", name))
6526 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6528 error ("stdcall and cdecl attributes are not compatible");
6530 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6532 error ("fastcall and cdecl attributes are not compatible");
6534 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6536 error ("cdecl and thiscall attributes are not compatible");
6539 else if (is_attribute_p ("thiscall", name))
6541 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6542 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6543 name);
6544 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6546 error ("stdcall and thiscall attributes are not compatible");
6548 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6550 error ("fastcall and thiscall attributes are not compatible");
6552 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6554 error ("cdecl and thiscall attributes are not compatible");
6558 /* Can combine sseregparm with all attributes. */
6560 return NULL_TREE;
6563 /* The transactional memory builtins are implicitly regparm or fastcall
6564 depending on the ABI. Override the generic do-nothing attribute that
6565 these builtins were declared with, and replace it with one of the two
6566 attributes that we expect elsewhere. */
6568 static tree
6569 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6570 int flags, bool *no_add_attrs)
6572 tree alt;
6574 /* In no case do we want to add the placeholder attribute. */
6575 *no_add_attrs = true;
6577 /* The 64-bit ABI is unchanged for transactional memory. */
6578 if (TARGET_64BIT)
6579 return NULL_TREE;
6581 /* ??? Is there a better way to validate 32-bit windows? We have
6582 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6583 if (CHECK_STACK_LIMIT > 0)
6584 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6585 else
6587 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6588 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6590 decl_attributes (node, alt, flags);
6592 return NULL_TREE;
6595 /* This function determines from TYPE the calling-convention. */
6597 unsigned int
6598 ix86_get_callcvt (const_tree type)
6600 unsigned int ret = 0;
6601 bool is_stdarg;
6602 tree attrs;
6604 if (TARGET_64BIT)
6605 return IX86_CALLCVT_CDECL;
6607 attrs = TYPE_ATTRIBUTES (type);
6608 if (attrs != NULL_TREE)
6610 if (lookup_attribute ("cdecl", attrs))
6611 ret |= IX86_CALLCVT_CDECL;
6612 else if (lookup_attribute ("stdcall", attrs))
6613 ret |= IX86_CALLCVT_STDCALL;
6614 else if (lookup_attribute ("fastcall", attrs))
6615 ret |= IX86_CALLCVT_FASTCALL;
6616 else if (lookup_attribute ("thiscall", attrs))
6617 ret |= IX86_CALLCVT_THISCALL;
6619 /* Regparam isn't allowed for thiscall and fastcall. */
6620 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6622 if (lookup_attribute ("regparm", attrs))
6623 ret |= IX86_CALLCVT_REGPARM;
6624 if (lookup_attribute ("sseregparm", attrs))
6625 ret |= IX86_CALLCVT_SSEREGPARM;
6628 if (IX86_BASE_CALLCVT(ret) != 0)
6629 return ret;
6632 is_stdarg = stdarg_p (type);
6633 if (TARGET_RTD && !is_stdarg)
6634 return IX86_CALLCVT_STDCALL | ret;
6636 if (ret != 0
6637 || is_stdarg
6638 || TREE_CODE (type) != METHOD_TYPE
6639 || ix86_function_type_abi (type) != MS_ABI)
6640 return IX86_CALLCVT_CDECL | ret;
6642 return IX86_CALLCVT_THISCALL;
6645 /* Return 0 if the attributes for two types are incompatible, 1 if they
6646 are compatible, and 2 if they are nearly compatible (which causes a
6647 warning to be generated). */
6649 static int
6650 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6652 unsigned int ccvt1, ccvt2;
6654 if (TREE_CODE (type1) != FUNCTION_TYPE
6655 && TREE_CODE (type1) != METHOD_TYPE)
6656 return 1;
6658 ccvt1 = ix86_get_callcvt (type1);
6659 ccvt2 = ix86_get_callcvt (type2);
6660 if (ccvt1 != ccvt2)
6661 return 0;
6662 if (ix86_function_regparm (type1, NULL)
6663 != ix86_function_regparm (type2, NULL))
6664 return 0;
6666 return 1;
6669 /* Return the regparm value for a function with the indicated TYPE and DECL.
6670 DECL may be NULL when calling function indirectly
6671 or considering a libcall. */
6673 static int
6674 ix86_function_regparm (const_tree type, const_tree decl)
6676 tree attr;
6677 int regparm;
6678 unsigned int ccvt;
6680 if (TARGET_64BIT)
6681 return (ix86_function_type_abi (type) == SYSV_ABI
6682 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6683 ccvt = ix86_get_callcvt (type);
6684 regparm = ix86_regparm;
6686 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6688 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6689 if (attr)
6691 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6692 return regparm;
6695 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6696 return 2;
6697 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6698 return 1;
6700 /* Use register calling convention for local functions when possible. */
6701 if (decl
6702 && TREE_CODE (decl) == FUNCTION_DECL)
6704 cgraph_node *target = cgraph_node::get (decl);
6705 if (target)
6706 target = target->function_symbol ();
6708 /* Caller and callee must agree on the calling convention, so
6709 checking here just optimize means that with
6710 __attribute__((optimize (...))) caller could use regparm convention
6711 and callee not, or vice versa. Instead look at whether the callee
6712 is optimized or not. */
6713 if (target && opt_for_fn (target->decl, optimize)
6714 && !(profile_flag && !flag_fentry))
6716 cgraph_local_info *i = &target->local;
6717 if (i && i->local && i->can_change_signature)
6719 int local_regparm, globals = 0, regno;
6721 /* Make sure no regparm register is taken by a
6722 fixed register variable. */
6723 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6724 local_regparm++)
6725 if (fixed_regs[local_regparm])
6726 break;
6728 /* We don't want to use regparm(3) for nested functions as
6729 these use a static chain pointer in the third argument. */
6730 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6731 local_regparm = 2;
6733 /* Save a register for the split stack. */
6734 if (flag_split_stack)
6736 if (local_regparm == 3)
6737 local_regparm = 2;
6738 else if (local_regparm == 2
6739 && DECL_STATIC_CHAIN (target->decl))
6740 local_regparm = 1;
6743 /* Each fixed register usage increases register pressure,
6744 so less registers should be used for argument passing.
6745 This functionality can be overriden by an explicit
6746 regparm value. */
6747 for (regno = AX_REG; regno <= DI_REG; regno++)
6748 if (fixed_regs[regno])
6749 globals++;
6751 local_regparm
6752 = globals < local_regparm ? local_regparm - globals : 0;
6754 if (local_regparm > regparm)
6755 regparm = local_regparm;
6760 return regparm;
6763 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6764 DFmode (2) arguments in SSE registers for a function with the
6765 indicated TYPE and DECL. DECL may be NULL when calling function
6766 indirectly or considering a libcall. Return -1 if any FP parameter
6767 should be rejected by error. This is used in siutation we imply SSE
6768 calling convetion but the function is called from another function with
6769 SSE disabled. Otherwise return 0. */
6771 static int
6772 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6774 gcc_assert (!TARGET_64BIT);
6776 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6777 by the sseregparm attribute. */
6778 if (TARGET_SSEREGPARM
6779 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6781 if (!TARGET_SSE)
6783 if (warn)
6785 if (decl)
6786 error ("calling %qD with attribute sseregparm without "
6787 "SSE/SSE2 enabled", decl);
6788 else
6789 error ("calling %qT with attribute sseregparm without "
6790 "SSE/SSE2 enabled", type);
6792 return 0;
6795 return 2;
6798 if (!decl)
6799 return 0;
6801 cgraph_node *target = cgraph_node::get (decl);
6802 if (target)
6803 target = target->function_symbol ();
6805 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6806 (and DFmode for SSE2) arguments in SSE registers. */
6807 if (target
6808 /* TARGET_SSE_MATH */
6809 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6810 && opt_for_fn (target->decl, optimize)
6811 && !(profile_flag && !flag_fentry))
6813 cgraph_local_info *i = &target->local;
6814 if (i && i->local && i->can_change_signature)
6816 /* Refuse to produce wrong code when local function with SSE enabled
6817 is called from SSE disabled function.
6818 FIXME: We need a way to detect these cases cross-ltrans partition
6819 and avoid using SSE calling conventions on local functions called
6820 from function with SSE disabled. For now at least delay the
6821 warning until we know we are going to produce wrong code.
6822 See PR66047 */
6823 if (!TARGET_SSE && warn)
6824 return -1;
6825 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6826 ->x_ix86_isa_flags) ? 2 : 1;
6830 return 0;
6833 /* Return true if EAX is live at the start of the function. Used by
6834 ix86_expand_prologue to determine if we need special help before
6835 calling allocate_stack_worker. */
6837 static bool
6838 ix86_eax_live_at_start_p (void)
6840 /* Cheat. Don't bother working forward from ix86_function_regparm
6841 to the function type to whether an actual argument is located in
6842 eax. Instead just look at cfg info, which is still close enough
6843 to correct at this point. This gives false positives for broken
6844 functions that might use uninitialized data that happens to be
6845 allocated in eax, but who cares? */
6846 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6849 static bool
6850 ix86_keep_aggregate_return_pointer (tree fntype)
6852 tree attr;
6854 if (!TARGET_64BIT)
6856 attr = lookup_attribute ("callee_pop_aggregate_return",
6857 TYPE_ATTRIBUTES (fntype));
6858 if (attr)
6859 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6861 /* For 32-bit MS-ABI the default is to keep aggregate
6862 return pointer. */
6863 if (ix86_function_type_abi (fntype) == MS_ABI)
6864 return true;
6866 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6869 /* Value is the number of bytes of arguments automatically
6870 popped when returning from a subroutine call.
6871 FUNDECL is the declaration node of the function (as a tree),
6872 FUNTYPE is the data type of the function (as a tree),
6873 or for a library call it is an identifier node for the subroutine name.
6874 SIZE is the number of bytes of arguments passed on the stack.
6876 On the 80386, the RTD insn may be used to pop them if the number
6877 of args is fixed, but if the number is variable then the caller
6878 must pop them all. RTD can't be used for library calls now
6879 because the library is compiled with the Unix compiler.
6880 Use of RTD is a selectable option, since it is incompatible with
6881 standard Unix calling sequences. If the option is not selected,
6882 the caller must always pop the args.
6884 The attribute stdcall is equivalent to RTD on a per module basis. */
6886 static poly_int64
6887 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6889 unsigned int ccvt;
6891 /* None of the 64-bit ABIs pop arguments. */
6892 if (TARGET_64BIT)
6893 return 0;
6895 ccvt = ix86_get_callcvt (funtype);
6897 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6898 | IX86_CALLCVT_THISCALL)) != 0
6899 && ! stdarg_p (funtype))
6900 return size;
6902 /* Lose any fake structure return argument if it is passed on the stack. */
6903 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6904 && !ix86_keep_aggregate_return_pointer (funtype))
6906 int nregs = ix86_function_regparm (funtype, fundecl);
6907 if (nregs == 0)
6908 return GET_MODE_SIZE (Pmode);
6911 return 0;
6914 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6916 static bool
6917 ix86_legitimate_combined_insn (rtx_insn *insn)
6919 int i;
6921 /* Check operand constraints in case hard registers were propagated
6922 into insn pattern. This check prevents combine pass from
6923 generating insn patterns with invalid hard register operands.
6924 These invalid insns can eventually confuse reload to error out
6925 with a spill failure. See also PRs 46829 and 46843. */
6927 gcc_assert (INSN_CODE (insn) >= 0);
6929 extract_insn (insn);
6930 preprocess_constraints (insn);
6932 int n_operands = recog_data.n_operands;
6933 int n_alternatives = recog_data.n_alternatives;
6934 for (i = 0; i < n_operands; i++)
6936 rtx op = recog_data.operand[i];
6937 machine_mode mode = GET_MODE (op);
6938 const operand_alternative *op_alt;
6939 int offset = 0;
6940 bool win;
6941 int j;
6943 /* A unary operator may be accepted by the predicate, but it
6944 is irrelevant for matching constraints. */
6945 if (UNARY_P (op))
6946 op = XEXP (op, 0);
6948 if (SUBREG_P (op))
6950 if (REG_P (SUBREG_REG (op))
6951 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6952 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6953 GET_MODE (SUBREG_REG (op)),
6954 SUBREG_BYTE (op),
6955 GET_MODE (op));
6956 op = SUBREG_REG (op);
6959 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6960 continue;
6962 op_alt = recog_op_alt;
6964 /* Operand has no constraints, anything is OK. */
6965 win = !n_alternatives;
6967 alternative_mask preferred = get_preferred_alternatives (insn);
6968 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6970 if (!TEST_BIT (preferred, j))
6971 continue;
6972 if (op_alt[i].anything_ok
6973 || (op_alt[i].matches != -1
6974 && operands_match_p
6975 (recog_data.operand[i],
6976 recog_data.operand[op_alt[i].matches]))
6977 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6979 win = true;
6980 break;
6984 if (!win)
6985 return false;
6988 return true;
6991 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6993 static unsigned HOST_WIDE_INT
6994 ix86_asan_shadow_offset (void)
6996 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6997 : HOST_WIDE_INT_C (0x7fff8000))
6998 : (HOST_WIDE_INT_1 << 29);
7001 /* Argument support functions. */
7003 /* Return true when register may be used to pass function parameters. */
7004 bool
7005 ix86_function_arg_regno_p (int regno)
7007 int i;
7008 enum calling_abi call_abi;
7009 const int *parm_regs;
7011 if (TARGET_MPX && BND_REGNO_P (regno))
7012 return true;
7014 if (!TARGET_64BIT)
7016 if (TARGET_MACHO)
7017 return (regno < REGPARM_MAX
7018 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
7019 else
7020 return (regno < REGPARM_MAX
7021 || (TARGET_MMX && MMX_REGNO_P (regno)
7022 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
7023 || (TARGET_SSE && SSE_REGNO_P (regno)
7024 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
7027 if (TARGET_SSE && SSE_REGNO_P (regno)
7028 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
7029 return true;
7031 /* TODO: The function should depend on current function ABI but
7032 builtins.c would need updating then. Therefore we use the
7033 default ABI. */
7034 call_abi = ix86_cfun_abi ();
7036 /* RAX is used as hidden argument to va_arg functions. */
7037 if (call_abi == SYSV_ABI && regno == AX_REG)
7038 return true;
7040 if (call_abi == MS_ABI)
7041 parm_regs = x86_64_ms_abi_int_parameter_registers;
7042 else
7043 parm_regs = x86_64_int_parameter_registers;
7045 for (i = 0; i < (call_abi == MS_ABI
7046 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
7047 if (regno == parm_regs[i])
7048 return true;
7049 return false;
7052 /* Return if we do not know how to pass TYPE solely in registers. */
7054 static bool
7055 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
7057 if (must_pass_in_stack_var_size_or_pad (mode, type))
7058 return true;
7060 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
7061 The layout_type routine is crafty and tries to trick us into passing
7062 currently unsupported vector types on the stack by using TImode. */
7063 return (!TARGET_64BIT && mode == TImode
7064 && type && TREE_CODE (type) != VECTOR_TYPE);
7067 /* It returns the size, in bytes, of the area reserved for arguments passed
7068 in registers for the function represented by fndecl dependent to the used
7069 abi format. */
7071 ix86_reg_parm_stack_space (const_tree fndecl)
7073 enum calling_abi call_abi = SYSV_ABI;
7074 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
7075 call_abi = ix86_function_abi (fndecl);
7076 else
7077 call_abi = ix86_function_type_abi (fndecl);
7078 if (TARGET_64BIT && call_abi == MS_ABI)
7079 return 32;
7080 return 0;
7083 /* We add this as a workaround in order to use libc_has_function
7084 hook in i386.md. */
7085 bool
7086 ix86_libc_has_function (enum function_class fn_class)
7088 return targetm.libc_has_function (fn_class);
7091 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
7092 specifying the call abi used. */
7093 enum calling_abi
7094 ix86_function_type_abi (const_tree fntype)
7096 enum calling_abi abi = ix86_abi;
7098 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
7099 return abi;
7101 if (abi == SYSV_ABI
7102 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
7104 static int warned;
7105 if (TARGET_X32 && !warned)
7107 error ("X32 does not support ms_abi attribute");
7108 warned = 1;
7111 abi = MS_ABI;
7113 else if (abi == MS_ABI
7114 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
7115 abi = SYSV_ABI;
7117 return abi;
7120 static enum calling_abi
7121 ix86_function_abi (const_tree fndecl)
7123 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
7126 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
7127 specifying the call abi used. */
7128 enum calling_abi
7129 ix86_cfun_abi (void)
7131 return cfun ? cfun->machine->call_abi : ix86_abi;
7134 static bool
7135 ix86_function_ms_hook_prologue (const_tree fn)
7137 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
7139 if (decl_function_context (fn) != NULL_TREE)
7140 error_at (DECL_SOURCE_LOCATION (fn),
7141 "ms_hook_prologue is not compatible with nested function");
7142 else
7143 return true;
7145 return false;
7148 static bool
7149 ix86_function_naked (const_tree fn)
7151 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7152 return true;
7154 return false;
7157 /* Write the extra assembler code needed to declare a function properly. */
7159 void
7160 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7161 tree decl)
7163 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7165 if (is_ms_hook)
7167 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7168 unsigned int filler_cc = 0xcccccccc;
7170 for (i = 0; i < filler_count; i += 4)
7171 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7174 #ifdef SUBTARGET_ASM_UNWIND_INIT
7175 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7176 #endif
7178 ASM_OUTPUT_LABEL (asm_out_file, fname);
7180 /* Output magic byte marker, if hot-patch attribute is set. */
7181 if (is_ms_hook)
7183 if (TARGET_64BIT)
7185 /* leaq [%rsp + 0], %rsp */
7186 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7187 asm_out_file);
7189 else
7191 /* movl.s %edi, %edi
7192 push %ebp
7193 movl.s %esp, %ebp */
7194 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7199 /* Implementation of call abi switching target hook. Specific to FNDECL
7200 the specific call register sets are set. See also
7201 ix86_conditional_register_usage for more details. */
7202 void
7203 ix86_call_abi_override (const_tree fndecl)
7205 cfun->machine->call_abi = ix86_function_abi (fndecl);
7208 /* Return 1 if pseudo register should be created and used to hold
7209 GOT address for PIC code. */
7210 bool
7211 ix86_use_pseudo_pic_reg (void)
7213 if ((TARGET_64BIT
7214 && (ix86_cmodel == CM_SMALL_PIC
7215 || TARGET_PECOFF))
7216 || !flag_pic)
7217 return false;
7218 return true;
7221 /* Initialize large model PIC register. */
7223 static void
7224 ix86_init_large_pic_reg (unsigned int tmp_regno)
7226 rtx_code_label *label;
7227 rtx tmp_reg;
7229 gcc_assert (Pmode == DImode);
7230 label = gen_label_rtx ();
7231 emit_label (label);
7232 LABEL_PRESERVE_P (label) = 1;
7233 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7234 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7235 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7236 label));
7237 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7238 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7239 pic_offset_table_rtx, tmp_reg));
7240 const char *name = LABEL_NAME (label);
7241 PUT_CODE (label, NOTE);
7242 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7243 NOTE_DELETED_LABEL_NAME (label) = name;
7246 /* Create and initialize PIC register if required. */
7247 static void
7248 ix86_init_pic_reg (void)
7250 edge entry_edge;
7251 rtx_insn *seq;
7253 if (!ix86_use_pseudo_pic_reg ())
7254 return;
7256 start_sequence ();
7258 if (TARGET_64BIT)
7260 if (ix86_cmodel == CM_LARGE_PIC)
7261 ix86_init_large_pic_reg (R11_REG);
7262 else
7263 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7265 else
7267 /* If there is future mcount call in the function it is more profitable
7268 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7269 rtx reg = crtl->profile
7270 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7271 : pic_offset_table_rtx;
7272 rtx_insn *insn = emit_insn (gen_set_got (reg));
7273 RTX_FRAME_RELATED_P (insn) = 1;
7274 if (crtl->profile)
7275 emit_move_insn (pic_offset_table_rtx, reg);
7276 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7279 seq = get_insns ();
7280 end_sequence ();
7282 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7283 insert_insn_on_edge (seq, entry_edge);
7284 commit_one_edge_insertion (entry_edge);
7287 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7288 for a call to a function whose data type is FNTYPE.
7289 For a library call, FNTYPE is 0. */
7291 void
7292 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7293 tree fntype, /* tree ptr for function decl */
7294 rtx libname, /* SYMBOL_REF of library name or 0 */
7295 tree fndecl,
7296 int caller)
7298 struct cgraph_local_info *i = NULL;
7299 struct cgraph_node *target = NULL;
7301 memset (cum, 0, sizeof (*cum));
7303 if (fndecl)
7305 target = cgraph_node::get (fndecl);
7306 if (target)
7308 target = target->function_symbol ();
7309 i = cgraph_node::local_info (target->decl);
7310 cum->call_abi = ix86_function_abi (target->decl);
7312 else
7313 cum->call_abi = ix86_function_abi (fndecl);
7315 else
7316 cum->call_abi = ix86_function_type_abi (fntype);
7318 cum->caller = caller;
7320 /* Set up the number of registers to use for passing arguments. */
7321 cum->nregs = ix86_regparm;
7322 if (TARGET_64BIT)
7324 cum->nregs = (cum->call_abi == SYSV_ABI
7325 ? X86_64_REGPARM_MAX
7326 : X86_64_MS_REGPARM_MAX);
7328 if (TARGET_SSE)
7330 cum->sse_nregs = SSE_REGPARM_MAX;
7331 if (TARGET_64BIT)
7333 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7334 ? X86_64_SSE_REGPARM_MAX
7335 : X86_64_MS_SSE_REGPARM_MAX);
7338 if (TARGET_MMX)
7339 cum->mmx_nregs = MMX_REGPARM_MAX;
7340 cum->warn_avx512f = true;
7341 cum->warn_avx = true;
7342 cum->warn_sse = true;
7343 cum->warn_mmx = true;
7345 /* Because type might mismatch in between caller and callee, we need to
7346 use actual type of function for local calls.
7347 FIXME: cgraph_analyze can be told to actually record if function uses
7348 va_start so for local functions maybe_vaarg can be made aggressive
7349 helping K&R code.
7350 FIXME: once typesytem is fixed, we won't need this code anymore. */
7351 if (i && i->local && i->can_change_signature)
7352 fntype = TREE_TYPE (target->decl);
7353 cum->stdarg = stdarg_p (fntype);
7354 cum->maybe_vaarg = (fntype
7355 ? (!prototype_p (fntype) || stdarg_p (fntype))
7356 : !libname);
7358 cum->bnd_regno = FIRST_BND_REG;
7359 cum->bnds_in_bt = 0;
7360 cum->force_bnd_pass = 0;
7361 cum->decl = fndecl;
7363 cum->warn_empty = !warn_abi || cum->stdarg;
7364 if (!cum->warn_empty && fntype)
7366 function_args_iterator iter;
7367 tree argtype;
7368 bool seen_empty_type = false;
7369 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7371 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7372 break;
7373 if (TYPE_EMPTY_P (argtype))
7374 seen_empty_type = true;
7375 else if (seen_empty_type)
7377 cum->warn_empty = true;
7378 break;
7383 if (!TARGET_64BIT)
7385 /* If there are variable arguments, then we won't pass anything
7386 in registers in 32-bit mode. */
7387 if (stdarg_p (fntype))
7389 cum->nregs = 0;
7390 /* Since in 32-bit, variable arguments are always passed on
7391 stack, there is scratch register available for indirect
7392 sibcall. */
7393 cfun->machine->arg_reg_available = true;
7394 cum->sse_nregs = 0;
7395 cum->mmx_nregs = 0;
7396 cum->warn_avx512f = false;
7397 cum->warn_avx = false;
7398 cum->warn_sse = false;
7399 cum->warn_mmx = false;
7400 return;
7403 /* Use ecx and edx registers if function has fastcall attribute,
7404 else look for regparm information. */
7405 if (fntype)
7407 unsigned int ccvt = ix86_get_callcvt (fntype);
7408 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7410 cum->nregs = 1;
7411 cum->fastcall = 1; /* Same first register as in fastcall. */
7413 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7415 cum->nregs = 2;
7416 cum->fastcall = 1;
7418 else
7419 cum->nregs = ix86_function_regparm (fntype, fndecl);
7422 /* Set up the number of SSE registers used for passing SFmode
7423 and DFmode arguments. Warn for mismatching ABI. */
7424 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7427 cfun->machine->arg_reg_available = (cum->nregs > 0);
7430 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7431 But in the case of vector types, it is some vector mode.
7433 When we have only some of our vector isa extensions enabled, then there
7434 are some modes for which vector_mode_supported_p is false. For these
7435 modes, the generic vector support in gcc will choose some non-vector mode
7436 in order to implement the type. By computing the natural mode, we'll
7437 select the proper ABI location for the operand and not depend on whatever
7438 the middle-end decides to do with these vector types.
7440 The midde-end can't deal with the vector types > 16 bytes. In this
7441 case, we return the original mode and warn ABI change if CUM isn't
7442 NULL.
7444 If INT_RETURN is true, warn ABI change if the vector mode isn't
7445 available for function return value. */
7447 static machine_mode
7448 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7449 bool in_return)
7451 machine_mode mode = TYPE_MODE (type);
7453 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7455 HOST_WIDE_INT size = int_size_in_bytes (type);
7456 if ((size == 8 || size == 16 || size == 32 || size == 64)
7457 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7458 && TYPE_VECTOR_SUBPARTS (type) > 1)
7460 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7462 /* There are no XFmode vector modes. */
7463 if (innermode == XFmode)
7464 return mode;
7466 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7467 mode = MIN_MODE_VECTOR_FLOAT;
7468 else
7469 mode = MIN_MODE_VECTOR_INT;
7471 /* Get the mode which has this inner mode and number of units. */
7472 FOR_EACH_MODE_FROM (mode, mode)
7473 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7474 && GET_MODE_INNER (mode) == innermode)
7476 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7478 static bool warnedavx512f;
7479 static bool warnedavx512f_ret;
7481 if (cum && cum->warn_avx512f && !warnedavx512f)
7483 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7484 "without AVX512F enabled changes the ABI"))
7485 warnedavx512f = true;
7487 else if (in_return && !warnedavx512f_ret)
7489 if (warning (OPT_Wpsabi, "AVX512F vector return "
7490 "without AVX512F enabled changes the ABI"))
7491 warnedavx512f_ret = true;
7494 return TYPE_MODE (type);
7496 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7498 static bool warnedavx;
7499 static bool warnedavx_ret;
7501 if (cum && cum->warn_avx && !warnedavx)
7503 if (warning (OPT_Wpsabi, "AVX vector argument "
7504 "without AVX enabled changes the ABI"))
7505 warnedavx = true;
7507 else if (in_return && !warnedavx_ret)
7509 if (warning (OPT_Wpsabi, "AVX vector return "
7510 "without AVX enabled changes the ABI"))
7511 warnedavx_ret = true;
7514 return TYPE_MODE (type);
7516 else if (((size == 8 && TARGET_64BIT) || size == 16)
7517 && !TARGET_SSE
7518 && !TARGET_IAMCU)
7520 static bool warnedsse;
7521 static bool warnedsse_ret;
7523 if (cum && cum->warn_sse && !warnedsse)
7525 if (warning (OPT_Wpsabi, "SSE vector argument "
7526 "without SSE enabled changes the ABI"))
7527 warnedsse = true;
7529 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7531 if (warning (OPT_Wpsabi, "SSE vector return "
7532 "without SSE enabled changes the ABI"))
7533 warnedsse_ret = true;
7536 else if ((size == 8 && !TARGET_64BIT)
7537 && (!cfun
7538 || cfun->machine->func_type == TYPE_NORMAL)
7539 && !TARGET_MMX
7540 && !TARGET_IAMCU)
7542 static bool warnedmmx;
7543 static bool warnedmmx_ret;
7545 if (cum && cum->warn_mmx && !warnedmmx)
7547 if (warning (OPT_Wpsabi, "MMX vector argument "
7548 "without MMX enabled changes the ABI"))
7549 warnedmmx = true;
7551 else if (in_return && !warnedmmx_ret)
7553 if (warning (OPT_Wpsabi, "MMX vector return "
7554 "without MMX enabled changes the ABI"))
7555 warnedmmx_ret = true;
7558 return mode;
7561 gcc_unreachable ();
7565 return mode;
7568 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7569 this may not agree with the mode that the type system has chosen for the
7570 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7571 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7573 static rtx
7574 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7575 unsigned int regno)
7577 rtx tmp;
7579 if (orig_mode != BLKmode)
7580 tmp = gen_rtx_REG (orig_mode, regno);
7581 else
7583 tmp = gen_rtx_REG (mode, regno);
7584 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7585 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7588 return tmp;
7591 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7592 of this code is to classify each 8bytes of incoming argument by the register
7593 class and assign registers accordingly. */
7595 /* Return the union class of CLASS1 and CLASS2.
7596 See the x86-64 PS ABI for details. */
7598 static enum x86_64_reg_class
7599 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7601 /* Rule #1: If both classes are equal, this is the resulting class. */
7602 if (class1 == class2)
7603 return class1;
7605 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7606 the other class. */
7607 if (class1 == X86_64_NO_CLASS)
7608 return class2;
7609 if (class2 == X86_64_NO_CLASS)
7610 return class1;
7612 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7613 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7614 return X86_64_MEMORY_CLASS;
7616 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7617 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7618 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7619 return X86_64_INTEGERSI_CLASS;
7620 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7621 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7622 return X86_64_INTEGER_CLASS;
7624 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7625 MEMORY is used. */
7626 if (class1 == X86_64_X87_CLASS
7627 || class1 == X86_64_X87UP_CLASS
7628 || class1 == X86_64_COMPLEX_X87_CLASS
7629 || class2 == X86_64_X87_CLASS
7630 || class2 == X86_64_X87UP_CLASS
7631 || class2 == X86_64_COMPLEX_X87_CLASS)
7632 return X86_64_MEMORY_CLASS;
7634 /* Rule #6: Otherwise class SSE is used. */
7635 return X86_64_SSE_CLASS;
7638 /* Classify the argument of type TYPE and mode MODE.
7639 CLASSES will be filled by the register class used to pass each word
7640 of the operand. The number of words is returned. In case the parameter
7641 should be passed in memory, 0 is returned. As a special case for zero
7642 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7644 BIT_OFFSET is used internally for handling records and specifies offset
7645 of the offset in bits modulo 512 to avoid overflow cases.
7647 See the x86-64 PS ABI for details.
7650 static int
7651 classify_argument (machine_mode mode, const_tree type,
7652 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7654 HOST_WIDE_INT bytes =
7655 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7656 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7658 /* Variable sized entities are always passed/returned in memory. */
7659 if (bytes < 0)
7660 return 0;
7662 if (mode != VOIDmode
7663 && targetm.calls.must_pass_in_stack (mode, type))
7664 return 0;
7666 if (type && AGGREGATE_TYPE_P (type))
7668 int i;
7669 tree field;
7670 enum x86_64_reg_class subclasses[MAX_CLASSES];
7672 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7673 if (bytes > 64)
7674 return 0;
7676 for (i = 0; i < words; i++)
7677 classes[i] = X86_64_NO_CLASS;
7679 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7680 signalize memory class, so handle it as special case. */
7681 if (!words)
7683 classes[0] = X86_64_NO_CLASS;
7684 return 1;
7687 /* Classify each field of record and merge classes. */
7688 switch (TREE_CODE (type))
7690 case RECORD_TYPE:
7691 /* And now merge the fields of structure. */
7692 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7694 if (TREE_CODE (field) == FIELD_DECL)
7696 int num;
7698 if (TREE_TYPE (field) == error_mark_node)
7699 continue;
7701 /* Bitfields are always classified as integer. Handle them
7702 early, since later code would consider them to be
7703 misaligned integers. */
7704 if (DECL_BIT_FIELD (field))
7706 for (i = (int_bit_position (field)
7707 + (bit_offset % 64)) / 8 / 8;
7708 i < ((int_bit_position (field) + (bit_offset % 64))
7709 + tree_to_shwi (DECL_SIZE (field))
7710 + 63) / 8 / 8; i++)
7711 classes[i] =
7712 merge_classes (X86_64_INTEGER_CLASS,
7713 classes[i]);
7715 else
7717 int pos;
7719 type = TREE_TYPE (field);
7721 /* Flexible array member is ignored. */
7722 if (TYPE_MODE (type) == BLKmode
7723 && TREE_CODE (type) == ARRAY_TYPE
7724 && TYPE_SIZE (type) == NULL_TREE
7725 && TYPE_DOMAIN (type) != NULL_TREE
7726 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7727 == NULL_TREE))
7729 static bool warned;
7731 if (!warned && warn_psabi)
7733 warned = true;
7734 inform (input_location,
7735 "the ABI of passing struct with"
7736 " a flexible array member has"
7737 " changed in GCC 4.4");
7739 continue;
7741 num = classify_argument (TYPE_MODE (type), type,
7742 subclasses,
7743 (int_bit_position (field)
7744 + bit_offset) % 512);
7745 if (!num)
7746 return 0;
7747 pos = (int_bit_position (field)
7748 + (bit_offset % 64)) / 8 / 8;
7749 for (i = 0; i < num && (i + pos) < words; i++)
7750 classes[i + pos] =
7751 merge_classes (subclasses[i], classes[i + pos]);
7755 break;
7757 case ARRAY_TYPE:
7758 /* Arrays are handled as small records. */
7760 int num;
7761 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7762 TREE_TYPE (type), subclasses, bit_offset);
7763 if (!num)
7764 return 0;
7766 /* The partial classes are now full classes. */
7767 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7768 subclasses[0] = X86_64_SSE_CLASS;
7769 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7770 && !((bit_offset % 64) == 0 && bytes == 4))
7771 subclasses[0] = X86_64_INTEGER_CLASS;
7773 for (i = 0; i < words; i++)
7774 classes[i] = subclasses[i % num];
7776 break;
7778 case UNION_TYPE:
7779 case QUAL_UNION_TYPE:
7780 /* Unions are similar to RECORD_TYPE but offset is always 0.
7782 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7784 if (TREE_CODE (field) == FIELD_DECL)
7786 int num;
7788 if (TREE_TYPE (field) == error_mark_node)
7789 continue;
7791 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7792 TREE_TYPE (field), subclasses,
7793 bit_offset);
7794 if (!num)
7795 return 0;
7796 for (i = 0; i < num && i < words; i++)
7797 classes[i] = merge_classes (subclasses[i], classes[i]);
7800 break;
7802 default:
7803 gcc_unreachable ();
7806 if (words > 2)
7808 /* When size > 16 bytes, if the first one isn't
7809 X86_64_SSE_CLASS or any other ones aren't
7810 X86_64_SSEUP_CLASS, everything should be passed in
7811 memory. */
7812 if (classes[0] != X86_64_SSE_CLASS)
7813 return 0;
7815 for (i = 1; i < words; i++)
7816 if (classes[i] != X86_64_SSEUP_CLASS)
7817 return 0;
7820 /* Final merger cleanup. */
7821 for (i = 0; i < words; i++)
7823 /* If one class is MEMORY, everything should be passed in
7824 memory. */
7825 if (classes[i] == X86_64_MEMORY_CLASS)
7826 return 0;
7828 /* The X86_64_SSEUP_CLASS should be always preceded by
7829 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7830 if (classes[i] == X86_64_SSEUP_CLASS
7831 && classes[i - 1] != X86_64_SSE_CLASS
7832 && classes[i - 1] != X86_64_SSEUP_CLASS)
7834 /* The first one should never be X86_64_SSEUP_CLASS. */
7835 gcc_assert (i != 0);
7836 classes[i] = X86_64_SSE_CLASS;
7839 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7840 everything should be passed in memory. */
7841 if (classes[i] == X86_64_X87UP_CLASS
7842 && (classes[i - 1] != X86_64_X87_CLASS))
7844 static bool warned;
7846 /* The first one should never be X86_64_X87UP_CLASS. */
7847 gcc_assert (i != 0);
7848 if (!warned && warn_psabi)
7850 warned = true;
7851 inform (input_location,
7852 "the ABI of passing union with long double"
7853 " has changed in GCC 4.4");
7855 return 0;
7858 return words;
7861 /* Compute alignment needed. We align all types to natural boundaries with
7862 exception of XFmode that is aligned to 64bits. */
7863 if (mode != VOIDmode && mode != BLKmode)
7865 int mode_alignment = GET_MODE_BITSIZE (mode);
7867 if (mode == XFmode)
7868 mode_alignment = 128;
7869 else if (mode == XCmode)
7870 mode_alignment = 256;
7871 if (COMPLEX_MODE_P (mode))
7872 mode_alignment /= 2;
7873 /* Misaligned fields are always returned in memory. */
7874 if (bit_offset % mode_alignment)
7875 return 0;
7878 /* for V1xx modes, just use the base mode */
7879 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7880 && GET_MODE_UNIT_SIZE (mode) == bytes)
7881 mode = GET_MODE_INNER (mode);
7883 /* Classification of atomic types. */
7884 switch (mode)
7886 case E_SDmode:
7887 case E_DDmode:
7888 classes[0] = X86_64_SSE_CLASS;
7889 return 1;
7890 case E_TDmode:
7891 classes[0] = X86_64_SSE_CLASS;
7892 classes[1] = X86_64_SSEUP_CLASS;
7893 return 2;
7894 case E_DImode:
7895 case E_SImode:
7896 case E_HImode:
7897 case E_QImode:
7898 case E_CSImode:
7899 case E_CHImode:
7900 case E_CQImode:
7902 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7904 /* Analyze last 128 bits only. */
7905 size = (size - 1) & 0x7f;
7907 if (size < 32)
7909 classes[0] = X86_64_INTEGERSI_CLASS;
7910 return 1;
7912 else if (size < 64)
7914 classes[0] = X86_64_INTEGER_CLASS;
7915 return 1;
7917 else if (size < 64+32)
7919 classes[0] = X86_64_INTEGER_CLASS;
7920 classes[1] = X86_64_INTEGERSI_CLASS;
7921 return 2;
7923 else if (size < 64+64)
7925 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7926 return 2;
7928 else
7929 gcc_unreachable ();
7931 case E_CDImode:
7932 case E_TImode:
7933 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7934 return 2;
7935 case E_COImode:
7936 case E_OImode:
7937 /* OImode shouldn't be used directly. */
7938 gcc_unreachable ();
7939 case E_CTImode:
7940 return 0;
7941 case E_SFmode:
7942 if (!(bit_offset % 64))
7943 classes[0] = X86_64_SSESF_CLASS;
7944 else
7945 classes[0] = X86_64_SSE_CLASS;
7946 return 1;
7947 case E_DFmode:
7948 classes[0] = X86_64_SSEDF_CLASS;
7949 return 1;
7950 case E_XFmode:
7951 classes[0] = X86_64_X87_CLASS;
7952 classes[1] = X86_64_X87UP_CLASS;
7953 return 2;
7954 case E_TFmode:
7955 classes[0] = X86_64_SSE_CLASS;
7956 classes[1] = X86_64_SSEUP_CLASS;
7957 return 2;
7958 case E_SCmode:
7959 classes[0] = X86_64_SSE_CLASS;
7960 if (!(bit_offset % 64))
7961 return 1;
7962 else
7964 static bool warned;
7966 if (!warned && warn_psabi)
7968 warned = true;
7969 inform (input_location,
7970 "the ABI of passing structure with complex float"
7971 " member has changed in GCC 4.4");
7973 classes[1] = X86_64_SSESF_CLASS;
7974 return 2;
7976 case E_DCmode:
7977 classes[0] = X86_64_SSEDF_CLASS;
7978 classes[1] = X86_64_SSEDF_CLASS;
7979 return 2;
7980 case E_XCmode:
7981 classes[0] = X86_64_COMPLEX_X87_CLASS;
7982 return 1;
7983 case E_TCmode:
7984 /* This modes is larger than 16 bytes. */
7985 return 0;
7986 case E_V8SFmode:
7987 case E_V8SImode:
7988 case E_V32QImode:
7989 case E_V16HImode:
7990 case E_V4DFmode:
7991 case E_V4DImode:
7992 classes[0] = X86_64_SSE_CLASS;
7993 classes[1] = X86_64_SSEUP_CLASS;
7994 classes[2] = X86_64_SSEUP_CLASS;
7995 classes[3] = X86_64_SSEUP_CLASS;
7996 return 4;
7997 case E_V8DFmode:
7998 case E_V16SFmode:
7999 case E_V8DImode:
8000 case E_V16SImode:
8001 case E_V32HImode:
8002 case E_V64QImode:
8003 classes[0] = X86_64_SSE_CLASS;
8004 classes[1] = X86_64_SSEUP_CLASS;
8005 classes[2] = X86_64_SSEUP_CLASS;
8006 classes[3] = X86_64_SSEUP_CLASS;
8007 classes[4] = X86_64_SSEUP_CLASS;
8008 classes[5] = X86_64_SSEUP_CLASS;
8009 classes[6] = X86_64_SSEUP_CLASS;
8010 classes[7] = X86_64_SSEUP_CLASS;
8011 return 8;
8012 case E_V4SFmode:
8013 case E_V4SImode:
8014 case E_V16QImode:
8015 case E_V8HImode:
8016 case E_V2DFmode:
8017 case E_V2DImode:
8018 classes[0] = X86_64_SSE_CLASS;
8019 classes[1] = X86_64_SSEUP_CLASS;
8020 return 2;
8021 case E_V1TImode:
8022 case E_V1DImode:
8023 case E_V2SFmode:
8024 case E_V2SImode:
8025 case E_V4HImode:
8026 case E_V8QImode:
8027 classes[0] = X86_64_SSE_CLASS;
8028 return 1;
8029 case E_BLKmode:
8030 case E_VOIDmode:
8031 return 0;
8032 default:
8033 gcc_assert (VECTOR_MODE_P (mode));
8035 if (bytes > 16)
8036 return 0;
8038 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
8040 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
8041 classes[0] = X86_64_INTEGERSI_CLASS;
8042 else
8043 classes[0] = X86_64_INTEGER_CLASS;
8044 classes[1] = X86_64_INTEGER_CLASS;
8045 return 1 + (bytes > 8);
8049 /* Examine the argument and return set number of register required in each
8050 class. Return true iff parameter should be passed in memory. */
8052 static bool
8053 examine_argument (machine_mode mode, const_tree type, int in_return,
8054 int *int_nregs, int *sse_nregs)
8056 enum x86_64_reg_class regclass[MAX_CLASSES];
8057 int n = classify_argument (mode, type, regclass, 0);
8059 *int_nregs = 0;
8060 *sse_nregs = 0;
8062 if (!n)
8063 return true;
8064 for (n--; n >= 0; n--)
8065 switch (regclass[n])
8067 case X86_64_INTEGER_CLASS:
8068 case X86_64_INTEGERSI_CLASS:
8069 (*int_nregs)++;
8070 break;
8071 case X86_64_SSE_CLASS:
8072 case X86_64_SSESF_CLASS:
8073 case X86_64_SSEDF_CLASS:
8074 (*sse_nregs)++;
8075 break;
8076 case X86_64_NO_CLASS:
8077 case X86_64_SSEUP_CLASS:
8078 break;
8079 case X86_64_X87_CLASS:
8080 case X86_64_X87UP_CLASS:
8081 case X86_64_COMPLEX_X87_CLASS:
8082 if (!in_return)
8083 return true;
8084 break;
8085 case X86_64_MEMORY_CLASS:
8086 gcc_unreachable ();
8089 return false;
8092 /* Construct container for the argument used by GCC interface. See
8093 FUNCTION_ARG for the detailed description. */
8095 static rtx
8096 construct_container (machine_mode mode, machine_mode orig_mode,
8097 const_tree type, int in_return, int nintregs, int nsseregs,
8098 const int *intreg, int sse_regno)
8100 /* The following variables hold the static issued_error state. */
8101 static bool issued_sse_arg_error;
8102 static bool issued_sse_ret_error;
8103 static bool issued_x87_ret_error;
8105 machine_mode tmpmode;
8106 int bytes =
8107 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8108 enum x86_64_reg_class regclass[MAX_CLASSES];
8109 int n;
8110 int i;
8111 int nexps = 0;
8112 int needed_sseregs, needed_intregs;
8113 rtx exp[MAX_CLASSES];
8114 rtx ret;
8116 n = classify_argument (mode, type, regclass, 0);
8117 if (!n)
8118 return NULL;
8119 if (examine_argument (mode, type, in_return, &needed_intregs,
8120 &needed_sseregs))
8121 return NULL;
8122 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
8123 return NULL;
8125 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
8126 some less clueful developer tries to use floating-point anyway. */
8127 if (needed_sseregs && !TARGET_SSE)
8129 if (in_return)
8131 if (!issued_sse_ret_error)
8133 error ("SSE register return with SSE disabled");
8134 issued_sse_ret_error = true;
8137 else if (!issued_sse_arg_error)
8139 error ("SSE register argument with SSE disabled");
8140 issued_sse_arg_error = true;
8142 return NULL;
8145 /* Likewise, error if the ABI requires us to return values in the
8146 x87 registers and the user specified -mno-80387. */
8147 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8148 for (i = 0; i < n; i++)
8149 if (regclass[i] == X86_64_X87_CLASS
8150 || regclass[i] == X86_64_X87UP_CLASS
8151 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8153 if (!issued_x87_ret_error)
8155 error ("x87 register return with x87 disabled");
8156 issued_x87_ret_error = true;
8158 return NULL;
8161 /* First construct simple cases. Avoid SCmode, since we want to use
8162 single register to pass this type. */
8163 if (n == 1 && mode != SCmode)
8164 switch (regclass[0])
8166 case X86_64_INTEGER_CLASS:
8167 case X86_64_INTEGERSI_CLASS:
8168 return gen_rtx_REG (mode, intreg[0]);
8169 case X86_64_SSE_CLASS:
8170 case X86_64_SSESF_CLASS:
8171 case X86_64_SSEDF_CLASS:
8172 if (mode != BLKmode)
8173 return gen_reg_or_parallel (mode, orig_mode,
8174 SSE_REGNO (sse_regno));
8175 break;
8176 case X86_64_X87_CLASS:
8177 case X86_64_COMPLEX_X87_CLASS:
8178 return gen_rtx_REG (mode, FIRST_STACK_REG);
8179 case X86_64_NO_CLASS:
8180 /* Zero sized array, struct or class. */
8181 return NULL;
8182 default:
8183 gcc_unreachable ();
8185 if (n == 2
8186 && regclass[0] == X86_64_SSE_CLASS
8187 && regclass[1] == X86_64_SSEUP_CLASS
8188 && mode != BLKmode)
8189 return gen_reg_or_parallel (mode, orig_mode,
8190 SSE_REGNO (sse_regno));
8191 if (n == 4
8192 && regclass[0] == X86_64_SSE_CLASS
8193 && regclass[1] == X86_64_SSEUP_CLASS
8194 && regclass[2] == X86_64_SSEUP_CLASS
8195 && regclass[3] == X86_64_SSEUP_CLASS
8196 && mode != BLKmode)
8197 return gen_reg_or_parallel (mode, orig_mode,
8198 SSE_REGNO (sse_regno));
8199 if (n == 8
8200 && regclass[0] == X86_64_SSE_CLASS
8201 && regclass[1] == X86_64_SSEUP_CLASS
8202 && regclass[2] == X86_64_SSEUP_CLASS
8203 && regclass[3] == X86_64_SSEUP_CLASS
8204 && regclass[4] == X86_64_SSEUP_CLASS
8205 && regclass[5] == X86_64_SSEUP_CLASS
8206 && regclass[6] == X86_64_SSEUP_CLASS
8207 && regclass[7] == X86_64_SSEUP_CLASS
8208 && mode != BLKmode)
8209 return gen_reg_or_parallel (mode, orig_mode,
8210 SSE_REGNO (sse_regno));
8211 if (n == 2
8212 && regclass[0] == X86_64_X87_CLASS
8213 && regclass[1] == X86_64_X87UP_CLASS)
8214 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8216 if (n == 2
8217 && regclass[0] == X86_64_INTEGER_CLASS
8218 && regclass[1] == X86_64_INTEGER_CLASS
8219 && (mode == CDImode || mode == TImode)
8220 && intreg[0] + 1 == intreg[1])
8221 return gen_rtx_REG (mode, intreg[0]);
8223 /* Otherwise figure out the entries of the PARALLEL. */
8224 for (i = 0; i < n; i++)
8226 int pos;
8228 switch (regclass[i])
8230 case X86_64_NO_CLASS:
8231 break;
8232 case X86_64_INTEGER_CLASS:
8233 case X86_64_INTEGERSI_CLASS:
8234 /* Merge TImodes on aligned occasions here too. */
8235 if (i * 8 + 8 > bytes)
8237 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8238 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8239 /* We've requested 24 bytes we
8240 don't have mode for. Use DImode. */
8241 tmpmode = DImode;
8243 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8244 tmpmode = SImode;
8245 else
8246 tmpmode = DImode;
8247 exp [nexps++]
8248 = gen_rtx_EXPR_LIST (VOIDmode,
8249 gen_rtx_REG (tmpmode, *intreg),
8250 GEN_INT (i*8));
8251 intreg++;
8252 break;
8253 case X86_64_SSESF_CLASS:
8254 exp [nexps++]
8255 = gen_rtx_EXPR_LIST (VOIDmode,
8256 gen_rtx_REG (SFmode,
8257 SSE_REGNO (sse_regno)),
8258 GEN_INT (i*8));
8259 sse_regno++;
8260 break;
8261 case X86_64_SSEDF_CLASS:
8262 exp [nexps++]
8263 = gen_rtx_EXPR_LIST (VOIDmode,
8264 gen_rtx_REG (DFmode,
8265 SSE_REGNO (sse_regno)),
8266 GEN_INT (i*8));
8267 sse_regno++;
8268 break;
8269 case X86_64_SSE_CLASS:
8270 pos = i;
8271 switch (n)
8273 case 1:
8274 tmpmode = DImode;
8275 break;
8276 case 2:
8277 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8279 tmpmode = TImode;
8280 i++;
8282 else
8283 tmpmode = DImode;
8284 break;
8285 case 4:
8286 gcc_assert (i == 0
8287 && regclass[1] == X86_64_SSEUP_CLASS
8288 && regclass[2] == X86_64_SSEUP_CLASS
8289 && regclass[3] == X86_64_SSEUP_CLASS);
8290 tmpmode = OImode;
8291 i += 3;
8292 break;
8293 case 8:
8294 gcc_assert (i == 0
8295 && regclass[1] == X86_64_SSEUP_CLASS
8296 && regclass[2] == X86_64_SSEUP_CLASS
8297 && regclass[3] == X86_64_SSEUP_CLASS
8298 && regclass[4] == X86_64_SSEUP_CLASS
8299 && regclass[5] == X86_64_SSEUP_CLASS
8300 && regclass[6] == X86_64_SSEUP_CLASS
8301 && regclass[7] == X86_64_SSEUP_CLASS);
8302 tmpmode = XImode;
8303 i += 7;
8304 break;
8305 default:
8306 gcc_unreachable ();
8308 exp [nexps++]
8309 = gen_rtx_EXPR_LIST (VOIDmode,
8310 gen_rtx_REG (tmpmode,
8311 SSE_REGNO (sse_regno)),
8312 GEN_INT (pos*8));
8313 sse_regno++;
8314 break;
8315 default:
8316 gcc_unreachable ();
8320 /* Empty aligned struct, union or class. */
8321 if (nexps == 0)
8322 return NULL;
8324 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8325 for (i = 0; i < nexps; i++)
8326 XVECEXP (ret, 0, i) = exp [i];
8327 return ret;
8330 /* Update the data in CUM to advance over an argument of mode MODE
8331 and data type TYPE. (TYPE is null for libcalls where that information
8332 may not be available.)
8334 Return a number of integer regsiters advanced over. */
8336 static int
8337 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8338 const_tree type, HOST_WIDE_INT bytes,
8339 HOST_WIDE_INT words)
8341 int res = 0;
8342 bool error_p = false;
8344 if (TARGET_IAMCU)
8346 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8347 bytes in registers. */
8348 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8349 goto pass_in_reg;
8350 return res;
8353 switch (mode)
8355 default:
8356 break;
8358 case E_BLKmode:
8359 if (bytes < 0)
8360 break;
8361 /* FALLTHRU */
8363 case E_DImode:
8364 case E_SImode:
8365 case E_HImode:
8366 case E_QImode:
8367 pass_in_reg:
8368 cum->words += words;
8369 cum->nregs -= words;
8370 cum->regno += words;
8371 if (cum->nregs >= 0)
8372 res = words;
8373 if (cum->nregs <= 0)
8375 cum->nregs = 0;
8376 cfun->machine->arg_reg_available = false;
8377 cum->regno = 0;
8379 break;
8381 case E_OImode:
8382 /* OImode shouldn't be used directly. */
8383 gcc_unreachable ();
8385 case E_DFmode:
8386 if (cum->float_in_sse == -1)
8387 error_p = true;
8388 if (cum->float_in_sse < 2)
8389 break;
8390 /* FALLTHRU */
8391 case E_SFmode:
8392 if (cum->float_in_sse == -1)
8393 error_p = true;
8394 if (cum->float_in_sse < 1)
8395 break;
8396 /* FALLTHRU */
8398 case E_V8SFmode:
8399 case E_V8SImode:
8400 case E_V64QImode:
8401 case E_V32HImode:
8402 case E_V16SImode:
8403 case E_V8DImode:
8404 case E_V16SFmode:
8405 case E_V8DFmode:
8406 case E_V32QImode:
8407 case E_V16HImode:
8408 case E_V4DFmode:
8409 case E_V4DImode:
8410 case E_TImode:
8411 case E_V16QImode:
8412 case E_V8HImode:
8413 case E_V4SImode:
8414 case E_V2DImode:
8415 case E_V4SFmode:
8416 case E_V2DFmode:
8417 if (!type || !AGGREGATE_TYPE_P (type))
8419 cum->sse_words += words;
8420 cum->sse_nregs -= 1;
8421 cum->sse_regno += 1;
8422 if (cum->sse_nregs <= 0)
8424 cum->sse_nregs = 0;
8425 cum->sse_regno = 0;
8428 break;
8430 case E_V8QImode:
8431 case E_V4HImode:
8432 case E_V2SImode:
8433 case E_V2SFmode:
8434 case E_V1TImode:
8435 case E_V1DImode:
8436 if (!type || !AGGREGATE_TYPE_P (type))
8438 cum->mmx_words += words;
8439 cum->mmx_nregs -= 1;
8440 cum->mmx_regno += 1;
8441 if (cum->mmx_nregs <= 0)
8443 cum->mmx_nregs = 0;
8444 cum->mmx_regno = 0;
8447 break;
8449 if (error_p)
8451 cum->float_in_sse = 0;
8452 error ("calling %qD with SSE calling convention without "
8453 "SSE/SSE2 enabled", cum->decl);
8454 sorry ("this is a GCC bug that can be worked around by adding "
8455 "attribute used to function called");
8458 return res;
8461 static int
8462 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8463 const_tree type, HOST_WIDE_INT words, bool named)
8465 int int_nregs, sse_nregs;
8467 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8468 if (!named && (VALID_AVX512F_REG_MODE (mode)
8469 || VALID_AVX256_REG_MODE (mode)))
8470 return 0;
8472 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8473 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8475 cum->nregs -= int_nregs;
8476 cum->sse_nregs -= sse_nregs;
8477 cum->regno += int_nregs;
8478 cum->sse_regno += sse_nregs;
8479 return int_nregs;
8481 else
8483 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8484 cum->words = ROUND_UP (cum->words, align);
8485 cum->words += words;
8486 return 0;
8490 static int
8491 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8492 HOST_WIDE_INT words)
8494 /* Otherwise, this should be passed indirect. */
8495 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8497 cum->words += words;
8498 if (cum->nregs > 0)
8500 cum->nregs -= 1;
8501 cum->regno += 1;
8502 return 1;
8504 return 0;
8507 /* Update the data in CUM to advance over an argument of mode MODE and
8508 data type TYPE. (TYPE is null for libcalls where that information
8509 may not be available.) */
8511 static void
8512 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8513 const_tree type, bool named)
8515 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8516 HOST_WIDE_INT bytes, words;
8517 int nregs;
8519 /* The argument of interrupt handler is a special case and is
8520 handled in ix86_function_arg. */
8521 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8522 return;
8524 if (mode == BLKmode)
8525 bytes = int_size_in_bytes (type);
8526 else
8527 bytes = GET_MODE_SIZE (mode);
8528 words = CEIL (bytes, UNITS_PER_WORD);
8530 if (type)
8531 mode = type_natural_mode (type, NULL, false);
8533 if ((type && POINTER_BOUNDS_TYPE_P (type))
8534 || POINTER_BOUNDS_MODE_P (mode))
8536 /* If we pass bounds in BT then just update remained bounds count. */
8537 if (cum->bnds_in_bt)
8539 cum->bnds_in_bt--;
8540 return;
8543 /* Update remained number of bounds to force. */
8544 if (cum->force_bnd_pass)
8545 cum->force_bnd_pass--;
8547 cum->bnd_regno++;
8549 return;
8552 /* The first arg not going to Bounds Tables resets this counter. */
8553 cum->bnds_in_bt = 0;
8554 /* For unnamed args we always pass bounds to avoid bounds mess when
8555 passed and received types do not match. If bounds do not follow
8556 unnamed arg, still pretend required number of bounds were passed. */
8557 if (cum->force_bnd_pass)
8559 cum->bnd_regno += cum->force_bnd_pass;
8560 cum->force_bnd_pass = 0;
8563 if (TARGET_64BIT)
8565 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8567 if (call_abi == MS_ABI)
8568 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8569 else
8570 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8572 else
8573 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8575 /* For stdarg we expect bounds to be passed for each value passed
8576 in register. */
8577 if (cum->stdarg)
8578 cum->force_bnd_pass = nregs;
8579 /* For pointers passed in memory we expect bounds passed in Bounds
8580 Table. */
8581 if (!nregs)
8583 /* Track if there are outgoing arguments on stack. */
8584 if (cum->caller)
8585 cfun->machine->outgoing_args_on_stack = true;
8587 cum->bnds_in_bt = chkp_type_bounds_count (type);
8591 /* Define where to put the arguments to a function.
8592 Value is zero to push the argument on the stack,
8593 or a hard register in which to store the argument.
8595 MODE is the argument's machine mode.
8596 TYPE is the data type of the argument (as a tree).
8597 This is null for libcalls where that information may
8598 not be available.
8599 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8600 the preceding args and about the function being called.
8601 NAMED is nonzero if this argument is a named parameter
8602 (otherwise it is an extra parameter matching an ellipsis). */
8604 static rtx
8605 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8606 machine_mode orig_mode, const_tree type,
8607 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8609 bool error_p = false;
8611 /* Avoid the AL settings for the Unix64 ABI. */
8612 if (mode == VOIDmode)
8613 return constm1_rtx;
8615 if (TARGET_IAMCU)
8617 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8618 bytes in registers. */
8619 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8620 goto pass_in_reg;
8621 return NULL_RTX;
8624 switch (mode)
8626 default:
8627 break;
8629 case E_BLKmode:
8630 if (bytes < 0)
8631 break;
8632 /* FALLTHRU */
8633 case E_DImode:
8634 case E_SImode:
8635 case E_HImode:
8636 case E_QImode:
8637 pass_in_reg:
8638 if (words <= cum->nregs)
8640 int regno = cum->regno;
8642 /* Fastcall allocates the first two DWORD (SImode) or
8643 smaller arguments to ECX and EDX if it isn't an
8644 aggregate type . */
8645 if (cum->fastcall)
8647 if (mode == BLKmode
8648 || mode == DImode
8649 || (type && AGGREGATE_TYPE_P (type)))
8650 break;
8652 /* ECX not EAX is the first allocated register. */
8653 if (regno == AX_REG)
8654 regno = CX_REG;
8656 return gen_rtx_REG (mode, regno);
8658 break;
8660 case E_DFmode:
8661 if (cum->float_in_sse == -1)
8662 error_p = true;
8663 if (cum->float_in_sse < 2)
8664 break;
8665 /* FALLTHRU */
8666 case E_SFmode:
8667 if (cum->float_in_sse == -1)
8668 error_p = true;
8669 if (cum->float_in_sse < 1)
8670 break;
8671 /* FALLTHRU */
8672 case E_TImode:
8673 /* In 32bit, we pass TImode in xmm registers. */
8674 case E_V16QImode:
8675 case E_V8HImode:
8676 case E_V4SImode:
8677 case E_V2DImode:
8678 case E_V4SFmode:
8679 case E_V2DFmode:
8680 if (!type || !AGGREGATE_TYPE_P (type))
8682 if (cum->sse_nregs)
8683 return gen_reg_or_parallel (mode, orig_mode,
8684 cum->sse_regno + FIRST_SSE_REG);
8686 break;
8688 case E_OImode:
8689 case E_XImode:
8690 /* OImode and XImode shouldn't be used directly. */
8691 gcc_unreachable ();
8693 case E_V64QImode:
8694 case E_V32HImode:
8695 case E_V16SImode:
8696 case E_V8DImode:
8697 case E_V16SFmode:
8698 case E_V8DFmode:
8699 case E_V8SFmode:
8700 case E_V8SImode:
8701 case E_V32QImode:
8702 case E_V16HImode:
8703 case E_V4DFmode:
8704 case E_V4DImode:
8705 if (!type || !AGGREGATE_TYPE_P (type))
8707 if (cum->sse_nregs)
8708 return gen_reg_or_parallel (mode, orig_mode,
8709 cum->sse_regno + FIRST_SSE_REG);
8711 break;
8713 case E_V8QImode:
8714 case E_V4HImode:
8715 case E_V2SImode:
8716 case E_V2SFmode:
8717 case E_V1TImode:
8718 case E_V1DImode:
8719 if (!type || !AGGREGATE_TYPE_P (type))
8721 if (cum->mmx_nregs)
8722 return gen_reg_or_parallel (mode, orig_mode,
8723 cum->mmx_regno + FIRST_MMX_REG);
8725 break;
8727 if (error_p)
8729 cum->float_in_sse = 0;
8730 error ("calling %qD with SSE calling convention without "
8731 "SSE/SSE2 enabled", cum->decl);
8732 sorry ("this is a GCC bug that can be worked around by adding "
8733 "attribute used to function called");
8736 return NULL_RTX;
8739 static rtx
8740 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8741 machine_mode orig_mode, const_tree type, bool named)
8743 /* Handle a hidden AL argument containing number of registers
8744 for varargs x86-64 functions. */
8745 if (mode == VOIDmode)
8746 return GEN_INT (cum->maybe_vaarg
8747 ? (cum->sse_nregs < 0
8748 ? X86_64_SSE_REGPARM_MAX
8749 : cum->sse_regno)
8750 : -1);
8752 switch (mode)
8754 default:
8755 break;
8757 case E_V8SFmode:
8758 case E_V8SImode:
8759 case E_V32QImode:
8760 case E_V16HImode:
8761 case E_V4DFmode:
8762 case E_V4DImode:
8763 case E_V16SFmode:
8764 case E_V16SImode:
8765 case E_V64QImode:
8766 case E_V32HImode:
8767 case E_V8DFmode:
8768 case E_V8DImode:
8769 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8770 if (!named)
8771 return NULL;
8772 break;
8775 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8776 cum->sse_nregs,
8777 &x86_64_int_parameter_registers [cum->regno],
8778 cum->sse_regno);
8781 static rtx
8782 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8783 machine_mode orig_mode, bool named,
8784 HOST_WIDE_INT bytes)
8786 unsigned int regno;
8788 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8789 We use value of -2 to specify that current function call is MSABI. */
8790 if (mode == VOIDmode)
8791 return GEN_INT (-2);
8793 /* If we've run out of registers, it goes on the stack. */
8794 if (cum->nregs == 0)
8795 return NULL_RTX;
8797 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8799 /* Only floating point modes are passed in anything but integer regs. */
8800 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8802 if (named)
8803 regno = cum->regno + FIRST_SSE_REG;
8804 else
8806 rtx t1, t2;
8808 /* Unnamed floating parameters are passed in both the
8809 SSE and integer registers. */
8810 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8811 t2 = gen_rtx_REG (mode, regno);
8812 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8813 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8814 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8817 /* Handle aggregated types passed in register. */
8818 if (orig_mode == BLKmode)
8820 if (bytes > 0 && bytes <= 8)
8821 mode = (bytes > 4 ? DImode : SImode);
8822 if (mode == BLKmode)
8823 mode = DImode;
8826 return gen_reg_or_parallel (mode, orig_mode, regno);
8829 /* Return where to put the arguments to a function.
8830 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8832 MODE is the argument's machine mode. TYPE is the data type of the
8833 argument. It is null for libcalls where that information may not be
8834 available. CUM gives information about the preceding args and about
8835 the function being called. NAMED is nonzero if this argument is a
8836 named parameter (otherwise it is an extra parameter matching an
8837 ellipsis). */
8839 static rtx
8840 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8841 const_tree type, bool named)
8843 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8844 machine_mode mode = omode;
8845 HOST_WIDE_INT bytes, words;
8846 rtx arg;
8848 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8850 gcc_assert (type != NULL_TREE);
8851 if (POINTER_TYPE_P (type))
8853 /* This is the pointer argument. */
8854 gcc_assert (TYPE_MODE (type) == Pmode);
8855 /* It is at -WORD(AP) in the current frame in interrupt and
8856 exception handlers. */
8857 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8859 else
8861 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8862 && TREE_CODE (type) == INTEGER_TYPE
8863 && TYPE_MODE (type) == word_mode);
8864 /* The error code is the word-mode integer argument at
8865 -2 * WORD(AP) in the current frame of the exception
8866 handler. */
8867 arg = gen_rtx_MEM (word_mode,
8868 plus_constant (Pmode,
8869 arg_pointer_rtx,
8870 -2 * UNITS_PER_WORD));
8872 return arg;
8875 /* All pointer bounds arguments are handled separately here. */
8876 if ((type && POINTER_BOUNDS_TYPE_P (type))
8877 || POINTER_BOUNDS_MODE_P (mode))
8879 /* Return NULL if bounds are forced to go in Bounds Table. */
8880 if (cum->bnds_in_bt)
8881 arg = NULL;
8882 /* Return the next available bound reg if any. */
8883 else if (cum->bnd_regno <= LAST_BND_REG)
8884 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8885 /* Return the next special slot number otherwise. */
8886 else
8887 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8889 return arg;
8892 if (mode == BLKmode)
8893 bytes = int_size_in_bytes (type);
8894 else
8895 bytes = GET_MODE_SIZE (mode);
8896 words = CEIL (bytes, UNITS_PER_WORD);
8898 /* To simplify the code below, represent vector types with a vector mode
8899 even if MMX/SSE are not active. */
8900 if (type && TREE_CODE (type) == VECTOR_TYPE)
8901 mode = type_natural_mode (type, cum, false);
8903 if (TARGET_64BIT)
8905 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8907 if (call_abi == MS_ABI)
8908 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8909 else
8910 arg = function_arg_64 (cum, mode, omode, type, named);
8912 else
8913 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8915 /* Track if there are outgoing arguments on stack. */
8916 if (arg == NULL_RTX && cum->caller)
8917 cfun->machine->outgoing_args_on_stack = true;
8919 return arg;
8922 /* A C expression that indicates when an argument must be passed by
8923 reference. If nonzero for an argument, a copy of that argument is
8924 made in memory and a pointer to the argument is passed instead of
8925 the argument itself. The pointer is passed in whatever way is
8926 appropriate for passing a pointer to that type. */
8928 static bool
8929 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8930 const_tree type, bool)
8932 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8934 /* Bounds are never passed by reference. */
8935 if ((type && POINTER_BOUNDS_TYPE_P (type))
8936 || POINTER_BOUNDS_MODE_P (mode))
8937 return false;
8939 if (TARGET_64BIT)
8941 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8943 /* See Windows x64 Software Convention. */
8944 if (call_abi == MS_ABI)
8946 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8948 if (type)
8950 /* Arrays are passed by reference. */
8951 if (TREE_CODE (type) == ARRAY_TYPE)
8952 return true;
8954 if (RECORD_OR_UNION_TYPE_P (type))
8956 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8957 are passed by reference. */
8958 msize = int_size_in_bytes (type);
8962 /* __m128 is passed by reference. */
8963 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8965 else if (type && int_size_in_bytes (type) == -1)
8966 return true;
8969 return false;
8972 /* Return true when TYPE should be 128bit aligned for 32bit argument
8973 passing ABI. XXX: This function is obsolete and is only used for
8974 checking psABI compatibility with previous versions of GCC. */
8976 static bool
8977 ix86_compat_aligned_value_p (const_tree type)
8979 machine_mode mode = TYPE_MODE (type);
8980 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8981 || mode == TDmode
8982 || mode == TFmode
8983 || mode == TCmode)
8984 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8985 return true;
8986 if (TYPE_ALIGN (type) < 128)
8987 return false;
8989 if (AGGREGATE_TYPE_P (type))
8991 /* Walk the aggregates recursively. */
8992 switch (TREE_CODE (type))
8994 case RECORD_TYPE:
8995 case UNION_TYPE:
8996 case QUAL_UNION_TYPE:
8998 tree field;
9000 /* Walk all the structure fields. */
9001 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9003 if (TREE_CODE (field) == FIELD_DECL
9004 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
9005 return true;
9007 break;
9010 case ARRAY_TYPE:
9011 /* Just for use if some languages passes arrays by value. */
9012 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
9013 return true;
9014 break;
9016 default:
9017 gcc_unreachable ();
9020 return false;
9023 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
9024 XXX: This function is obsolete and is only used for checking psABI
9025 compatibility with previous versions of GCC. */
9027 static unsigned int
9028 ix86_compat_function_arg_boundary (machine_mode mode,
9029 const_tree type, unsigned int align)
9031 /* In 32bit, only _Decimal128 and __float128 are aligned to their
9032 natural boundaries. */
9033 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
9035 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
9036 make an exception for SSE modes since these require 128bit
9037 alignment.
9039 The handling here differs from field_alignment. ICC aligns MMX
9040 arguments to 4 byte boundaries, while structure fields are aligned
9041 to 8 byte boundaries. */
9042 if (!type)
9044 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
9045 align = PARM_BOUNDARY;
9047 else
9049 if (!ix86_compat_aligned_value_p (type))
9050 align = PARM_BOUNDARY;
9053 if (align > BIGGEST_ALIGNMENT)
9054 align = BIGGEST_ALIGNMENT;
9055 return align;
9058 /* Return true when TYPE should be 128bit aligned for 32bit argument
9059 passing ABI. */
9061 static bool
9062 ix86_contains_aligned_value_p (const_tree type)
9064 machine_mode mode = TYPE_MODE (type);
9066 if (mode == XFmode || mode == XCmode)
9067 return false;
9069 if (TYPE_ALIGN (type) < 128)
9070 return false;
9072 if (AGGREGATE_TYPE_P (type))
9074 /* Walk the aggregates recursively. */
9075 switch (TREE_CODE (type))
9077 case RECORD_TYPE:
9078 case UNION_TYPE:
9079 case QUAL_UNION_TYPE:
9081 tree field;
9083 /* Walk all the structure fields. */
9084 for (field = TYPE_FIELDS (type);
9085 field;
9086 field = DECL_CHAIN (field))
9088 if (TREE_CODE (field) == FIELD_DECL
9089 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
9090 return true;
9092 break;
9095 case ARRAY_TYPE:
9096 /* Just for use if some languages passes arrays by value. */
9097 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
9098 return true;
9099 break;
9101 default:
9102 gcc_unreachable ();
9105 else
9106 return TYPE_ALIGN (type) >= 128;
9108 return false;
9111 /* Gives the alignment boundary, in bits, of an argument with the
9112 specified mode and type. */
9114 static unsigned int
9115 ix86_function_arg_boundary (machine_mode mode, const_tree type)
9117 unsigned int align;
9118 if (type)
9120 /* Since the main variant type is used for call, we convert it to
9121 the main variant type. */
9122 type = TYPE_MAIN_VARIANT (type);
9123 align = TYPE_ALIGN (type);
9124 if (TYPE_EMPTY_P (type))
9125 return PARM_BOUNDARY;
9127 else
9128 align = GET_MODE_ALIGNMENT (mode);
9129 if (align < PARM_BOUNDARY)
9130 align = PARM_BOUNDARY;
9131 else
9133 static bool warned;
9134 unsigned int saved_align = align;
9136 if (!TARGET_64BIT)
9138 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
9139 if (!type)
9141 if (mode == XFmode || mode == XCmode)
9142 align = PARM_BOUNDARY;
9144 else if (!ix86_contains_aligned_value_p (type))
9145 align = PARM_BOUNDARY;
9147 if (align < 128)
9148 align = PARM_BOUNDARY;
9151 if (warn_psabi
9152 && !warned
9153 && align != ix86_compat_function_arg_boundary (mode, type,
9154 saved_align))
9156 warned = true;
9157 inform (input_location,
9158 "The ABI for passing parameters with %d-byte"
9159 " alignment has changed in GCC 4.6",
9160 align / BITS_PER_UNIT);
9164 return align;
9167 /* Return true if N is a possible register number of function value. */
9169 static bool
9170 ix86_function_value_regno_p (const unsigned int regno)
9172 switch (regno)
9174 case AX_REG:
9175 return true;
9176 case DX_REG:
9177 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9178 case DI_REG:
9179 case SI_REG:
9180 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9182 case BND0_REG:
9183 case BND1_REG:
9184 return chkp_function_instrumented_p (current_function_decl);
9186 /* Complex values are returned in %st(0)/%st(1) pair. */
9187 case ST0_REG:
9188 case ST1_REG:
9189 /* TODO: The function should depend on current function ABI but
9190 builtins.c would need updating then. Therefore we use the
9191 default ABI. */
9192 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9193 return false;
9194 return TARGET_FLOAT_RETURNS_IN_80387;
9196 /* Complex values are returned in %xmm0/%xmm1 pair. */
9197 case XMM0_REG:
9198 case XMM1_REG:
9199 return TARGET_SSE;
9201 case MM0_REG:
9202 if (TARGET_MACHO || TARGET_64BIT)
9203 return false;
9204 return TARGET_MMX;
9207 return false;
9210 /* Define how to find the value returned by a function.
9211 VALTYPE is the data type of the value (as a tree).
9212 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9213 otherwise, FUNC is 0. */
9215 static rtx
9216 function_value_32 (machine_mode orig_mode, machine_mode mode,
9217 const_tree fntype, const_tree fn)
9219 unsigned int regno;
9221 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9222 we normally prevent this case when mmx is not available. However
9223 some ABIs may require the result to be returned like DImode. */
9224 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9225 regno = FIRST_MMX_REG;
9227 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9228 we prevent this case when sse is not available. However some ABIs
9229 may require the result to be returned like integer TImode. */
9230 else if (mode == TImode
9231 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9232 regno = FIRST_SSE_REG;
9234 /* 32-byte vector modes in %ymm0. */
9235 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9236 regno = FIRST_SSE_REG;
9238 /* 64-byte vector modes in %zmm0. */
9239 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9240 regno = FIRST_SSE_REG;
9242 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9243 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9244 regno = FIRST_FLOAT_REG;
9245 else
9246 /* Most things go in %eax. */
9247 regno = AX_REG;
9249 /* Override FP return register with %xmm0 for local functions when
9250 SSE math is enabled or for functions with sseregparm attribute. */
9251 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9253 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9254 if (sse_level == -1)
9256 error ("calling %qD with SSE calling convention without "
9257 "SSE/SSE2 enabled", fn);
9258 sorry ("this is a GCC bug that can be worked around by adding "
9259 "attribute used to function called");
9261 else if ((sse_level >= 1 && mode == SFmode)
9262 || (sse_level == 2 && mode == DFmode))
9263 regno = FIRST_SSE_REG;
9266 /* OImode shouldn't be used directly. */
9267 gcc_assert (mode != OImode);
9269 return gen_rtx_REG (orig_mode, regno);
9272 static rtx
9273 function_value_64 (machine_mode orig_mode, machine_mode mode,
9274 const_tree valtype)
9276 rtx ret;
9278 /* Handle libcalls, which don't provide a type node. */
9279 if (valtype == NULL)
9281 unsigned int regno;
9283 switch (mode)
9285 case E_SFmode:
9286 case E_SCmode:
9287 case E_DFmode:
9288 case E_DCmode:
9289 case E_TFmode:
9290 case E_SDmode:
9291 case E_DDmode:
9292 case E_TDmode:
9293 regno = FIRST_SSE_REG;
9294 break;
9295 case E_XFmode:
9296 case E_XCmode:
9297 regno = FIRST_FLOAT_REG;
9298 break;
9299 case E_TCmode:
9300 return NULL;
9301 default:
9302 regno = AX_REG;
9305 return gen_rtx_REG (mode, regno);
9307 else if (POINTER_TYPE_P (valtype))
9309 /* Pointers are always returned in word_mode. */
9310 mode = word_mode;
9313 ret = construct_container (mode, orig_mode, valtype, 1,
9314 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9315 x86_64_int_return_registers, 0);
9317 /* For zero sized structures, construct_container returns NULL, but we
9318 need to keep rest of compiler happy by returning meaningful value. */
9319 if (!ret)
9320 ret = gen_rtx_REG (orig_mode, AX_REG);
9322 return ret;
9325 static rtx
9326 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9327 const_tree valtype)
9329 unsigned int regno = AX_REG;
9331 if (TARGET_SSE)
9333 switch (GET_MODE_SIZE (mode))
9335 case 16:
9336 if (valtype != NULL_TREE
9337 && !VECTOR_INTEGER_TYPE_P (valtype)
9338 && !VECTOR_INTEGER_TYPE_P (valtype)
9339 && !INTEGRAL_TYPE_P (valtype)
9340 && !VECTOR_FLOAT_TYPE_P (valtype))
9341 break;
9342 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9343 && !COMPLEX_MODE_P (mode))
9344 regno = FIRST_SSE_REG;
9345 break;
9346 case 8:
9347 case 4:
9348 if (mode == SFmode || mode == DFmode)
9349 regno = FIRST_SSE_REG;
9350 break;
9351 default:
9352 break;
9355 return gen_rtx_REG (orig_mode, regno);
9358 static rtx
9359 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9360 machine_mode orig_mode, machine_mode mode)
9362 const_tree fn, fntype;
9364 fn = NULL_TREE;
9365 if (fntype_or_decl && DECL_P (fntype_or_decl))
9366 fn = fntype_or_decl;
9367 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9369 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9370 || POINTER_BOUNDS_MODE_P (mode))
9371 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9372 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9373 return function_value_ms_64 (orig_mode, mode, valtype);
9374 else if (TARGET_64BIT)
9375 return function_value_64 (orig_mode, mode, valtype);
9376 else
9377 return function_value_32 (orig_mode, mode, fntype, fn);
9380 static rtx
9381 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9383 machine_mode mode, orig_mode;
9385 orig_mode = TYPE_MODE (valtype);
9386 mode = type_natural_mode (valtype, NULL, true);
9387 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9390 /* Return an RTX representing a place where a function returns
9391 or recieves pointer bounds or NULL if no bounds are returned.
9393 VALTYPE is a data type of a value returned by the function.
9395 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9396 or FUNCTION_TYPE of the function.
9398 If OUTGOING is false, return a place in which the caller will
9399 see the return value. Otherwise, return a place where a
9400 function returns a value. */
9402 static rtx
9403 ix86_function_value_bounds (const_tree valtype,
9404 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9405 bool outgoing ATTRIBUTE_UNUSED)
9407 rtx res = NULL_RTX;
9409 if (BOUNDED_TYPE_P (valtype))
9410 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9411 else if (chkp_type_has_pointer (valtype))
9413 bitmap slots;
9414 rtx bounds[2];
9415 bitmap_iterator bi;
9416 unsigned i, bnd_no = 0;
9418 bitmap_obstack_initialize (NULL);
9419 slots = BITMAP_ALLOC (NULL);
9420 chkp_find_bound_slots (valtype, slots);
9422 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9424 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9425 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9426 gcc_assert (bnd_no < 2);
9427 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9430 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9432 BITMAP_FREE (slots);
9433 bitmap_obstack_release (NULL);
9435 else
9436 res = NULL_RTX;
9438 return res;
9441 /* Pointer function arguments and return values are promoted to
9442 word_mode for normal functions. */
9444 static machine_mode
9445 ix86_promote_function_mode (const_tree type, machine_mode mode,
9446 int *punsignedp, const_tree fntype,
9447 int for_return)
9449 if (cfun->machine->func_type == TYPE_NORMAL
9450 && type != NULL_TREE
9451 && POINTER_TYPE_P (type))
9453 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9454 return word_mode;
9456 return default_promote_function_mode (type, mode, punsignedp, fntype,
9457 for_return);
9460 /* Return true if a structure, union or array with MODE containing FIELD
9461 should be accessed using BLKmode. */
9463 static bool
9464 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9466 /* Union with XFmode must be in BLKmode. */
9467 return (mode == XFmode
9468 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9469 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9473 ix86_libcall_value (machine_mode mode)
9475 return ix86_function_value_1 (NULL, NULL, mode, mode);
9478 /* Return true iff type is returned in memory. */
9480 static bool
9481 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9483 #ifdef SUBTARGET_RETURN_IN_MEMORY
9484 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9485 #else
9486 const machine_mode mode = type_natural_mode (type, NULL, true);
9487 HOST_WIDE_INT size;
9489 if (POINTER_BOUNDS_TYPE_P (type))
9490 return false;
9492 if (TARGET_64BIT)
9494 if (ix86_function_type_abi (fntype) == MS_ABI)
9496 size = int_size_in_bytes (type);
9498 /* __m128 is returned in xmm0. */
9499 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9500 || INTEGRAL_TYPE_P (type)
9501 || VECTOR_FLOAT_TYPE_P (type))
9502 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9503 && !COMPLEX_MODE_P (mode)
9504 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9505 return false;
9507 /* Otherwise, the size must be exactly in [1248]. */
9508 return size != 1 && size != 2 && size != 4 && size != 8;
9510 else
9512 int needed_intregs, needed_sseregs;
9514 return examine_argument (mode, type, 1,
9515 &needed_intregs, &needed_sseregs);
9518 else
9520 size = int_size_in_bytes (type);
9522 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9523 bytes in registers. */
9524 if (TARGET_IAMCU)
9525 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9527 if (mode == BLKmode)
9528 return true;
9530 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9531 return false;
9533 if (VECTOR_MODE_P (mode) || mode == TImode)
9535 /* User-created vectors small enough to fit in EAX. */
9536 if (size < 8)
9537 return false;
9539 /* Unless ABI prescibes otherwise,
9540 MMX/3dNow values are returned in MM0 if available. */
9542 if (size == 8)
9543 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9545 /* SSE values are returned in XMM0 if available. */
9546 if (size == 16)
9547 return !TARGET_SSE;
9549 /* AVX values are returned in YMM0 if available. */
9550 if (size == 32)
9551 return !TARGET_AVX;
9553 /* AVX512F values are returned in ZMM0 if available. */
9554 if (size == 64)
9555 return !TARGET_AVX512F;
9558 if (mode == XFmode)
9559 return false;
9561 if (size > 12)
9562 return true;
9564 /* OImode shouldn't be used directly. */
9565 gcc_assert (mode != OImode);
9567 return false;
9569 #endif
9573 /* Create the va_list data type. */
9575 static tree
9576 ix86_build_builtin_va_list_64 (void)
9578 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9580 record = lang_hooks.types.make_type (RECORD_TYPE);
9581 type_decl = build_decl (BUILTINS_LOCATION,
9582 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9584 f_gpr = build_decl (BUILTINS_LOCATION,
9585 FIELD_DECL, get_identifier ("gp_offset"),
9586 unsigned_type_node);
9587 f_fpr = build_decl (BUILTINS_LOCATION,
9588 FIELD_DECL, get_identifier ("fp_offset"),
9589 unsigned_type_node);
9590 f_ovf = build_decl (BUILTINS_LOCATION,
9591 FIELD_DECL, get_identifier ("overflow_arg_area"),
9592 ptr_type_node);
9593 f_sav = build_decl (BUILTINS_LOCATION,
9594 FIELD_DECL, get_identifier ("reg_save_area"),
9595 ptr_type_node);
9597 va_list_gpr_counter_field = f_gpr;
9598 va_list_fpr_counter_field = f_fpr;
9600 DECL_FIELD_CONTEXT (f_gpr) = record;
9601 DECL_FIELD_CONTEXT (f_fpr) = record;
9602 DECL_FIELD_CONTEXT (f_ovf) = record;
9603 DECL_FIELD_CONTEXT (f_sav) = record;
9605 TYPE_STUB_DECL (record) = type_decl;
9606 TYPE_NAME (record) = type_decl;
9607 TYPE_FIELDS (record) = f_gpr;
9608 DECL_CHAIN (f_gpr) = f_fpr;
9609 DECL_CHAIN (f_fpr) = f_ovf;
9610 DECL_CHAIN (f_ovf) = f_sav;
9612 layout_type (record);
9614 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9615 NULL_TREE, TYPE_ATTRIBUTES (record));
9617 /* The correct type is an array type of one element. */
9618 return build_array_type (record, build_index_type (size_zero_node));
9621 /* Setup the builtin va_list data type and for 64-bit the additional
9622 calling convention specific va_list data types. */
9624 static tree
9625 ix86_build_builtin_va_list (void)
9627 if (TARGET_64BIT)
9629 /* Initialize ABI specific va_list builtin types.
9631 In lto1, we can encounter two va_list types:
9632 - one as a result of the type-merge across TUs, and
9633 - the one constructed here.
9634 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9635 a type identity check in canonical_va_list_type based on
9636 TYPE_MAIN_VARIANT (which we used to have) will not work.
9637 Instead, we tag each va_list_type_node with its unique attribute, and
9638 look for the attribute in the type identity check in
9639 canonical_va_list_type.
9641 Tagging sysv_va_list_type_node directly with the attribute is
9642 problematic since it's a array of one record, which will degrade into a
9643 pointer to record when used as parameter (see build_va_arg comments for
9644 an example), dropping the attribute in the process. So we tag the
9645 record instead. */
9647 /* For SYSV_ABI we use an array of one record. */
9648 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9650 /* For MS_ABI we use plain pointer to argument area. */
9651 tree char_ptr_type = build_pointer_type (char_type_node);
9652 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9653 TYPE_ATTRIBUTES (char_ptr_type));
9654 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9656 return ((ix86_abi == MS_ABI)
9657 ? ms_va_list_type_node
9658 : sysv_va_list_type_node);
9660 else
9662 /* For i386 we use plain pointer to argument area. */
9663 return build_pointer_type (char_type_node);
9667 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9669 static void
9670 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9672 rtx save_area, mem;
9673 alias_set_type set;
9674 int i, max;
9676 /* GPR size of varargs save area. */
9677 if (cfun->va_list_gpr_size)
9678 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9679 else
9680 ix86_varargs_gpr_size = 0;
9682 /* FPR size of varargs save area. We don't need it if we don't pass
9683 anything in SSE registers. */
9684 if (TARGET_SSE && cfun->va_list_fpr_size)
9685 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9686 else
9687 ix86_varargs_fpr_size = 0;
9689 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9690 return;
9692 save_area = frame_pointer_rtx;
9693 set = get_varargs_alias_set ();
9695 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9696 if (max > X86_64_REGPARM_MAX)
9697 max = X86_64_REGPARM_MAX;
9699 for (i = cum->regno; i < max; i++)
9701 mem = gen_rtx_MEM (word_mode,
9702 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9703 MEM_NOTRAP_P (mem) = 1;
9704 set_mem_alias_set (mem, set);
9705 emit_move_insn (mem,
9706 gen_rtx_REG (word_mode,
9707 x86_64_int_parameter_registers[i]));
9710 if (ix86_varargs_fpr_size)
9712 machine_mode smode;
9713 rtx_code_label *label;
9714 rtx test;
9716 /* Now emit code to save SSE registers. The AX parameter contains number
9717 of SSE parameter registers used to call this function, though all we
9718 actually check here is the zero/non-zero status. */
9720 label = gen_label_rtx ();
9721 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9722 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9723 label));
9725 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9726 we used movdqa (i.e. TImode) instead? Perhaps even better would
9727 be if we could determine the real mode of the data, via a hook
9728 into pass_stdarg. Ignore all that for now. */
9729 smode = V4SFmode;
9730 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9731 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9733 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9734 if (max > X86_64_SSE_REGPARM_MAX)
9735 max = X86_64_SSE_REGPARM_MAX;
9737 for (i = cum->sse_regno; i < max; ++i)
9739 mem = plus_constant (Pmode, save_area,
9740 i * 16 + ix86_varargs_gpr_size);
9741 mem = gen_rtx_MEM (smode, mem);
9742 MEM_NOTRAP_P (mem) = 1;
9743 set_mem_alias_set (mem, set);
9744 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9746 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9749 emit_label (label);
9753 static void
9754 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9756 alias_set_type set = get_varargs_alias_set ();
9757 int i;
9759 /* Reset to zero, as there might be a sysv vaarg used
9760 before. */
9761 ix86_varargs_gpr_size = 0;
9762 ix86_varargs_fpr_size = 0;
9764 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9766 rtx reg, mem;
9768 mem = gen_rtx_MEM (Pmode,
9769 plus_constant (Pmode, virtual_incoming_args_rtx,
9770 i * UNITS_PER_WORD));
9771 MEM_NOTRAP_P (mem) = 1;
9772 set_mem_alias_set (mem, set);
9774 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9775 emit_move_insn (mem, reg);
9779 static void
9780 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9781 tree type, int *, int no_rtl)
9783 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9784 CUMULATIVE_ARGS next_cum;
9785 tree fntype;
9787 /* This argument doesn't appear to be used anymore. Which is good,
9788 because the old code here didn't suppress rtl generation. */
9789 gcc_assert (!no_rtl);
9791 if (!TARGET_64BIT)
9792 return;
9794 fntype = TREE_TYPE (current_function_decl);
9796 /* For varargs, we do not want to skip the dummy va_dcl argument.
9797 For stdargs, we do want to skip the last named argument. */
9798 next_cum = *cum;
9799 if (stdarg_p (fntype))
9800 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9801 true);
9803 if (cum->call_abi == MS_ABI)
9804 setup_incoming_varargs_ms_64 (&next_cum);
9805 else
9806 setup_incoming_varargs_64 (&next_cum);
9809 static void
9810 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9811 machine_mode mode,
9812 tree type,
9813 int *pretend_size ATTRIBUTE_UNUSED,
9814 int no_rtl)
9816 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9817 CUMULATIVE_ARGS next_cum;
9818 tree fntype;
9819 rtx save_area;
9820 int bnd_reg, i, max;
9822 gcc_assert (!no_rtl);
9824 /* Do nothing if we use plain pointer to argument area. */
9825 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9826 return;
9828 fntype = TREE_TYPE (current_function_decl);
9830 /* For varargs, we do not want to skip the dummy va_dcl argument.
9831 For stdargs, we do want to skip the last named argument. */
9832 next_cum = *cum;
9833 if (stdarg_p (fntype))
9834 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9835 true);
9836 save_area = frame_pointer_rtx;
9838 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9839 if (max > X86_64_REGPARM_MAX)
9840 max = X86_64_REGPARM_MAX;
9842 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9843 if (chkp_function_instrumented_p (current_function_decl))
9844 for (i = cum->regno; i < max; i++)
9846 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9847 rtx ptr = gen_rtx_REG (Pmode,
9848 x86_64_int_parameter_registers[i]);
9849 rtx bounds;
9851 if (bnd_reg <= LAST_BND_REG)
9852 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9853 else
9855 rtx ldx_addr =
9856 plus_constant (Pmode, arg_pointer_rtx,
9857 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9858 bounds = gen_reg_rtx (BNDmode);
9859 emit_insn (BNDmode == BND64mode
9860 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9861 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9864 emit_insn (BNDmode == BND64mode
9865 ? gen_bnd64_stx (addr, ptr, bounds)
9866 : gen_bnd32_stx (addr, ptr, bounds));
9868 bnd_reg++;
9873 /* Checks if TYPE is of kind va_list char *. */
9875 static bool
9876 is_va_list_char_pointer (tree type)
9878 tree canonic;
9880 /* For 32-bit it is always true. */
9881 if (!TARGET_64BIT)
9882 return true;
9883 canonic = ix86_canonical_va_list_type (type);
9884 return (canonic == ms_va_list_type_node
9885 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9888 /* Implement va_start. */
9890 static void
9891 ix86_va_start (tree valist, rtx nextarg)
9893 HOST_WIDE_INT words, n_gpr, n_fpr;
9894 tree f_gpr, f_fpr, f_ovf, f_sav;
9895 tree gpr, fpr, ovf, sav, t;
9896 tree type;
9897 rtx ovf_rtx;
9899 if (flag_split_stack
9900 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9902 unsigned int scratch_regno;
9904 /* When we are splitting the stack, we can't refer to the stack
9905 arguments using internal_arg_pointer, because they may be on
9906 the old stack. The split stack prologue will arrange to
9907 leave a pointer to the old stack arguments in a scratch
9908 register, which we here copy to a pseudo-register. The split
9909 stack prologue can't set the pseudo-register directly because
9910 it (the prologue) runs before any registers have been saved. */
9912 scratch_regno = split_stack_prologue_scratch_regno ();
9913 if (scratch_regno != INVALID_REGNUM)
9915 rtx reg;
9916 rtx_insn *seq;
9918 reg = gen_reg_rtx (Pmode);
9919 cfun->machine->split_stack_varargs_pointer = reg;
9921 start_sequence ();
9922 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9923 seq = get_insns ();
9924 end_sequence ();
9926 push_topmost_sequence ();
9927 emit_insn_after (seq, entry_of_function ());
9928 pop_topmost_sequence ();
9932 /* Only 64bit target needs something special. */
9933 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9935 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9936 std_expand_builtin_va_start (valist, nextarg);
9937 else
9939 rtx va_r, next;
9941 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9942 next = expand_binop (ptr_mode, add_optab,
9943 cfun->machine->split_stack_varargs_pointer,
9944 crtl->args.arg_offset_rtx,
9945 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9946 convert_move (va_r, next, 0);
9948 /* Store zero bounds for va_list. */
9949 if (chkp_function_instrumented_p (current_function_decl))
9950 chkp_expand_bounds_reset_for_mem (valist,
9951 make_tree (TREE_TYPE (valist),
9952 next));
9955 return;
9958 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9959 f_fpr = DECL_CHAIN (f_gpr);
9960 f_ovf = DECL_CHAIN (f_fpr);
9961 f_sav = DECL_CHAIN (f_ovf);
9963 valist = build_simple_mem_ref (valist);
9964 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9965 /* The following should be folded into the MEM_REF offset. */
9966 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9967 f_gpr, NULL_TREE);
9968 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9969 f_fpr, NULL_TREE);
9970 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9971 f_ovf, NULL_TREE);
9972 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9973 f_sav, NULL_TREE);
9975 /* Count number of gp and fp argument registers used. */
9976 words = crtl->args.info.words;
9977 n_gpr = crtl->args.info.regno;
9978 n_fpr = crtl->args.info.sse_regno;
9980 if (cfun->va_list_gpr_size)
9982 type = TREE_TYPE (gpr);
9983 t = build2 (MODIFY_EXPR, type,
9984 gpr, build_int_cst (type, n_gpr * 8));
9985 TREE_SIDE_EFFECTS (t) = 1;
9986 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9989 if (TARGET_SSE && cfun->va_list_fpr_size)
9991 type = TREE_TYPE (fpr);
9992 t = build2 (MODIFY_EXPR, type, fpr,
9993 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9994 TREE_SIDE_EFFECTS (t) = 1;
9995 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9998 /* Find the overflow area. */
9999 type = TREE_TYPE (ovf);
10000 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10001 ovf_rtx = crtl->args.internal_arg_pointer;
10002 else
10003 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
10004 t = make_tree (type, ovf_rtx);
10005 if (words != 0)
10006 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
10008 /* Store zero bounds for overflow area pointer. */
10009 if (chkp_function_instrumented_p (current_function_decl))
10010 chkp_expand_bounds_reset_for_mem (ovf, t);
10012 t = build2 (MODIFY_EXPR, type, ovf, t);
10013 TREE_SIDE_EFFECTS (t) = 1;
10014 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10016 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
10018 /* Find the register save area.
10019 Prologue of the function save it right above stack frame. */
10020 type = TREE_TYPE (sav);
10021 t = make_tree (type, frame_pointer_rtx);
10022 if (!ix86_varargs_gpr_size)
10023 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
10025 /* Store zero bounds for save area pointer. */
10026 if (chkp_function_instrumented_p (current_function_decl))
10027 chkp_expand_bounds_reset_for_mem (sav, t);
10029 t = build2 (MODIFY_EXPR, type, sav, t);
10030 TREE_SIDE_EFFECTS (t) = 1;
10031 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10035 /* Implement va_arg. */
10037 static tree
10038 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
10039 gimple_seq *post_p)
10041 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
10042 tree f_gpr, f_fpr, f_ovf, f_sav;
10043 tree gpr, fpr, ovf, sav, t;
10044 int size, rsize;
10045 tree lab_false, lab_over = NULL_TREE;
10046 tree addr, t2;
10047 rtx container;
10048 int indirect_p = 0;
10049 tree ptrtype;
10050 machine_mode nat_mode;
10051 unsigned int arg_boundary;
10053 /* Only 64bit target needs something special. */
10054 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10055 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
10057 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10058 f_fpr = DECL_CHAIN (f_gpr);
10059 f_ovf = DECL_CHAIN (f_fpr);
10060 f_sav = DECL_CHAIN (f_ovf);
10062 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
10063 valist, f_gpr, NULL_TREE);
10065 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
10066 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
10067 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
10069 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10070 if (indirect_p)
10071 type = build_pointer_type (type);
10072 size = arg_int_size_in_bytes (type);
10073 rsize = CEIL (size, UNITS_PER_WORD);
10075 nat_mode = type_natural_mode (type, NULL, false);
10076 switch (nat_mode)
10078 case E_V8SFmode:
10079 case E_V8SImode:
10080 case E_V32QImode:
10081 case E_V16HImode:
10082 case E_V4DFmode:
10083 case E_V4DImode:
10084 case E_V16SFmode:
10085 case E_V16SImode:
10086 case E_V64QImode:
10087 case E_V32HImode:
10088 case E_V8DFmode:
10089 case E_V8DImode:
10090 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10091 if (!TARGET_64BIT_MS_ABI)
10093 container = NULL;
10094 break;
10096 /* FALLTHRU */
10098 default:
10099 container = construct_container (nat_mode, TYPE_MODE (type),
10100 type, 0, X86_64_REGPARM_MAX,
10101 X86_64_SSE_REGPARM_MAX, intreg,
10103 break;
10106 /* Pull the value out of the saved registers. */
10108 addr = create_tmp_var (ptr_type_node, "addr");
10110 if (container)
10112 int needed_intregs, needed_sseregs;
10113 bool need_temp;
10114 tree int_addr, sse_addr;
10116 lab_false = create_artificial_label (UNKNOWN_LOCATION);
10117 lab_over = create_artificial_label (UNKNOWN_LOCATION);
10119 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
10121 need_temp = (!REG_P (container)
10122 && ((needed_intregs && TYPE_ALIGN (type) > 64)
10123 || TYPE_ALIGN (type) > 128));
10125 /* In case we are passing structure, verify that it is consecutive block
10126 on the register save area. If not we need to do moves. */
10127 if (!need_temp && !REG_P (container))
10129 /* Verify that all registers are strictly consecutive */
10130 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
10132 int i;
10134 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10136 rtx slot = XVECEXP (container, 0, i);
10137 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
10138 || INTVAL (XEXP (slot, 1)) != i * 16)
10139 need_temp = true;
10142 else
10144 int i;
10146 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10148 rtx slot = XVECEXP (container, 0, i);
10149 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10150 || INTVAL (XEXP (slot, 1)) != i * 8)
10151 need_temp = true;
10155 if (!need_temp)
10157 int_addr = addr;
10158 sse_addr = addr;
10160 else
10162 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10163 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10166 /* First ensure that we fit completely in registers. */
10167 if (needed_intregs)
10169 t = build_int_cst (TREE_TYPE (gpr),
10170 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10171 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10172 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10173 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10174 gimplify_and_add (t, pre_p);
10176 if (needed_sseregs)
10178 t = build_int_cst (TREE_TYPE (fpr),
10179 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10180 + X86_64_REGPARM_MAX * 8);
10181 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10182 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10183 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10184 gimplify_and_add (t, pre_p);
10187 /* Compute index to start of area used for integer regs. */
10188 if (needed_intregs)
10190 /* int_addr = gpr + sav; */
10191 t = fold_build_pointer_plus (sav, gpr);
10192 gimplify_assign (int_addr, t, pre_p);
10194 if (needed_sseregs)
10196 /* sse_addr = fpr + sav; */
10197 t = fold_build_pointer_plus (sav, fpr);
10198 gimplify_assign (sse_addr, t, pre_p);
10200 if (need_temp)
10202 int i, prev_size = 0;
10203 tree temp = create_tmp_var (type, "va_arg_tmp");
10205 /* addr = &temp; */
10206 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10207 gimplify_assign (addr, t, pre_p);
10209 for (i = 0; i < XVECLEN (container, 0); i++)
10211 rtx slot = XVECEXP (container, 0, i);
10212 rtx reg = XEXP (slot, 0);
10213 machine_mode mode = GET_MODE (reg);
10214 tree piece_type;
10215 tree addr_type;
10216 tree daddr_type;
10217 tree src_addr, src;
10218 int src_offset;
10219 tree dest_addr, dest;
10220 int cur_size = GET_MODE_SIZE (mode);
10222 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10223 prev_size = INTVAL (XEXP (slot, 1));
10224 if (prev_size + cur_size > size)
10226 cur_size = size - prev_size;
10227 unsigned int nbits = cur_size * BITS_PER_UNIT;
10228 if (!int_mode_for_size (nbits, 1).exists (&mode))
10229 mode = QImode;
10231 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10232 if (mode == GET_MODE (reg))
10233 addr_type = build_pointer_type (piece_type);
10234 else
10235 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10236 true);
10237 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10238 true);
10240 if (SSE_REGNO_P (REGNO (reg)))
10242 src_addr = sse_addr;
10243 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10245 else
10247 src_addr = int_addr;
10248 src_offset = REGNO (reg) * 8;
10250 src_addr = fold_convert (addr_type, src_addr);
10251 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10253 dest_addr = fold_convert (daddr_type, addr);
10254 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10255 if (cur_size == GET_MODE_SIZE (mode))
10257 src = build_va_arg_indirect_ref (src_addr);
10258 dest = build_va_arg_indirect_ref (dest_addr);
10260 gimplify_assign (dest, src, pre_p);
10262 else
10264 tree copy
10265 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10266 3, dest_addr, src_addr,
10267 size_int (cur_size));
10268 gimplify_and_add (copy, pre_p);
10270 prev_size += cur_size;
10274 if (needed_intregs)
10276 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10277 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10278 gimplify_assign (gpr, t, pre_p);
10281 if (needed_sseregs)
10283 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10284 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10285 gimplify_assign (unshare_expr (fpr), t, pre_p);
10288 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10290 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10293 /* ... otherwise out of the overflow area. */
10295 /* When we align parameter on stack for caller, if the parameter
10296 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10297 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10298 here with caller. */
10299 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10300 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10301 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10303 /* Care for on-stack alignment if needed. */
10304 if (arg_boundary <= 64 || size == 0)
10305 t = ovf;
10306 else
10308 HOST_WIDE_INT align = arg_boundary / 8;
10309 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10310 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10311 build_int_cst (TREE_TYPE (t), -align));
10314 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10315 gimplify_assign (addr, t, pre_p);
10317 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10318 gimplify_assign (unshare_expr (ovf), t, pre_p);
10320 if (container)
10321 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10323 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10324 addr = fold_convert (ptrtype, addr);
10326 if (indirect_p)
10327 addr = build_va_arg_indirect_ref (addr);
10328 return build_va_arg_indirect_ref (addr);
10331 /* Return true if OPNUM's MEM should be matched
10332 in movabs* patterns. */
10334 bool
10335 ix86_check_movabs (rtx insn, int opnum)
10337 rtx set, mem;
10339 set = PATTERN (insn);
10340 if (GET_CODE (set) == PARALLEL)
10341 set = XVECEXP (set, 0, 0);
10342 gcc_assert (GET_CODE (set) == SET);
10343 mem = XEXP (set, opnum);
10344 while (SUBREG_P (mem))
10345 mem = SUBREG_REG (mem);
10346 gcc_assert (MEM_P (mem));
10347 return volatile_ok || !MEM_VOLATILE_P (mem);
10350 /* Return false if INSN contains a MEM with a non-default address space. */
10351 bool
10352 ix86_check_no_addr_space (rtx insn)
10354 subrtx_var_iterator::array_type array;
10355 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10357 rtx x = *iter;
10358 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10359 return false;
10361 return true;
10364 /* Initialize the table of extra 80387 mathematical constants. */
10366 static void
10367 init_ext_80387_constants (void)
10369 static const char * cst[5] =
10371 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10372 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10373 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10374 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10375 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10377 int i;
10379 for (i = 0; i < 5; i++)
10381 real_from_string (&ext_80387_constants_table[i], cst[i]);
10382 /* Ensure each constant is rounded to XFmode precision. */
10383 real_convert (&ext_80387_constants_table[i],
10384 XFmode, &ext_80387_constants_table[i]);
10387 ext_80387_constants_init = 1;
10390 /* Return non-zero if the constant is something that
10391 can be loaded with a special instruction. */
10394 standard_80387_constant_p (rtx x)
10396 machine_mode mode = GET_MODE (x);
10398 const REAL_VALUE_TYPE *r;
10400 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10401 return -1;
10403 if (x == CONST0_RTX (mode))
10404 return 1;
10405 if (x == CONST1_RTX (mode))
10406 return 2;
10408 r = CONST_DOUBLE_REAL_VALUE (x);
10410 /* For XFmode constants, try to find a special 80387 instruction when
10411 optimizing for size or on those CPUs that benefit from them. */
10412 if (mode == XFmode
10413 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10415 int i;
10417 if (! ext_80387_constants_init)
10418 init_ext_80387_constants ();
10420 for (i = 0; i < 5; i++)
10421 if (real_identical (r, &ext_80387_constants_table[i]))
10422 return i + 3;
10425 /* Load of the constant -0.0 or -1.0 will be split as
10426 fldz;fchs or fld1;fchs sequence. */
10427 if (real_isnegzero (r))
10428 return 8;
10429 if (real_identical (r, &dconstm1))
10430 return 9;
10432 return 0;
10435 /* Return the opcode of the special instruction to be used to load
10436 the constant X. */
10438 const char *
10439 standard_80387_constant_opcode (rtx x)
10441 switch (standard_80387_constant_p (x))
10443 case 1:
10444 return "fldz";
10445 case 2:
10446 return "fld1";
10447 case 3:
10448 return "fldlg2";
10449 case 4:
10450 return "fldln2";
10451 case 5:
10452 return "fldl2e";
10453 case 6:
10454 return "fldl2t";
10455 case 7:
10456 return "fldpi";
10457 case 8:
10458 case 9:
10459 return "#";
10460 default:
10461 gcc_unreachable ();
10465 /* Return the CONST_DOUBLE representing the 80387 constant that is
10466 loaded by the specified special instruction. The argument IDX
10467 matches the return value from standard_80387_constant_p. */
10470 standard_80387_constant_rtx (int idx)
10472 int i;
10474 if (! ext_80387_constants_init)
10475 init_ext_80387_constants ();
10477 switch (idx)
10479 case 3:
10480 case 4:
10481 case 5:
10482 case 6:
10483 case 7:
10484 i = idx - 3;
10485 break;
10487 default:
10488 gcc_unreachable ();
10491 return const_double_from_real_value (ext_80387_constants_table[i],
10492 XFmode);
10495 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10496 in supported SSE/AVX vector mode. */
10499 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10501 machine_mode mode;
10503 if (!TARGET_SSE)
10504 return 0;
10506 mode = GET_MODE (x);
10508 if (x == const0_rtx || const0_operand (x, mode))
10509 return 1;
10511 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10513 /* VOIDmode integer constant, get mode from the predicate. */
10514 if (mode == VOIDmode)
10515 mode = pred_mode;
10517 switch (GET_MODE_SIZE (mode))
10519 case 64:
10520 if (TARGET_AVX512F)
10521 return 2;
10522 break;
10523 case 32:
10524 if (TARGET_AVX2)
10525 return 2;
10526 break;
10527 case 16:
10528 if (TARGET_SSE2)
10529 return 2;
10530 break;
10531 case 0:
10532 /* VOIDmode */
10533 gcc_unreachable ();
10534 default:
10535 break;
10539 return 0;
10542 /* Return the opcode of the special instruction to be used to load
10543 the constant operands[1] into operands[0]. */
10545 const char *
10546 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10548 machine_mode mode;
10549 rtx x = operands[1];
10551 gcc_assert (TARGET_SSE);
10553 mode = GET_MODE (x);
10555 if (x == const0_rtx || const0_operand (x, mode))
10557 switch (get_attr_mode (insn))
10559 case MODE_TI:
10560 if (!EXT_REX_SSE_REG_P (operands[0]))
10561 return "%vpxor\t%0, %d0";
10562 /* FALLTHRU */
10563 case MODE_XI:
10564 case MODE_OI:
10565 if (EXT_REX_SSE_REG_P (operands[0]))
10566 return (TARGET_AVX512VL
10567 ? "vpxord\t%x0, %x0, %x0"
10568 : "vpxord\t%g0, %g0, %g0");
10569 return "vpxor\t%x0, %x0, %x0";
10571 case MODE_V2DF:
10572 if (!EXT_REX_SSE_REG_P (operands[0]))
10573 return "%vxorpd\t%0, %d0";
10574 /* FALLTHRU */
10575 case MODE_V8DF:
10576 case MODE_V4DF:
10577 if (!EXT_REX_SSE_REG_P (operands[0]))
10578 return "vxorpd\t%x0, %x0, %x0";
10579 else if (TARGET_AVX512DQ)
10580 return (TARGET_AVX512VL
10581 ? "vxorpd\t%x0, %x0, %x0"
10582 : "vxorpd\t%g0, %g0, %g0");
10583 else
10584 return (TARGET_AVX512VL
10585 ? "vpxorq\t%x0, %x0, %x0"
10586 : "vpxorq\t%g0, %g0, %g0");
10588 case MODE_V4SF:
10589 if (!EXT_REX_SSE_REG_P (operands[0]))
10590 return "%vxorps\t%0, %d0";
10591 /* FALLTHRU */
10592 case MODE_V16SF:
10593 case MODE_V8SF:
10594 if (!EXT_REX_SSE_REG_P (operands[0]))
10595 return "vxorps\t%x0, %x0, %x0";
10596 else if (TARGET_AVX512DQ)
10597 return (TARGET_AVX512VL
10598 ? "vxorps\t%x0, %x0, %x0"
10599 : "vxorps\t%g0, %g0, %g0");
10600 else
10601 return (TARGET_AVX512VL
10602 ? "vpxord\t%x0, %x0, %x0"
10603 : "vpxord\t%g0, %g0, %g0");
10605 default:
10606 gcc_unreachable ();
10609 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10611 enum attr_mode insn_mode = get_attr_mode (insn);
10613 switch (insn_mode)
10615 case MODE_XI:
10616 case MODE_V8DF:
10617 case MODE_V16SF:
10618 gcc_assert (TARGET_AVX512F);
10619 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10621 case MODE_OI:
10622 case MODE_V4DF:
10623 case MODE_V8SF:
10624 gcc_assert (TARGET_AVX2);
10625 /* FALLTHRU */
10626 case MODE_TI:
10627 case MODE_V2DF:
10628 case MODE_V4SF:
10629 gcc_assert (TARGET_SSE2);
10630 if (!EXT_REX_SSE_REG_P (operands[0]))
10631 return (TARGET_AVX
10632 ? "vpcmpeqd\t%0, %0, %0"
10633 : "pcmpeqd\t%0, %0");
10634 else if (TARGET_AVX512VL)
10635 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10636 else
10637 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10639 default:
10640 gcc_unreachable ();
10644 gcc_unreachable ();
10647 /* Returns true if INSN can be transformed from a memory load
10648 to a supported FP constant load. */
10650 bool
10651 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10653 rtx src = find_constant_src (insn);
10655 gcc_assert (REG_P (dst));
10657 if (src == NULL
10658 || (SSE_REGNO_P (REGNO (dst))
10659 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10660 || (STACK_REGNO_P (REGNO (dst))
10661 && standard_80387_constant_p (src) < 1))
10662 return false;
10664 return true;
10667 /* Returns true if OP contains a symbol reference */
10669 bool
10670 symbolic_reference_mentioned_p (rtx op)
10672 const char *fmt;
10673 int i;
10675 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10676 return true;
10678 fmt = GET_RTX_FORMAT (GET_CODE (op));
10679 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10681 if (fmt[i] == 'E')
10683 int j;
10685 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10686 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10687 return true;
10690 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10691 return true;
10694 return false;
10697 /* Return true if it is appropriate to emit `ret' instructions in the
10698 body of a function. Do this only if the epilogue is simple, needing a
10699 couple of insns. Prior to reloading, we can't tell how many registers
10700 must be saved, so return false then. Return false if there is no frame
10701 marker to de-allocate. */
10703 bool
10704 ix86_can_use_return_insn_p (void)
10706 if (ix86_function_naked (current_function_decl))
10707 return false;
10709 /* Don't use `ret' instruction in interrupt handler. */
10710 if (! reload_completed
10711 || frame_pointer_needed
10712 || cfun->machine->func_type != TYPE_NORMAL)
10713 return 0;
10715 /* Don't allow more than 32k pop, since that's all we can do
10716 with one instruction. */
10717 if (crtl->args.pops_args && crtl->args.size >= 32768)
10718 return 0;
10720 struct ix86_frame &frame = cfun->machine->frame;
10721 return (frame.stack_pointer_offset == UNITS_PER_WORD
10722 && (frame.nregs + frame.nsseregs) == 0);
10725 /* Value should be nonzero if functions must have frame pointers.
10726 Zero means the frame pointer need not be set up (and parms may
10727 be accessed via the stack pointer) in functions that seem suitable. */
10729 static bool
10730 ix86_frame_pointer_required (void)
10732 /* If we accessed previous frames, then the generated code expects
10733 to be able to access the saved ebp value in our frame. */
10734 if (cfun->machine->accesses_prev_frame)
10735 return true;
10737 /* Several x86 os'es need a frame pointer for other reasons,
10738 usually pertaining to setjmp. */
10739 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10740 return true;
10742 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10743 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10744 return true;
10746 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10747 allocation is 4GB. */
10748 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10749 return true;
10751 /* SSE saves require frame-pointer when stack is misaligned. */
10752 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10753 return true;
10755 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10756 turns off the frame pointer by default. Turn it back on now if
10757 we've not got a leaf function. */
10758 if (TARGET_OMIT_LEAF_FRAME_POINTER
10759 && (!crtl->is_leaf
10760 || ix86_current_function_calls_tls_descriptor))
10761 return true;
10763 if (crtl->profile && !flag_fentry)
10764 return true;
10766 return false;
10769 /* Record that the current function accesses previous call frames. */
10771 void
10772 ix86_setup_frame_addresses (void)
10774 cfun->machine->accesses_prev_frame = 1;
10777 #ifndef USE_HIDDEN_LINKONCE
10778 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10779 # define USE_HIDDEN_LINKONCE 1
10780 # else
10781 # define USE_HIDDEN_LINKONCE 0
10782 # endif
10783 #endif
10785 /* Label count for call and return thunks. It is used to make unique
10786 labels in call and return thunks. */
10787 static int indirectlabelno;
10789 /* True if call and return thunk functions are needed. */
10790 static bool indirect_thunk_needed = false;
10791 /* True if call and return thunk functions with the BND prefix are
10792 needed. */
10793 static bool indirect_thunk_bnd_needed = false;
10795 /* Bit masks of integer registers, which contain branch target, used
10796 by call and return thunks functions. */
10797 static int indirect_thunks_used;
10798 /* Bit masks of integer registers, which contain branch target, used
10799 by call and return thunks functions with the BND prefix. */
10800 static int indirect_thunks_bnd_used;
10802 #ifndef INDIRECT_LABEL
10803 # define INDIRECT_LABEL "LIND"
10804 #endif
10806 /* Fills in the label name that should be used for the indirect thunk. */
10808 static void
10809 indirect_thunk_name (char name[32], unsigned int regno,
10810 bool need_bnd_p, bool ret_p)
10812 if (regno != INVALID_REGNUM && ret_p)
10813 gcc_unreachable ();
10815 if (USE_HIDDEN_LINKONCE)
10817 const char *bnd = need_bnd_p ? "_bnd" : "";
10818 if (regno != INVALID_REGNUM)
10820 const char *reg_prefix;
10821 if (LEGACY_INT_REGNO_P (regno))
10822 reg_prefix = TARGET_64BIT ? "r" : "e";
10823 else
10824 reg_prefix = "";
10825 sprintf (name, "__x86_indirect_thunk%s_%s%s",
10826 bnd, reg_prefix, reg_names[regno]);
10828 else
10830 const char *ret = ret_p ? "return" : "indirect";
10831 sprintf (name, "__x86_%s_thunk%s", ret, bnd);
10834 else
10836 if (regno != INVALID_REGNUM)
10838 if (need_bnd_p)
10839 ASM_GENERATE_INTERNAL_LABEL (name, "LITBR", regno);
10840 else
10841 ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
10843 else
10845 if (ret_p)
10847 if (need_bnd_p)
10848 ASM_GENERATE_INTERNAL_LABEL (name, "LRTB", 0);
10849 else
10850 ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
10852 else
10854 if (need_bnd_p)
10855 ASM_GENERATE_INTERNAL_LABEL (name, "LITB", 0);
10856 else
10857 ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
10863 /* Output a call and return thunk for indirect branch. If BND_P is
10864 true, the BND prefix is needed. If REGNO != -1, the function
10865 address is in REGNO and the call and return thunk looks like:
10867 call L2
10869 pause
10870 lfence
10871 jmp L1
10873 mov %REG, (%sp)
10876 Otherwise, the function address is on the top of stack and the
10877 call and return thunk looks like:
10879 call L2
10881 pause
10882 lfence
10883 jmp L1
10885 lea WORD_SIZE(%sp), %sp
10889 static void
10890 output_indirect_thunk (bool need_bnd_p, unsigned int regno)
10892 char indirectlabel1[32];
10893 char indirectlabel2[32];
10895 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
10896 indirectlabelno++);
10897 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
10898 indirectlabelno++);
10900 /* Call */
10901 if (need_bnd_p)
10902 fputs ("\tbnd call\t", asm_out_file);
10903 else
10904 fputs ("\tcall\t", asm_out_file);
10905 assemble_name_raw (asm_out_file, indirectlabel2);
10906 fputc ('\n', asm_out_file);
10908 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
10910 /* AMD and Intel CPUs prefer each a different instruction as loop filler.
10911 Usage of both pause + lfence is compromise solution. */
10912 fprintf (asm_out_file, "\tpause\n\tlfence\n");
10914 /* Jump. */
10915 fputs ("\tjmp\t", asm_out_file);
10916 assemble_name_raw (asm_out_file, indirectlabel1);
10917 fputc ('\n', asm_out_file);
10919 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
10921 if (regno != INVALID_REGNUM)
10923 /* MOV. */
10924 rtx xops[2];
10925 xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
10926 xops[1] = gen_rtx_REG (word_mode, regno);
10927 output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
10929 else
10931 /* LEA. */
10932 rtx xops[2];
10933 xops[0] = stack_pointer_rtx;
10934 xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10935 output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
10938 if (need_bnd_p)
10939 fputs ("\tbnd ret\n", asm_out_file);
10940 else
10941 fputs ("\tret\n", asm_out_file);
10944 /* Output a funtion with a call and return thunk for indirect branch.
10945 If BND_P is true, the BND prefix is needed. If REGNO != UNVALID_REGNUM,
10946 the function address is in REGNO. Otherwise, the function address is
10947 on the top of stack. */
10949 static void
10950 output_indirect_thunk_function (bool need_bnd_p, unsigned int regno)
10952 char name[32];
10953 tree decl;
10955 /* Create __x86_indirect_thunk/__x86_indirect_thunk_bnd. */
10956 indirect_thunk_name (name, regno, need_bnd_p, false);
10957 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10958 get_identifier (name),
10959 build_function_type_list (void_type_node, NULL_TREE));
10960 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10961 NULL_TREE, void_type_node);
10962 TREE_PUBLIC (decl) = 1;
10963 TREE_STATIC (decl) = 1;
10964 DECL_IGNORED_P (decl) = 1;
10966 #if TARGET_MACHO
10967 if (TARGET_MACHO)
10969 switch_to_section (darwin_sections[picbase_thunk_section]);
10970 fputs ("\t.weak_definition\t", asm_out_file);
10971 assemble_name (asm_out_file, name);
10972 fputs ("\n\t.private_extern\t", asm_out_file);
10973 assemble_name (asm_out_file, name);
10974 putc ('\n', asm_out_file);
10975 ASM_OUTPUT_LABEL (asm_out_file, name);
10976 DECL_WEAK (decl) = 1;
10978 else
10979 #endif
10980 if (USE_HIDDEN_LINKONCE)
10982 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10984 targetm.asm_out.unique_section (decl, 0);
10985 switch_to_section (get_named_section (decl, NULL, 0));
10987 targetm.asm_out.globalize_label (asm_out_file, name);
10988 fputs ("\t.hidden\t", asm_out_file);
10989 assemble_name (asm_out_file, name);
10990 putc ('\n', asm_out_file);
10991 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10993 else
10995 switch_to_section (text_section);
10996 ASM_OUTPUT_LABEL (asm_out_file, name);
10999 if (regno == INVALID_REGNUM)
11001 /* Create alias for __x86.return_thunk/__x86.return_thunk_bnd. */
11002 char alias[32];
11004 indirect_thunk_name (alias, regno, need_bnd_p, true);
11005 #if TARGET_MACHO
11006 if (TARGET_MACHO)
11008 fputs ("\t.weak_definition\t", asm_out_file);
11009 assemble_name (asm_out_file, alias);
11010 fputs ("\n\t.private_extern\t", asm_out_file);
11011 assemble_name (asm_out_file, alias);
11012 putc ('\n', asm_out_file);
11013 ASM_OUTPUT_LABEL (asm_out_file, alias);
11015 #else
11016 ASM_OUTPUT_DEF (asm_out_file, alias, name);
11017 if (USE_HIDDEN_LINKONCE)
11019 fputs ("\t.globl\t", asm_out_file);
11020 assemble_name (asm_out_file, alias);
11021 putc ('\n', asm_out_file);
11022 fputs ("\t.hidden\t", asm_out_file);
11023 assemble_name (asm_out_file, alias);
11024 putc ('\n', asm_out_file);
11026 #endif
11029 DECL_INITIAL (decl) = make_node (BLOCK);
11030 current_function_decl = decl;
11031 allocate_struct_function (decl, false);
11032 init_function_start (decl);
11033 /* We're about to hide the function body from callees of final_* by
11034 emitting it directly; tell them we're a thunk, if they care. */
11035 cfun->is_thunk = true;
11036 first_function_block_is_cold = false;
11037 /* Make sure unwind info is emitted for the thunk if needed. */
11038 final_start_function (emit_barrier (), asm_out_file, 1);
11040 output_indirect_thunk (need_bnd_p, regno);
11042 final_end_function ();
11043 init_insn_lengths ();
11044 free_after_compilation (cfun);
11045 set_cfun (NULL);
11046 current_function_decl = NULL;
11049 static int pic_labels_used;
11051 /* Fills in the label name that should be used for a pc thunk for
11052 the given register. */
11054 static void
11055 get_pc_thunk_name (char name[32], unsigned int regno)
11057 gcc_assert (!TARGET_64BIT);
11059 if (USE_HIDDEN_LINKONCE)
11060 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11061 else
11062 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11066 /* This function generates code for -fpic that loads %ebx with
11067 the return address of the caller and then returns. */
11069 static void
11070 ix86_code_end (void)
11072 rtx xops[2];
11073 unsigned int regno;
11075 if (indirect_thunk_needed)
11076 output_indirect_thunk_function (false, INVALID_REGNUM);
11077 if (indirect_thunk_bnd_needed)
11078 output_indirect_thunk_function (true, INVALID_REGNUM);
11080 for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
11082 unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
11083 if ((indirect_thunks_used & (1 << i)))
11084 output_indirect_thunk_function (false, regno);
11086 if ((indirect_thunks_bnd_used & (1 << i)))
11087 output_indirect_thunk_function (true, regno);
11090 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
11092 char name[32];
11093 tree decl;
11095 if ((indirect_thunks_used & (1 << regno)))
11096 output_indirect_thunk_function (false, regno);
11098 if ((indirect_thunks_bnd_used & (1 << regno)))
11099 output_indirect_thunk_function (true, regno);
11101 if (!(pic_labels_used & (1 << regno)))
11102 continue;
11104 get_pc_thunk_name (name, regno);
11106 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11107 get_identifier (name),
11108 build_function_type_list (void_type_node, NULL_TREE));
11109 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11110 NULL_TREE, void_type_node);
11111 TREE_PUBLIC (decl) = 1;
11112 TREE_STATIC (decl) = 1;
11113 DECL_IGNORED_P (decl) = 1;
11115 #if TARGET_MACHO
11116 if (TARGET_MACHO)
11118 switch_to_section (darwin_sections[picbase_thunk_section]);
11119 fputs ("\t.weak_definition\t", asm_out_file);
11120 assemble_name (asm_out_file, name);
11121 fputs ("\n\t.private_extern\t", asm_out_file);
11122 assemble_name (asm_out_file, name);
11123 putc ('\n', asm_out_file);
11124 ASM_OUTPUT_LABEL (asm_out_file, name);
11125 DECL_WEAK (decl) = 1;
11127 else
11128 #endif
11129 if (USE_HIDDEN_LINKONCE)
11131 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11133 targetm.asm_out.unique_section (decl, 0);
11134 switch_to_section (get_named_section (decl, NULL, 0));
11136 targetm.asm_out.globalize_label (asm_out_file, name);
11137 fputs ("\t.hidden\t", asm_out_file);
11138 assemble_name (asm_out_file, name);
11139 putc ('\n', asm_out_file);
11140 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11142 else
11144 switch_to_section (text_section);
11145 ASM_OUTPUT_LABEL (asm_out_file, name);
11148 DECL_INITIAL (decl) = make_node (BLOCK);
11149 current_function_decl = decl;
11150 allocate_struct_function (decl, false);
11151 init_function_start (decl);
11152 /* We're about to hide the function body from callees of final_* by
11153 emitting it directly; tell them we're a thunk, if they care. */
11154 cfun->is_thunk = true;
11155 first_function_block_is_cold = false;
11156 /* Make sure unwind info is emitted for the thunk if needed. */
11157 final_start_function (emit_barrier (), asm_out_file, 1);
11159 /* Pad stack IP move with 4 instructions (two NOPs count
11160 as one instruction). */
11161 if (TARGET_PAD_SHORT_FUNCTION)
11163 int i = 8;
11165 while (i--)
11166 fputs ("\tnop\n", asm_out_file);
11169 xops[0] = gen_rtx_REG (Pmode, regno);
11170 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11171 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11172 output_asm_insn ("%!ret", NULL);
11173 final_end_function ();
11174 init_insn_lengths ();
11175 free_after_compilation (cfun);
11176 set_cfun (NULL);
11177 current_function_decl = NULL;
11180 if (flag_split_stack)
11181 file_end_indicate_split_stack ();
11184 /* Emit code for the SET_GOT patterns. */
11186 const char *
11187 output_set_got (rtx dest, rtx label)
11189 rtx xops[3];
11191 xops[0] = dest;
11193 if (TARGET_VXWORKS_RTP && flag_pic)
11195 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11196 xops[2] = gen_rtx_MEM (Pmode,
11197 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11198 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11200 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11201 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11202 an unadorned address. */
11203 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11204 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11205 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11206 return "";
11209 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11211 if (flag_pic)
11213 char name[32];
11214 get_pc_thunk_name (name, REGNO (dest));
11215 pic_labels_used |= 1 << REGNO (dest);
11217 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11218 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11219 output_asm_insn ("%!call\t%X2", xops);
11221 #if TARGET_MACHO
11222 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11223 This is what will be referenced by the Mach-O PIC subsystem. */
11224 if (machopic_should_output_picbase_label () || !label)
11225 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11227 /* When we are restoring the pic base at the site of a nonlocal label,
11228 and we decided to emit the pic base above, we will still output a
11229 local label used for calculating the correction offset (even though
11230 the offset will be 0 in that case). */
11231 if (label)
11232 targetm.asm_out.internal_label (asm_out_file, "L",
11233 CODE_LABEL_NUMBER (label));
11234 #endif
11236 else
11238 if (TARGET_MACHO)
11239 /* We don't need a pic base, we're not producing pic. */
11240 gcc_unreachable ();
11242 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11243 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11244 targetm.asm_out.internal_label (asm_out_file, "L",
11245 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11248 if (!TARGET_MACHO)
11249 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11251 return "";
11254 /* Generate an "push" pattern for input ARG. */
11256 static rtx
11257 gen_push (rtx arg)
11259 struct machine_function *m = cfun->machine;
11261 if (m->fs.cfa_reg == stack_pointer_rtx)
11262 m->fs.cfa_offset += UNITS_PER_WORD;
11263 m->fs.sp_offset += UNITS_PER_WORD;
11265 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11266 arg = gen_rtx_REG (word_mode, REGNO (arg));
11268 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11269 gen_rtx_PRE_DEC (Pmode,
11270 stack_pointer_rtx)),
11271 arg);
11274 /* Generate an "pop" pattern for input ARG. */
11276 static rtx
11277 gen_pop (rtx arg)
11279 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11280 arg = gen_rtx_REG (word_mode, REGNO (arg));
11282 return gen_rtx_SET (arg,
11283 gen_rtx_MEM (word_mode,
11284 gen_rtx_POST_INC (Pmode,
11285 stack_pointer_rtx)));
11288 /* Return >= 0 if there is an unused call-clobbered register available
11289 for the entire function. */
11291 static unsigned int
11292 ix86_select_alt_pic_regnum (void)
11294 if (ix86_use_pseudo_pic_reg ())
11295 return INVALID_REGNUM;
11297 if (crtl->is_leaf
11298 && !crtl->profile
11299 && !ix86_current_function_calls_tls_descriptor)
11301 int i, drap;
11302 /* Can't use the same register for both PIC and DRAP. */
11303 if (crtl->drap_reg)
11304 drap = REGNO (crtl->drap_reg);
11305 else
11306 drap = -1;
11307 for (i = 2; i >= 0; --i)
11308 if (i != drap && !df_regs_ever_live_p (i))
11309 return i;
11312 return INVALID_REGNUM;
11315 /* Return true if REGNO is used by the epilogue. */
11317 bool
11318 ix86_epilogue_uses (int regno)
11320 /* If there are no caller-saved registers, we preserve all registers,
11321 except for MMX and x87 registers which aren't supported when saving
11322 and restoring registers. Don't explicitly save SP register since
11323 it is always preserved. */
11324 return (epilogue_completed
11325 && cfun->machine->no_caller_saved_registers
11326 && !fixed_regs[regno]
11327 && !STACK_REGNO_P (regno)
11328 && !MMX_REGNO_P (regno));
11331 /* Return nonzero if register REGNO can be used as a scratch register
11332 in peephole2. */
11334 static bool
11335 ix86_hard_regno_scratch_ok (unsigned int regno)
11337 /* If there are no caller-saved registers, we can't use any register
11338 as a scratch register after epilogue and use REGNO as scratch
11339 register only if it has been used before to avoid saving and
11340 restoring it. */
11341 return (!cfun->machine->no_caller_saved_registers
11342 || (!epilogue_completed
11343 && df_regs_ever_live_p (regno)));
11346 /* Return true if register class CL should be an additional allocno
11347 class. */
11349 static bool
11350 ix86_additional_allocno_class_p (reg_class_t cl)
11352 return cl == MOD4_SSE_REGS;
11355 /* Return TRUE if we need to save REGNO. */
11357 static bool
11358 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
11360 /* If there are no caller-saved registers, we preserve all registers,
11361 except for MMX and x87 registers which aren't supported when saving
11362 and restoring registers. Don't explicitly save SP register since
11363 it is always preserved. */
11364 if (cfun->machine->no_caller_saved_registers)
11366 /* Don't preserve registers used for function return value. */
11367 rtx reg = crtl->return_rtx;
11368 if (reg)
11370 unsigned int i = REGNO (reg);
11371 unsigned int nregs = REG_NREGS (reg);
11372 while (nregs-- > 0)
11373 if ((i + nregs) == regno)
11374 return false;
11376 reg = crtl->return_bnd;
11377 if (reg)
11379 i = REGNO (reg);
11380 nregs = REG_NREGS (reg);
11381 while (nregs-- > 0)
11382 if ((i + nregs) == regno)
11383 return false;
11387 return (df_regs_ever_live_p (regno)
11388 && !fixed_regs[regno]
11389 && !STACK_REGNO_P (regno)
11390 && !MMX_REGNO_P (regno)
11391 && (regno != HARD_FRAME_POINTER_REGNUM
11392 || !frame_pointer_needed));
11395 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
11396 && pic_offset_table_rtx)
11398 if (ix86_use_pseudo_pic_reg ())
11400 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
11401 _mcount in prologue. */
11402 if (!TARGET_64BIT && flag_pic && crtl->profile)
11403 return true;
11405 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11406 || crtl->profile
11407 || crtl->calls_eh_return
11408 || crtl->uses_const_pool
11409 || cfun->has_nonlocal_label)
11410 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
11413 if (crtl->calls_eh_return && maybe_eh_return)
11415 unsigned i;
11416 for (i = 0; ; i++)
11418 unsigned test = EH_RETURN_DATA_REGNO (i);
11419 if (test == INVALID_REGNUM)
11420 break;
11421 if (test == regno)
11422 return true;
11426 if (ignore_outlined && cfun->machine->call_ms2sysv)
11428 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11429 + xlogue_layout::MIN_REGS;
11430 if (xlogue_layout::is_stub_managed_reg (regno, count))
11431 return false;
11434 if (crtl->drap_reg
11435 && regno == REGNO (crtl->drap_reg)
11436 && !cfun->machine->no_drap_save_restore)
11437 return true;
11439 return (df_regs_ever_live_p (regno)
11440 && !call_used_regs[regno]
11441 && !fixed_regs[regno]
11442 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11445 /* Return number of saved general prupose registers. */
11447 static int
11448 ix86_nsaved_regs (void)
11450 int nregs = 0;
11451 int regno;
11453 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11454 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11455 nregs ++;
11456 return nregs;
11459 /* Return number of saved SSE registers. */
11461 static int
11462 ix86_nsaved_sseregs (void)
11464 int nregs = 0;
11465 int regno;
11467 if (!TARGET_64BIT_MS_ABI)
11468 return 0;
11469 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11470 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11471 nregs ++;
11472 return nregs;
11475 /* Given FROM and TO register numbers, say whether this elimination is
11476 allowed. If stack alignment is needed, we can only replace argument
11477 pointer with hard frame pointer, or replace frame pointer with stack
11478 pointer. Otherwise, frame pointer elimination is automatically
11479 handled and all other eliminations are valid. */
11481 static bool
11482 ix86_can_eliminate (const int from, const int to)
11484 if (stack_realign_fp)
11485 return ((from == ARG_POINTER_REGNUM
11486 && to == HARD_FRAME_POINTER_REGNUM)
11487 || (from == FRAME_POINTER_REGNUM
11488 && to == STACK_POINTER_REGNUM));
11489 else
11490 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11493 /* Return the offset between two registers, one to be eliminated, and the other
11494 its replacement, at the start of a routine. */
11496 HOST_WIDE_INT
11497 ix86_initial_elimination_offset (int from, int to)
11499 struct ix86_frame &frame = cfun->machine->frame;
11501 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11502 return frame.hard_frame_pointer_offset;
11503 else if (from == FRAME_POINTER_REGNUM
11504 && to == HARD_FRAME_POINTER_REGNUM)
11505 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11506 else
11508 gcc_assert (to == STACK_POINTER_REGNUM);
11510 if (from == ARG_POINTER_REGNUM)
11511 return frame.stack_pointer_offset;
11513 gcc_assert (from == FRAME_POINTER_REGNUM);
11514 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11518 /* In a dynamically-aligned function, we can't know the offset from
11519 stack pointer to frame pointer, so we must ensure that setjmp
11520 eliminates fp against the hard fp (%ebp) rather than trying to
11521 index from %esp up to the top of the frame across a gap that is
11522 of unknown (at compile-time) size. */
11523 static rtx
11524 ix86_builtin_setjmp_frame_value (void)
11526 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11529 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11530 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11532 static bool warned_once = false;
11533 if (!warned_once)
11535 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11536 feature);
11537 warned_once = true;
11541 /* Return the probing interval for -fstack-clash-protection. */
11543 static HOST_WIDE_INT
11544 get_probe_interval (void)
11546 if (flag_stack_clash_protection)
11547 return (HOST_WIDE_INT_1U
11548 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
11549 else
11550 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
11553 /* When using -fsplit-stack, the allocation routines set a field in
11554 the TCB to the bottom of the stack plus this much space, measured
11555 in bytes. */
11557 #define SPLIT_STACK_AVAILABLE 256
11559 /* Fill structure ix86_frame about frame of currently computed function. */
11561 static void
11562 ix86_compute_frame_layout (void)
11564 struct ix86_frame *frame = &cfun->machine->frame;
11565 struct machine_function *m = cfun->machine;
11566 unsigned HOST_WIDE_INT stack_alignment_needed;
11567 HOST_WIDE_INT offset;
11568 unsigned HOST_WIDE_INT preferred_alignment;
11569 HOST_WIDE_INT size = get_frame_size ();
11570 HOST_WIDE_INT to_allocate;
11572 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11573 * ms_abi functions that call a sysv function. We now need to prune away
11574 * cases where it should be disabled. */
11575 if (TARGET_64BIT && m->call_ms2sysv)
11577 gcc_assert (TARGET_64BIT_MS_ABI);
11578 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11579 gcc_assert (!TARGET_SEH);
11580 gcc_assert (TARGET_SSE);
11581 gcc_assert (!ix86_using_red_zone ());
11583 if (crtl->calls_eh_return)
11585 gcc_assert (!reload_completed);
11586 m->call_ms2sysv = false;
11587 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11590 else if (ix86_static_chain_on_stack)
11592 gcc_assert (!reload_completed);
11593 m->call_ms2sysv = false;
11594 warn_once_call_ms2sysv_xlogues ("static call chains");
11597 /* Finally, compute which registers the stub will manage. */
11598 else
11600 unsigned count = xlogue_layout::count_stub_managed_regs ();
11601 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11602 m->call_ms2sysv_pad_in = 0;
11606 frame->nregs = ix86_nsaved_regs ();
11607 frame->nsseregs = ix86_nsaved_sseregs ();
11609 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11610 except for function prologues, leaf functions and when the defult
11611 incoming stack boundary is overriden at command line or via
11612 force_align_arg_pointer attribute. */
11613 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11614 && (!crtl->is_leaf || cfun->calls_alloca != 0
11615 || ix86_current_function_calls_tls_descriptor
11616 || ix86_incoming_stack_boundary < 128))
11618 crtl->preferred_stack_boundary = 128;
11619 crtl->stack_alignment_needed = 128;
11622 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11623 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11625 gcc_assert (!size || stack_alignment_needed);
11626 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11627 gcc_assert (preferred_alignment <= stack_alignment_needed);
11629 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11630 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11631 if (TARGET_64BIT && m->call_ms2sysv)
11633 gcc_assert (stack_alignment_needed >= 16);
11634 gcc_assert (!frame->nsseregs);
11637 /* For SEH we have to limit the amount of code movement into the prologue.
11638 At present we do this via a BLOCKAGE, at which point there's very little
11639 scheduling that can be done, which means that there's very little point
11640 in doing anything except PUSHs. */
11641 if (TARGET_SEH)
11642 m->use_fast_prologue_epilogue = false;
11643 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11645 int count = frame->nregs;
11646 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11648 /* The fast prologue uses move instead of push to save registers. This
11649 is significantly longer, but also executes faster as modern hardware
11650 can execute the moves in parallel, but can't do that for push/pop.
11652 Be careful about choosing what prologue to emit: When function takes
11653 many instructions to execute we may use slow version as well as in
11654 case function is known to be outside hot spot (this is known with
11655 feedback only). Weight the size of function by number of registers
11656 to save as it is cheap to use one or two push instructions but very
11657 slow to use many of them. */
11658 if (count)
11659 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11660 if (node->frequency < NODE_FREQUENCY_NORMAL
11661 || (flag_branch_probabilities
11662 && node->frequency < NODE_FREQUENCY_HOT))
11663 m->use_fast_prologue_epilogue = false;
11664 else
11665 m->use_fast_prologue_epilogue
11666 = !expensive_function_p (count);
11669 frame->save_regs_using_mov
11670 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11671 /* If static stack checking is enabled and done with probes,
11672 the registers need to be saved before allocating the frame. */
11673 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11675 /* Skip return address and error code in exception handler. */
11676 offset = INCOMING_FRAME_SP_OFFSET;
11678 /* Skip pushed static chain. */
11679 if (ix86_static_chain_on_stack)
11680 offset += UNITS_PER_WORD;
11682 /* Skip saved base pointer. */
11683 if (frame_pointer_needed)
11684 offset += UNITS_PER_WORD;
11685 frame->hfp_save_offset = offset;
11687 /* The traditional frame pointer location is at the top of the frame. */
11688 frame->hard_frame_pointer_offset = offset;
11690 /* Register save area */
11691 offset += frame->nregs * UNITS_PER_WORD;
11692 frame->reg_save_offset = offset;
11694 /* On SEH target, registers are pushed just before the frame pointer
11695 location. */
11696 if (TARGET_SEH)
11697 frame->hard_frame_pointer_offset = offset;
11699 /* Calculate the size of the va-arg area (not including padding, if any). */
11700 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11702 /* Also adjust stack_realign_offset for the largest alignment of
11703 stack slot actually used. */
11704 if (stack_realign_fp
11705 || (cfun->machine->max_used_stack_alignment != 0
11706 && (offset % cfun->machine->max_used_stack_alignment) != 0))
11708 /* We may need a 16-byte aligned stack for the remainder of the
11709 register save area, but the stack frame for the local function
11710 may require a greater alignment if using AVX/2/512. In order
11711 to avoid wasting space, we first calculate the space needed for
11712 the rest of the register saves, add that to the stack pointer,
11713 and then realign the stack to the boundary of the start of the
11714 frame for the local function. */
11715 HOST_WIDE_INT space_needed = 0;
11716 HOST_WIDE_INT sse_reg_space_needed = 0;
11718 if (TARGET_64BIT)
11720 if (m->call_ms2sysv)
11722 m->call_ms2sysv_pad_in = 0;
11723 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11726 else if (frame->nsseregs)
11727 /* The only ABI that has saved SSE registers (Win64) also has a
11728 16-byte aligned default stack. However, many programs violate
11729 the ABI, and Wine64 forces stack realignment to compensate. */
11730 space_needed = frame->nsseregs * 16;
11732 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11734 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11735 rounding to be pedantic. */
11736 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11738 else
11739 space_needed = frame->va_arg_size;
11741 /* Record the allocation size required prior to the realignment AND. */
11742 frame->stack_realign_allocate = space_needed;
11744 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11745 before this point are not directly comparable with values below
11746 this point. Use sp_valid_at to determine if the stack pointer is
11747 valid for a given offset, fp_valid_at for the frame pointer, or
11748 choose_baseaddr to have a base register chosen for you.
11750 Note that the result of (frame->stack_realign_offset
11751 & (stack_alignment_needed - 1)) may not equal zero. */
11752 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11753 frame->stack_realign_offset = offset - space_needed;
11754 frame->sse_reg_save_offset = frame->stack_realign_offset
11755 + sse_reg_space_needed;
11757 else
11759 frame->stack_realign_offset = offset;
11761 if (TARGET_64BIT && m->call_ms2sysv)
11763 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11764 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11767 /* Align and set SSE register save area. */
11768 else if (frame->nsseregs)
11770 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11771 required and the DRAP re-alignment boundary is at least 16 bytes,
11772 then we want the SSE register save area properly aligned. */
11773 if (ix86_incoming_stack_boundary >= 128
11774 || (stack_realign_drap && stack_alignment_needed >= 16))
11775 offset = ROUND_UP (offset, 16);
11776 offset += frame->nsseregs * 16;
11778 frame->sse_reg_save_offset = offset;
11779 offset += frame->va_arg_size;
11782 /* Align start of frame for local function. When a function call
11783 is removed, it may become a leaf function. But if argument may
11784 be passed on stack, we need to align the stack when there is no
11785 tail call. */
11786 if (m->call_ms2sysv
11787 || frame->va_arg_size != 0
11788 || size != 0
11789 || !crtl->is_leaf
11790 || (!crtl->tail_call_emit
11791 && cfun->machine->outgoing_args_on_stack)
11792 || cfun->calls_alloca
11793 || ix86_current_function_calls_tls_descriptor)
11794 offset = ROUND_UP (offset, stack_alignment_needed);
11796 /* Frame pointer points here. */
11797 frame->frame_pointer_offset = offset;
11799 offset += size;
11801 /* Add outgoing arguments area. Can be skipped if we eliminated
11802 all the function calls as dead code.
11803 Skipping is however impossible when function calls alloca. Alloca
11804 expander assumes that last crtl->outgoing_args_size
11805 of stack frame are unused. */
11806 if (ACCUMULATE_OUTGOING_ARGS
11807 && (!crtl->is_leaf || cfun->calls_alloca
11808 || ix86_current_function_calls_tls_descriptor))
11810 offset += crtl->outgoing_args_size;
11811 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11813 else
11814 frame->outgoing_arguments_size = 0;
11816 /* Align stack boundary. Only needed if we're calling another function
11817 or using alloca. */
11818 if (!crtl->is_leaf || cfun->calls_alloca
11819 || ix86_current_function_calls_tls_descriptor)
11820 offset = ROUND_UP (offset, preferred_alignment);
11822 /* We've reached end of stack frame. */
11823 frame->stack_pointer_offset = offset;
11825 /* Size prologue needs to allocate. */
11826 to_allocate = offset - frame->sse_reg_save_offset;
11828 if ((!to_allocate && frame->nregs <= 1)
11829 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
11830 /* If stack clash probing needs a loop, then it needs a
11831 scratch register. But the returned register is only guaranteed
11832 to be safe to use after register saves are complete. So if
11833 stack clash protections are enabled and the allocated frame is
11834 larger than the probe interval, then use pushes to save
11835 callee saved registers. */
11836 || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
11837 frame->save_regs_using_mov = false;
11839 if (ix86_using_red_zone ()
11840 && crtl->sp_is_unchanging
11841 && crtl->is_leaf
11842 && !ix86_pc_thunk_call_expanded
11843 && !ix86_current_function_calls_tls_descriptor)
11845 frame->red_zone_size = to_allocate;
11846 if (frame->save_regs_using_mov)
11847 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11848 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11849 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11851 else
11852 frame->red_zone_size = 0;
11853 frame->stack_pointer_offset -= frame->red_zone_size;
11855 /* The SEH frame pointer location is near the bottom of the frame.
11856 This is enforced by the fact that the difference between the
11857 stack pointer and the frame pointer is limited to 240 bytes in
11858 the unwind data structure. */
11859 if (TARGET_SEH)
11861 HOST_WIDE_INT diff;
11863 /* If we can leave the frame pointer where it is, do so. Also, returns
11864 the establisher frame for __builtin_frame_address (0). */
11865 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11866 if (diff <= SEH_MAX_FRAME_SIZE
11867 && (diff > 240 || (diff & 15) != 0)
11868 && !crtl->accesses_prior_frames)
11870 /* Ideally we'd determine what portion of the local stack frame
11871 (within the constraint of the lowest 240) is most heavily used.
11872 But without that complication, simply bias the frame pointer
11873 by 128 bytes so as to maximize the amount of the local stack
11874 frame that is addressable with 8-bit offsets. */
11875 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11880 /* This is semi-inlined memory_address_length, but simplified
11881 since we know that we're always dealing with reg+offset, and
11882 to avoid having to create and discard all that rtl. */
11884 static inline int
11885 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11887 int len = 4;
11889 if (offset == 0)
11891 /* EBP and R13 cannot be encoded without an offset. */
11892 len = (regno == BP_REG || regno == R13_REG);
11894 else if (IN_RANGE (offset, -128, 127))
11895 len = 1;
11897 /* ESP and R12 must be encoded with a SIB byte. */
11898 if (regno == SP_REG || regno == R12_REG)
11899 len++;
11901 return len;
11904 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11905 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11907 static bool
11908 sp_valid_at (HOST_WIDE_INT cfa_offset)
11910 const struct machine_frame_state &fs = cfun->machine->fs;
11911 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11913 /* Validate that the cfa_offset isn't in a "no-man's land". */
11914 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11915 return false;
11917 return fs.sp_valid;
11920 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11921 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11923 static inline bool
11924 fp_valid_at (HOST_WIDE_INT cfa_offset)
11926 const struct machine_frame_state &fs = cfun->machine->fs;
11927 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11929 /* Validate that the cfa_offset isn't in a "no-man's land". */
11930 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11931 return false;
11933 return fs.fp_valid;
11936 /* Choose a base register based upon alignment requested, speed and/or
11937 size. */
11939 static void
11940 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11941 HOST_WIDE_INT &base_offset,
11942 unsigned int align_reqested, unsigned int *align)
11944 const struct machine_function *m = cfun->machine;
11945 unsigned int hfp_align;
11946 unsigned int drap_align;
11947 unsigned int sp_align;
11948 bool hfp_ok = fp_valid_at (cfa_offset);
11949 bool drap_ok = m->fs.drap_valid;
11950 bool sp_ok = sp_valid_at (cfa_offset);
11952 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11954 /* Filter out any registers that don't meet the requested alignment
11955 criteria. */
11956 if (align_reqested)
11958 if (m->fs.realigned)
11959 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11960 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11961 notes (which we would need to use a realigned stack pointer),
11962 so disable on SEH targets. */
11963 else if (m->fs.sp_realigned)
11964 sp_align = crtl->stack_alignment_needed;
11966 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11967 drap_ok = drap_ok && drap_align >= align_reqested;
11968 sp_ok = sp_ok && sp_align >= align_reqested;
11971 if (m->use_fast_prologue_epilogue)
11973 /* Choose the base register most likely to allow the most scheduling
11974 opportunities. Generally FP is valid throughout the function,
11975 while DRAP must be reloaded within the epilogue. But choose either
11976 over the SP due to increased encoding size. */
11978 if (hfp_ok)
11980 base_reg = hard_frame_pointer_rtx;
11981 base_offset = m->fs.fp_offset - cfa_offset;
11983 else if (drap_ok)
11985 base_reg = crtl->drap_reg;
11986 base_offset = 0 - cfa_offset;
11988 else if (sp_ok)
11990 base_reg = stack_pointer_rtx;
11991 base_offset = m->fs.sp_offset - cfa_offset;
11994 else
11996 HOST_WIDE_INT toffset;
11997 int len = 16, tlen;
11999 /* Choose the base register with the smallest address encoding.
12000 With a tie, choose FP > DRAP > SP. */
12001 if (sp_ok)
12003 base_reg = stack_pointer_rtx;
12004 base_offset = m->fs.sp_offset - cfa_offset;
12005 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12007 if (drap_ok)
12009 toffset = 0 - cfa_offset;
12010 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12011 if (tlen <= len)
12013 base_reg = crtl->drap_reg;
12014 base_offset = toffset;
12015 len = tlen;
12018 if (hfp_ok)
12020 toffset = m->fs.fp_offset - cfa_offset;
12021 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12022 if (tlen <= len)
12024 base_reg = hard_frame_pointer_rtx;
12025 base_offset = toffset;
12026 len = tlen;
12031 /* Set the align return value. */
12032 if (align)
12034 if (base_reg == stack_pointer_rtx)
12035 *align = sp_align;
12036 else if (base_reg == crtl->drap_reg)
12037 *align = drap_align;
12038 else if (base_reg == hard_frame_pointer_rtx)
12039 *align = hfp_align;
12043 /* Return an RTX that points to CFA_OFFSET within the stack frame and
12044 the alignment of address. If ALIGN is non-null, it should point to
12045 an alignment value (in bits) that is preferred or zero and will
12046 recieve the alignment of the base register that was selected,
12047 irrespective of rather or not CFA_OFFSET is a multiple of that
12048 alignment value. If it is possible for the base register offset to be
12049 non-immediate then SCRATCH_REGNO should specify a scratch register to
12050 use.
12052 The valid base registers are taken from CFUN->MACHINE->FS. */
12054 static rtx
12055 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
12056 unsigned int scratch_regno = INVALID_REGNUM)
12058 rtx base_reg = NULL;
12059 HOST_WIDE_INT base_offset = 0;
12061 /* If a specific alignment is requested, try to get a base register
12062 with that alignment first. */
12063 if (align && *align)
12064 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
12066 if (!base_reg)
12067 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
12069 gcc_assert (base_reg != NULL);
12071 rtx base_offset_rtx = GEN_INT (base_offset);
12073 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
12075 gcc_assert (scratch_regno != INVALID_REGNUM);
12077 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
12078 emit_move_insn (scratch_reg, base_offset_rtx);
12080 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
12083 return plus_constant (Pmode, base_reg, base_offset);
12086 /* Emit code to save registers in the prologue. */
12088 static void
12089 ix86_emit_save_regs (void)
12091 unsigned int regno;
12092 rtx_insn *insn;
12094 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12095 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12097 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12098 RTX_FRAME_RELATED_P (insn) = 1;
12102 /* Emit a single register save at CFA - CFA_OFFSET. */
12104 static void
12105 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12106 HOST_WIDE_INT cfa_offset)
12108 struct machine_function *m = cfun->machine;
12109 rtx reg = gen_rtx_REG (mode, regno);
12110 rtx mem, addr, base, insn;
12111 unsigned int align = GET_MODE_ALIGNMENT (mode);
12113 addr = choose_baseaddr (cfa_offset, &align);
12114 mem = gen_frame_mem (mode, addr);
12116 /* The location aligment depends upon the base register. */
12117 align = MIN (GET_MODE_ALIGNMENT (mode), align);
12118 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
12119 set_mem_align (mem, align);
12121 insn = emit_insn (gen_rtx_SET (mem, reg));
12122 RTX_FRAME_RELATED_P (insn) = 1;
12124 base = addr;
12125 if (GET_CODE (base) == PLUS)
12126 base = XEXP (base, 0);
12127 gcc_checking_assert (REG_P (base));
12129 /* When saving registers into a re-aligned local stack frame, avoid
12130 any tricky guessing by dwarf2out. */
12131 if (m->fs.realigned)
12133 gcc_checking_assert (stack_realign_drap);
12135 if (regno == REGNO (crtl->drap_reg))
12137 /* A bit of a hack. We force the DRAP register to be saved in
12138 the re-aligned stack frame, which provides us with a copy
12139 of the CFA that will last past the prologue. Install it. */
12140 gcc_checking_assert (cfun->machine->fs.fp_valid);
12141 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12142 cfun->machine->fs.fp_offset - cfa_offset);
12143 mem = gen_rtx_MEM (mode, addr);
12144 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12146 else
12148 /* The frame pointer is a stable reference within the
12149 aligned frame. Use it. */
12150 gcc_checking_assert (cfun->machine->fs.fp_valid);
12151 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12152 cfun->machine->fs.fp_offset - cfa_offset);
12153 mem = gen_rtx_MEM (mode, addr);
12154 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12158 else if (base == stack_pointer_rtx && m->fs.sp_realigned
12159 && cfa_offset >= m->fs.sp_realigned_offset)
12161 gcc_checking_assert (stack_realign_fp);
12162 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12165 /* The memory may not be relative to the current CFA register,
12166 which means that we may need to generate a new pattern for
12167 use by the unwind info. */
12168 else if (base != m->fs.cfa_reg)
12170 addr = plus_constant (Pmode, m->fs.cfa_reg,
12171 m->fs.cfa_offset - cfa_offset);
12172 mem = gen_rtx_MEM (mode, addr);
12173 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12177 /* Emit code to save registers using MOV insns.
12178 First register is stored at CFA - CFA_OFFSET. */
12179 static void
12180 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12182 unsigned int regno;
12184 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12185 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12187 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12188 cfa_offset -= UNITS_PER_WORD;
12192 /* Emit code to save SSE registers using MOV insns.
12193 First register is stored at CFA - CFA_OFFSET. */
12194 static void
12195 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12197 unsigned int regno;
12199 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12200 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12202 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12203 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12207 static GTY(()) rtx queued_cfa_restores;
12209 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12210 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12211 Don't add the note if the previously saved value will be left untouched
12212 within stack red-zone till return, as unwinders can find the same value
12213 in the register and on the stack. */
12215 static void
12216 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12218 if (!crtl->shrink_wrapped
12219 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12220 return;
12222 if (insn)
12224 add_reg_note (insn, REG_CFA_RESTORE, reg);
12225 RTX_FRAME_RELATED_P (insn) = 1;
12227 else
12228 queued_cfa_restores
12229 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12232 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12234 static void
12235 ix86_add_queued_cfa_restore_notes (rtx insn)
12237 rtx last;
12238 if (!queued_cfa_restores)
12239 return;
12240 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12242 XEXP (last, 1) = REG_NOTES (insn);
12243 REG_NOTES (insn) = queued_cfa_restores;
12244 queued_cfa_restores = NULL_RTX;
12245 RTX_FRAME_RELATED_P (insn) = 1;
12248 /* Expand prologue or epilogue stack adjustment.
12249 The pattern exist to put a dependency on all ebp-based memory accesses.
12250 STYLE should be negative if instructions should be marked as frame related,
12251 zero if %r11 register is live and cannot be freely used and positive
12252 otherwise. */
12254 static rtx
12255 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12256 int style, bool set_cfa)
12258 struct machine_function *m = cfun->machine;
12259 rtx insn;
12260 bool add_frame_related_expr = false;
12262 if (Pmode == SImode)
12263 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12264 else if (x86_64_immediate_operand (offset, DImode))
12265 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12266 else
12268 rtx tmp;
12269 /* r11 is used by indirect sibcall return as well, set before the
12270 epilogue and used after the epilogue. */
12271 if (style)
12272 tmp = gen_rtx_REG (DImode, R11_REG);
12273 else
12275 gcc_assert (src != hard_frame_pointer_rtx
12276 && dest != hard_frame_pointer_rtx);
12277 tmp = hard_frame_pointer_rtx;
12279 insn = emit_insn (gen_rtx_SET (tmp, offset));
12280 if (style < 0)
12281 add_frame_related_expr = true;
12283 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12286 insn = emit_insn (insn);
12287 if (style >= 0)
12288 ix86_add_queued_cfa_restore_notes (insn);
12290 if (set_cfa)
12292 rtx r;
12294 gcc_assert (m->fs.cfa_reg == src);
12295 m->fs.cfa_offset += INTVAL (offset);
12296 m->fs.cfa_reg = dest;
12298 r = gen_rtx_PLUS (Pmode, src, offset);
12299 r = gen_rtx_SET (dest, r);
12300 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12301 RTX_FRAME_RELATED_P (insn) = 1;
12303 else if (style < 0)
12305 RTX_FRAME_RELATED_P (insn) = 1;
12306 if (add_frame_related_expr)
12308 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12309 r = gen_rtx_SET (dest, r);
12310 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12314 if (dest == stack_pointer_rtx)
12316 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12317 bool valid = m->fs.sp_valid;
12318 bool realigned = m->fs.sp_realigned;
12320 if (src == hard_frame_pointer_rtx)
12322 valid = m->fs.fp_valid;
12323 realigned = false;
12324 ooffset = m->fs.fp_offset;
12326 else if (src == crtl->drap_reg)
12328 valid = m->fs.drap_valid;
12329 realigned = false;
12330 ooffset = 0;
12332 else
12334 /* Else there are two possibilities: SP itself, which we set
12335 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12336 taken care of this by hand along the eh_return path. */
12337 gcc_checking_assert (src == stack_pointer_rtx
12338 || offset == const0_rtx);
12341 m->fs.sp_offset = ooffset - INTVAL (offset);
12342 m->fs.sp_valid = valid;
12343 m->fs.sp_realigned = realigned;
12345 return insn;
12348 /* Find an available register to be used as dynamic realign argument
12349 pointer regsiter. Such a register will be written in prologue and
12350 used in begin of body, so it must not be
12351 1. parameter passing register.
12352 2. GOT pointer.
12353 We reuse static-chain register if it is available. Otherwise, we
12354 use DI for i386 and R13 for x86-64. We chose R13 since it has
12355 shorter encoding.
12357 Return: the regno of chosen register. */
12359 static unsigned int
12360 find_drap_reg (void)
12362 tree decl = cfun->decl;
12364 /* Always use callee-saved register if there are no caller-saved
12365 registers. */
12366 if (TARGET_64BIT)
12368 /* Use R13 for nested function or function need static chain.
12369 Since function with tail call may use any caller-saved
12370 registers in epilogue, DRAP must not use caller-saved
12371 register in such case. */
12372 if (DECL_STATIC_CHAIN (decl)
12373 || cfun->machine->no_caller_saved_registers
12374 || crtl->tail_call_emit)
12375 return R13_REG;
12377 return R10_REG;
12379 else
12381 /* Use DI for nested function or function need static chain.
12382 Since function with tail call may use any caller-saved
12383 registers in epilogue, DRAP must not use caller-saved
12384 register in such case. */
12385 if (DECL_STATIC_CHAIN (decl)
12386 || cfun->machine->no_caller_saved_registers
12387 || crtl->tail_call_emit)
12388 return DI_REG;
12390 /* Reuse static chain register if it isn't used for parameter
12391 passing. */
12392 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12394 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12395 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12396 return CX_REG;
12398 return DI_REG;
12402 /* Handle a "force_align_arg_pointer" attribute. */
12404 static tree
12405 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12406 tree, int, bool *no_add_attrs)
12408 if (TREE_CODE (*node) != FUNCTION_TYPE
12409 && TREE_CODE (*node) != METHOD_TYPE
12410 && TREE_CODE (*node) != FIELD_DECL
12411 && TREE_CODE (*node) != TYPE_DECL)
12413 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12414 name);
12415 *no_add_attrs = true;
12418 return NULL_TREE;
12421 /* Return minimum incoming stack alignment. */
12423 static unsigned int
12424 ix86_minimum_incoming_stack_boundary (bool sibcall)
12426 unsigned int incoming_stack_boundary;
12428 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
12429 if (cfun->machine->func_type != TYPE_NORMAL)
12430 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
12431 /* Prefer the one specified at command line. */
12432 else if (ix86_user_incoming_stack_boundary)
12433 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12434 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12435 if -mstackrealign is used, it isn't used for sibcall check and
12436 estimated stack alignment is 128bit. */
12437 else if (!sibcall
12438 && ix86_force_align_arg_pointer
12439 && crtl->stack_alignment_estimated == 128)
12440 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12441 else
12442 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12444 /* Incoming stack alignment can be changed on individual functions
12445 via force_align_arg_pointer attribute. We use the smallest
12446 incoming stack boundary. */
12447 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12448 && lookup_attribute (ix86_force_align_arg_pointer_string,
12449 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12450 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12452 /* The incoming stack frame has to be aligned at least at
12453 parm_stack_boundary. */
12454 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12455 incoming_stack_boundary = crtl->parm_stack_boundary;
12457 /* Stack at entrance of main is aligned by runtime. We use the
12458 smallest incoming stack boundary. */
12459 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12460 && DECL_NAME (current_function_decl)
12461 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12462 && DECL_FILE_SCOPE_P (current_function_decl))
12463 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12465 return incoming_stack_boundary;
12468 /* Update incoming stack boundary and estimated stack alignment. */
12470 static void
12471 ix86_update_stack_boundary (void)
12473 ix86_incoming_stack_boundary
12474 = ix86_minimum_incoming_stack_boundary (false);
12476 /* x86_64 vararg needs 16byte stack alignment for register save
12477 area. */
12478 if (TARGET_64BIT
12479 && cfun->stdarg
12480 && crtl->stack_alignment_estimated < 128)
12481 crtl->stack_alignment_estimated = 128;
12483 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12484 if (ix86_tls_descriptor_calls_expanded_in_cfun
12485 && crtl->preferred_stack_boundary < 128)
12486 crtl->preferred_stack_boundary = 128;
12489 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12490 needed or an rtx for DRAP otherwise. */
12492 static rtx
12493 ix86_get_drap_rtx (void)
12495 /* We must use DRAP if there are outgoing arguments on stack and
12496 ACCUMULATE_OUTGOING_ARGS is false. */
12497 if (ix86_force_drap
12498 || (cfun->machine->outgoing_args_on_stack
12499 && !ACCUMULATE_OUTGOING_ARGS))
12500 crtl->need_drap = true;
12502 if (stack_realign_drap)
12504 /* Assign DRAP to vDRAP and returns vDRAP */
12505 unsigned int regno = find_drap_reg ();
12506 rtx drap_vreg;
12507 rtx arg_ptr;
12508 rtx_insn *seq, *insn;
12510 arg_ptr = gen_rtx_REG (Pmode, regno);
12511 crtl->drap_reg = arg_ptr;
12513 start_sequence ();
12514 drap_vreg = copy_to_reg (arg_ptr);
12515 seq = get_insns ();
12516 end_sequence ();
12518 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12519 if (!optimize)
12521 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12522 RTX_FRAME_RELATED_P (insn) = 1;
12524 return drap_vreg;
12526 else
12527 return NULL;
12530 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12532 static rtx
12533 ix86_internal_arg_pointer (void)
12535 return virtual_incoming_args_rtx;
12538 struct scratch_reg {
12539 rtx reg;
12540 bool saved;
12543 /* Return a short-lived scratch register for use on function entry.
12544 In 32-bit mode, it is valid only after the registers are saved
12545 in the prologue. This register must be released by means of
12546 release_scratch_register_on_entry once it is dead. */
12548 static void
12549 get_scratch_register_on_entry (struct scratch_reg *sr)
12551 int regno;
12553 sr->saved = false;
12555 if (TARGET_64BIT)
12557 /* We always use R11 in 64-bit mode. */
12558 regno = R11_REG;
12560 else
12562 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12563 bool fastcall_p
12564 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12565 bool thiscall_p
12566 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12567 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12568 int regparm = ix86_function_regparm (fntype, decl);
12569 int drap_regno
12570 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12572 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12573 for the static chain register. */
12574 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12575 && drap_regno != AX_REG)
12576 regno = AX_REG;
12577 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12578 for the static chain register. */
12579 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12580 regno = AX_REG;
12581 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12582 regno = DX_REG;
12583 /* ecx is the static chain register. */
12584 else if (regparm < 3 && !fastcall_p && !thiscall_p
12585 && !static_chain_p
12586 && drap_regno != CX_REG)
12587 regno = CX_REG;
12588 else if (ix86_save_reg (BX_REG, true, false))
12589 regno = BX_REG;
12590 /* esi is the static chain register. */
12591 else if (!(regparm == 3 && static_chain_p)
12592 && ix86_save_reg (SI_REG, true, false))
12593 regno = SI_REG;
12594 else if (ix86_save_reg (DI_REG, true, false))
12595 regno = DI_REG;
12596 else
12598 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12599 sr->saved = true;
12603 sr->reg = gen_rtx_REG (Pmode, regno);
12604 if (sr->saved)
12606 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12607 RTX_FRAME_RELATED_P (insn) = 1;
12611 /* Release a scratch register obtained from the preceding function.
12613 If RELEASE_VIA_POP is true, we just pop the register off the stack
12614 to release it. This is what non-Linux systems use with -fstack-check.
12616 Otherwise we use OFFSET to locate the saved register and the
12617 allocated stack space becomes part of the local frame and is
12618 deallocated by the epilogue. */
12620 static void
12621 release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
12622 bool release_via_pop)
12624 if (sr->saved)
12626 if (release_via_pop)
12628 struct machine_function *m = cfun->machine;
12629 rtx x, insn = emit_insn (gen_pop (sr->reg));
12631 /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */
12632 RTX_FRAME_RELATED_P (insn) = 1;
12633 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12634 x = gen_rtx_SET (stack_pointer_rtx, x);
12635 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12636 m->fs.sp_offset -= UNITS_PER_WORD;
12638 else
12640 rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
12641 x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
12642 emit_insn (x);
12647 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12649 This differs from the next routine in that it tries hard to prevent
12650 attacks that jump the stack guard. Thus it is never allowed to allocate
12651 more than PROBE_INTERVAL bytes of stack space without a suitable
12652 probe.
12654 INT_REGISTERS_SAVED is true if integer registers have already been
12655 pushed on the stack. */
12657 static void
12658 ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
12659 const bool int_registers_saved)
12661 struct machine_function *m = cfun->machine;
12663 /* If this function does not statically allocate stack space, then
12664 no probes are needed. */
12665 if (!size)
12667 /* However, the allocation of space via pushes for register
12668 saves could be viewed as allocating space, but without the
12669 need to probe. */
12670 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12671 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12672 else
12673 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12674 return;
12677 /* If we are a noreturn function, then we have to consider the
12678 possibility that we're called via a jump rather than a call.
12680 Thus we don't have the implicit probe generated by saving the
12681 return address into the stack at the call. Thus, the stack
12682 pointer could be anywhere in the guard page. The safe thing
12683 to do is emit a probe now.
12685 The probe can be avoided if we have already emitted any callee
12686 register saves into the stack or have a frame pointer (which will
12687 have been saved as well). Those saves will function as implicit
12688 probes.
12690 ?!? This should be revamped to work like aarch64 and s390 where
12691 we track the offset from the most recent probe. Normally that
12692 offset would be zero. For a noreturn function we would reset
12693 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12694 we just probe when we cross PROBE_INTERVAL. */
12695 if (TREE_THIS_VOLATILE (cfun->decl)
12696 && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12698 /* We can safely use any register here since we're just going to push
12699 its value and immediately pop it back. But we do try and avoid
12700 argument passing registers so as not to introduce dependencies in
12701 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12702 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12703 rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12704 rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12705 m->fs.sp_offset -= UNITS_PER_WORD;
12706 if (m->fs.cfa_reg == stack_pointer_rtx)
12708 m->fs.cfa_offset -= UNITS_PER_WORD;
12709 rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12710 x = gen_rtx_SET (stack_pointer_rtx, x);
12711 add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12712 RTX_FRAME_RELATED_P (insn_push) = 1;
12713 x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12714 x = gen_rtx_SET (stack_pointer_rtx, x);
12715 add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12716 RTX_FRAME_RELATED_P (insn_pop) = 1;
12718 emit_insn (gen_blockage ());
12721 /* If we allocate less than the size of the guard statically,
12722 then no probing is necessary, but we do need to allocate
12723 the stack. */
12724 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12726 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12727 GEN_INT (-size), -1,
12728 m->fs.cfa_reg == stack_pointer_rtx);
12729 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12730 return;
12733 /* We're allocating a large enough stack frame that we need to
12734 emit probes. Either emit them inline or in a loop depending
12735 on the size. */
12736 HOST_WIDE_INT probe_interval = get_probe_interval ();
12737 if (size <= 4 * probe_interval)
12739 HOST_WIDE_INT i;
12740 for (i = probe_interval; i <= size; i += probe_interval)
12742 /* Allocate PROBE_INTERVAL bytes. */
12743 rtx insn
12744 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12745 GEN_INT (-probe_interval), -1,
12746 m->fs.cfa_reg == stack_pointer_rtx);
12747 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12749 /* And probe at *sp. */
12750 emit_stack_probe (stack_pointer_rtx);
12751 emit_insn (gen_blockage ());
12754 /* We need to allocate space for the residual, but we do not need
12755 to probe the residual. */
12756 HOST_WIDE_INT residual = (i - probe_interval - size);
12757 if (residual)
12758 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12759 GEN_INT (residual), -1,
12760 m->fs.cfa_reg == stack_pointer_rtx);
12761 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12763 else
12765 /* We expect the GP registers to be saved when probes are used
12766 as the probing sequences might need a scratch register and
12767 the routine to allocate one assumes the integer registers
12768 have already been saved. */
12769 gcc_assert (int_registers_saved);
12771 struct scratch_reg sr;
12772 get_scratch_register_on_entry (&sr);
12774 /* If we needed to save a register, then account for any space
12775 that was pushed (we are not going to pop the register when
12776 we do the restore). */
12777 if (sr.saved)
12778 size -= UNITS_PER_WORD;
12780 /* Step 1: round SIZE down to a multiple of the interval. */
12781 HOST_WIDE_INT rounded_size = size & -probe_interval;
12783 /* Step 2: compute final value of the loop counter. Use lea if
12784 possible. */
12785 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12786 rtx insn;
12787 if (address_no_seg_operand (addr, Pmode))
12788 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12789 else
12791 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12792 insn = emit_insn (gen_rtx_SET (sr.reg,
12793 gen_rtx_PLUS (Pmode, sr.reg,
12794 stack_pointer_rtx)));
12796 if (m->fs.cfa_reg == stack_pointer_rtx)
12798 add_reg_note (insn, REG_CFA_DEF_CFA,
12799 plus_constant (Pmode, sr.reg,
12800 m->fs.cfa_offset + rounded_size));
12801 RTX_FRAME_RELATED_P (insn) = 1;
12804 /* Step 3: the loop. */
12805 rtx size_rtx = GEN_INT (rounded_size);
12806 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12807 size_rtx));
12808 if (m->fs.cfa_reg == stack_pointer_rtx)
12810 m->fs.cfa_offset += rounded_size;
12811 add_reg_note (insn, REG_CFA_DEF_CFA,
12812 plus_constant (Pmode, stack_pointer_rtx,
12813 m->fs.cfa_offset));
12814 RTX_FRAME_RELATED_P (insn) = 1;
12816 m->fs.sp_offset += rounded_size;
12817 emit_insn (gen_blockage ());
12819 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12820 is equal to ROUNDED_SIZE. */
12822 if (size != rounded_size)
12823 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12824 GEN_INT (rounded_size - size), -1,
12825 m->fs.cfa_reg == stack_pointer_rtx);
12826 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12828 /* This does not deallocate the space reserved for the scratch
12829 register. That will be deallocated in the epilogue. */
12830 release_scratch_register_on_entry (&sr, size, false);
12833 /* Make sure nothing is scheduled before we are done. */
12834 emit_insn (gen_blockage ());
12837 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12839 INT_REGISTERS_SAVED is true if integer registers have already been
12840 pushed on the stack. */
12842 static void
12843 ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
12844 const bool int_registers_saved)
12846 /* We skip the probe for the first interval + a small dope of 4 words and
12847 probe that many bytes past the specified size to maintain a protection
12848 area at the botton of the stack. */
12849 const int dope = 4 * UNITS_PER_WORD;
12850 rtx size_rtx = GEN_INT (size), last;
12852 /* See if we have a constant small number of probes to generate. If so,
12853 that's the easy case. The run-time loop is made up of 9 insns in the
12854 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12855 for n # of intervals. */
12856 if (size <= 4 * get_probe_interval ())
12858 HOST_WIDE_INT i, adjust;
12859 bool first_probe = true;
12861 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12862 values of N from 1 until it exceeds SIZE. If only one probe is
12863 needed, this will not generate any code. Then adjust and probe
12864 to PROBE_INTERVAL + SIZE. */
12865 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12867 if (first_probe)
12869 adjust = 2 * get_probe_interval () + dope;
12870 first_probe = false;
12872 else
12873 adjust = get_probe_interval ();
12875 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12876 plus_constant (Pmode, stack_pointer_rtx,
12877 -adjust)));
12878 emit_stack_probe (stack_pointer_rtx);
12881 if (first_probe)
12882 adjust = size + get_probe_interval () + dope;
12883 else
12884 adjust = size + get_probe_interval () - i;
12886 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12887 plus_constant (Pmode, stack_pointer_rtx,
12888 -adjust)));
12889 emit_stack_probe (stack_pointer_rtx);
12891 /* Adjust back to account for the additional first interval. */
12892 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12893 plus_constant (Pmode, stack_pointer_rtx,
12894 (get_probe_interval ()
12895 + dope))));
12898 /* Otherwise, do the same as above, but in a loop. Note that we must be
12899 extra careful with variables wrapping around because we might be at
12900 the very top (or the very bottom) of the address space and we have
12901 to be able to handle this case properly; in particular, we use an
12902 equality test for the loop condition. */
12903 else
12905 /* We expect the GP registers to be saved when probes are used
12906 as the probing sequences might need a scratch register and
12907 the routine to allocate one assumes the integer registers
12908 have already been saved. */
12909 gcc_assert (int_registers_saved);
12911 HOST_WIDE_INT rounded_size;
12912 struct scratch_reg sr;
12914 get_scratch_register_on_entry (&sr);
12916 /* If we needed to save a register, then account for any space
12917 that was pushed (we are not going to pop the register when
12918 we do the restore). */
12919 if (sr.saved)
12920 size -= UNITS_PER_WORD;
12922 /* Step 1: round SIZE to the previous multiple of the interval. */
12924 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12927 /* Step 2: compute initial and final value of the loop counter. */
12929 /* SP = SP_0 + PROBE_INTERVAL. */
12930 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12931 plus_constant (Pmode, stack_pointer_rtx,
12932 - (get_probe_interval () + dope))));
12934 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12935 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12936 emit_insn (gen_rtx_SET (sr.reg,
12937 plus_constant (Pmode, stack_pointer_rtx,
12938 -rounded_size)));
12939 else
12941 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12942 emit_insn (gen_rtx_SET (sr.reg,
12943 gen_rtx_PLUS (Pmode, sr.reg,
12944 stack_pointer_rtx)));
12948 /* Step 3: the loop
12952 SP = SP + PROBE_INTERVAL
12953 probe at SP
12955 while (SP != LAST_ADDR)
12957 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12958 values of N from 1 until it is equal to ROUNDED_SIZE. */
12960 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12963 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12964 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12966 if (size != rounded_size)
12968 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12969 plus_constant (Pmode, stack_pointer_rtx,
12970 rounded_size - size)));
12971 emit_stack_probe (stack_pointer_rtx);
12974 /* Adjust back to account for the additional first interval. */
12975 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12976 plus_constant (Pmode, stack_pointer_rtx,
12977 (get_probe_interval ()
12978 + dope))));
12980 /* This does not deallocate the space reserved for the scratch
12981 register. That will be deallocated in the epilogue. */
12982 release_scratch_register_on_entry (&sr, size, false);
12985 /* Even if the stack pointer isn't the CFA register, we need to correctly
12986 describe the adjustments made to it, in particular differentiate the
12987 frame-related ones from the frame-unrelated ones. */
12988 if (size > 0)
12990 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12991 XVECEXP (expr, 0, 0)
12992 = gen_rtx_SET (stack_pointer_rtx,
12993 plus_constant (Pmode, stack_pointer_rtx, -size));
12994 XVECEXP (expr, 0, 1)
12995 = gen_rtx_SET (stack_pointer_rtx,
12996 plus_constant (Pmode, stack_pointer_rtx,
12997 get_probe_interval () + dope + size));
12998 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12999 RTX_FRAME_RELATED_P (last) = 1;
13001 cfun->machine->fs.sp_offset += size;
13004 /* Make sure nothing is scheduled before we are done. */
13005 emit_insn (gen_blockage ());
13008 /* Adjust the stack pointer up to REG while probing it. */
13010 const char *
13011 output_adjust_stack_and_probe (rtx reg)
13013 static int labelno = 0;
13014 char loop_lab[32];
13015 rtx xops[2];
13017 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13019 /* Loop. */
13020 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13022 /* SP = SP + PROBE_INTERVAL. */
13023 xops[0] = stack_pointer_rtx;
13024 xops[1] = GEN_INT (get_probe_interval ());
13025 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13027 /* Probe at SP. */
13028 xops[1] = const0_rtx;
13029 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13031 /* Test if SP == LAST_ADDR. */
13032 xops[0] = stack_pointer_rtx;
13033 xops[1] = reg;
13034 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13036 /* Branch. */
13037 fputs ("\tjne\t", asm_out_file);
13038 assemble_name_raw (asm_out_file, loop_lab);
13039 fputc ('\n', asm_out_file);
13041 return "";
13044 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13045 inclusive. These are offsets from the current stack pointer.
13047 INT_REGISTERS_SAVED is true if integer registers have already been
13048 pushed on the stack. */
13050 static void
13051 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
13052 const bool int_registers_saved)
13054 /* See if we have a constant small number of probes to generate. If so,
13055 that's the easy case. The run-time loop is made up of 6 insns in the
13056 generic case while the compile-time loop is made up of n insns for n #
13057 of intervals. */
13058 if (size <= 6 * get_probe_interval ())
13060 HOST_WIDE_INT i;
13062 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13063 it exceeds SIZE. If only one probe is needed, this will not
13064 generate any code. Then probe at FIRST + SIZE. */
13065 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
13066 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13067 -(first + i)));
13069 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13070 -(first + size)));
13073 /* Otherwise, do the same as above, but in a loop. Note that we must be
13074 extra careful with variables wrapping around because we might be at
13075 the very top (or the very bottom) of the address space and we have
13076 to be able to handle this case properly; in particular, we use an
13077 equality test for the loop condition. */
13078 else
13080 /* We expect the GP registers to be saved when probes are used
13081 as the probing sequences might need a scratch register and
13082 the routine to allocate one assumes the integer registers
13083 have already been saved. */
13084 gcc_assert (int_registers_saved);
13086 HOST_WIDE_INT rounded_size, last;
13087 struct scratch_reg sr;
13089 get_scratch_register_on_entry (&sr);
13092 /* Step 1: round SIZE to the previous multiple of the interval. */
13094 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13097 /* Step 2: compute initial and final value of the loop counter. */
13099 /* TEST_OFFSET = FIRST. */
13100 emit_move_insn (sr.reg, GEN_INT (-first));
13102 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13103 last = first + rounded_size;
13106 /* Step 3: the loop
13110 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13111 probe at TEST_ADDR
13113 while (TEST_ADDR != LAST_ADDR)
13115 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13116 until it is equal to ROUNDED_SIZE. */
13118 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13121 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13122 that SIZE is equal to ROUNDED_SIZE. */
13124 if (size != rounded_size)
13125 emit_stack_probe (plus_constant (Pmode,
13126 gen_rtx_PLUS (Pmode,
13127 stack_pointer_rtx,
13128 sr.reg),
13129 rounded_size - size));
13131 release_scratch_register_on_entry (&sr, size, true);
13134 /* Make sure nothing is scheduled before we are done. */
13135 emit_insn (gen_blockage ());
13138 /* Probe a range of stack addresses from REG to END, inclusive. These are
13139 offsets from the current stack pointer. */
13141 const char *
13142 output_probe_stack_range (rtx reg, rtx end)
13144 static int labelno = 0;
13145 char loop_lab[32];
13146 rtx xops[3];
13148 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13150 /* Loop. */
13151 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13153 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13154 xops[0] = reg;
13155 xops[1] = GEN_INT (get_probe_interval ());
13156 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13158 /* Probe at TEST_ADDR. */
13159 xops[0] = stack_pointer_rtx;
13160 xops[1] = reg;
13161 xops[2] = const0_rtx;
13162 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13164 /* Test if TEST_ADDR == LAST_ADDR. */
13165 xops[0] = reg;
13166 xops[1] = end;
13167 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13169 /* Branch. */
13170 fputs ("\tjne\t", asm_out_file);
13171 assemble_name_raw (asm_out_file, loop_lab);
13172 fputc ('\n', asm_out_file);
13174 return "";
13177 /* Return true if stack frame is required. Update STACK_ALIGNMENT
13178 to the largest alignment, in bits, of stack slot used if stack
13179 frame is required and CHECK_STACK_SLOT is true. */
13181 static bool
13182 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
13183 bool check_stack_slot)
13185 HARD_REG_SET set_up_by_prologue, prologue_used;
13186 basic_block bb;
13188 CLEAR_HARD_REG_SET (prologue_used);
13189 CLEAR_HARD_REG_SET (set_up_by_prologue);
13190 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13191 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13192 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13193 HARD_FRAME_POINTER_REGNUM);
13195 /* The preferred stack alignment is the minimum stack alignment. */
13196 if (stack_alignment > crtl->preferred_stack_boundary)
13197 stack_alignment = crtl->preferred_stack_boundary;
13199 bool require_stack_frame = false;
13201 FOR_EACH_BB_FN (bb, cfun)
13203 rtx_insn *insn;
13204 FOR_BB_INSNS (bb, insn)
13205 if (NONDEBUG_INSN_P (insn)
13206 && requires_stack_frame_p (insn, prologue_used,
13207 set_up_by_prologue))
13209 require_stack_frame = true;
13211 if (check_stack_slot)
13213 /* Find the maximum stack alignment. */
13214 subrtx_iterator::array_type array;
13215 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
13216 if (MEM_P (*iter)
13217 && (reg_mentioned_p (stack_pointer_rtx,
13218 *iter)
13219 || reg_mentioned_p (frame_pointer_rtx,
13220 *iter)))
13222 unsigned int alignment = MEM_ALIGN (*iter);
13223 if (alignment > stack_alignment)
13224 stack_alignment = alignment;
13230 return require_stack_frame;
13233 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
13234 will guide prologue/epilogue to be generated in correct form. */
13236 static void
13237 ix86_finalize_stack_frame_flags (void)
13239 /* Check if stack realign is really needed after reload, and
13240 stores result in cfun */
13241 unsigned int incoming_stack_boundary
13242 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13243 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13244 unsigned int stack_alignment
13245 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13246 ? crtl->max_used_stack_slot_alignment
13247 : crtl->stack_alignment_needed);
13248 unsigned int stack_realign
13249 = (incoming_stack_boundary < stack_alignment);
13250 bool recompute_frame_layout_p = false;
13252 if (crtl->stack_realign_finalized)
13254 /* After stack_realign_needed is finalized, we can't no longer
13255 change it. */
13256 gcc_assert (crtl->stack_realign_needed == stack_realign);
13257 return;
13260 /* If the only reason for frame_pointer_needed is that we conservatively
13261 assumed stack realignment might be needed or -fno-omit-frame-pointer
13262 is used, but in the end nothing that needed the stack alignment had
13263 been spilled nor stack access, clear frame_pointer_needed and say we
13264 don't need stack realignment. */
13265 if ((stack_realign || !flag_omit_frame_pointer)
13266 && frame_pointer_needed
13267 && crtl->is_leaf
13268 && crtl->sp_is_unchanging
13269 && !ix86_current_function_calls_tls_descriptor
13270 && !crtl->accesses_prior_frames
13271 && !cfun->calls_alloca
13272 && !crtl->calls_eh_return
13273 /* See ira_setup_eliminable_regset for the rationale. */
13274 && !(STACK_CHECK_MOVING_SP
13275 && flag_stack_check
13276 && flag_exceptions
13277 && cfun->can_throw_non_call_exceptions)
13278 && !ix86_frame_pointer_required ()
13279 && get_frame_size () == 0
13280 && ix86_nsaved_sseregs () == 0
13281 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13283 if (ix86_find_max_used_stack_alignment (stack_alignment,
13284 stack_realign))
13286 /* Stack frame is required. If stack alignment needed is less
13287 than incoming stack boundary, don't realign stack. */
13288 stack_realign = incoming_stack_boundary < stack_alignment;
13289 if (!stack_realign)
13291 crtl->max_used_stack_slot_alignment
13292 = incoming_stack_boundary;
13293 crtl->stack_alignment_needed
13294 = incoming_stack_boundary;
13295 /* Also update preferred_stack_boundary for leaf
13296 functions. */
13297 crtl->preferred_stack_boundary
13298 = incoming_stack_boundary;
13301 else
13303 /* If drap has been set, but it actually isn't live at the
13304 start of the function, there is no reason to set it up. */
13305 if (crtl->drap_reg)
13307 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13308 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
13309 REGNO (crtl->drap_reg)))
13311 crtl->drap_reg = NULL_RTX;
13312 crtl->need_drap = false;
13315 else
13316 cfun->machine->no_drap_save_restore = true;
13318 frame_pointer_needed = false;
13319 stack_realign = false;
13320 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13321 crtl->stack_alignment_needed = incoming_stack_boundary;
13322 crtl->stack_alignment_estimated = incoming_stack_boundary;
13323 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13324 crtl->preferred_stack_boundary = incoming_stack_boundary;
13325 df_finish_pass (true);
13326 df_scan_alloc (NULL);
13327 df_scan_blocks ();
13328 df_compute_regs_ever_live (true);
13329 df_analyze ();
13331 if (flag_var_tracking)
13333 /* Since frame pointer is no longer available, replace it with
13334 stack pointer - UNITS_PER_WORD in debug insns. */
13335 df_ref ref, next;
13336 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
13337 ref; ref = next)
13339 next = DF_REF_NEXT_REG (ref);
13340 if (!DF_REF_INSN_INFO (ref))
13341 continue;
13343 /* Make sure the next ref is for a different instruction,
13344 so that we're not affected by the rescan. */
13345 rtx_insn *insn = DF_REF_INSN (ref);
13346 while (next && DF_REF_INSN (next) == insn)
13347 next = DF_REF_NEXT_REG (next);
13349 if (DEBUG_INSN_P (insn))
13351 bool changed = false;
13352 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
13354 rtx *loc = DF_REF_LOC (ref);
13355 if (*loc == hard_frame_pointer_rtx)
13357 *loc = plus_constant (Pmode,
13358 stack_pointer_rtx,
13359 -UNITS_PER_WORD);
13360 changed = true;
13363 if (changed)
13364 df_insn_rescan (insn);
13369 recompute_frame_layout_p = true;
13372 else if (crtl->max_used_stack_slot_alignment
13373 > crtl->preferred_stack_boundary)
13375 /* We don't need to realign stack. But we still need to keep
13376 stack frame properly aligned to satisfy the largest alignment
13377 of stack slots. */
13378 if (ix86_find_max_used_stack_alignment (stack_alignment, true))
13379 cfun->machine->max_used_stack_alignment
13380 = stack_alignment / BITS_PER_UNIT;
13383 if (crtl->stack_realign_needed != stack_realign)
13384 recompute_frame_layout_p = true;
13385 crtl->stack_realign_needed = stack_realign;
13386 crtl->stack_realign_finalized = true;
13387 if (recompute_frame_layout_p)
13388 ix86_compute_frame_layout ();
13391 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13393 static void
13394 ix86_elim_entry_set_got (rtx reg)
13396 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13397 rtx_insn *c_insn = BB_HEAD (bb);
13398 if (!NONDEBUG_INSN_P (c_insn))
13399 c_insn = next_nonnote_nondebug_insn (c_insn);
13400 if (c_insn && NONJUMP_INSN_P (c_insn))
13402 rtx pat = PATTERN (c_insn);
13403 if (GET_CODE (pat) == PARALLEL)
13405 rtx vec = XVECEXP (pat, 0, 0);
13406 if (GET_CODE (vec) == SET
13407 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13408 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13409 delete_insn (c_insn);
13414 static rtx
13415 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
13417 rtx addr, mem;
13419 if (offset)
13420 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
13421 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
13422 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
13425 static inline rtx
13426 gen_frame_load (rtx reg, rtx frame_reg, int offset)
13428 return gen_frame_set (reg, frame_reg, offset, false);
13431 static inline rtx
13432 gen_frame_store (rtx reg, rtx frame_reg, int offset)
13434 return gen_frame_set (reg, frame_reg, offset, true);
13437 static void
13438 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
13440 struct machine_function *m = cfun->machine;
13441 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13442 + m->call_ms2sysv_extra_regs;
13443 rtvec v = rtvec_alloc (ncregs + 1);
13444 unsigned int align, i, vi = 0;
13445 rtx_insn *insn;
13446 rtx sym, addr;
13447 rtx rax = gen_rtx_REG (word_mode, AX_REG);
13448 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13450 /* AL should only be live with sysv_abi. */
13451 gcc_assert (!ix86_eax_live_at_start_p ());
13452 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
13454 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
13455 we've actually realigned the stack or not. */
13456 align = GET_MODE_ALIGNMENT (V4SFmode);
13457 addr = choose_baseaddr (frame.stack_realign_offset
13458 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
13459 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13461 emit_insn (gen_rtx_SET (rax, addr));
13463 /* Get the stub symbol. */
13464 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
13465 : XLOGUE_STUB_SAVE);
13466 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13468 for (i = 0; i < ncregs; ++i)
13470 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13471 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
13472 r.regno);
13473 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
13476 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
13478 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
13479 RTX_FRAME_RELATED_P (insn) = true;
13482 /* Expand the prologue into a bunch of separate insns. */
13484 void
13485 ix86_expand_prologue (void)
13487 struct machine_function *m = cfun->machine;
13488 rtx insn, t;
13489 HOST_WIDE_INT allocate;
13490 bool int_registers_saved;
13491 bool sse_registers_saved;
13492 bool save_stub_call_needed;
13493 rtx static_chain = NULL_RTX;
13495 if (ix86_function_naked (current_function_decl))
13496 return;
13498 ix86_finalize_stack_frame_flags ();
13500 /* DRAP should not coexist with stack_realign_fp */
13501 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13503 memset (&m->fs, 0, sizeof (m->fs));
13505 /* Initialize CFA state for before the prologue. */
13506 m->fs.cfa_reg = stack_pointer_rtx;
13507 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13509 /* Track SP offset to the CFA. We continue tracking this after we've
13510 swapped the CFA register away from SP. In the case of re-alignment
13511 this is fudged; we're interested to offsets within the local frame. */
13512 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13513 m->fs.sp_valid = true;
13514 m->fs.sp_realigned = false;
13516 const struct ix86_frame &frame = cfun->machine->frame;
13518 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13520 /* We should have already generated an error for any use of
13521 ms_hook on a nested function. */
13522 gcc_checking_assert (!ix86_static_chain_on_stack);
13524 /* Check if profiling is active and we shall use profiling before
13525 prologue variant. If so sorry. */
13526 if (crtl->profile && flag_fentry != 0)
13527 sorry ("ms_hook_prologue attribute isn%'t compatible "
13528 "with -mfentry for 32-bit");
13530 /* In ix86_asm_output_function_label we emitted:
13531 8b ff movl.s %edi,%edi
13532 55 push %ebp
13533 8b ec movl.s %esp,%ebp
13535 This matches the hookable function prologue in Win32 API
13536 functions in Microsoft Windows XP Service Pack 2 and newer.
13537 Wine uses this to enable Windows apps to hook the Win32 API
13538 functions provided by Wine.
13540 What that means is that we've already set up the frame pointer. */
13542 if (frame_pointer_needed
13543 && !(crtl->drap_reg && crtl->stack_realign_needed))
13545 rtx push, mov;
13547 /* We've decided to use the frame pointer already set up.
13548 Describe this to the unwinder by pretending that both
13549 push and mov insns happen right here.
13551 Putting the unwind info here at the end of the ms_hook
13552 is done so that we can make absolutely certain we get
13553 the required byte sequence at the start of the function,
13554 rather than relying on an assembler that can produce
13555 the exact encoding required.
13557 However it does mean (in the unpatched case) that we have
13558 a 1 insn window where the asynchronous unwind info is
13559 incorrect. However, if we placed the unwind info at
13560 its correct location we would have incorrect unwind info
13561 in the patched case. Which is probably all moot since
13562 I don't expect Wine generates dwarf2 unwind info for the
13563 system libraries that use this feature. */
13565 insn = emit_insn (gen_blockage ());
13567 push = gen_push (hard_frame_pointer_rtx);
13568 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13569 stack_pointer_rtx);
13570 RTX_FRAME_RELATED_P (push) = 1;
13571 RTX_FRAME_RELATED_P (mov) = 1;
13573 RTX_FRAME_RELATED_P (insn) = 1;
13574 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13575 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13577 /* Note that gen_push incremented m->fs.cfa_offset, even
13578 though we didn't emit the push insn here. */
13579 m->fs.cfa_reg = hard_frame_pointer_rtx;
13580 m->fs.fp_offset = m->fs.cfa_offset;
13581 m->fs.fp_valid = true;
13583 else
13585 /* The frame pointer is not needed so pop %ebp again.
13586 This leaves us with a pristine state. */
13587 emit_insn (gen_pop (hard_frame_pointer_rtx));
13591 /* The first insn of a function that accepts its static chain on the
13592 stack is to push the register that would be filled in by a direct
13593 call. This insn will be skipped by the trampoline. */
13594 else if (ix86_static_chain_on_stack)
13596 static_chain = ix86_static_chain (cfun->decl, false);
13597 insn = emit_insn (gen_push (static_chain));
13598 emit_insn (gen_blockage ());
13600 /* We don't want to interpret this push insn as a register save,
13601 only as a stack adjustment. The real copy of the register as
13602 a save will be done later, if needed. */
13603 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13604 t = gen_rtx_SET (stack_pointer_rtx, t);
13605 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13606 RTX_FRAME_RELATED_P (insn) = 1;
13609 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13610 of DRAP is needed and stack realignment is really needed after reload */
13611 if (stack_realign_drap)
13613 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13615 /* Can't use DRAP in interrupt function. */
13616 if (cfun->machine->func_type != TYPE_NORMAL)
13617 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13618 "in interrupt service routine. This may be worked "
13619 "around by avoiding functions with aggregate return.");
13621 /* Only need to push parameter pointer reg if it is caller saved. */
13622 if (!call_used_regs[REGNO (crtl->drap_reg)])
13624 /* Push arg pointer reg */
13625 insn = emit_insn (gen_push (crtl->drap_reg));
13626 RTX_FRAME_RELATED_P (insn) = 1;
13629 /* Grab the argument pointer. */
13630 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13631 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13632 RTX_FRAME_RELATED_P (insn) = 1;
13633 m->fs.cfa_reg = crtl->drap_reg;
13634 m->fs.cfa_offset = 0;
13636 /* Align the stack. */
13637 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13638 stack_pointer_rtx,
13639 GEN_INT (-align_bytes)));
13640 RTX_FRAME_RELATED_P (insn) = 1;
13642 /* Replicate the return address on the stack so that return
13643 address can be reached via (argp - 1) slot. This is needed
13644 to implement macro RETURN_ADDR_RTX and intrinsic function
13645 expand_builtin_return_addr etc. */
13646 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13647 t = gen_frame_mem (word_mode, t);
13648 insn = emit_insn (gen_push (t));
13649 RTX_FRAME_RELATED_P (insn) = 1;
13651 /* For the purposes of frame and register save area addressing,
13652 we've started over with a new frame. */
13653 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13654 m->fs.realigned = true;
13656 if (static_chain)
13658 /* Replicate static chain on the stack so that static chain
13659 can be reached via (argp - 2) slot. This is needed for
13660 nested function with stack realignment. */
13661 insn = emit_insn (gen_push (static_chain));
13662 RTX_FRAME_RELATED_P (insn) = 1;
13666 int_registers_saved = (frame.nregs == 0);
13667 sse_registers_saved = (frame.nsseregs == 0);
13668 save_stub_call_needed = (m->call_ms2sysv);
13669 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13671 if (frame_pointer_needed && !m->fs.fp_valid)
13673 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13674 slower on all targets. Also sdb didn't like it. */
13675 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13676 RTX_FRAME_RELATED_P (insn) = 1;
13678 /* Push registers now, before setting the frame pointer
13679 on SEH target. */
13680 if (!int_registers_saved
13681 && TARGET_SEH
13682 && !frame.save_regs_using_mov)
13684 ix86_emit_save_regs ();
13685 int_registers_saved = true;
13686 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13689 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13691 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13692 RTX_FRAME_RELATED_P (insn) = 1;
13694 if (m->fs.cfa_reg == stack_pointer_rtx)
13695 m->fs.cfa_reg = hard_frame_pointer_rtx;
13696 m->fs.fp_offset = m->fs.sp_offset;
13697 m->fs.fp_valid = true;
13701 if (!int_registers_saved)
13703 /* If saving registers via PUSH, do so now. */
13704 if (!frame.save_regs_using_mov)
13706 ix86_emit_save_regs ();
13707 int_registers_saved = true;
13708 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13711 /* When using red zone we may start register saving before allocating
13712 the stack frame saving one cycle of the prologue. However, avoid
13713 doing this if we have to probe the stack; at least on x86_64 the
13714 stack probe can turn into a call that clobbers a red zone location. */
13715 else if (ix86_using_red_zone ()
13716 && (! TARGET_STACK_PROBE
13717 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13719 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13720 int_registers_saved = true;
13724 if (stack_realign_fp)
13726 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13727 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13729 /* Record last valid frame pointer offset. */
13730 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13732 /* The computation of the size of the re-aligned stack frame means
13733 that we must allocate the size of the register save area before
13734 performing the actual alignment. Otherwise we cannot guarantee
13735 that there's enough storage above the realignment point. */
13736 allocate = frame.reg_save_offset - m->fs.sp_offset
13737 + frame.stack_realign_allocate;
13738 if (allocate)
13739 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13740 GEN_INT (-allocate), -1, false);
13742 /* Align the stack. */
13743 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13744 stack_pointer_rtx,
13745 GEN_INT (-align_bytes)));
13746 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13747 m->fs.sp_realigned_offset = m->fs.sp_offset
13748 - frame.stack_realign_allocate;
13749 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13750 Beyond this point, stack access should be done via choose_baseaddr or
13751 by using sp_valid_at and fp_valid_at to determine the correct base
13752 register. Henceforth, any CFA offset should be thought of as logical
13753 and not physical. */
13754 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13755 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13756 m->fs.sp_realigned = true;
13758 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13759 is needed to describe where a register is saved using a realigned
13760 stack pointer, so we need to invalidate the stack pointer for that
13761 target. */
13762 if (TARGET_SEH)
13763 m->fs.sp_valid = false;
13765 /* If SP offset is non-immediate after allocation of the stack frame,
13766 then emit SSE saves or stub call prior to allocating the rest of the
13767 stack frame. This is less efficient for the out-of-line stub because
13768 we can't combine allocations across the call barrier, but it's better
13769 than using a scratch register. */
13770 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13771 - m->fs.sp_realigned_offset),
13772 Pmode))
13774 if (!sse_registers_saved)
13776 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13777 sse_registers_saved = true;
13779 else if (save_stub_call_needed)
13781 ix86_emit_outlined_ms2sysv_save (frame);
13782 save_stub_call_needed = false;
13787 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13789 if (flag_stack_usage_info)
13791 /* We start to count from ARG_POINTER. */
13792 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13794 /* If it was realigned, take into account the fake frame. */
13795 if (stack_realign_drap)
13797 if (ix86_static_chain_on_stack)
13798 stack_size += UNITS_PER_WORD;
13800 if (!call_used_regs[REGNO (crtl->drap_reg)])
13801 stack_size += UNITS_PER_WORD;
13803 /* This over-estimates by 1 minimal-stack-alignment-unit but
13804 mitigates that by counting in the new return address slot. */
13805 current_function_dynamic_stack_size
13806 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13809 current_function_static_stack_size = stack_size;
13812 /* On SEH target with very large frame size, allocate an area to save
13813 SSE registers (as the very large allocation won't be described). */
13814 if (TARGET_SEH
13815 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13816 && !sse_registers_saved)
13818 HOST_WIDE_INT sse_size =
13819 frame.sse_reg_save_offset - frame.reg_save_offset;
13821 gcc_assert (int_registers_saved);
13823 /* No need to do stack checking as the area will be immediately
13824 written. */
13825 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13826 GEN_INT (-sse_size), -1,
13827 m->fs.cfa_reg == stack_pointer_rtx);
13828 allocate -= sse_size;
13829 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13830 sse_registers_saved = true;
13833 /* The stack has already been decremented by the instruction calling us
13834 so probe if the size is non-negative to preserve the protection area. */
13835 if (allocate >= 0
13836 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13837 || flag_stack_clash_protection))
13839 if (flag_stack_clash_protection)
13841 ix86_adjust_stack_and_probe_stack_clash (allocate,
13842 int_registers_saved);
13843 allocate = 0;
13845 else if (STACK_CHECK_MOVING_SP)
13847 if (!(crtl->is_leaf && !cfun->calls_alloca
13848 && allocate <= get_probe_interval ()))
13850 ix86_adjust_stack_and_probe (allocate, int_registers_saved);
13851 allocate = 0;
13854 else
13856 HOST_WIDE_INT size = allocate;
13858 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13859 size = 0x80000000 - get_stack_check_protect () - 1;
13861 if (TARGET_STACK_PROBE)
13863 if (crtl->is_leaf && !cfun->calls_alloca)
13865 if (size > get_probe_interval ())
13866 ix86_emit_probe_stack_range (0, size, int_registers_saved);
13868 else
13869 ix86_emit_probe_stack_range (0,
13870 size + get_stack_check_protect (),
13871 int_registers_saved);
13873 else
13875 if (crtl->is_leaf && !cfun->calls_alloca)
13877 if (size > get_probe_interval ()
13878 && size > get_stack_check_protect ())
13879 ix86_emit_probe_stack_range (get_stack_check_protect (),
13880 (size
13881 - get_stack_check_protect ()),
13882 int_registers_saved);
13884 else
13885 ix86_emit_probe_stack_range (get_stack_check_protect (), size,
13886 int_registers_saved);
13891 if (allocate == 0)
13893 else if (!ix86_target_stack_probe ()
13894 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13896 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13897 GEN_INT (-allocate), -1,
13898 m->fs.cfa_reg == stack_pointer_rtx);
13900 else
13902 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13903 rtx r10 = NULL;
13904 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13905 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13906 bool eax_live = ix86_eax_live_at_start_p ();
13907 bool r10_live = false;
13909 if (TARGET_64BIT)
13910 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13912 if (eax_live)
13914 insn = emit_insn (gen_push (eax));
13915 allocate -= UNITS_PER_WORD;
13916 /* Note that SEH directives need to continue tracking the stack
13917 pointer even after the frame pointer has been set up. */
13918 if (sp_is_cfa_reg || TARGET_SEH)
13920 if (sp_is_cfa_reg)
13921 m->fs.cfa_offset += UNITS_PER_WORD;
13922 RTX_FRAME_RELATED_P (insn) = 1;
13923 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13924 gen_rtx_SET (stack_pointer_rtx,
13925 plus_constant (Pmode, stack_pointer_rtx,
13926 -UNITS_PER_WORD)));
13930 if (r10_live)
13932 r10 = gen_rtx_REG (Pmode, R10_REG);
13933 insn = emit_insn (gen_push (r10));
13934 allocate -= UNITS_PER_WORD;
13935 if (sp_is_cfa_reg || TARGET_SEH)
13937 if (sp_is_cfa_reg)
13938 m->fs.cfa_offset += UNITS_PER_WORD;
13939 RTX_FRAME_RELATED_P (insn) = 1;
13940 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13941 gen_rtx_SET (stack_pointer_rtx,
13942 plus_constant (Pmode, stack_pointer_rtx,
13943 -UNITS_PER_WORD)));
13947 emit_move_insn (eax, GEN_INT (allocate));
13948 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13950 /* Use the fact that AX still contains ALLOCATE. */
13951 adjust_stack_insn = (Pmode == DImode
13952 ? gen_pro_epilogue_adjust_stack_di_sub
13953 : gen_pro_epilogue_adjust_stack_si_sub);
13955 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13956 stack_pointer_rtx, eax));
13958 if (sp_is_cfa_reg || TARGET_SEH)
13960 if (sp_is_cfa_reg)
13961 m->fs.cfa_offset += allocate;
13962 RTX_FRAME_RELATED_P (insn) = 1;
13963 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13964 gen_rtx_SET (stack_pointer_rtx,
13965 plus_constant (Pmode, stack_pointer_rtx,
13966 -allocate)));
13968 m->fs.sp_offset += allocate;
13970 /* Use stack_pointer_rtx for relative addressing so that code
13971 works for realigned stack, too. */
13972 if (r10_live && eax_live)
13974 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13975 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13976 gen_frame_mem (word_mode, t));
13977 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13978 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13979 gen_frame_mem (word_mode, t));
13981 else if (eax_live || r10_live)
13983 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13984 emit_move_insn (gen_rtx_REG (word_mode,
13985 (eax_live ? AX_REG : R10_REG)),
13986 gen_frame_mem (word_mode, t));
13989 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13991 /* If we havn't already set up the frame pointer, do so now. */
13992 if (frame_pointer_needed && !m->fs.fp_valid)
13994 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13995 GEN_INT (frame.stack_pointer_offset
13996 - frame.hard_frame_pointer_offset));
13997 insn = emit_insn (insn);
13998 RTX_FRAME_RELATED_P (insn) = 1;
13999 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14001 if (m->fs.cfa_reg == stack_pointer_rtx)
14002 m->fs.cfa_reg = hard_frame_pointer_rtx;
14003 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14004 m->fs.fp_valid = true;
14007 if (!int_registers_saved)
14008 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14009 if (!sse_registers_saved)
14010 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14011 else if (save_stub_call_needed)
14012 ix86_emit_outlined_ms2sysv_save (frame);
14014 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14015 in PROLOGUE. */
14016 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14018 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14019 insn = emit_insn (gen_set_got (pic));
14020 RTX_FRAME_RELATED_P (insn) = 1;
14021 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14022 emit_insn (gen_prologue_use (pic));
14023 /* Deleting already emmitted SET_GOT if exist and allocated to
14024 REAL_PIC_OFFSET_TABLE_REGNUM. */
14025 ix86_elim_entry_set_got (pic);
14028 if (crtl->drap_reg && !crtl->stack_realign_needed)
14030 /* vDRAP is setup but after reload it turns out stack realign
14031 isn't necessary, here we will emit prologue to setup DRAP
14032 without stack realign adjustment */
14033 t = choose_baseaddr (0, NULL);
14034 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14037 /* Prevent instructions from being scheduled into register save push
14038 sequence when access to the redzone area is done through frame pointer.
14039 The offset between the frame pointer and the stack pointer is calculated
14040 relative to the value of the stack pointer at the end of the function
14041 prologue, and moving instructions that access redzone area via frame
14042 pointer inside push sequence violates this assumption. */
14043 if (frame_pointer_needed && frame.red_zone_size)
14044 emit_insn (gen_memory_blockage ());
14046 /* SEH requires that the prologue end within 256 bytes of the start of
14047 the function. Prevent instruction schedules that would extend that.
14048 Further, prevent alloca modifications to the stack pointer from being
14049 combined with prologue modifications. */
14050 if (TARGET_SEH)
14051 emit_insn (gen_prologue_use (stack_pointer_rtx));
14054 /* Emit code to restore REG using a POP insn. */
14056 static void
14057 ix86_emit_restore_reg_using_pop (rtx reg)
14059 struct machine_function *m = cfun->machine;
14060 rtx_insn *insn = emit_insn (gen_pop (reg));
14062 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14063 m->fs.sp_offset -= UNITS_PER_WORD;
14065 if (m->fs.cfa_reg == crtl->drap_reg
14066 && REGNO (reg) == REGNO (crtl->drap_reg))
14068 /* Previously we'd represented the CFA as an expression
14069 like *(%ebp - 8). We've just popped that value from
14070 the stack, which means we need to reset the CFA to
14071 the drap register. This will remain until we restore
14072 the stack pointer. */
14073 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14074 RTX_FRAME_RELATED_P (insn) = 1;
14076 /* This means that the DRAP register is valid for addressing too. */
14077 m->fs.drap_valid = true;
14078 return;
14081 if (m->fs.cfa_reg == stack_pointer_rtx)
14083 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14084 x = gen_rtx_SET (stack_pointer_rtx, x);
14085 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14086 RTX_FRAME_RELATED_P (insn) = 1;
14088 m->fs.cfa_offset -= UNITS_PER_WORD;
14091 /* When the frame pointer is the CFA, and we pop it, we are
14092 swapping back to the stack pointer as the CFA. This happens
14093 for stack frames that don't allocate other data, so we assume
14094 the stack pointer is now pointing at the return address, i.e.
14095 the function entry state, which makes the offset be 1 word. */
14096 if (reg == hard_frame_pointer_rtx)
14098 m->fs.fp_valid = false;
14099 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14101 m->fs.cfa_reg = stack_pointer_rtx;
14102 m->fs.cfa_offset -= UNITS_PER_WORD;
14104 add_reg_note (insn, REG_CFA_DEF_CFA,
14105 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14106 GEN_INT (m->fs.cfa_offset)));
14107 RTX_FRAME_RELATED_P (insn) = 1;
14112 /* Emit code to restore saved registers using POP insns. */
14114 static void
14115 ix86_emit_restore_regs_using_pop (void)
14117 unsigned int regno;
14119 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14120 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14121 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14124 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14125 omits the emit and only attaches the notes. */
14127 static void
14128 ix86_emit_leave (rtx_insn *insn)
14130 struct machine_function *m = cfun->machine;
14131 if (!insn)
14132 insn = emit_insn (ix86_gen_leave ());
14134 ix86_add_queued_cfa_restore_notes (insn);
14136 gcc_assert (m->fs.fp_valid);
14137 m->fs.sp_valid = true;
14138 m->fs.sp_realigned = false;
14139 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14140 m->fs.fp_valid = false;
14142 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14144 m->fs.cfa_reg = stack_pointer_rtx;
14145 m->fs.cfa_offset = m->fs.sp_offset;
14147 add_reg_note (insn, REG_CFA_DEF_CFA,
14148 plus_constant (Pmode, stack_pointer_rtx,
14149 m->fs.sp_offset));
14150 RTX_FRAME_RELATED_P (insn) = 1;
14152 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14153 m->fs.fp_offset);
14156 /* Emit code to restore saved registers using MOV insns.
14157 First register is restored from CFA - CFA_OFFSET. */
14158 static void
14159 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14160 bool maybe_eh_return)
14162 struct machine_function *m = cfun->machine;
14163 unsigned int regno;
14165 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14166 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14168 rtx reg = gen_rtx_REG (word_mode, regno);
14169 rtx mem;
14170 rtx_insn *insn;
14172 mem = choose_baseaddr (cfa_offset, NULL);
14173 mem = gen_frame_mem (word_mode, mem);
14174 insn = emit_move_insn (reg, mem);
14176 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14178 /* Previously we'd represented the CFA as an expression
14179 like *(%ebp - 8). We've just popped that value from
14180 the stack, which means we need to reset the CFA to
14181 the drap register. This will remain until we restore
14182 the stack pointer. */
14183 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14184 RTX_FRAME_RELATED_P (insn) = 1;
14186 /* This means that the DRAP register is valid for addressing. */
14187 m->fs.drap_valid = true;
14189 else
14190 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14192 cfa_offset -= UNITS_PER_WORD;
14196 /* Emit code to restore saved registers using MOV insns.
14197 First register is restored from CFA - CFA_OFFSET. */
14198 static void
14199 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14200 bool maybe_eh_return)
14202 unsigned int regno;
14204 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14205 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14207 rtx reg = gen_rtx_REG (V4SFmode, regno);
14208 rtx mem;
14209 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14211 mem = choose_baseaddr (cfa_offset, &align);
14212 mem = gen_rtx_MEM (V4SFmode, mem);
14214 /* The location aligment depends upon the base register. */
14215 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14216 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14217 set_mem_align (mem, align);
14218 emit_insn (gen_rtx_SET (reg, mem));
14220 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14222 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14226 static void
14227 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
14228 bool use_call, int style)
14230 struct machine_function *m = cfun->machine;
14231 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14232 + m->call_ms2sysv_extra_regs;
14233 rtvec v;
14234 unsigned int elems_needed, align, i, vi = 0;
14235 rtx_insn *insn;
14236 rtx sym, tmp;
14237 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
14238 rtx r10 = NULL_RTX;
14239 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14240 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
14241 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
14242 rtx rsi_frame_load = NULL_RTX;
14243 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
14244 enum xlogue_stub stub;
14246 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
14248 /* If using a realigned stack, we should never start with padding. */
14249 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
14251 /* Setup RSI as the stub's base pointer. */
14252 align = GET_MODE_ALIGNMENT (V4SFmode);
14253 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
14254 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14256 emit_insn (gen_rtx_SET (rsi, tmp));
14258 /* Get a symbol for the stub. */
14259 if (frame_pointer_needed)
14260 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
14261 : XLOGUE_STUB_RESTORE_HFP_TAIL;
14262 else
14263 stub = use_call ? XLOGUE_STUB_RESTORE
14264 : XLOGUE_STUB_RESTORE_TAIL;
14265 sym = xlogue.get_stub_rtx (stub);
14267 elems_needed = ncregs;
14268 if (use_call)
14269 elems_needed += 1;
14270 else
14271 elems_needed += frame_pointer_needed ? 5 : 3;
14272 v = rtvec_alloc (elems_needed);
14274 /* We call the epilogue stub when we need to pop incoming args or we are
14275 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
14276 epilogue stub and it is the tail-call. */
14277 if (use_call)
14278 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14279 else
14281 RTVEC_ELT (v, vi++) = ret_rtx;
14282 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14283 if (frame_pointer_needed)
14285 rtx rbp = gen_rtx_REG (DImode, BP_REG);
14286 gcc_assert (m->fs.fp_valid);
14287 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
14289 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
14290 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
14291 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
14292 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
14293 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
14295 else
14297 /* If no hard frame pointer, we set R10 to the SP restore value. */
14298 gcc_assert (!m->fs.fp_valid);
14299 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14300 gcc_assert (m->fs.sp_valid);
14302 r10 = gen_rtx_REG (DImode, R10_REG);
14303 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
14304 emit_insn (gen_rtx_SET (r10, tmp));
14306 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
14310 /* Generate frame load insns and restore notes. */
14311 for (i = 0; i < ncregs; ++i)
14313 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14314 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
14315 rtx reg, frame_load;
14317 reg = gen_rtx_REG (mode, r.regno);
14318 frame_load = gen_frame_load (reg, rsi, r.offset);
14320 /* Save RSI frame load insn & note to add last. */
14321 if (r.regno == SI_REG)
14323 gcc_assert (!rsi_frame_load);
14324 rsi_frame_load = frame_load;
14325 rsi_restore_offset = r.offset;
14327 else
14329 RTVEC_ELT (v, vi++) = frame_load;
14330 ix86_add_cfa_restore_note (NULL, reg, r.offset);
14334 /* Add RSI frame load & restore note at the end. */
14335 gcc_assert (rsi_frame_load);
14336 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
14337 RTVEC_ELT (v, vi++) = rsi_frame_load;
14338 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
14339 rsi_restore_offset);
14341 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
14342 if (!use_call && !frame_pointer_needed)
14344 gcc_assert (m->fs.sp_valid);
14345 gcc_assert (!m->fs.sp_realigned);
14347 /* At this point, R10 should point to frame.stack_realign_offset. */
14348 if (m->fs.cfa_reg == stack_pointer_rtx)
14349 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
14350 m->fs.sp_offset = frame.stack_realign_offset;
14353 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
14354 tmp = gen_rtx_PARALLEL (VOIDmode, v);
14355 if (use_call)
14356 insn = emit_insn (tmp);
14357 else
14359 insn = emit_jump_insn (tmp);
14360 JUMP_LABEL (insn) = ret_rtx;
14362 if (frame_pointer_needed)
14363 ix86_emit_leave (insn);
14364 else
14366 /* Need CFA adjust note. */
14367 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
14368 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
14372 RTX_FRAME_RELATED_P (insn) = true;
14373 ix86_add_queued_cfa_restore_notes (insn);
14375 /* If we're not doing a tail-call, we need to adjust the stack. */
14376 if (use_call && m->fs.sp_valid)
14378 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
14379 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14380 GEN_INT (dealloc), style,
14381 m->fs.cfa_reg == stack_pointer_rtx);
14385 /* Restore function stack, frame, and registers. */
14387 void
14388 ix86_expand_epilogue (int style)
14390 struct machine_function *m = cfun->machine;
14391 struct machine_frame_state frame_state_save = m->fs;
14392 bool restore_regs_via_mov;
14393 bool using_drap;
14394 bool restore_stub_is_tail = false;
14396 if (ix86_function_naked (current_function_decl))
14398 /* The program should not reach this point. */
14399 emit_insn (gen_ud2 ());
14400 return;
14403 ix86_finalize_stack_frame_flags ();
14404 const struct ix86_frame &frame = cfun->machine->frame;
14406 m->fs.sp_realigned = stack_realign_fp;
14407 m->fs.sp_valid = stack_realign_fp
14408 || !frame_pointer_needed
14409 || crtl->sp_is_unchanging;
14410 gcc_assert (!m->fs.sp_valid
14411 || m->fs.sp_offset == frame.stack_pointer_offset);
14413 /* The FP must be valid if the frame pointer is present. */
14414 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14415 gcc_assert (!m->fs.fp_valid
14416 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14418 /* We must have *some* valid pointer to the stack frame. */
14419 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14421 /* The DRAP is never valid at this point. */
14422 gcc_assert (!m->fs.drap_valid);
14424 /* See the comment about red zone and frame
14425 pointer usage in ix86_expand_prologue. */
14426 if (frame_pointer_needed && frame.red_zone_size)
14427 emit_insn (gen_memory_blockage ());
14429 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14430 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14432 /* Determine the CFA offset of the end of the red-zone. */
14433 m->fs.red_zone_offset = 0;
14434 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14436 /* The red-zone begins below return address and error code in
14437 exception handler. */
14438 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
14440 /* When the register save area is in the aligned portion of
14441 the stack, determine the maximum runtime displacement that
14442 matches up with the aligned frame. */
14443 if (stack_realign_drap)
14444 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14445 + UNITS_PER_WORD);
14448 HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
14450 /* Special care must be taken for the normal return case of a function
14451 using eh_return: the eax and edx registers are marked as saved, but
14452 not restored along this path. Adjust the save location to match. */
14453 if (crtl->calls_eh_return && style != 2)
14454 reg_save_offset -= 2 * UNITS_PER_WORD;
14456 /* EH_RETURN requires the use of moves to function properly. */
14457 if (crtl->calls_eh_return)
14458 restore_regs_via_mov = true;
14459 /* SEH requires the use of pops to identify the epilogue. */
14460 else if (TARGET_SEH)
14461 restore_regs_via_mov = false;
14462 /* If we're only restoring one register and sp cannot be used then
14463 using a move instruction to restore the register since it's
14464 less work than reloading sp and popping the register. */
14465 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
14466 restore_regs_via_mov = true;
14467 else if (TARGET_EPILOGUE_USING_MOVE
14468 && cfun->machine->use_fast_prologue_epilogue
14469 && (frame.nregs > 1
14470 || m->fs.sp_offset != reg_save_offset))
14471 restore_regs_via_mov = true;
14472 else if (frame_pointer_needed
14473 && !frame.nregs
14474 && m->fs.sp_offset != reg_save_offset)
14475 restore_regs_via_mov = true;
14476 else if (frame_pointer_needed
14477 && TARGET_USE_LEAVE
14478 && cfun->machine->use_fast_prologue_epilogue
14479 && frame.nregs == 1)
14480 restore_regs_via_mov = true;
14481 else
14482 restore_regs_via_mov = false;
14484 if (restore_regs_via_mov || frame.nsseregs)
14486 /* Ensure that the entire register save area is addressable via
14487 the stack pointer, if we will restore SSE regs via sp. */
14488 if (TARGET_64BIT
14489 && m->fs.sp_offset > 0x7fffffff
14490 && sp_valid_at (frame.stack_realign_offset + 1)
14491 && (frame.nsseregs + frame.nregs) != 0)
14493 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14494 GEN_INT (m->fs.sp_offset
14495 - frame.sse_reg_save_offset),
14496 style,
14497 m->fs.cfa_reg == stack_pointer_rtx);
14501 /* If there are any SSE registers to restore, then we have to do it
14502 via moves, since there's obviously no pop for SSE regs. */
14503 if (frame.nsseregs)
14504 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14505 style == 2);
14507 if (m->call_ms2sysv)
14509 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14511 /* We cannot use a tail-call for the stub if:
14512 1. We have to pop incoming args,
14513 2. We have additional int regs to restore, or
14514 3. A sibling call will be the tail-call, or
14515 4. We are emitting an eh_return_internal epilogue.
14517 TODO: Item 4 has not yet tested!
14519 If any of the above are true, we will call the stub rather than
14520 jump to it. */
14521 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14522 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14525 /* If using out-of-line stub that is a tail-call, then...*/
14526 if (m->call_ms2sysv && restore_stub_is_tail)
14528 /* TODO: parinoid tests. (remove eventually) */
14529 gcc_assert (m->fs.sp_valid);
14530 gcc_assert (!m->fs.sp_realigned);
14531 gcc_assert (!m->fs.fp_valid);
14532 gcc_assert (!m->fs.realigned);
14533 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14534 gcc_assert (!crtl->drap_reg);
14535 gcc_assert (!frame.nregs);
14537 else if (restore_regs_via_mov)
14539 rtx t;
14541 if (frame.nregs)
14542 ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
14544 /* eh_return epilogues need %ecx added to the stack pointer. */
14545 if (style == 2)
14547 rtx sa = EH_RETURN_STACKADJ_RTX;
14548 rtx_insn *insn;
14550 /* %ecx can't be used for both DRAP register and eh_return. */
14551 if (crtl->drap_reg)
14552 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14554 /* regparm nested functions don't work with eh_return. */
14555 gcc_assert (!ix86_static_chain_on_stack);
14557 if (frame_pointer_needed)
14559 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14560 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14561 emit_insn (gen_rtx_SET (sa, t));
14563 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14564 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14566 /* Note that we use SA as a temporary CFA, as the return
14567 address is at the proper place relative to it. We
14568 pretend this happens at the FP restore insn because
14569 prior to this insn the FP would be stored at the wrong
14570 offset relative to SA, and after this insn we have no
14571 other reasonable register to use for the CFA. We don't
14572 bother resetting the CFA to the SP for the duration of
14573 the return insn, unless the control flow instrumentation
14574 is done. In this case the SP is used later and we have
14575 to reset CFA to SP. */
14576 add_reg_note (insn, REG_CFA_DEF_CFA,
14577 plus_constant (Pmode, sa, UNITS_PER_WORD));
14578 ix86_add_queued_cfa_restore_notes (insn);
14579 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14580 RTX_FRAME_RELATED_P (insn) = 1;
14582 m->fs.cfa_reg = sa;
14583 m->fs.cfa_offset = UNITS_PER_WORD;
14584 m->fs.fp_valid = false;
14586 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14587 const0_rtx, style,
14588 flag_cf_protection);
14590 else
14592 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14593 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14594 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14595 ix86_add_queued_cfa_restore_notes (insn);
14597 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14598 if (m->fs.cfa_offset != UNITS_PER_WORD)
14600 m->fs.cfa_offset = UNITS_PER_WORD;
14601 add_reg_note (insn, REG_CFA_DEF_CFA,
14602 plus_constant (Pmode, stack_pointer_rtx,
14603 UNITS_PER_WORD));
14604 RTX_FRAME_RELATED_P (insn) = 1;
14607 m->fs.sp_offset = UNITS_PER_WORD;
14608 m->fs.sp_valid = true;
14609 m->fs.sp_realigned = false;
14612 else
14614 /* SEH requires that the function end with (1) a stack adjustment
14615 if necessary, (2) a sequence of pops, and (3) a return or
14616 jump instruction. Prevent insns from the function body from
14617 being scheduled into this sequence. */
14618 if (TARGET_SEH)
14620 /* Prevent a catch region from being adjacent to the standard
14621 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14622 several other flags that would be interesting to test are
14623 not yet set up. */
14624 if (flag_non_call_exceptions)
14625 emit_insn (gen_nops (const1_rtx));
14626 else
14627 emit_insn (gen_blockage ());
14630 /* First step is to deallocate the stack frame so that we can
14631 pop the registers. If the stack pointer was realigned, it needs
14632 to be restored now. Also do it on SEH target for very large
14633 frame as the emitted instructions aren't allowed by the ABI
14634 in epilogues. */
14635 if (!m->fs.sp_valid || m->fs.sp_realigned
14636 || (TARGET_SEH
14637 && (m->fs.sp_offset - reg_save_offset
14638 >= SEH_MAX_FRAME_SIZE)))
14640 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14641 GEN_INT (m->fs.fp_offset
14642 - reg_save_offset),
14643 style, false);
14645 else if (m->fs.sp_offset != reg_save_offset)
14647 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14648 GEN_INT (m->fs.sp_offset
14649 - reg_save_offset),
14650 style,
14651 m->fs.cfa_reg == stack_pointer_rtx);
14654 ix86_emit_restore_regs_using_pop ();
14657 /* If we used a stack pointer and haven't already got rid of it,
14658 then do so now. */
14659 if (m->fs.fp_valid)
14661 /* If the stack pointer is valid and pointing at the frame
14662 pointer store address, then we only need a pop. */
14663 if (sp_valid_at (frame.hfp_save_offset)
14664 && m->fs.sp_offset == frame.hfp_save_offset)
14665 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14666 /* Leave results in shorter dependency chains on CPUs that are
14667 able to grok it fast. */
14668 else if (TARGET_USE_LEAVE
14669 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14670 || !cfun->machine->use_fast_prologue_epilogue)
14671 ix86_emit_leave (NULL);
14672 else
14674 pro_epilogue_adjust_stack (stack_pointer_rtx,
14675 hard_frame_pointer_rtx,
14676 const0_rtx, style, !using_drap);
14677 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14681 if (using_drap)
14683 int param_ptr_offset = UNITS_PER_WORD;
14684 rtx_insn *insn;
14686 gcc_assert (stack_realign_drap);
14688 if (ix86_static_chain_on_stack)
14689 param_ptr_offset += UNITS_PER_WORD;
14690 if (!call_used_regs[REGNO (crtl->drap_reg)])
14691 param_ptr_offset += UNITS_PER_WORD;
14693 insn = emit_insn (gen_rtx_SET
14694 (stack_pointer_rtx,
14695 gen_rtx_PLUS (Pmode,
14696 crtl->drap_reg,
14697 GEN_INT (-param_ptr_offset))));
14698 m->fs.cfa_reg = stack_pointer_rtx;
14699 m->fs.cfa_offset = param_ptr_offset;
14700 m->fs.sp_offset = param_ptr_offset;
14701 m->fs.realigned = false;
14703 add_reg_note (insn, REG_CFA_DEF_CFA,
14704 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14705 GEN_INT (param_ptr_offset)));
14706 RTX_FRAME_RELATED_P (insn) = 1;
14708 if (!call_used_regs[REGNO (crtl->drap_reg)])
14709 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14712 /* At this point the stack pointer must be valid, and we must have
14713 restored all of the registers. We may not have deallocated the
14714 entire stack frame. We've delayed this until now because it may
14715 be possible to merge the local stack deallocation with the
14716 deallocation forced by ix86_static_chain_on_stack. */
14717 gcc_assert (m->fs.sp_valid);
14718 gcc_assert (!m->fs.sp_realigned);
14719 gcc_assert (!m->fs.fp_valid);
14720 gcc_assert (!m->fs.realigned);
14721 if (m->fs.sp_offset != UNITS_PER_WORD)
14723 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14724 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14725 style, true);
14727 else
14728 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14730 /* Sibcall epilogues don't want a return instruction. */
14731 if (style == 0)
14733 m->fs = frame_state_save;
14734 return;
14737 if (cfun->machine->func_type != TYPE_NORMAL)
14738 emit_jump_insn (gen_interrupt_return ());
14739 else if (crtl->args.pops_args && crtl->args.size)
14741 rtx popc = GEN_INT (crtl->args.pops_args);
14743 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14744 address, do explicit add, and jump indirectly to the caller. */
14746 if (crtl->args.pops_args >= 65536)
14748 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14749 rtx_insn *insn;
14751 /* There is no "pascal" calling convention in any 64bit ABI. */
14752 gcc_assert (!TARGET_64BIT);
14754 insn = emit_insn (gen_pop (ecx));
14755 m->fs.cfa_offset -= UNITS_PER_WORD;
14756 m->fs.sp_offset -= UNITS_PER_WORD;
14758 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14759 x = gen_rtx_SET (stack_pointer_rtx, x);
14760 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14761 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14762 RTX_FRAME_RELATED_P (insn) = 1;
14764 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14765 popc, -1, true);
14766 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14768 else
14769 emit_jump_insn (gen_simple_return_pop_internal (popc));
14771 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14773 /* In case of return from EH a simple return cannot be used
14774 as a return address will be compared with a shadow stack
14775 return address. Use indirect jump instead. */
14776 if (style == 2 && flag_cf_protection)
14778 /* Register used in indirect jump must be in word_mode. But
14779 Pmode may not be the same as word_mode for x32. */
14780 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14781 rtx_insn *insn;
14783 insn = emit_insn (gen_pop (ecx));
14784 m->fs.cfa_offset -= UNITS_PER_WORD;
14785 m->fs.sp_offset -= UNITS_PER_WORD;
14787 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14788 x = gen_rtx_SET (stack_pointer_rtx, x);
14789 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14790 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14791 RTX_FRAME_RELATED_P (insn) = 1;
14793 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14795 else
14796 emit_jump_insn (gen_simple_return_internal ());
14799 /* Restore the state back to the state from the prologue,
14800 so that it's correct for the next epilogue. */
14801 m->fs = frame_state_save;
14804 /* Reset from the function's potential modifications. */
14806 static void
14807 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14809 if (pic_offset_table_rtx
14810 && !ix86_use_pseudo_pic_reg ())
14811 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14813 if (TARGET_MACHO)
14815 rtx_insn *insn = get_last_insn ();
14816 rtx_insn *deleted_debug_label = NULL;
14818 /* Mach-O doesn't support labels at the end of objects, so if
14819 it looks like we might want one, take special action.
14820 First, collect any sequence of deleted debug labels. */
14821 while (insn
14822 && NOTE_P (insn)
14823 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14825 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14826 notes only, instead set their CODE_LABEL_NUMBER to -1,
14827 otherwise there would be code generation differences
14828 in between -g and -g0. */
14829 if (NOTE_P (insn) && NOTE_KIND (insn)
14830 == NOTE_INSN_DELETED_DEBUG_LABEL)
14831 deleted_debug_label = insn;
14832 insn = PREV_INSN (insn);
14835 /* If we have:
14836 label:
14837 barrier
14838 then this needs to be detected, so skip past the barrier. */
14840 if (insn && BARRIER_P (insn))
14841 insn = PREV_INSN (insn);
14843 /* Up to now we've only seen notes or barriers. */
14844 if (insn)
14846 if (LABEL_P (insn)
14847 || (NOTE_P (insn)
14848 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14849 /* Trailing label. */
14850 fputs ("\tnop\n", file);
14851 else if (cfun && ! cfun->is_thunk)
14853 /* See if we have a completely empty function body, skipping
14854 the special case of the picbase thunk emitted as asm. */
14855 while (insn && ! INSN_P (insn))
14856 insn = PREV_INSN (insn);
14857 /* If we don't find any insns, we've got an empty function body;
14858 I.e. completely empty - without a return or branch. This is
14859 taken as the case where a function body has been removed
14860 because it contains an inline __builtin_unreachable(). GCC
14861 declares that reaching __builtin_unreachable() means UB so
14862 we're not obliged to do anything special; however, we want
14863 non-zero-sized function bodies. To meet this, and help the
14864 user out, let's trap the case. */
14865 if (insn == NULL)
14866 fputs ("\tud2\n", file);
14869 else if (deleted_debug_label)
14870 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14871 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14872 CODE_LABEL_NUMBER (insn) = -1;
14876 /* Return a scratch register to use in the split stack prologue. The
14877 split stack prologue is used for -fsplit-stack. It is the first
14878 instructions in the function, even before the regular prologue.
14879 The scratch register can be any caller-saved register which is not
14880 used for parameters or for the static chain. */
14882 static unsigned int
14883 split_stack_prologue_scratch_regno (void)
14885 if (TARGET_64BIT)
14886 return R11_REG;
14887 else
14889 bool is_fastcall, is_thiscall;
14890 int regparm;
14892 is_fastcall = (lookup_attribute ("fastcall",
14893 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14894 != NULL);
14895 is_thiscall = (lookup_attribute ("thiscall",
14896 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14897 != NULL);
14898 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14900 if (is_fastcall)
14902 if (DECL_STATIC_CHAIN (cfun->decl))
14904 sorry ("-fsplit-stack does not support fastcall with "
14905 "nested function");
14906 return INVALID_REGNUM;
14908 return AX_REG;
14910 else if (is_thiscall)
14912 if (!DECL_STATIC_CHAIN (cfun->decl))
14913 return DX_REG;
14914 return AX_REG;
14916 else if (regparm < 3)
14918 if (!DECL_STATIC_CHAIN (cfun->decl))
14919 return CX_REG;
14920 else
14922 if (regparm >= 2)
14924 sorry ("-fsplit-stack does not support 2 register "
14925 "parameters for a nested function");
14926 return INVALID_REGNUM;
14928 return DX_REG;
14931 else
14933 /* FIXME: We could make this work by pushing a register
14934 around the addition and comparison. */
14935 sorry ("-fsplit-stack does not support 3 register parameters");
14936 return INVALID_REGNUM;
14941 /* A SYMBOL_REF for the function which allocates new stackspace for
14942 -fsplit-stack. */
14944 static GTY(()) rtx split_stack_fn;
14946 /* A SYMBOL_REF for the more stack function when using the large
14947 model. */
14949 static GTY(()) rtx split_stack_fn_large;
14951 /* Return location of the stack guard value in the TLS block. */
14954 ix86_split_stack_guard (void)
14956 int offset;
14957 addr_space_t as = DEFAULT_TLS_SEG_REG;
14958 rtx r;
14960 gcc_assert (flag_split_stack);
14962 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14963 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14964 #else
14965 gcc_unreachable ();
14966 #endif
14968 r = GEN_INT (offset);
14969 r = gen_const_mem (Pmode, r);
14970 set_mem_addr_space (r, as);
14972 return r;
14975 /* Handle -fsplit-stack. These are the first instructions in the
14976 function, even before the regular prologue. */
14978 void
14979 ix86_expand_split_stack_prologue (void)
14981 HOST_WIDE_INT allocate;
14982 unsigned HOST_WIDE_INT args_size;
14983 rtx_code_label *label;
14984 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14985 rtx scratch_reg = NULL_RTX;
14986 rtx_code_label *varargs_label = NULL;
14987 rtx fn;
14989 gcc_assert (flag_split_stack && reload_completed);
14991 ix86_finalize_stack_frame_flags ();
14992 struct ix86_frame &frame = cfun->machine->frame;
14993 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14995 /* This is the label we will branch to if we have enough stack
14996 space. We expect the basic block reordering pass to reverse this
14997 branch if optimizing, so that we branch in the unlikely case. */
14998 label = gen_label_rtx ();
15000 /* We need to compare the stack pointer minus the frame size with
15001 the stack boundary in the TCB. The stack boundary always gives
15002 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15003 can compare directly. Otherwise we need to do an addition. */
15005 limit = ix86_split_stack_guard ();
15007 if (allocate < SPLIT_STACK_AVAILABLE)
15008 current = stack_pointer_rtx;
15009 else
15011 unsigned int scratch_regno;
15012 rtx offset;
15014 /* We need a scratch register to hold the stack pointer minus
15015 the required frame size. Since this is the very start of the
15016 function, the scratch register can be any caller-saved
15017 register which is not used for parameters. */
15018 offset = GEN_INT (- allocate);
15019 scratch_regno = split_stack_prologue_scratch_regno ();
15020 if (scratch_regno == INVALID_REGNUM)
15021 return;
15022 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15023 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15025 /* We don't use ix86_gen_add3 in this case because it will
15026 want to split to lea, but when not optimizing the insn
15027 will not be split after this point. */
15028 emit_insn (gen_rtx_SET (scratch_reg,
15029 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15030 offset)));
15032 else
15034 emit_move_insn (scratch_reg, offset);
15035 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15036 stack_pointer_rtx));
15038 current = scratch_reg;
15041 ix86_expand_branch (GEU, current, limit, label);
15042 rtx_insn *jump_insn = get_last_insn ();
15043 JUMP_LABEL (jump_insn) = label;
15045 /* Mark the jump as very likely to be taken. */
15046 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15048 if (split_stack_fn == NULL_RTX)
15050 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15051 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15053 fn = split_stack_fn;
15055 /* Get more stack space. We pass in the desired stack space and the
15056 size of the arguments to copy to the new stack. In 32-bit mode
15057 we push the parameters; __morestack will return on a new stack
15058 anyhow. In 64-bit mode we pass the parameters in r10 and
15059 r11. */
15060 allocate_rtx = GEN_INT (allocate);
15061 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
15062 call_fusage = NULL_RTX;
15063 rtx pop = NULL_RTX;
15064 if (TARGET_64BIT)
15066 rtx reg10, reg11;
15068 reg10 = gen_rtx_REG (Pmode, R10_REG);
15069 reg11 = gen_rtx_REG (Pmode, R11_REG);
15071 /* If this function uses a static chain, it will be in %r10.
15072 Preserve it across the call to __morestack. */
15073 if (DECL_STATIC_CHAIN (cfun->decl))
15075 rtx rax;
15077 rax = gen_rtx_REG (word_mode, AX_REG);
15078 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15079 use_reg (&call_fusage, rax);
15082 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15083 && !TARGET_PECOFF)
15085 HOST_WIDE_INT argval;
15087 gcc_assert (Pmode == DImode);
15088 /* When using the large model we need to load the address
15089 into a register, and we've run out of registers. So we
15090 switch to a different calling convention, and we call a
15091 different function: __morestack_large. We pass the
15092 argument size in the upper 32 bits of r10 and pass the
15093 frame size in the lower 32 bits. */
15094 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15095 gcc_assert ((args_size & 0xffffffff) == args_size);
15097 if (split_stack_fn_large == NULL_RTX)
15099 split_stack_fn_large =
15100 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15101 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15103 if (ix86_cmodel == CM_LARGE_PIC)
15105 rtx_code_label *label;
15106 rtx x;
15108 label = gen_label_rtx ();
15109 emit_label (label);
15110 LABEL_PRESERVE_P (label) = 1;
15111 emit_insn (gen_set_rip_rex64 (reg10, label));
15112 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15113 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15114 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15115 UNSPEC_GOT);
15116 x = gen_rtx_CONST (Pmode, x);
15117 emit_move_insn (reg11, x);
15118 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15119 x = gen_const_mem (Pmode, x);
15120 emit_move_insn (reg11, x);
15122 else
15123 emit_move_insn (reg11, split_stack_fn_large);
15125 fn = reg11;
15127 argval = ((args_size << 16) << 16) + allocate;
15128 emit_move_insn (reg10, GEN_INT (argval));
15130 else
15132 emit_move_insn (reg10, allocate_rtx);
15133 emit_move_insn (reg11, GEN_INT (args_size));
15134 use_reg (&call_fusage, reg11);
15137 use_reg (&call_fusage, reg10);
15139 else
15141 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15142 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15143 insn = emit_insn (gen_push (allocate_rtx));
15144 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15145 pop = GEN_INT (2 * UNITS_PER_WORD);
15147 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15148 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15149 pop, false);
15150 add_function_usage_to (call_insn, call_fusage);
15151 if (!TARGET_64BIT)
15152 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15153 /* Indicate that this function can't jump to non-local gotos. */
15154 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15156 /* In order to make call/return prediction work right, we now need
15157 to execute a return instruction. See
15158 libgcc/config/i386/morestack.S for the details on how this works.
15160 For flow purposes gcc must not see this as a return
15161 instruction--we need control flow to continue at the subsequent
15162 label. Therefore, we use an unspec. */
15163 gcc_assert (crtl->args.pops_args < 65536);
15164 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15166 /* If we are in 64-bit mode and this function uses a static chain,
15167 we saved %r10 in %rax before calling _morestack. */
15168 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15169 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15170 gen_rtx_REG (word_mode, AX_REG));
15172 /* If this function calls va_start, we need to store a pointer to
15173 the arguments on the old stack, because they may not have been
15174 all copied to the new stack. At this point the old stack can be
15175 found at the frame pointer value used by __morestack, because
15176 __morestack has set that up before calling back to us. Here we
15177 store that pointer in a scratch register, and in
15178 ix86_expand_prologue we store the scratch register in a stack
15179 slot. */
15180 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15182 unsigned int scratch_regno;
15183 rtx frame_reg;
15184 int words;
15186 scratch_regno = split_stack_prologue_scratch_regno ();
15187 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15188 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15190 /* 64-bit:
15191 fp -> old fp value
15192 return address within this function
15193 return address of caller of this function
15194 stack arguments
15195 So we add three words to get to the stack arguments.
15197 32-bit:
15198 fp -> old fp value
15199 return address within this function
15200 first argument to __morestack
15201 second argument to __morestack
15202 return address of caller of this function
15203 stack arguments
15204 So we add five words to get to the stack arguments.
15206 words = TARGET_64BIT ? 3 : 5;
15207 emit_insn (gen_rtx_SET (scratch_reg,
15208 gen_rtx_PLUS (Pmode, frame_reg,
15209 GEN_INT (words * UNITS_PER_WORD))));
15211 varargs_label = gen_label_rtx ();
15212 emit_jump_insn (gen_jump (varargs_label));
15213 JUMP_LABEL (get_last_insn ()) = varargs_label;
15215 emit_barrier ();
15218 emit_label (label);
15219 LABEL_NUSES (label) = 1;
15221 /* If this function calls va_start, we now have to set the scratch
15222 register for the case where we do not call __morestack. In this
15223 case we need to set it based on the stack pointer. */
15224 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15226 emit_insn (gen_rtx_SET (scratch_reg,
15227 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15228 GEN_INT (UNITS_PER_WORD))));
15230 emit_label (varargs_label);
15231 LABEL_NUSES (varargs_label) = 1;
15235 /* We may have to tell the dataflow pass that the split stack prologue
15236 is initializing a scratch register. */
15238 static void
15239 ix86_live_on_entry (bitmap regs)
15241 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15243 gcc_assert (flag_split_stack);
15244 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15248 /* Extract the parts of an RTL expression that is a valid memory address
15249 for an instruction. Return 0 if the structure of the address is
15250 grossly off. Return -1 if the address contains ASHIFT, so it is not
15251 strictly valid, but still used for computing length of lea instruction. */
15254 ix86_decompose_address (rtx addr, struct ix86_address *out)
15256 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15257 rtx base_reg, index_reg;
15258 HOST_WIDE_INT scale = 1;
15259 rtx scale_rtx = NULL_RTX;
15260 rtx tmp;
15261 int retval = 1;
15262 addr_space_t seg = ADDR_SPACE_GENERIC;
15264 /* Allow zero-extended SImode addresses,
15265 they will be emitted with addr32 prefix. */
15266 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15268 if (GET_CODE (addr) == ZERO_EXTEND
15269 && GET_MODE (XEXP (addr, 0)) == SImode)
15271 addr = XEXP (addr, 0);
15272 if (CONST_INT_P (addr))
15273 return 0;
15275 else if (GET_CODE (addr) == AND
15276 && const_32bit_mask (XEXP (addr, 1), DImode))
15278 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15279 if (addr == NULL_RTX)
15280 return 0;
15282 if (CONST_INT_P (addr))
15283 return 0;
15287 /* Allow SImode subregs of DImode addresses,
15288 they will be emitted with addr32 prefix. */
15289 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15291 if (SUBREG_P (addr)
15292 && GET_MODE (SUBREG_REG (addr)) == DImode)
15294 addr = SUBREG_REG (addr);
15295 if (CONST_INT_P (addr))
15296 return 0;
15300 if (REG_P (addr))
15301 base = addr;
15302 else if (SUBREG_P (addr))
15304 if (REG_P (SUBREG_REG (addr)))
15305 base = addr;
15306 else
15307 return 0;
15309 else if (GET_CODE (addr) == PLUS)
15311 rtx addends[4], op;
15312 int n = 0, i;
15314 op = addr;
15317 if (n >= 4)
15318 return 0;
15319 addends[n++] = XEXP (op, 1);
15320 op = XEXP (op, 0);
15322 while (GET_CODE (op) == PLUS);
15323 if (n >= 4)
15324 return 0;
15325 addends[n] = op;
15327 for (i = n; i >= 0; --i)
15329 op = addends[i];
15330 switch (GET_CODE (op))
15332 case MULT:
15333 if (index)
15334 return 0;
15335 index = XEXP (op, 0);
15336 scale_rtx = XEXP (op, 1);
15337 break;
15339 case ASHIFT:
15340 if (index)
15341 return 0;
15342 index = XEXP (op, 0);
15343 tmp = XEXP (op, 1);
15344 if (!CONST_INT_P (tmp))
15345 return 0;
15346 scale = INTVAL (tmp);
15347 if ((unsigned HOST_WIDE_INT) scale > 3)
15348 return 0;
15349 scale = 1 << scale;
15350 break;
15352 case ZERO_EXTEND:
15353 op = XEXP (op, 0);
15354 if (GET_CODE (op) != UNSPEC)
15355 return 0;
15356 /* FALLTHRU */
15358 case UNSPEC:
15359 if (XINT (op, 1) == UNSPEC_TP
15360 && TARGET_TLS_DIRECT_SEG_REFS
15361 && seg == ADDR_SPACE_GENERIC)
15362 seg = DEFAULT_TLS_SEG_REG;
15363 else
15364 return 0;
15365 break;
15367 case SUBREG:
15368 if (!REG_P (SUBREG_REG (op)))
15369 return 0;
15370 /* FALLTHRU */
15372 case REG:
15373 if (!base)
15374 base = op;
15375 else if (!index)
15376 index = op;
15377 else
15378 return 0;
15379 break;
15381 case CONST:
15382 case CONST_INT:
15383 case SYMBOL_REF:
15384 case LABEL_REF:
15385 if (disp)
15386 return 0;
15387 disp = op;
15388 break;
15390 default:
15391 return 0;
15395 else if (GET_CODE (addr) == MULT)
15397 index = XEXP (addr, 0); /* index*scale */
15398 scale_rtx = XEXP (addr, 1);
15400 else if (GET_CODE (addr) == ASHIFT)
15402 /* We're called for lea too, which implements ashift on occasion. */
15403 index = XEXP (addr, 0);
15404 tmp = XEXP (addr, 1);
15405 if (!CONST_INT_P (tmp))
15406 return 0;
15407 scale = INTVAL (tmp);
15408 if ((unsigned HOST_WIDE_INT) scale > 3)
15409 return 0;
15410 scale = 1 << scale;
15411 retval = -1;
15413 else
15414 disp = addr; /* displacement */
15416 if (index)
15418 if (REG_P (index))
15420 else if (SUBREG_P (index)
15421 && REG_P (SUBREG_REG (index)))
15423 else
15424 return 0;
15427 /* Extract the integral value of scale. */
15428 if (scale_rtx)
15430 if (!CONST_INT_P (scale_rtx))
15431 return 0;
15432 scale = INTVAL (scale_rtx);
15435 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15436 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15438 /* Avoid useless 0 displacement. */
15439 if (disp == const0_rtx && (base || index))
15440 disp = NULL_RTX;
15442 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15443 if (base_reg && index_reg && scale == 1
15444 && (REGNO (index_reg) == ARG_POINTER_REGNUM
15445 || REGNO (index_reg) == FRAME_POINTER_REGNUM
15446 || REGNO (index_reg) == SP_REG))
15448 std::swap (base, index);
15449 std::swap (base_reg, index_reg);
15452 /* Special case: %ebp cannot be encoded as a base without a displacement.
15453 Similarly %r13. */
15454 if (!disp && base_reg
15455 && (REGNO (base_reg) == ARG_POINTER_REGNUM
15456 || REGNO (base_reg) == FRAME_POINTER_REGNUM
15457 || REGNO (base_reg) == BP_REG
15458 || REGNO (base_reg) == R13_REG))
15459 disp = const0_rtx;
15461 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15462 Avoid this by transforming to [%esi+0].
15463 Reload calls address legitimization without cfun defined, so we need
15464 to test cfun for being non-NULL. */
15465 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15466 && base_reg && !index_reg && !disp
15467 && REGNO (base_reg) == SI_REG)
15468 disp = const0_rtx;
15470 /* Special case: encode reg+reg instead of reg*2. */
15471 if (!base && index && scale == 2)
15472 base = index, base_reg = index_reg, scale = 1;
15474 /* Special case: scaling cannot be encoded without base or displacement. */
15475 if (!base && !disp && index && scale != 1)
15476 disp = const0_rtx;
15478 out->base = base;
15479 out->index = index;
15480 out->disp = disp;
15481 out->scale = scale;
15482 out->seg = seg;
15484 return retval;
15487 /* Return cost of the memory address x.
15488 For i386, it is better to use a complex address than let gcc copy
15489 the address into a reg and make a new pseudo. But not if the address
15490 requires to two regs - that would mean more pseudos with longer
15491 lifetimes. */
15492 static int
15493 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15495 struct ix86_address parts;
15496 int cost = 1;
15497 int ok = ix86_decompose_address (x, &parts);
15499 gcc_assert (ok);
15501 if (parts.base && SUBREG_P (parts.base))
15502 parts.base = SUBREG_REG (parts.base);
15503 if (parts.index && SUBREG_P (parts.index))
15504 parts.index = SUBREG_REG (parts.index);
15506 /* Attempt to minimize number of registers in the address by increasing
15507 address cost for each used register. We don't increase address cost
15508 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15509 is not invariant itself it most likely means that base or index is not
15510 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15511 which is not profitable for x86. */
15512 if (parts.base
15513 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15514 && (current_pass->type == GIMPLE_PASS
15515 || !pic_offset_table_rtx
15516 || !REG_P (parts.base)
15517 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15518 cost++;
15520 if (parts.index
15521 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15522 && (current_pass->type == GIMPLE_PASS
15523 || !pic_offset_table_rtx
15524 || !REG_P (parts.index)
15525 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15526 cost++;
15528 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15529 since it's predecode logic can't detect the length of instructions
15530 and it degenerates to vector decoded. Increase cost of such
15531 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15532 to split such addresses or even refuse such addresses at all.
15534 Following addressing modes are affected:
15535 [base+scale*index]
15536 [scale*index+disp]
15537 [base+index]
15539 The first and last case may be avoidable by explicitly coding the zero in
15540 memory address, but I don't have AMD-K6 machine handy to check this
15541 theory. */
15543 if (TARGET_K6
15544 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15545 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15546 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15547 cost += 10;
15549 return cost;
15552 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15553 this is used for to form addresses to local data when -fPIC is in
15554 use. */
15556 static bool
15557 darwin_local_data_pic (rtx disp)
15559 return (GET_CODE (disp) == UNSPEC
15560 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15563 /* True if operand X should be loaded from GOT. */
15565 bool
15566 ix86_force_load_from_GOT_p (rtx x)
15568 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15569 && !TARGET_PECOFF && !TARGET_MACHO
15570 && !flag_plt && !flag_pic
15571 && ix86_cmodel != CM_LARGE
15572 && GET_CODE (x) == SYMBOL_REF
15573 && SYMBOL_REF_FUNCTION_P (x)
15574 && !SYMBOL_REF_LOCAL_P (x));
15577 /* Determine if a given RTX is a valid constant. We already know this
15578 satisfies CONSTANT_P. */
15580 static bool
15581 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15583 /* Pointer bounds constants are not valid. */
15584 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15585 return false;
15587 switch (GET_CODE (x))
15589 case CONST:
15590 x = XEXP (x, 0);
15592 if (GET_CODE (x) == PLUS)
15594 if (!CONST_INT_P (XEXP (x, 1)))
15595 return false;
15596 x = XEXP (x, 0);
15599 if (TARGET_MACHO && darwin_local_data_pic (x))
15600 return true;
15602 /* Only some unspecs are valid as "constants". */
15603 if (GET_CODE (x) == UNSPEC)
15604 switch (XINT (x, 1))
15606 case UNSPEC_GOT:
15607 case UNSPEC_GOTOFF:
15608 case UNSPEC_PLTOFF:
15609 return TARGET_64BIT;
15610 case UNSPEC_TPOFF:
15611 case UNSPEC_NTPOFF:
15612 x = XVECEXP (x, 0, 0);
15613 return (GET_CODE (x) == SYMBOL_REF
15614 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15615 case UNSPEC_DTPOFF:
15616 x = XVECEXP (x, 0, 0);
15617 return (GET_CODE (x) == SYMBOL_REF
15618 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15619 default:
15620 return false;
15623 /* We must have drilled down to a symbol. */
15624 if (GET_CODE (x) == LABEL_REF)
15625 return true;
15626 if (GET_CODE (x) != SYMBOL_REF)
15627 return false;
15628 /* FALLTHRU */
15630 case SYMBOL_REF:
15631 /* TLS symbols are never valid. */
15632 if (SYMBOL_REF_TLS_MODEL (x))
15633 return false;
15635 /* DLLIMPORT symbols are never valid. */
15636 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15637 && SYMBOL_REF_DLLIMPORT_P (x))
15638 return false;
15640 #if TARGET_MACHO
15641 /* mdynamic-no-pic */
15642 if (MACHO_DYNAMIC_NO_PIC_P)
15643 return machopic_symbol_defined_p (x);
15644 #endif
15646 /* External function address should be loaded
15647 via the GOT slot to avoid PLT. */
15648 if (ix86_force_load_from_GOT_p (x))
15649 return false;
15651 break;
15653 CASE_CONST_SCALAR_INT:
15654 switch (mode)
15656 case E_TImode:
15657 if (TARGET_64BIT)
15658 return true;
15659 /* FALLTHRU */
15660 case E_OImode:
15661 case E_XImode:
15662 if (!standard_sse_constant_p (x, mode))
15663 return false;
15664 default:
15665 break;
15667 break;
15669 case CONST_VECTOR:
15670 if (!standard_sse_constant_p (x, mode))
15671 return false;
15673 default:
15674 break;
15677 /* Otherwise we handle everything else in the move patterns. */
15678 return true;
15681 /* Determine if it's legal to put X into the constant pool. This
15682 is not possible for the address of thread-local symbols, which
15683 is checked above. */
15685 static bool
15686 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15688 /* We can put any immediate constant in memory. */
15689 switch (GET_CODE (x))
15691 CASE_CONST_ANY:
15692 return false;
15694 default:
15695 break;
15698 return !ix86_legitimate_constant_p (mode, x);
15701 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15702 otherwise zero. */
15704 static bool
15705 is_imported_p (rtx x)
15707 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15708 || GET_CODE (x) != SYMBOL_REF)
15709 return false;
15711 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15715 /* Nonzero if the constant value X is a legitimate general operand
15716 when generating PIC code. It is given that flag_pic is on and
15717 that X satisfies CONSTANT_P. */
15719 bool
15720 legitimate_pic_operand_p (rtx x)
15722 rtx inner;
15724 switch (GET_CODE (x))
15726 case CONST:
15727 inner = XEXP (x, 0);
15728 if (GET_CODE (inner) == PLUS
15729 && CONST_INT_P (XEXP (inner, 1)))
15730 inner = XEXP (inner, 0);
15732 /* Only some unspecs are valid as "constants". */
15733 if (GET_CODE (inner) == UNSPEC)
15734 switch (XINT (inner, 1))
15736 case UNSPEC_GOT:
15737 case UNSPEC_GOTOFF:
15738 case UNSPEC_PLTOFF:
15739 return TARGET_64BIT;
15740 case UNSPEC_TPOFF:
15741 x = XVECEXP (inner, 0, 0);
15742 return (GET_CODE (x) == SYMBOL_REF
15743 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15744 case UNSPEC_MACHOPIC_OFFSET:
15745 return legitimate_pic_address_disp_p (x);
15746 default:
15747 return false;
15749 /* FALLTHRU */
15751 case SYMBOL_REF:
15752 case LABEL_REF:
15753 return legitimate_pic_address_disp_p (x);
15755 default:
15756 return true;
15760 /* Determine if a given CONST RTX is a valid memory displacement
15761 in PIC mode. */
15763 bool
15764 legitimate_pic_address_disp_p (rtx disp)
15766 bool saw_plus;
15768 /* In 64bit mode we can allow direct addresses of symbols and labels
15769 when they are not dynamic symbols. */
15770 if (TARGET_64BIT)
15772 rtx op0 = disp, op1;
15774 switch (GET_CODE (disp))
15776 case LABEL_REF:
15777 return true;
15779 case CONST:
15780 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15781 break;
15782 op0 = XEXP (XEXP (disp, 0), 0);
15783 op1 = XEXP (XEXP (disp, 0), 1);
15784 if (!CONST_INT_P (op1))
15785 break;
15786 if (GET_CODE (op0) == UNSPEC
15787 && (XINT (op0, 1) == UNSPEC_DTPOFF
15788 || XINT (op0, 1) == UNSPEC_NTPOFF)
15789 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15790 return true;
15791 if (INTVAL (op1) >= 16*1024*1024
15792 || INTVAL (op1) < -16*1024*1024)
15793 break;
15794 if (GET_CODE (op0) == LABEL_REF)
15795 return true;
15796 if (GET_CODE (op0) == CONST
15797 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15798 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15799 return true;
15800 if (GET_CODE (op0) == UNSPEC
15801 && XINT (op0, 1) == UNSPEC_PCREL)
15802 return true;
15803 if (GET_CODE (op0) != SYMBOL_REF)
15804 break;
15805 /* FALLTHRU */
15807 case SYMBOL_REF:
15808 /* TLS references should always be enclosed in UNSPEC.
15809 The dllimported symbol needs always to be resolved. */
15810 if (SYMBOL_REF_TLS_MODEL (op0)
15811 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15812 return false;
15814 if (TARGET_PECOFF)
15816 if (is_imported_p (op0))
15817 return true;
15819 if (SYMBOL_REF_FAR_ADDR_P (op0)
15820 || !SYMBOL_REF_LOCAL_P (op0))
15821 break;
15823 /* Function-symbols need to be resolved only for
15824 large-model.
15825 For the small-model we don't need to resolve anything
15826 here. */
15827 if ((ix86_cmodel != CM_LARGE_PIC
15828 && SYMBOL_REF_FUNCTION_P (op0))
15829 || ix86_cmodel == CM_SMALL_PIC)
15830 return true;
15831 /* Non-external symbols don't need to be resolved for
15832 large, and medium-model. */
15833 if ((ix86_cmodel == CM_LARGE_PIC
15834 || ix86_cmodel == CM_MEDIUM_PIC)
15835 && !SYMBOL_REF_EXTERNAL_P (op0))
15836 return true;
15838 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15839 && (SYMBOL_REF_LOCAL_P (op0)
15840 || (HAVE_LD_PIE_COPYRELOC
15841 && flag_pie
15842 && !SYMBOL_REF_WEAK (op0)
15843 && !SYMBOL_REF_FUNCTION_P (op0)))
15844 && ix86_cmodel != CM_LARGE_PIC)
15845 return true;
15846 break;
15848 default:
15849 break;
15852 if (GET_CODE (disp) != CONST)
15853 return false;
15854 disp = XEXP (disp, 0);
15856 if (TARGET_64BIT)
15858 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15859 of GOT tables. We should not need these anyway. */
15860 if (GET_CODE (disp) != UNSPEC
15861 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15862 && XINT (disp, 1) != UNSPEC_GOTOFF
15863 && XINT (disp, 1) != UNSPEC_PCREL
15864 && XINT (disp, 1) != UNSPEC_PLTOFF))
15865 return false;
15867 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15868 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15869 return false;
15870 return true;
15873 saw_plus = false;
15874 if (GET_CODE (disp) == PLUS)
15876 if (!CONST_INT_P (XEXP (disp, 1)))
15877 return false;
15878 disp = XEXP (disp, 0);
15879 saw_plus = true;
15882 if (TARGET_MACHO && darwin_local_data_pic (disp))
15883 return true;
15885 if (GET_CODE (disp) != UNSPEC)
15886 return false;
15888 switch (XINT (disp, 1))
15890 case UNSPEC_GOT:
15891 if (saw_plus)
15892 return false;
15893 /* We need to check for both symbols and labels because VxWorks loads
15894 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15895 details. */
15896 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15897 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15898 case UNSPEC_GOTOFF:
15899 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15900 While ABI specify also 32bit relocation but we don't produce it in
15901 small PIC model at all. */
15902 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15903 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15904 && !TARGET_64BIT)
15905 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15906 return false;
15907 case UNSPEC_GOTTPOFF:
15908 case UNSPEC_GOTNTPOFF:
15909 case UNSPEC_INDNTPOFF:
15910 if (saw_plus)
15911 return false;
15912 disp = XVECEXP (disp, 0, 0);
15913 return (GET_CODE (disp) == SYMBOL_REF
15914 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15915 case UNSPEC_NTPOFF:
15916 disp = XVECEXP (disp, 0, 0);
15917 return (GET_CODE (disp) == SYMBOL_REF
15918 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15919 case UNSPEC_DTPOFF:
15920 disp = XVECEXP (disp, 0, 0);
15921 return (GET_CODE (disp) == SYMBOL_REF
15922 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15925 return false;
15928 /* Determine if op is suitable RTX for an address register.
15929 Return naked register if a register or a register subreg is
15930 found, otherwise return NULL_RTX. */
15932 static rtx
15933 ix86_validate_address_register (rtx op)
15935 machine_mode mode = GET_MODE (op);
15937 /* Only SImode or DImode registers can form the address. */
15938 if (mode != SImode && mode != DImode)
15939 return NULL_RTX;
15941 if (REG_P (op))
15942 return op;
15943 else if (SUBREG_P (op))
15945 rtx reg = SUBREG_REG (op);
15947 if (!REG_P (reg))
15948 return NULL_RTX;
15950 mode = GET_MODE (reg);
15952 /* Don't allow SUBREGs that span more than a word. It can
15953 lead to spill failures when the register is one word out
15954 of a two word structure. */
15955 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15956 return NULL_RTX;
15958 /* Allow only SUBREGs of non-eliminable hard registers. */
15959 if (register_no_elim_operand (reg, mode))
15960 return reg;
15963 /* Op is not a register. */
15964 return NULL_RTX;
15967 /* Recognizes RTL expressions that are valid memory addresses for an
15968 instruction. The MODE argument is the machine mode for the MEM
15969 expression that wants to use this address.
15971 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15972 convert common non-canonical forms to canonical form so that they will
15973 be recognized. */
15975 static bool
15976 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15978 struct ix86_address parts;
15979 rtx base, index, disp;
15980 HOST_WIDE_INT scale;
15981 addr_space_t seg;
15983 if (ix86_decompose_address (addr, &parts) <= 0)
15984 /* Decomposition failed. */
15985 return false;
15987 base = parts.base;
15988 index = parts.index;
15989 disp = parts.disp;
15990 scale = parts.scale;
15991 seg = parts.seg;
15993 /* Validate base register. */
15994 if (base)
15996 rtx reg = ix86_validate_address_register (base);
15998 if (reg == NULL_RTX)
15999 return false;
16001 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16002 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16003 /* Base is not valid. */
16004 return false;
16007 /* Validate index register. */
16008 if (index)
16010 rtx reg = ix86_validate_address_register (index);
16012 if (reg == NULL_RTX)
16013 return false;
16015 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16016 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16017 /* Index is not valid. */
16018 return false;
16021 /* Index and base should have the same mode. */
16022 if (base && index
16023 && GET_MODE (base) != GET_MODE (index))
16024 return false;
16026 /* Address override works only on the (%reg) part of %fs:(%reg). */
16027 if (seg != ADDR_SPACE_GENERIC
16028 && ((base && GET_MODE (base) != word_mode)
16029 || (index && GET_MODE (index) != word_mode)))
16030 return false;
16032 /* Validate scale factor. */
16033 if (scale != 1)
16035 if (!index)
16036 /* Scale without index. */
16037 return false;
16039 if (scale != 2 && scale != 4 && scale != 8)
16040 /* Scale is not a valid multiplier. */
16041 return false;
16044 /* Validate displacement. */
16045 if (disp)
16047 if (GET_CODE (disp) == CONST
16048 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16049 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16050 switch (XINT (XEXP (disp, 0), 1))
16052 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16053 when used. While ABI specify also 32bit relocations, we
16054 don't produce them at all and use IP relative instead.
16055 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16056 should be loaded via GOT. */
16057 case UNSPEC_GOT:
16058 if (!TARGET_64BIT
16059 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16060 goto is_legitimate_pic;
16061 /* FALLTHRU */
16062 case UNSPEC_GOTOFF:
16063 gcc_assert (flag_pic);
16064 if (!TARGET_64BIT)
16065 goto is_legitimate_pic;
16067 /* 64bit address unspec. */
16068 return false;
16070 case UNSPEC_GOTPCREL:
16071 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16072 goto is_legitimate_pic;
16073 /* FALLTHRU */
16074 case UNSPEC_PCREL:
16075 gcc_assert (flag_pic);
16076 goto is_legitimate_pic;
16078 case UNSPEC_GOTTPOFF:
16079 case UNSPEC_GOTNTPOFF:
16080 case UNSPEC_INDNTPOFF:
16081 case UNSPEC_NTPOFF:
16082 case UNSPEC_DTPOFF:
16083 break;
16085 default:
16086 /* Invalid address unspec. */
16087 return false;
16090 else if (SYMBOLIC_CONST (disp)
16091 && (flag_pic
16092 || (TARGET_MACHO
16093 #if TARGET_MACHO
16094 && MACHOPIC_INDIRECT
16095 && !machopic_operand_p (disp)
16096 #endif
16100 is_legitimate_pic:
16101 if (TARGET_64BIT && (index || base))
16103 /* foo@dtpoff(%rX) is ok. */
16104 if (GET_CODE (disp) != CONST
16105 || GET_CODE (XEXP (disp, 0)) != PLUS
16106 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16107 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16108 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16109 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16110 /* Non-constant pic memory reference. */
16111 return false;
16113 else if ((!TARGET_MACHO || flag_pic)
16114 && ! legitimate_pic_address_disp_p (disp))
16115 /* Displacement is an invalid pic construct. */
16116 return false;
16117 #if TARGET_MACHO
16118 else if (MACHO_DYNAMIC_NO_PIC_P
16119 && !ix86_legitimate_constant_p (Pmode, disp))
16120 /* displacment must be referenced via non_lazy_pointer */
16121 return false;
16122 #endif
16124 /* This code used to verify that a symbolic pic displacement
16125 includes the pic_offset_table_rtx register.
16127 While this is good idea, unfortunately these constructs may
16128 be created by "adds using lea" optimization for incorrect
16129 code like:
16131 int a;
16132 int foo(int i)
16134 return *(&a+i);
16137 This code is nonsensical, but results in addressing
16138 GOT table with pic_offset_table_rtx base. We can't
16139 just refuse it easily, since it gets matched by
16140 "addsi3" pattern, that later gets split to lea in the
16141 case output register differs from input. While this
16142 can be handled by separate addsi pattern for this case
16143 that never results in lea, this seems to be easier and
16144 correct fix for crash to disable this test. */
16146 else if (GET_CODE (disp) != LABEL_REF
16147 && !CONST_INT_P (disp)
16148 && (GET_CODE (disp) != CONST
16149 || !ix86_legitimate_constant_p (Pmode, disp))
16150 && (GET_CODE (disp) != SYMBOL_REF
16151 || !ix86_legitimate_constant_p (Pmode, disp)))
16152 /* Displacement is not constant. */
16153 return false;
16154 else if (TARGET_64BIT
16155 && !x86_64_immediate_operand (disp, VOIDmode))
16156 /* Displacement is out of range. */
16157 return false;
16158 /* In x32 mode, constant addresses are sign extended to 64bit, so
16159 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16160 else if (TARGET_X32 && !(index || base)
16161 && CONST_INT_P (disp)
16162 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16163 return false;
16166 /* Everything looks valid. */
16167 return true;
16170 /* Determine if a given RTX is a valid constant address. */
16172 bool
16173 constant_address_p (rtx x)
16175 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16178 /* Return a unique alias set for the GOT. */
16180 static alias_set_type
16181 ix86_GOT_alias_set (void)
16183 static alias_set_type set = -1;
16184 if (set == -1)
16185 set = new_alias_set ();
16186 return set;
16189 /* Return a legitimate reference for ORIG (an address) using the
16190 register REG. If REG is 0, a new pseudo is generated.
16192 There are two types of references that must be handled:
16194 1. Global data references must load the address from the GOT, via
16195 the PIC reg. An insn is emitted to do this load, and the reg is
16196 returned.
16198 2. Static data references, constant pool addresses, and code labels
16199 compute the address as an offset from the GOT, whose base is in
16200 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16201 differentiate them from global data objects. The returned
16202 address is the PIC reg + an unspec constant.
16204 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16205 reg also appears in the address. */
16207 static rtx
16208 legitimize_pic_address (rtx orig, rtx reg)
16210 rtx addr = orig;
16211 rtx new_rtx = orig;
16213 #if TARGET_MACHO
16214 if (TARGET_MACHO && !TARGET_64BIT)
16216 if (reg == 0)
16217 reg = gen_reg_rtx (Pmode);
16218 /* Use the generic Mach-O PIC machinery. */
16219 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16221 #endif
16223 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16225 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16226 if (tmp)
16227 return tmp;
16230 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16231 new_rtx = addr;
16232 else if ((!TARGET_64BIT
16233 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16234 && !TARGET_PECOFF
16235 && gotoff_operand (addr, Pmode))
16237 /* This symbol may be referenced via a displacement
16238 from the PIC base address (@GOTOFF). */
16239 if (GET_CODE (addr) == CONST)
16240 addr = XEXP (addr, 0);
16242 if (GET_CODE (addr) == PLUS)
16244 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16245 UNSPEC_GOTOFF);
16246 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16248 else
16249 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16251 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16253 if (TARGET_64BIT)
16254 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16256 if (reg != 0)
16258 gcc_assert (REG_P (reg));
16259 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16260 new_rtx, reg, 1, OPTAB_DIRECT);
16262 else
16263 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16265 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16266 /* We can't use @GOTOFF for text labels
16267 on VxWorks, see gotoff_operand. */
16268 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16270 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16271 if (tmp)
16272 return tmp;
16274 /* For x64 PE-COFF there is no GOT table,
16275 so we use address directly. */
16276 if (TARGET_64BIT && TARGET_PECOFF)
16278 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16279 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16281 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16283 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16284 UNSPEC_GOTPCREL);
16285 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16286 new_rtx = gen_const_mem (Pmode, new_rtx);
16287 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16289 else
16291 /* This symbol must be referenced via a load
16292 from the Global Offset Table (@GOT). */
16293 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16294 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16295 if (TARGET_64BIT)
16296 new_rtx = force_reg (Pmode, new_rtx);
16297 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16298 new_rtx = gen_const_mem (Pmode, new_rtx);
16299 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16302 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16304 else
16306 if (CONST_INT_P (addr)
16307 && !x86_64_immediate_operand (addr, VOIDmode))
16308 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16309 else if (GET_CODE (addr) == CONST)
16311 addr = XEXP (addr, 0);
16313 /* We must match stuff we generate before. Assume the only
16314 unspecs that can get here are ours. Not that we could do
16315 anything with them anyway.... */
16316 if (GET_CODE (addr) == UNSPEC
16317 || (GET_CODE (addr) == PLUS
16318 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16319 return orig;
16320 gcc_assert (GET_CODE (addr) == PLUS);
16323 if (GET_CODE (addr) == PLUS)
16325 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16327 /* Check first to see if this is a constant
16328 offset from a @GOTOFF symbol reference. */
16329 if (!TARGET_PECOFF
16330 && gotoff_operand (op0, Pmode)
16331 && CONST_INT_P (op1))
16333 if (!TARGET_64BIT)
16335 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16336 UNSPEC_GOTOFF);
16337 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16338 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16340 if (reg != 0)
16342 gcc_assert (REG_P (reg));
16343 new_rtx = expand_simple_binop (Pmode, PLUS,
16344 pic_offset_table_rtx,
16345 new_rtx, reg, 1,
16346 OPTAB_DIRECT);
16348 else
16349 new_rtx
16350 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16352 else
16354 if (INTVAL (op1) < -16*1024*1024
16355 || INTVAL (op1) >= 16*1024*1024)
16357 if (!x86_64_immediate_operand (op1, Pmode))
16358 op1 = force_reg (Pmode, op1);
16360 new_rtx
16361 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16365 else
16367 rtx base = legitimize_pic_address (op0, reg);
16368 machine_mode mode = GET_MODE (base);
16369 new_rtx
16370 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16372 if (CONST_INT_P (new_rtx))
16374 if (INTVAL (new_rtx) < -16*1024*1024
16375 || INTVAL (new_rtx) >= 16*1024*1024)
16377 if (!x86_64_immediate_operand (new_rtx, mode))
16378 new_rtx = force_reg (mode, new_rtx);
16380 new_rtx
16381 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16383 else
16384 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16386 else
16388 /* For %rip addressing, we have to use
16389 just disp32, not base nor index. */
16390 if (TARGET_64BIT
16391 && (GET_CODE (base) == SYMBOL_REF
16392 || GET_CODE (base) == LABEL_REF))
16393 base = force_reg (mode, base);
16394 if (GET_CODE (new_rtx) == PLUS
16395 && CONSTANT_P (XEXP (new_rtx, 1)))
16397 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16398 new_rtx = XEXP (new_rtx, 1);
16400 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16405 return new_rtx;
16408 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16410 static rtx
16411 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16413 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16415 if (GET_MODE (tp) != tp_mode)
16417 gcc_assert (GET_MODE (tp) == SImode);
16418 gcc_assert (tp_mode == DImode);
16420 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16423 if (to_reg)
16424 tp = copy_to_mode_reg (tp_mode, tp);
16426 return tp;
16429 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16431 static GTY(()) rtx ix86_tls_symbol;
16433 static rtx
16434 ix86_tls_get_addr (void)
16436 if (!ix86_tls_symbol)
16438 const char *sym
16439 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16440 ? "___tls_get_addr" : "__tls_get_addr");
16442 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16445 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16447 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16448 UNSPEC_PLTOFF);
16449 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16450 gen_rtx_CONST (Pmode, unspec));
16453 return ix86_tls_symbol;
16456 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16458 static GTY(()) rtx ix86_tls_module_base_symbol;
16461 ix86_tls_module_base (void)
16463 if (!ix86_tls_module_base_symbol)
16465 ix86_tls_module_base_symbol
16466 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16468 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16469 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16472 return ix86_tls_module_base_symbol;
16475 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16476 false if we expect this to be used for a memory address and true if
16477 we expect to load the address into a register. */
16479 static rtx
16480 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16482 rtx dest, base, off;
16483 rtx pic = NULL_RTX, tp = NULL_RTX;
16484 machine_mode tp_mode = Pmode;
16485 int type;
16487 /* Fall back to global dynamic model if tool chain cannot support local
16488 dynamic. */
16489 if (TARGET_SUN_TLS && !TARGET_64BIT
16490 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16491 && model == TLS_MODEL_LOCAL_DYNAMIC)
16492 model = TLS_MODEL_GLOBAL_DYNAMIC;
16494 switch (model)
16496 case TLS_MODEL_GLOBAL_DYNAMIC:
16497 dest = gen_reg_rtx (Pmode);
16499 if (!TARGET_64BIT)
16501 if (flag_pic && !TARGET_PECOFF)
16502 pic = pic_offset_table_rtx;
16503 else
16505 pic = gen_reg_rtx (Pmode);
16506 emit_insn (gen_set_got (pic));
16510 if (TARGET_GNU2_TLS)
16512 if (TARGET_64BIT)
16513 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16514 else
16515 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16517 tp = get_thread_pointer (Pmode, true);
16518 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16520 if (GET_MODE (x) != Pmode)
16521 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16523 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16525 else
16527 rtx caddr = ix86_tls_get_addr ();
16529 if (TARGET_64BIT)
16531 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16532 rtx_insn *insns;
16534 start_sequence ();
16535 emit_call_insn
16536 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16537 insns = get_insns ();
16538 end_sequence ();
16540 if (GET_MODE (x) != Pmode)
16541 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16543 RTL_CONST_CALL_P (insns) = 1;
16544 emit_libcall_block (insns, dest, rax, x);
16546 else
16547 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16549 break;
16551 case TLS_MODEL_LOCAL_DYNAMIC:
16552 base = gen_reg_rtx (Pmode);
16554 if (!TARGET_64BIT)
16556 if (flag_pic)
16557 pic = pic_offset_table_rtx;
16558 else
16560 pic = gen_reg_rtx (Pmode);
16561 emit_insn (gen_set_got (pic));
16565 if (TARGET_GNU2_TLS)
16567 rtx tmp = ix86_tls_module_base ();
16569 if (TARGET_64BIT)
16570 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16571 else
16572 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16574 tp = get_thread_pointer (Pmode, true);
16575 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16576 gen_rtx_MINUS (Pmode, tmp, tp));
16578 else
16580 rtx caddr = ix86_tls_get_addr ();
16582 if (TARGET_64BIT)
16584 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16585 rtx_insn *insns;
16586 rtx eqv;
16588 start_sequence ();
16589 emit_call_insn
16590 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16591 insns = get_insns ();
16592 end_sequence ();
16594 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16595 share the LD_BASE result with other LD model accesses. */
16596 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16597 UNSPEC_TLS_LD_BASE);
16599 RTL_CONST_CALL_P (insns) = 1;
16600 emit_libcall_block (insns, base, rax, eqv);
16602 else
16603 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16606 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16607 off = gen_rtx_CONST (Pmode, off);
16609 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16611 if (TARGET_GNU2_TLS)
16613 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16615 if (GET_MODE (x) != Pmode)
16616 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16618 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16620 break;
16622 case TLS_MODEL_INITIAL_EXEC:
16623 if (TARGET_64BIT)
16625 if (TARGET_SUN_TLS && !TARGET_X32)
16627 /* The Sun linker took the AMD64 TLS spec literally
16628 and can only handle %rax as destination of the
16629 initial executable code sequence. */
16631 dest = gen_reg_rtx (DImode);
16632 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16633 return dest;
16636 /* Generate DImode references to avoid %fs:(%reg32)
16637 problems and linker IE->LE relaxation bug. */
16638 tp_mode = DImode;
16639 pic = NULL;
16640 type = UNSPEC_GOTNTPOFF;
16642 else if (flag_pic)
16644 pic = pic_offset_table_rtx;
16645 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16647 else if (!TARGET_ANY_GNU_TLS)
16649 pic = gen_reg_rtx (Pmode);
16650 emit_insn (gen_set_got (pic));
16651 type = UNSPEC_GOTTPOFF;
16653 else
16655 pic = NULL;
16656 type = UNSPEC_INDNTPOFF;
16659 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16660 off = gen_rtx_CONST (tp_mode, off);
16661 if (pic)
16662 off = gen_rtx_PLUS (tp_mode, pic, off);
16663 off = gen_const_mem (tp_mode, off);
16664 set_mem_alias_set (off, ix86_GOT_alias_set ());
16666 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16668 base = get_thread_pointer (tp_mode,
16669 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16670 off = force_reg (tp_mode, off);
16671 dest = gen_rtx_PLUS (tp_mode, base, off);
16672 if (tp_mode != Pmode)
16673 dest = convert_to_mode (Pmode, dest, 1);
16675 else
16677 base = get_thread_pointer (Pmode, true);
16678 dest = gen_reg_rtx (Pmode);
16679 emit_insn (ix86_gen_sub3 (dest, base, off));
16681 break;
16683 case TLS_MODEL_LOCAL_EXEC:
16684 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16685 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16686 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16687 off = gen_rtx_CONST (Pmode, off);
16689 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16691 base = get_thread_pointer (Pmode,
16692 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16693 return gen_rtx_PLUS (Pmode, base, off);
16695 else
16697 base = get_thread_pointer (Pmode, true);
16698 dest = gen_reg_rtx (Pmode);
16699 emit_insn (ix86_gen_sub3 (dest, base, off));
16701 break;
16703 default:
16704 gcc_unreachable ();
16707 return dest;
16710 /* Return true if OP refers to a TLS address. */
16711 bool
16712 ix86_tls_address_pattern_p (rtx op)
16714 subrtx_var_iterator::array_type array;
16715 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16717 rtx op = *iter;
16718 if (MEM_P (op))
16720 rtx *x = &XEXP (op, 0);
16721 while (GET_CODE (*x) == PLUS)
16723 int i;
16724 for (i = 0; i < 2; i++)
16726 rtx u = XEXP (*x, i);
16727 if (GET_CODE (u) == ZERO_EXTEND)
16728 u = XEXP (u, 0);
16729 if (GET_CODE (u) == UNSPEC
16730 && XINT (u, 1) == UNSPEC_TP)
16731 return true;
16733 x = &XEXP (*x, 0);
16736 iter.skip_subrtxes ();
16740 return false;
16743 /* Rewrite *LOC so that it refers to a default TLS address space. */
16744 void
16745 ix86_rewrite_tls_address_1 (rtx *loc)
16747 subrtx_ptr_iterator::array_type array;
16748 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16750 rtx *loc = *iter;
16751 if (MEM_P (*loc))
16753 rtx addr = XEXP (*loc, 0);
16754 rtx *x = &addr;
16755 while (GET_CODE (*x) == PLUS)
16757 int i;
16758 for (i = 0; i < 2; i++)
16760 rtx u = XEXP (*x, i);
16761 if (GET_CODE (u) == ZERO_EXTEND)
16762 u = XEXP (u, 0);
16763 if (GET_CODE (u) == UNSPEC
16764 && XINT (u, 1) == UNSPEC_TP)
16766 addr_space_t as = DEFAULT_TLS_SEG_REG;
16768 *x = XEXP (*x, 1 - i);
16770 *loc = replace_equiv_address_nv (*loc, addr, true);
16771 set_mem_addr_space (*loc, as);
16772 return;
16775 x = &XEXP (*x, 0);
16778 iter.skip_subrtxes ();
16783 /* Rewrite instruction pattern involvning TLS address
16784 so that it refers to a default TLS address space. */
16786 ix86_rewrite_tls_address (rtx pattern)
16788 pattern = copy_insn (pattern);
16789 ix86_rewrite_tls_address_1 (&pattern);
16790 return pattern;
16793 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16794 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16795 unique refptr-DECL symbol corresponding to symbol DECL. */
16797 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16799 static inline hashval_t hash (tree_map *m) { return m->hash; }
16800 static inline bool
16801 equal (tree_map *a, tree_map *b)
16803 return a->base.from == b->base.from;
16806 static int
16807 keep_cache_entry (tree_map *&m)
16809 return ggc_marked_p (m->base.from);
16813 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16815 static tree
16816 get_dllimport_decl (tree decl, bool beimport)
16818 struct tree_map *h, in;
16819 const char *name;
16820 const char *prefix;
16821 size_t namelen, prefixlen;
16822 char *imp_name;
16823 tree to;
16824 rtx rtl;
16826 if (!dllimport_map)
16827 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16829 in.hash = htab_hash_pointer (decl);
16830 in.base.from = decl;
16831 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16832 h = *loc;
16833 if (h)
16834 return h->to;
16836 *loc = h = ggc_alloc<tree_map> ();
16837 h->hash = in.hash;
16838 h->base.from = decl;
16839 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16840 VAR_DECL, NULL, ptr_type_node);
16841 DECL_ARTIFICIAL (to) = 1;
16842 DECL_IGNORED_P (to) = 1;
16843 DECL_EXTERNAL (to) = 1;
16844 TREE_READONLY (to) = 1;
16846 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16847 name = targetm.strip_name_encoding (name);
16848 if (beimport)
16849 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16850 ? "*__imp_" : "*__imp__";
16851 else
16852 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16853 namelen = strlen (name);
16854 prefixlen = strlen (prefix);
16855 imp_name = (char *) alloca (namelen + prefixlen + 1);
16856 memcpy (imp_name, prefix, prefixlen);
16857 memcpy (imp_name + prefixlen, name, namelen + 1);
16859 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16860 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16861 SET_SYMBOL_REF_DECL (rtl, to);
16862 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16863 if (!beimport)
16865 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16866 #ifdef SUB_TARGET_RECORD_STUB
16867 SUB_TARGET_RECORD_STUB (name);
16868 #endif
16871 rtl = gen_const_mem (Pmode, rtl);
16872 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16874 SET_DECL_RTL (to, rtl);
16875 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16877 return to;
16880 /* Expand SYMBOL into its corresponding far-address symbol.
16881 WANT_REG is true if we require the result be a register. */
16883 static rtx
16884 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16886 tree imp_decl;
16887 rtx x;
16889 gcc_assert (SYMBOL_REF_DECL (symbol));
16890 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16892 x = DECL_RTL (imp_decl);
16893 if (want_reg)
16894 x = force_reg (Pmode, x);
16895 return x;
16898 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16899 true if we require the result be a register. */
16901 static rtx
16902 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16904 tree imp_decl;
16905 rtx x;
16907 gcc_assert (SYMBOL_REF_DECL (symbol));
16908 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16910 x = DECL_RTL (imp_decl);
16911 if (want_reg)
16912 x = force_reg (Pmode, x);
16913 return x;
16916 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16917 is true if we require the result be a register. */
16919 static rtx
16920 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16922 if (!TARGET_PECOFF)
16923 return NULL_RTX;
16925 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16927 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16928 return legitimize_dllimport_symbol (addr, inreg);
16929 if (GET_CODE (addr) == CONST
16930 && GET_CODE (XEXP (addr, 0)) == PLUS
16931 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16932 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16934 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16935 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16939 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16940 return NULL_RTX;
16941 if (GET_CODE (addr) == SYMBOL_REF
16942 && !is_imported_p (addr)
16943 && SYMBOL_REF_EXTERNAL_P (addr)
16944 && SYMBOL_REF_DECL (addr))
16945 return legitimize_pe_coff_extern_decl (addr, inreg);
16947 if (GET_CODE (addr) == CONST
16948 && GET_CODE (XEXP (addr, 0)) == PLUS
16949 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16950 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16951 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16952 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16954 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16955 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16957 return NULL_RTX;
16960 /* Try machine-dependent ways of modifying an illegitimate address
16961 to be legitimate. If we find one, return the new, valid address.
16962 This macro is used in only one place: `memory_address' in explow.c.
16964 OLDX is the address as it was before break_out_memory_refs was called.
16965 In some cases it is useful to look at this to decide what needs to be done.
16967 It is always safe for this macro to do nothing. It exists to recognize
16968 opportunities to optimize the output.
16970 For the 80386, we handle X+REG by loading X into a register R and
16971 using R+REG. R will go in a general reg and indexing will be used.
16972 However, if REG is a broken-out memory address or multiplication,
16973 nothing needs to be done because REG can certainly go in a general reg.
16975 When -fpic is used, special handling is needed for symbolic references.
16976 See comments by legitimize_pic_address in i386.c for details. */
16978 static rtx
16979 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16981 bool changed = false;
16982 unsigned log;
16984 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16985 if (log)
16986 return legitimize_tls_address (x, (enum tls_model) log, false);
16987 if (GET_CODE (x) == CONST
16988 && GET_CODE (XEXP (x, 0)) == PLUS
16989 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16990 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16992 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16993 (enum tls_model) log, false);
16994 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16997 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16999 rtx tmp = legitimize_pe_coff_symbol (x, true);
17000 if (tmp)
17001 return tmp;
17004 if (flag_pic && SYMBOLIC_CONST (x))
17005 return legitimize_pic_address (x, 0);
17007 #if TARGET_MACHO
17008 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17009 return machopic_indirect_data_reference (x, 0);
17010 #endif
17012 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17013 if (GET_CODE (x) == ASHIFT
17014 && CONST_INT_P (XEXP (x, 1))
17015 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17017 changed = true;
17018 log = INTVAL (XEXP (x, 1));
17019 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17020 GEN_INT (1 << log));
17023 if (GET_CODE (x) == PLUS)
17025 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17027 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17028 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17029 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17031 changed = true;
17032 log = INTVAL (XEXP (XEXP (x, 0), 1));
17033 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17034 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17035 GEN_INT (1 << log));
17038 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17039 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17040 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17042 changed = true;
17043 log = INTVAL (XEXP (XEXP (x, 1), 1));
17044 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17045 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17046 GEN_INT (1 << log));
17049 /* Put multiply first if it isn't already. */
17050 if (GET_CODE (XEXP (x, 1)) == MULT)
17052 std::swap (XEXP (x, 0), XEXP (x, 1));
17053 changed = true;
17056 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17057 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17058 created by virtual register instantiation, register elimination, and
17059 similar optimizations. */
17060 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17062 changed = true;
17063 x = gen_rtx_PLUS (Pmode,
17064 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17065 XEXP (XEXP (x, 1), 0)),
17066 XEXP (XEXP (x, 1), 1));
17069 /* Canonicalize
17070 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17071 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17072 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17073 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17074 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17075 && CONSTANT_P (XEXP (x, 1)))
17077 rtx constant;
17078 rtx other = NULL_RTX;
17080 if (CONST_INT_P (XEXP (x, 1)))
17082 constant = XEXP (x, 1);
17083 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17085 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17087 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17088 other = XEXP (x, 1);
17090 else
17091 constant = 0;
17093 if (constant)
17095 changed = true;
17096 x = gen_rtx_PLUS (Pmode,
17097 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17098 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17099 plus_constant (Pmode, other,
17100 INTVAL (constant)));
17104 if (changed && ix86_legitimate_address_p (mode, x, false))
17105 return x;
17107 if (GET_CODE (XEXP (x, 0)) == MULT)
17109 changed = true;
17110 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17113 if (GET_CODE (XEXP (x, 1)) == MULT)
17115 changed = true;
17116 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17119 if (changed
17120 && REG_P (XEXP (x, 1))
17121 && REG_P (XEXP (x, 0)))
17122 return x;
17124 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17126 changed = true;
17127 x = legitimize_pic_address (x, 0);
17130 if (changed && ix86_legitimate_address_p (mode, x, false))
17131 return x;
17133 if (REG_P (XEXP (x, 0)))
17135 rtx temp = gen_reg_rtx (Pmode);
17136 rtx val = force_operand (XEXP (x, 1), temp);
17137 if (val != temp)
17139 val = convert_to_mode (Pmode, val, 1);
17140 emit_move_insn (temp, val);
17143 XEXP (x, 1) = temp;
17144 return x;
17147 else if (REG_P (XEXP (x, 1)))
17149 rtx temp = gen_reg_rtx (Pmode);
17150 rtx val = force_operand (XEXP (x, 0), temp);
17151 if (val != temp)
17153 val = convert_to_mode (Pmode, val, 1);
17154 emit_move_insn (temp, val);
17157 XEXP (x, 0) = temp;
17158 return x;
17162 return x;
17165 /* Print an integer constant expression in assembler syntax. Addition
17166 and subtraction are the only arithmetic that may appear in these
17167 expressions. FILE is the stdio stream to write to, X is the rtx, and
17168 CODE is the operand print code from the output string. */
17170 static void
17171 output_pic_addr_const (FILE *file, rtx x, int code)
17173 char buf[256];
17175 switch (GET_CODE (x))
17177 case PC:
17178 gcc_assert (flag_pic);
17179 putc ('.', file);
17180 break;
17182 case SYMBOL_REF:
17183 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17184 output_addr_const (file, x);
17185 else
17187 const char *name = XSTR (x, 0);
17189 /* Mark the decl as referenced so that cgraph will
17190 output the function. */
17191 if (SYMBOL_REF_DECL (x))
17192 mark_decl_referenced (SYMBOL_REF_DECL (x));
17194 #if TARGET_MACHO
17195 if (MACHOPIC_INDIRECT
17196 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17197 name = machopic_indirection_name (x, /*stub_p=*/true);
17198 #endif
17199 assemble_name (file, name);
17201 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17202 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17203 fputs ("@PLT", file);
17204 break;
17206 case LABEL_REF:
17207 x = XEXP (x, 0);
17208 /* FALLTHRU */
17209 case CODE_LABEL:
17210 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17211 assemble_name (asm_out_file, buf);
17212 break;
17214 case CONST_INT:
17215 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17216 break;
17218 case CONST:
17219 /* This used to output parentheses around the expression,
17220 but that does not work on the 386 (either ATT or BSD assembler). */
17221 output_pic_addr_const (file, XEXP (x, 0), code);
17222 break;
17224 case CONST_DOUBLE:
17225 /* We can't handle floating point constants;
17226 TARGET_PRINT_OPERAND must handle them. */
17227 output_operand_lossage ("floating constant misused");
17228 break;
17230 case PLUS:
17231 /* Some assemblers need integer constants to appear first. */
17232 if (CONST_INT_P (XEXP (x, 0)))
17234 output_pic_addr_const (file, XEXP (x, 0), code);
17235 putc ('+', file);
17236 output_pic_addr_const (file, XEXP (x, 1), code);
17238 else
17240 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17241 output_pic_addr_const (file, XEXP (x, 1), code);
17242 putc ('+', file);
17243 output_pic_addr_const (file, XEXP (x, 0), code);
17245 break;
17247 case MINUS:
17248 if (!TARGET_MACHO)
17249 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17250 output_pic_addr_const (file, XEXP (x, 0), code);
17251 putc ('-', file);
17252 output_pic_addr_const (file, XEXP (x, 1), code);
17253 if (!TARGET_MACHO)
17254 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17255 break;
17257 case UNSPEC:
17258 gcc_assert (XVECLEN (x, 0) == 1);
17259 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17260 switch (XINT (x, 1))
17262 case UNSPEC_GOT:
17263 fputs ("@GOT", file);
17264 break;
17265 case UNSPEC_GOTOFF:
17266 fputs ("@GOTOFF", file);
17267 break;
17268 case UNSPEC_PLTOFF:
17269 fputs ("@PLTOFF", file);
17270 break;
17271 case UNSPEC_PCREL:
17272 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17273 "(%rip)" : "[rip]", file);
17274 break;
17275 case UNSPEC_GOTPCREL:
17276 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17277 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17278 break;
17279 case UNSPEC_GOTTPOFF:
17280 /* FIXME: This might be @TPOFF in Sun ld too. */
17281 fputs ("@gottpoff", file);
17282 break;
17283 case UNSPEC_TPOFF:
17284 fputs ("@tpoff", file);
17285 break;
17286 case UNSPEC_NTPOFF:
17287 if (TARGET_64BIT)
17288 fputs ("@tpoff", file);
17289 else
17290 fputs ("@ntpoff", file);
17291 break;
17292 case UNSPEC_DTPOFF:
17293 fputs ("@dtpoff", file);
17294 break;
17295 case UNSPEC_GOTNTPOFF:
17296 if (TARGET_64BIT)
17297 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17298 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17299 else
17300 fputs ("@gotntpoff", file);
17301 break;
17302 case UNSPEC_INDNTPOFF:
17303 fputs ("@indntpoff", file);
17304 break;
17305 #if TARGET_MACHO
17306 case UNSPEC_MACHOPIC_OFFSET:
17307 putc ('-', file);
17308 machopic_output_function_base_name (file);
17309 break;
17310 #endif
17311 default:
17312 output_operand_lossage ("invalid UNSPEC as operand");
17313 break;
17315 break;
17317 default:
17318 output_operand_lossage ("invalid expression as operand");
17322 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17323 We need to emit DTP-relative relocations. */
17325 static void ATTRIBUTE_UNUSED
17326 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17328 fputs (ASM_LONG, file);
17329 output_addr_const (file, x);
17330 fputs ("@dtpoff", file);
17331 switch (size)
17333 case 4:
17334 break;
17335 case 8:
17336 fputs (", 0", file);
17337 break;
17338 default:
17339 gcc_unreachable ();
17343 /* Return true if X is a representation of the PIC register. This copes
17344 with calls from ix86_find_base_term, where the register might have
17345 been replaced by a cselib value. */
17347 static bool
17348 ix86_pic_register_p (rtx x)
17350 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17351 return (pic_offset_table_rtx
17352 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17353 else if (!REG_P (x))
17354 return false;
17355 else if (pic_offset_table_rtx)
17357 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17358 return true;
17359 if (HARD_REGISTER_P (x)
17360 && !HARD_REGISTER_P (pic_offset_table_rtx)
17361 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17362 return true;
17363 return false;
17365 else
17366 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17369 /* Helper function for ix86_delegitimize_address.
17370 Attempt to delegitimize TLS local-exec accesses. */
17372 static rtx
17373 ix86_delegitimize_tls_address (rtx orig_x)
17375 rtx x = orig_x, unspec;
17376 struct ix86_address addr;
17378 if (!TARGET_TLS_DIRECT_SEG_REFS)
17379 return orig_x;
17380 if (MEM_P (x))
17381 x = XEXP (x, 0);
17382 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17383 return orig_x;
17384 if (ix86_decompose_address (x, &addr) == 0
17385 || addr.seg != DEFAULT_TLS_SEG_REG
17386 || addr.disp == NULL_RTX
17387 || GET_CODE (addr.disp) != CONST)
17388 return orig_x;
17389 unspec = XEXP (addr.disp, 0);
17390 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17391 unspec = XEXP (unspec, 0);
17392 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17393 return orig_x;
17394 x = XVECEXP (unspec, 0, 0);
17395 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17396 if (unspec != XEXP (addr.disp, 0))
17397 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17398 if (addr.index)
17400 rtx idx = addr.index;
17401 if (addr.scale != 1)
17402 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17403 x = gen_rtx_PLUS (Pmode, idx, x);
17405 if (addr.base)
17406 x = gen_rtx_PLUS (Pmode, addr.base, x);
17407 if (MEM_P (orig_x))
17408 x = replace_equiv_address_nv (orig_x, x);
17409 return x;
17412 /* In the name of slightly smaller debug output, and to cater to
17413 general assembler lossage, recognize PIC+GOTOFF and turn it back
17414 into a direct symbol reference.
17416 On Darwin, this is necessary to avoid a crash, because Darwin
17417 has a different PIC label for each routine but the DWARF debugging
17418 information is not associated with any particular routine, so it's
17419 necessary to remove references to the PIC label from RTL stored by
17420 the DWARF output code.
17422 This helper is used in the normal ix86_delegitimize_address
17423 entrypoint (e.g. used in the target delegitimization hook) and
17424 in ix86_find_base_term. As compile time memory optimization, we
17425 avoid allocating rtxes that will not change anything on the outcome
17426 of the callers (find_base_value and find_base_term). */
17428 static inline rtx
17429 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17431 rtx orig_x = delegitimize_mem_from_attrs (x);
17432 /* addend is NULL or some rtx if x is something+GOTOFF where
17433 something doesn't include the PIC register. */
17434 rtx addend = NULL_RTX;
17435 /* reg_addend is NULL or a multiple of some register. */
17436 rtx reg_addend = NULL_RTX;
17437 /* const_addend is NULL or a const_int. */
17438 rtx const_addend = NULL_RTX;
17439 /* This is the result, or NULL. */
17440 rtx result = NULL_RTX;
17442 x = orig_x;
17444 if (MEM_P (x))
17445 x = XEXP (x, 0);
17447 if (TARGET_64BIT)
17449 if (GET_CODE (x) == CONST
17450 && GET_CODE (XEXP (x, 0)) == PLUS
17451 && GET_MODE (XEXP (x, 0)) == Pmode
17452 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17453 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17454 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17456 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17457 base. A CONST can't be arg_pointer_rtx based. */
17458 if (base_term_p && MEM_P (orig_x))
17459 return orig_x;
17460 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17461 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17462 if (MEM_P (orig_x))
17463 x = replace_equiv_address_nv (orig_x, x);
17464 return x;
17467 if (GET_CODE (x) == CONST
17468 && GET_CODE (XEXP (x, 0)) == UNSPEC
17469 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17470 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17471 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17473 x = XVECEXP (XEXP (x, 0), 0, 0);
17474 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17476 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17477 if (x == NULL_RTX)
17478 return orig_x;
17480 return x;
17483 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17484 return ix86_delegitimize_tls_address (orig_x);
17486 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17487 and -mcmodel=medium -fpic. */
17490 if (GET_CODE (x) != PLUS
17491 || GET_CODE (XEXP (x, 1)) != CONST)
17492 return ix86_delegitimize_tls_address (orig_x);
17494 if (ix86_pic_register_p (XEXP (x, 0)))
17495 /* %ebx + GOT/GOTOFF */
17497 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17499 /* %ebx + %reg * scale + GOT/GOTOFF */
17500 reg_addend = XEXP (x, 0);
17501 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17502 reg_addend = XEXP (reg_addend, 1);
17503 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17504 reg_addend = XEXP (reg_addend, 0);
17505 else
17507 reg_addend = NULL_RTX;
17508 addend = XEXP (x, 0);
17511 else
17512 addend = XEXP (x, 0);
17514 x = XEXP (XEXP (x, 1), 0);
17515 if (GET_CODE (x) == PLUS
17516 && CONST_INT_P (XEXP (x, 1)))
17518 const_addend = XEXP (x, 1);
17519 x = XEXP (x, 0);
17522 if (GET_CODE (x) == UNSPEC
17523 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17524 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17525 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17526 && !MEM_P (orig_x) && !addend)))
17527 result = XVECEXP (x, 0, 0);
17529 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17530 && !MEM_P (orig_x))
17531 result = XVECEXP (x, 0, 0);
17533 if (! result)
17534 return ix86_delegitimize_tls_address (orig_x);
17536 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17537 recurse on the first operand. */
17538 if (const_addend && !base_term_p)
17539 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17540 if (reg_addend)
17541 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17542 if (addend)
17544 /* If the rest of original X doesn't involve the PIC register, add
17545 addend and subtract pic_offset_table_rtx. This can happen e.g.
17546 for code like:
17547 leal (%ebx, %ecx, 4), %ecx
17549 movl foo@GOTOFF(%ecx), %edx
17550 in which case we return (%ecx - %ebx) + foo
17551 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17552 and reload has completed. Don't do the latter for debug,
17553 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17554 if (pic_offset_table_rtx
17555 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17556 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17557 pic_offset_table_rtx),
17558 result);
17559 else if (base_term_p
17560 && pic_offset_table_rtx
17561 && !TARGET_MACHO
17562 && !TARGET_VXWORKS_RTP)
17564 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17565 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17566 result = gen_rtx_PLUS (Pmode, tmp, result);
17568 else
17569 return orig_x;
17571 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17573 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17574 if (result == NULL_RTX)
17575 return orig_x;
17577 return result;
17580 /* The normal instantiation of the above template. */
17582 static rtx
17583 ix86_delegitimize_address (rtx x)
17585 return ix86_delegitimize_address_1 (x, false);
17588 /* If X is a machine specific address (i.e. a symbol or label being
17589 referenced as a displacement from the GOT implemented using an
17590 UNSPEC), then return the base term. Otherwise return X. */
17593 ix86_find_base_term (rtx x)
17595 rtx term;
17597 if (TARGET_64BIT)
17599 if (GET_CODE (x) != CONST)
17600 return x;
17601 term = XEXP (x, 0);
17602 if (GET_CODE (term) == PLUS
17603 && CONST_INT_P (XEXP (term, 1)))
17604 term = XEXP (term, 0);
17605 if (GET_CODE (term) != UNSPEC
17606 || (XINT (term, 1) != UNSPEC_GOTPCREL
17607 && XINT (term, 1) != UNSPEC_PCREL))
17608 return x;
17610 return XVECEXP (term, 0, 0);
17613 return ix86_delegitimize_address_1 (x, true);
17616 /* Return true if X shouldn't be emitted into the debug info.
17617 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17618 symbol easily into the .debug_info section, so we need not to
17619 delegitimize, but instead assemble as @gotoff.
17620 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17621 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17623 static bool
17624 ix86_const_not_ok_for_debug_p (rtx x)
17626 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17627 return true;
17629 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17630 return true;
17632 return false;
17635 static void
17636 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17637 bool fp, FILE *file)
17639 const char *suffix;
17641 if (mode == CCFPmode)
17643 code = ix86_fp_compare_code_to_integer (code);
17644 mode = CCmode;
17646 if (reverse)
17647 code = reverse_condition (code);
17649 switch (code)
17651 case EQ:
17652 gcc_assert (mode != CCGZmode);
17653 switch (mode)
17655 case E_CCAmode:
17656 suffix = "a";
17657 break;
17658 case E_CCCmode:
17659 suffix = "c";
17660 break;
17661 case E_CCOmode:
17662 suffix = "o";
17663 break;
17664 case E_CCPmode:
17665 suffix = "p";
17666 break;
17667 case E_CCSmode:
17668 suffix = "s";
17669 break;
17670 default:
17671 suffix = "e";
17672 break;
17674 break;
17675 case NE:
17676 gcc_assert (mode != CCGZmode);
17677 switch (mode)
17679 case E_CCAmode:
17680 suffix = "na";
17681 break;
17682 case E_CCCmode:
17683 suffix = "nc";
17684 break;
17685 case E_CCOmode:
17686 suffix = "no";
17687 break;
17688 case E_CCPmode:
17689 suffix = "np";
17690 break;
17691 case E_CCSmode:
17692 suffix = "ns";
17693 break;
17694 default:
17695 suffix = "ne";
17696 break;
17698 break;
17699 case GT:
17700 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17701 suffix = "g";
17702 break;
17703 case GTU:
17704 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17705 Those same assemblers have the same but opposite lossage on cmov. */
17706 if (mode == CCmode)
17707 suffix = fp ? "nbe" : "a";
17708 else
17709 gcc_unreachable ();
17710 break;
17711 case LT:
17712 switch (mode)
17714 case E_CCNOmode:
17715 case E_CCGOCmode:
17716 suffix = "s";
17717 break;
17719 case E_CCmode:
17720 case E_CCGCmode:
17721 case E_CCGZmode:
17722 suffix = "l";
17723 break;
17725 default:
17726 gcc_unreachable ();
17728 break;
17729 case LTU:
17730 if (mode == CCmode || mode == CCGZmode)
17731 suffix = "b";
17732 else if (mode == CCCmode)
17733 suffix = fp ? "b" : "c";
17734 else
17735 gcc_unreachable ();
17736 break;
17737 case GE:
17738 switch (mode)
17740 case E_CCNOmode:
17741 case E_CCGOCmode:
17742 suffix = "ns";
17743 break;
17745 case E_CCmode:
17746 case E_CCGCmode:
17747 case E_CCGZmode:
17748 suffix = "ge";
17749 break;
17751 default:
17752 gcc_unreachable ();
17754 break;
17755 case GEU:
17756 if (mode == CCmode || mode == CCGZmode)
17757 suffix = "nb";
17758 else if (mode == CCCmode)
17759 suffix = fp ? "nb" : "nc";
17760 else
17761 gcc_unreachable ();
17762 break;
17763 case LE:
17764 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17765 suffix = "le";
17766 break;
17767 case LEU:
17768 if (mode == CCmode)
17769 suffix = "be";
17770 else
17771 gcc_unreachable ();
17772 break;
17773 case UNORDERED:
17774 suffix = fp ? "u" : "p";
17775 break;
17776 case ORDERED:
17777 suffix = fp ? "nu" : "np";
17778 break;
17779 default:
17780 gcc_unreachable ();
17782 fputs (suffix, file);
17785 /* Print the name of register X to FILE based on its machine mode and number.
17786 If CODE is 'w', pretend the mode is HImode.
17787 If CODE is 'b', pretend the mode is QImode.
17788 If CODE is 'k', pretend the mode is SImode.
17789 If CODE is 'q', pretend the mode is DImode.
17790 If CODE is 'x', pretend the mode is V4SFmode.
17791 If CODE is 't', pretend the mode is V8SFmode.
17792 If CODE is 'g', pretend the mode is V16SFmode.
17793 If CODE is 'h', pretend the reg is the 'high' byte register.
17794 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17795 If CODE is 'd', duplicate the operand for AVX instruction.
17796 If CODE is 'V', print naked full integer register name without %.
17799 void
17800 print_reg (rtx x, int code, FILE *file)
17802 const char *reg;
17803 int msize;
17804 unsigned int regno;
17805 bool duplicated;
17807 if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
17808 putc ('%', file);
17810 if (x == pc_rtx)
17812 gcc_assert (TARGET_64BIT);
17813 fputs ("rip", file);
17814 return;
17817 if (code == 'y' && STACK_TOP_P (x))
17819 fputs ("st(0)", file);
17820 return;
17823 if (code == 'w')
17824 msize = 2;
17825 else if (code == 'b')
17826 msize = 1;
17827 else if (code == 'k')
17828 msize = 4;
17829 else if (code == 'q')
17830 msize = 8;
17831 else if (code == 'h')
17832 msize = 0;
17833 else if (code == 'x')
17834 msize = 16;
17835 else if (code == 't')
17836 msize = 32;
17837 else if (code == 'g')
17838 msize = 64;
17839 else
17840 msize = GET_MODE_SIZE (GET_MODE (x));
17842 regno = REGNO (x);
17844 if (regno == ARG_POINTER_REGNUM
17845 || regno == FRAME_POINTER_REGNUM
17846 || regno == FPSR_REG
17847 || regno == FPCR_REG)
17849 output_operand_lossage
17850 ("invalid use of register '%s'", reg_names[regno]);
17851 return;
17853 else if (regno == FLAGS_REG)
17855 output_operand_lossage ("invalid use of asm flag output");
17856 return;
17859 if (code == 'V')
17861 if (GENERAL_REGNO_P (regno))
17862 msize = GET_MODE_SIZE (word_mode);
17863 else
17864 error ("'V' modifier on non-integer register");
17867 duplicated = code == 'd' && TARGET_AVX;
17869 switch (msize)
17871 case 16:
17872 case 12:
17873 case 8:
17874 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17875 warning (0, "unsupported size for integer register");
17876 /* FALLTHRU */
17877 case 4:
17878 if (LEGACY_INT_REGNO_P (regno))
17879 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17880 /* FALLTHRU */
17881 case 2:
17882 normal:
17883 reg = hi_reg_name[regno];
17884 break;
17885 case 1:
17886 if (regno >= ARRAY_SIZE (qi_reg_name))
17887 goto normal;
17888 if (!ANY_QI_REGNO_P (regno))
17889 error ("unsupported size for integer register");
17890 reg = qi_reg_name[regno];
17891 break;
17892 case 0:
17893 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17894 goto normal;
17895 reg = qi_high_reg_name[regno];
17896 break;
17897 case 32:
17898 case 64:
17899 if (SSE_REGNO_P (regno))
17901 gcc_assert (!duplicated);
17902 putc (msize == 32 ? 'y' : 'z', file);
17903 reg = hi_reg_name[regno] + 1;
17904 break;
17906 goto normal;
17907 default:
17908 gcc_unreachable ();
17911 fputs (reg, file);
17913 /* Irritatingly, AMD extended registers use
17914 different naming convention: "r%d[bwd]" */
17915 if (REX_INT_REGNO_P (regno))
17917 gcc_assert (TARGET_64BIT);
17918 switch (msize)
17920 case 0:
17921 error ("extended registers have no high halves");
17922 break;
17923 case 1:
17924 putc ('b', file);
17925 break;
17926 case 2:
17927 putc ('w', file);
17928 break;
17929 case 4:
17930 putc ('d', file);
17931 break;
17932 case 8:
17933 /* no suffix */
17934 break;
17935 default:
17936 error ("unsupported operand size for extended register");
17937 break;
17939 return;
17942 if (duplicated)
17944 if (ASSEMBLER_DIALECT == ASM_ATT)
17945 fprintf (file, ", %%%s", reg);
17946 else
17947 fprintf (file, ", %s", reg);
17951 /* Meaning of CODE:
17952 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17953 C -- print opcode suffix for set/cmov insn.
17954 c -- like C, but print reversed condition
17955 F,f -- likewise, but for floating-point.
17956 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17957 otherwise nothing
17958 R -- print embedded rounding and sae.
17959 r -- print only sae.
17960 z -- print the opcode suffix for the size of the current operand.
17961 Z -- likewise, with special suffixes for x87 instructions.
17962 * -- print a star (in certain assembler syntax)
17963 A -- print an absolute memory reference.
17964 E -- print address with DImode register names if TARGET_64BIT.
17965 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17966 s -- print a shift double count, followed by the assemblers argument
17967 delimiter.
17968 b -- print the QImode name of the register for the indicated operand.
17969 %b0 would print %al if operands[0] is reg 0.
17970 w -- likewise, print the HImode name of the register.
17971 k -- likewise, print the SImode name of the register.
17972 q -- likewise, print the DImode name of the register.
17973 x -- likewise, print the V4SFmode name of the register.
17974 t -- likewise, print the V8SFmode name of the register.
17975 g -- likewise, print the V16SFmode name of the register.
17976 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17977 y -- print "st(0)" instead of "st" as a register.
17978 d -- print duplicated register operand for AVX instruction.
17979 D -- print condition for SSE cmp instruction.
17980 P -- if PIC, print an @PLT suffix.
17981 p -- print raw symbol name.
17982 X -- don't print any sort of PIC '@' suffix for a symbol.
17983 & -- print some in-use local-dynamic symbol name.
17984 H -- print a memory address offset by 8; used for sse high-parts
17985 Y -- print condition for XOP pcom* instruction.
17986 V -- print naked full integer register name without %.
17987 + -- print a branch hint as 'cs' or 'ds' prefix
17988 ; -- print a semicolon (after prefixes due to bug in older gas).
17989 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17990 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17991 ! -- print MPX prefix for jxx/call/ret instructions if required.
17994 void
17995 ix86_print_operand (FILE *file, rtx x, int code)
17997 if (code)
17999 switch (code)
18001 case 'A':
18002 switch (ASSEMBLER_DIALECT)
18004 case ASM_ATT:
18005 putc ('*', file);
18006 break;
18008 case ASM_INTEL:
18009 /* Intel syntax. For absolute addresses, registers should not
18010 be surrounded by braces. */
18011 if (!REG_P (x))
18013 putc ('[', file);
18014 ix86_print_operand (file, x, 0);
18015 putc (']', file);
18016 return;
18018 break;
18020 default:
18021 gcc_unreachable ();
18024 ix86_print_operand (file, x, 0);
18025 return;
18027 case 'E':
18028 /* Wrap address in an UNSPEC to declare special handling. */
18029 if (TARGET_64BIT)
18030 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18032 output_address (VOIDmode, x);
18033 return;
18035 case 'L':
18036 if (ASSEMBLER_DIALECT == ASM_ATT)
18037 putc ('l', file);
18038 return;
18040 case 'W':
18041 if (ASSEMBLER_DIALECT == ASM_ATT)
18042 putc ('w', file);
18043 return;
18045 case 'B':
18046 if (ASSEMBLER_DIALECT == ASM_ATT)
18047 putc ('b', file);
18048 return;
18050 case 'Q':
18051 if (ASSEMBLER_DIALECT == ASM_ATT)
18052 putc ('l', file);
18053 return;
18055 case 'S':
18056 if (ASSEMBLER_DIALECT == ASM_ATT)
18057 putc ('s', file);
18058 return;
18060 case 'T':
18061 if (ASSEMBLER_DIALECT == ASM_ATT)
18062 putc ('t', file);
18063 return;
18065 case 'O':
18066 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18067 if (ASSEMBLER_DIALECT != ASM_ATT)
18068 return;
18070 switch (GET_MODE_SIZE (GET_MODE (x)))
18072 case 2:
18073 putc ('w', file);
18074 break;
18076 case 4:
18077 putc ('l', file);
18078 break;
18080 case 8:
18081 putc ('q', file);
18082 break;
18084 default:
18085 output_operand_lossage ("invalid operand size for operand "
18086 "code 'O'");
18087 return;
18090 putc ('.', file);
18091 #endif
18092 return;
18094 case 'z':
18095 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18097 /* Opcodes don't get size suffixes if using Intel opcodes. */
18098 if (ASSEMBLER_DIALECT == ASM_INTEL)
18099 return;
18101 switch (GET_MODE_SIZE (GET_MODE (x)))
18103 case 1:
18104 putc ('b', file);
18105 return;
18107 case 2:
18108 putc ('w', file);
18109 return;
18111 case 4:
18112 putc ('l', file);
18113 return;
18115 case 8:
18116 putc ('q', file);
18117 return;
18119 default:
18120 output_operand_lossage ("invalid operand size for operand "
18121 "code 'z'");
18122 return;
18126 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18127 warning (0, "non-integer operand used with operand code 'z'");
18128 /* FALLTHRU */
18130 case 'Z':
18131 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18132 if (ASSEMBLER_DIALECT == ASM_INTEL)
18133 return;
18135 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18137 switch (GET_MODE_SIZE (GET_MODE (x)))
18139 case 2:
18140 #ifdef HAVE_AS_IX86_FILDS
18141 putc ('s', file);
18142 #endif
18143 return;
18145 case 4:
18146 putc ('l', file);
18147 return;
18149 case 8:
18150 #ifdef HAVE_AS_IX86_FILDQ
18151 putc ('q', file);
18152 #else
18153 fputs ("ll", file);
18154 #endif
18155 return;
18157 default:
18158 break;
18161 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18163 /* 387 opcodes don't get size suffixes
18164 if the operands are registers. */
18165 if (STACK_REG_P (x))
18166 return;
18168 switch (GET_MODE_SIZE (GET_MODE (x)))
18170 case 4:
18171 putc ('s', file);
18172 return;
18174 case 8:
18175 putc ('l', file);
18176 return;
18178 case 12:
18179 case 16:
18180 putc ('t', file);
18181 return;
18183 default:
18184 break;
18187 else
18189 output_operand_lossage ("invalid operand type used with "
18190 "operand code 'Z'");
18191 return;
18194 output_operand_lossage ("invalid operand size for operand code 'Z'");
18195 return;
18197 case 'd':
18198 case 'b':
18199 case 'w':
18200 case 'k':
18201 case 'q':
18202 case 'h':
18203 case 't':
18204 case 'g':
18205 case 'y':
18206 case 'x':
18207 case 'X':
18208 case 'P':
18209 case 'p':
18210 case 'V':
18211 break;
18213 case 's':
18214 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18216 ix86_print_operand (file, x, 0);
18217 fputs (", ", file);
18219 return;
18221 case 'Y':
18222 switch (GET_CODE (x))
18224 case NE:
18225 fputs ("neq", file);
18226 break;
18227 case EQ:
18228 fputs ("eq", file);
18229 break;
18230 case GE:
18231 case GEU:
18232 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18233 break;
18234 case GT:
18235 case GTU:
18236 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18237 break;
18238 case LE:
18239 case LEU:
18240 fputs ("le", file);
18241 break;
18242 case LT:
18243 case LTU:
18244 fputs ("lt", file);
18245 break;
18246 case UNORDERED:
18247 fputs ("unord", file);
18248 break;
18249 case ORDERED:
18250 fputs ("ord", file);
18251 break;
18252 case UNEQ:
18253 fputs ("ueq", file);
18254 break;
18255 case UNGE:
18256 fputs ("nlt", file);
18257 break;
18258 case UNGT:
18259 fputs ("nle", file);
18260 break;
18261 case UNLE:
18262 fputs ("ule", file);
18263 break;
18264 case UNLT:
18265 fputs ("ult", file);
18266 break;
18267 case LTGT:
18268 fputs ("une", file);
18269 break;
18270 default:
18271 output_operand_lossage ("operand is not a condition code, "
18272 "invalid operand code 'Y'");
18273 return;
18275 return;
18277 case 'D':
18278 /* Little bit of braindamage here. The SSE compare instructions
18279 does use completely different names for the comparisons that the
18280 fp conditional moves. */
18281 switch (GET_CODE (x))
18283 case UNEQ:
18284 if (TARGET_AVX)
18286 fputs ("eq_us", file);
18287 break;
18289 /* FALLTHRU */
18290 case EQ:
18291 fputs ("eq", file);
18292 break;
18293 case UNLT:
18294 if (TARGET_AVX)
18296 fputs ("nge", file);
18297 break;
18299 /* FALLTHRU */
18300 case LT:
18301 fputs ("lt", file);
18302 break;
18303 case UNLE:
18304 if (TARGET_AVX)
18306 fputs ("ngt", file);
18307 break;
18309 /* FALLTHRU */
18310 case LE:
18311 fputs ("le", file);
18312 break;
18313 case UNORDERED:
18314 fputs ("unord", file);
18315 break;
18316 case LTGT:
18317 if (TARGET_AVX)
18319 fputs ("neq_oq", file);
18320 break;
18322 /* FALLTHRU */
18323 case NE:
18324 fputs ("neq", file);
18325 break;
18326 case GE:
18327 if (TARGET_AVX)
18329 fputs ("ge", file);
18330 break;
18332 /* FALLTHRU */
18333 case UNGE:
18334 fputs ("nlt", file);
18335 break;
18336 case GT:
18337 if (TARGET_AVX)
18339 fputs ("gt", file);
18340 break;
18342 /* FALLTHRU */
18343 case UNGT:
18344 fputs ("nle", file);
18345 break;
18346 case ORDERED:
18347 fputs ("ord", file);
18348 break;
18349 default:
18350 output_operand_lossage ("operand is not a condition code, "
18351 "invalid operand code 'D'");
18352 return;
18354 return;
18356 case 'F':
18357 case 'f':
18358 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18359 if (ASSEMBLER_DIALECT == ASM_ATT)
18360 putc ('.', file);
18361 gcc_fallthrough ();
18362 #endif
18364 case 'C':
18365 case 'c':
18366 if (!COMPARISON_P (x))
18368 output_operand_lossage ("operand is not a condition code, "
18369 "invalid operand code '%c'", code);
18370 return;
18372 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18373 code == 'c' || code == 'f',
18374 code == 'F' || code == 'f',
18375 file);
18376 return;
18378 case 'H':
18379 if (!offsettable_memref_p (x))
18381 output_operand_lossage ("operand is not an offsettable memory "
18382 "reference, invalid operand code 'H'");
18383 return;
18385 /* It doesn't actually matter what mode we use here, as we're
18386 only going to use this for printing. */
18387 x = adjust_address_nv (x, DImode, 8);
18388 /* Output 'qword ptr' for intel assembler dialect. */
18389 if (ASSEMBLER_DIALECT == ASM_INTEL)
18390 code = 'q';
18391 break;
18393 case 'K':
18394 if (!CONST_INT_P (x))
18396 output_operand_lossage ("operand is not an integer, invalid "
18397 "operand code 'K'");
18398 return;
18401 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18402 #ifdef HAVE_AS_IX86_HLE
18403 fputs ("xacquire ", file);
18404 #else
18405 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18406 #endif
18407 else if (INTVAL (x) & IX86_HLE_RELEASE)
18408 #ifdef HAVE_AS_IX86_HLE
18409 fputs ("xrelease ", file);
18410 #else
18411 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18412 #endif
18413 /* We do not want to print value of the operand. */
18414 return;
18416 case 'N':
18417 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18418 fputs ("{z}", file);
18419 return;
18421 case 'r':
18422 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18424 output_operand_lossage ("operand is not a specific integer, "
18425 "invalid operand code 'r'");
18426 return;
18429 if (ASSEMBLER_DIALECT == ASM_INTEL)
18430 fputs (", ", file);
18432 fputs ("{sae}", file);
18434 if (ASSEMBLER_DIALECT == ASM_ATT)
18435 fputs (", ", file);
18437 return;
18439 case 'R':
18440 if (!CONST_INT_P (x))
18442 output_operand_lossage ("operand is not an integer, invalid "
18443 "operand code 'R'");
18444 return;
18447 if (ASSEMBLER_DIALECT == ASM_INTEL)
18448 fputs (", ", file);
18450 switch (INTVAL (x))
18452 case ROUND_NEAREST_INT | ROUND_SAE:
18453 fputs ("{rn-sae}", file);
18454 break;
18455 case ROUND_NEG_INF | ROUND_SAE:
18456 fputs ("{rd-sae}", file);
18457 break;
18458 case ROUND_POS_INF | ROUND_SAE:
18459 fputs ("{ru-sae}", file);
18460 break;
18461 case ROUND_ZERO | ROUND_SAE:
18462 fputs ("{rz-sae}", file);
18463 break;
18464 default:
18465 output_operand_lossage ("operand is not a specific integer, "
18466 "invalid operand code 'R'");
18469 if (ASSEMBLER_DIALECT == ASM_ATT)
18470 fputs (", ", file);
18472 return;
18474 case '*':
18475 if (ASSEMBLER_DIALECT == ASM_ATT)
18476 putc ('*', file);
18477 return;
18479 case '&':
18481 const char *name = get_some_local_dynamic_name ();
18482 if (name == NULL)
18483 output_operand_lossage ("'%%&' used without any "
18484 "local dynamic TLS references");
18485 else
18486 assemble_name (file, name);
18487 return;
18490 case '+':
18492 rtx x;
18494 if (!optimize
18495 || optimize_function_for_size_p (cfun)
18496 || !TARGET_BRANCH_PREDICTION_HINTS)
18497 return;
18499 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18500 if (x)
18502 int pred_val = profile_probability::from_reg_br_prob_note
18503 (XINT (x, 0)).to_reg_br_prob_base ();
18505 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18506 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18508 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18509 bool cputaken
18510 = final_forward_branch_p (current_output_insn) == 0;
18512 /* Emit hints only in the case default branch prediction
18513 heuristics would fail. */
18514 if (taken != cputaken)
18516 /* We use 3e (DS) prefix for taken branches and
18517 2e (CS) prefix for not taken branches. */
18518 if (taken)
18519 fputs ("ds ; ", file);
18520 else
18521 fputs ("cs ; ", file);
18525 return;
18528 case ';':
18529 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18530 putc (';', file);
18531 #endif
18532 return;
18534 case '~':
18535 putc (TARGET_AVX2 ? 'i' : 'f', file);
18536 return;
18538 case '^':
18539 if (TARGET_64BIT && Pmode != word_mode)
18540 fputs ("addr32 ", file);
18541 return;
18543 case '!':
18544 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18545 fputs ("bnd ", file);
18546 if (ix86_notrack_prefixed_insn_p (current_output_insn))
18547 fputs ("notrack ", file);
18548 return;
18550 default:
18551 output_operand_lossage ("invalid operand code '%c'", code);
18555 if (REG_P (x))
18556 print_reg (x, code, file);
18558 else if (MEM_P (x))
18560 rtx addr = XEXP (x, 0);
18562 /* No `byte ptr' prefix for call instructions ... */
18563 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18565 machine_mode mode = GET_MODE (x);
18566 const char *size;
18568 /* Check for explicit size override codes. */
18569 if (code == 'b')
18570 size = "BYTE";
18571 else if (code == 'w')
18572 size = "WORD";
18573 else if (code == 'k')
18574 size = "DWORD";
18575 else if (code == 'q')
18576 size = "QWORD";
18577 else if (code == 'x')
18578 size = "XMMWORD";
18579 else if (code == 't')
18580 size = "YMMWORD";
18581 else if (code == 'g')
18582 size = "ZMMWORD";
18583 else if (mode == BLKmode)
18584 /* ... or BLKmode operands, when not overridden. */
18585 size = NULL;
18586 else
18587 switch (GET_MODE_SIZE (mode))
18589 case 1: size = "BYTE"; break;
18590 case 2: size = "WORD"; break;
18591 case 4: size = "DWORD"; break;
18592 case 8: size = "QWORD"; break;
18593 case 12: size = "TBYTE"; break;
18594 case 16:
18595 if (mode == XFmode)
18596 size = "TBYTE";
18597 else
18598 size = "XMMWORD";
18599 break;
18600 case 32: size = "YMMWORD"; break;
18601 case 64: size = "ZMMWORD"; break;
18602 default:
18603 gcc_unreachable ();
18605 if (size)
18607 fputs (size, file);
18608 fputs (" PTR ", file);
18612 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18613 output_operand_lossage ("invalid constraints for operand");
18614 else
18615 ix86_print_operand_address_as
18616 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18619 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18621 long l;
18623 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18625 if (ASSEMBLER_DIALECT == ASM_ATT)
18626 putc ('$', file);
18627 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18628 if (code == 'q')
18629 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18630 (unsigned long long) (int) l);
18631 else
18632 fprintf (file, "0x%08x", (unsigned int) l);
18635 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18637 long l[2];
18639 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18641 if (ASSEMBLER_DIALECT == ASM_ATT)
18642 putc ('$', file);
18643 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18646 /* These float cases don't actually occur as immediate operands. */
18647 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18649 char dstr[30];
18651 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18652 fputs (dstr, file);
18655 else
18657 /* We have patterns that allow zero sets of memory, for instance.
18658 In 64-bit mode, we should probably support all 8-byte vectors,
18659 since we can in fact encode that into an immediate. */
18660 if (GET_CODE (x) == CONST_VECTOR)
18662 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18663 x = const0_rtx;
18666 if (code != 'P' && code != 'p')
18668 if (CONST_INT_P (x))
18670 if (ASSEMBLER_DIALECT == ASM_ATT)
18671 putc ('$', file);
18673 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18674 || GET_CODE (x) == LABEL_REF)
18676 if (ASSEMBLER_DIALECT == ASM_ATT)
18677 putc ('$', file);
18678 else
18679 fputs ("OFFSET FLAT:", file);
18682 if (CONST_INT_P (x))
18683 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18684 else if (flag_pic || MACHOPIC_INDIRECT)
18685 output_pic_addr_const (file, x, code);
18686 else
18687 output_addr_const (file, x);
18691 static bool
18692 ix86_print_operand_punct_valid_p (unsigned char code)
18694 return (code == '*' || code == '+' || code == '&' || code == ';'
18695 || code == '~' || code == '^' || code == '!');
18698 /* Print a memory operand whose address is ADDR. */
18700 static void
18701 ix86_print_operand_address_as (FILE *file, rtx addr,
18702 addr_space_t as, bool no_rip)
18704 struct ix86_address parts;
18705 rtx base, index, disp;
18706 int scale;
18707 int ok;
18708 bool vsib = false;
18709 int code = 0;
18711 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18713 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18714 gcc_assert (parts.index == NULL_RTX);
18715 parts.index = XVECEXP (addr, 0, 1);
18716 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18717 addr = XVECEXP (addr, 0, 0);
18718 vsib = true;
18720 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18722 gcc_assert (TARGET_64BIT);
18723 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18724 code = 'q';
18726 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18728 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18729 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18730 if (parts.base != NULL_RTX)
18732 parts.index = parts.base;
18733 parts.scale = 1;
18735 parts.base = XVECEXP (addr, 0, 0);
18736 addr = XVECEXP (addr, 0, 0);
18738 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18740 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18741 gcc_assert (parts.index == NULL_RTX);
18742 parts.index = XVECEXP (addr, 0, 1);
18743 addr = XVECEXP (addr, 0, 0);
18745 else
18746 ok = ix86_decompose_address (addr, &parts);
18748 gcc_assert (ok);
18750 base = parts.base;
18751 index = parts.index;
18752 disp = parts.disp;
18753 scale = parts.scale;
18755 if (ADDR_SPACE_GENERIC_P (as))
18756 as = parts.seg;
18757 else
18758 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18760 if (!ADDR_SPACE_GENERIC_P (as))
18762 const char *string;
18764 if (as == ADDR_SPACE_SEG_FS)
18765 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18766 else if (as == ADDR_SPACE_SEG_GS)
18767 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18768 else
18769 gcc_unreachable ();
18770 fputs (string, file);
18773 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18774 if (TARGET_64BIT && !base && !index && !no_rip)
18776 rtx symbol = disp;
18778 if (GET_CODE (disp) == CONST
18779 && GET_CODE (XEXP (disp, 0)) == PLUS
18780 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18781 symbol = XEXP (XEXP (disp, 0), 0);
18783 if (GET_CODE (symbol) == LABEL_REF
18784 || (GET_CODE (symbol) == SYMBOL_REF
18785 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18786 base = pc_rtx;
18789 if (!base && !index)
18791 /* Displacement only requires special attention. */
18792 if (CONST_INT_P (disp))
18794 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18795 fputs ("ds:", file);
18796 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18798 /* Load the external function address via the GOT slot to avoid PLT. */
18799 else if (GET_CODE (disp) == CONST
18800 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18801 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18802 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18803 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18804 output_pic_addr_const (file, disp, 0);
18805 else if (flag_pic)
18806 output_pic_addr_const (file, disp, 0);
18807 else
18808 output_addr_const (file, disp);
18810 else
18812 /* Print SImode register names to force addr32 prefix. */
18813 if (SImode_address_operand (addr, VOIDmode))
18815 if (flag_checking)
18817 gcc_assert (TARGET_64BIT);
18818 switch (GET_CODE (addr))
18820 case SUBREG:
18821 gcc_assert (GET_MODE (addr) == SImode);
18822 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18823 break;
18824 case ZERO_EXTEND:
18825 case AND:
18826 gcc_assert (GET_MODE (addr) == DImode);
18827 break;
18828 default:
18829 gcc_unreachable ();
18832 gcc_assert (!code);
18833 code = 'k';
18835 else if (code == 0
18836 && TARGET_X32
18837 && disp
18838 && CONST_INT_P (disp)
18839 && INTVAL (disp) < -16*1024*1024)
18841 /* X32 runs in 64-bit mode, where displacement, DISP, in
18842 address DISP(%r64), is encoded as 32-bit immediate sign-
18843 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18844 address is %r64 + 0xffffffffbffffd00. When %r64 <
18845 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18846 which is invalid for x32. The correct address is %r64
18847 - 0x40000300 == 0xf7ffdd64. To properly encode
18848 -0x40000300(%r64) for x32, we zero-extend negative
18849 displacement by forcing addr32 prefix which truncates
18850 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18851 zero-extend all negative displacements, including -1(%rsp).
18852 However, for small negative displacements, sign-extension
18853 won't cause overflow. We only zero-extend negative
18854 displacements if they < -16*1024*1024, which is also used
18855 to check legitimate address displacements for PIC. */
18856 code = 'k';
18859 /* Since the upper 32 bits of RSP are always zero for x32,
18860 we can encode %esp as %rsp to avoid 0x67 prefix if
18861 there is no index register. */
18862 if (TARGET_X32 && Pmode == SImode
18863 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18864 code = 'q';
18866 if (ASSEMBLER_DIALECT == ASM_ATT)
18868 if (disp)
18870 if (flag_pic)
18871 output_pic_addr_const (file, disp, 0);
18872 else if (GET_CODE (disp) == LABEL_REF)
18873 output_asm_label (disp);
18874 else
18875 output_addr_const (file, disp);
18878 putc ('(', file);
18879 if (base)
18880 print_reg (base, code, file);
18881 if (index)
18883 putc (',', file);
18884 print_reg (index, vsib ? 0 : code, file);
18885 if (scale != 1 || vsib)
18886 fprintf (file, ",%d", scale);
18888 putc (')', file);
18890 else
18892 rtx offset = NULL_RTX;
18894 if (disp)
18896 /* Pull out the offset of a symbol; print any symbol itself. */
18897 if (GET_CODE (disp) == CONST
18898 && GET_CODE (XEXP (disp, 0)) == PLUS
18899 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18901 offset = XEXP (XEXP (disp, 0), 1);
18902 disp = gen_rtx_CONST (VOIDmode,
18903 XEXP (XEXP (disp, 0), 0));
18906 if (flag_pic)
18907 output_pic_addr_const (file, disp, 0);
18908 else if (GET_CODE (disp) == LABEL_REF)
18909 output_asm_label (disp);
18910 else if (CONST_INT_P (disp))
18911 offset = disp;
18912 else
18913 output_addr_const (file, disp);
18916 putc ('[', file);
18917 if (base)
18919 print_reg (base, code, file);
18920 if (offset)
18922 if (INTVAL (offset) >= 0)
18923 putc ('+', file);
18924 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18927 else if (offset)
18928 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18929 else
18930 putc ('0', file);
18932 if (index)
18934 putc ('+', file);
18935 print_reg (index, vsib ? 0 : code, file);
18936 if (scale != 1 || vsib)
18937 fprintf (file, "*%d", scale);
18939 putc (']', file);
18944 static void
18945 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18947 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18950 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18952 static bool
18953 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18955 rtx op;
18957 if (GET_CODE (x) != UNSPEC)
18958 return false;
18960 op = XVECEXP (x, 0, 0);
18961 switch (XINT (x, 1))
18963 case UNSPEC_GOTOFF:
18964 output_addr_const (file, op);
18965 fputs ("@gotoff", file);
18966 break;
18967 case UNSPEC_GOTTPOFF:
18968 output_addr_const (file, op);
18969 /* FIXME: This might be @TPOFF in Sun ld. */
18970 fputs ("@gottpoff", file);
18971 break;
18972 case UNSPEC_TPOFF:
18973 output_addr_const (file, op);
18974 fputs ("@tpoff", file);
18975 break;
18976 case UNSPEC_NTPOFF:
18977 output_addr_const (file, op);
18978 if (TARGET_64BIT)
18979 fputs ("@tpoff", file);
18980 else
18981 fputs ("@ntpoff", file);
18982 break;
18983 case UNSPEC_DTPOFF:
18984 output_addr_const (file, op);
18985 fputs ("@dtpoff", file);
18986 break;
18987 case UNSPEC_GOTNTPOFF:
18988 output_addr_const (file, op);
18989 if (TARGET_64BIT)
18990 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18991 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18992 else
18993 fputs ("@gotntpoff", file);
18994 break;
18995 case UNSPEC_INDNTPOFF:
18996 output_addr_const (file, op);
18997 fputs ("@indntpoff", file);
18998 break;
18999 #if TARGET_MACHO
19000 case UNSPEC_MACHOPIC_OFFSET:
19001 output_addr_const (file, op);
19002 putc ('-', file);
19003 machopic_output_function_base_name (file);
19004 break;
19005 #endif
19007 default:
19008 return false;
19011 return true;
19014 /* Split one or more double-mode RTL references into pairs of half-mode
19015 references. The RTL can be REG, offsettable MEM, integer constant, or
19016 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19017 split and "num" is its length. lo_half and hi_half are output arrays
19018 that parallel "operands". */
19020 void
19021 split_double_mode (machine_mode mode, rtx operands[],
19022 int num, rtx lo_half[], rtx hi_half[])
19024 machine_mode half_mode;
19025 unsigned int byte;
19027 switch (mode)
19029 case E_TImode:
19030 half_mode = DImode;
19031 break;
19032 case E_DImode:
19033 half_mode = SImode;
19034 break;
19035 default:
19036 gcc_unreachable ();
19039 byte = GET_MODE_SIZE (half_mode);
19041 while (num--)
19043 rtx op = operands[num];
19045 /* simplify_subreg refuse to split volatile memory addresses,
19046 but we still have to handle it. */
19047 if (MEM_P (op))
19049 lo_half[num] = adjust_address (op, half_mode, 0);
19050 hi_half[num] = adjust_address (op, half_mode, byte);
19052 else
19054 lo_half[num] = simplify_gen_subreg (half_mode, op,
19055 GET_MODE (op) == VOIDmode
19056 ? mode : GET_MODE (op), 0);
19057 hi_half[num] = simplify_gen_subreg (half_mode, op,
19058 GET_MODE (op) == VOIDmode
19059 ? mode : GET_MODE (op), byte);
19064 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19065 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19066 is the expression of the binary operation. The output may either be
19067 emitted here, or returned to the caller, like all output_* functions.
19069 There is no guarantee that the operands are the same mode, as they
19070 might be within FLOAT or FLOAT_EXTEND expressions. */
19072 #ifndef SYSV386_COMPAT
19073 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19074 wants to fix the assemblers because that causes incompatibility
19075 with gcc. No-one wants to fix gcc because that causes
19076 incompatibility with assemblers... You can use the option of
19077 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19078 #define SYSV386_COMPAT 1
19079 #endif
19081 const char *
19082 output_387_binary_op (rtx_insn *insn, rtx *operands)
19084 static char buf[40];
19085 const char *p;
19086 bool is_sse
19087 = (SSE_REG_P (operands[0])
19088 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
19090 if (is_sse)
19091 p = "%v";
19092 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19093 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19094 p = "fi";
19095 else
19096 p = "f";
19098 strcpy (buf, p);
19100 switch (GET_CODE (operands[3]))
19102 case PLUS:
19103 p = "add"; break;
19104 case MINUS:
19105 p = "sub"; break;
19106 case MULT:
19107 p = "mul"; break;
19108 case DIV:
19109 p = "div"; break;
19110 default:
19111 gcc_unreachable ();
19114 strcat (buf, p);
19116 if (is_sse)
19118 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
19119 strcat (buf, p);
19121 if (TARGET_AVX)
19122 p = "\t{%2, %1, %0|%0, %1, %2}";
19123 else
19124 p = "\t{%2, %0|%0, %2}";
19126 strcat (buf, p);
19127 return buf;
19130 /* Even if we do not want to check the inputs, this documents input
19131 constraints. Which helps in understanding the following code. */
19132 if (flag_checking)
19134 if (STACK_REG_P (operands[0])
19135 && ((REG_P (operands[1])
19136 && REGNO (operands[0]) == REGNO (operands[1])
19137 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19138 || (REG_P (operands[2])
19139 && REGNO (operands[0]) == REGNO (operands[2])
19140 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19141 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19142 ; /* ok */
19143 else
19144 gcc_unreachable ();
19147 switch (GET_CODE (operands[3]))
19149 case MULT:
19150 case PLUS:
19151 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19152 std::swap (operands[1], operands[2]);
19154 /* know operands[0] == operands[1]. */
19156 if (MEM_P (operands[2]))
19158 p = "%Z2\t%2";
19159 break;
19162 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19164 if (STACK_TOP_P (operands[0]))
19165 /* How is it that we are storing to a dead operand[2]?
19166 Well, presumably operands[1] is dead too. We can't
19167 store the result to st(0) as st(0) gets popped on this
19168 instruction. Instead store to operands[2] (which I
19169 think has to be st(1)). st(1) will be popped later.
19170 gcc <= 2.8.1 didn't have this check and generated
19171 assembly code that the Unixware assembler rejected. */
19172 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19173 else
19174 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19175 break;
19178 if (STACK_TOP_P (operands[0]))
19179 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19180 else
19181 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19182 break;
19184 case MINUS:
19185 case DIV:
19186 if (MEM_P (operands[1]))
19188 p = "r%Z1\t%1";
19189 break;
19192 if (MEM_P (operands[2]))
19194 p = "%Z2\t%2";
19195 break;
19198 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19200 #if SYSV386_COMPAT
19201 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19202 derived assemblers, confusingly reverse the direction of
19203 the operation for fsub{r} and fdiv{r} when the
19204 destination register is not st(0). The Intel assembler
19205 doesn't have this brain damage. Read !SYSV386_COMPAT to
19206 figure out what the hardware really does. */
19207 if (STACK_TOP_P (operands[0]))
19208 p = "{p\t%0, %2|rp\t%2, %0}";
19209 else
19210 p = "{rp\t%2, %0|p\t%0, %2}";
19211 #else
19212 if (STACK_TOP_P (operands[0]))
19213 /* As above for fmul/fadd, we can't store to st(0). */
19214 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19215 else
19216 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19217 #endif
19218 break;
19221 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19223 #if SYSV386_COMPAT
19224 if (STACK_TOP_P (operands[0]))
19225 p = "{rp\t%0, %1|p\t%1, %0}";
19226 else
19227 p = "{p\t%1, %0|rp\t%0, %1}";
19228 #else
19229 if (STACK_TOP_P (operands[0]))
19230 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19231 else
19232 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19233 #endif
19234 break;
19237 if (STACK_TOP_P (operands[0]))
19239 if (STACK_TOP_P (operands[1]))
19240 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19241 else
19242 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19243 break;
19245 else if (STACK_TOP_P (operands[1]))
19247 #if SYSV386_COMPAT
19248 p = "{\t%1, %0|r\t%0, %1}";
19249 #else
19250 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19251 #endif
19253 else
19255 #if SYSV386_COMPAT
19256 p = "{r\t%2, %0|\t%0, %2}";
19257 #else
19258 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19259 #endif
19261 break;
19263 default:
19264 gcc_unreachable ();
19267 strcat (buf, p);
19268 return buf;
19271 /* Return needed mode for entity in optimize_mode_switching pass. */
19273 static int
19274 ix86_dirflag_mode_needed (rtx_insn *insn)
19276 if (CALL_P (insn))
19278 if (cfun->machine->func_type == TYPE_NORMAL)
19279 return X86_DIRFLAG_ANY;
19280 else
19281 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19282 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19285 if (recog_memoized (insn) < 0)
19286 return X86_DIRFLAG_ANY;
19288 if (get_attr_type (insn) == TYPE_STR)
19290 /* Emit cld instruction if stringops are used in the function. */
19291 if (cfun->machine->func_type == TYPE_NORMAL)
19292 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19293 else
19294 return X86_DIRFLAG_RESET;
19297 return X86_DIRFLAG_ANY;
19300 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
19302 static bool
19303 ix86_check_avx_upper_register (const_rtx exp)
19305 if (SUBREG_P (exp))
19306 exp = SUBREG_REG (exp);
19308 return (REG_P (exp)
19309 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
19310 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
19313 /* Return needed mode for entity in optimize_mode_switching pass. */
19315 static int
19316 ix86_avx_u128_mode_needed (rtx_insn *insn)
19318 if (CALL_P (insn))
19320 rtx link;
19322 /* Needed mode is set to AVX_U128_CLEAN if there are
19323 no 256bit or 512bit modes used in function arguments. */
19324 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19325 link;
19326 link = XEXP (link, 1))
19328 if (GET_CODE (XEXP (link, 0)) == USE)
19330 rtx arg = XEXP (XEXP (link, 0), 0);
19332 if (ix86_check_avx_upper_register (arg))
19333 return AVX_U128_DIRTY;
19337 return AVX_U128_CLEAN;
19340 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
19341 Hardware changes state only when a 256bit register is written to,
19342 but we need to prevent the compiler from moving optimal insertion
19343 point above eventual read from 256bit or 512 bit register. */
19344 subrtx_iterator::array_type array;
19345 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19346 if (ix86_check_avx_upper_register (*iter))
19347 return AVX_U128_DIRTY;
19349 return AVX_U128_ANY;
19352 /* Return mode that i387 must be switched into
19353 prior to the execution of insn. */
19355 static int
19356 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19358 enum attr_i387_cw mode;
19360 /* The mode UNINITIALIZED is used to store control word after a
19361 function call or ASM pattern. The mode ANY specify that function
19362 has no requirements on the control word and make no changes in the
19363 bits we are interested in. */
19365 if (CALL_P (insn)
19366 || (NONJUMP_INSN_P (insn)
19367 && (asm_noperands (PATTERN (insn)) >= 0
19368 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19369 return I387_CW_UNINITIALIZED;
19371 if (recog_memoized (insn) < 0)
19372 return I387_CW_ANY;
19374 mode = get_attr_i387_cw (insn);
19376 switch (entity)
19378 case I387_TRUNC:
19379 if (mode == I387_CW_TRUNC)
19380 return mode;
19381 break;
19383 case I387_FLOOR:
19384 if (mode == I387_CW_FLOOR)
19385 return mode;
19386 break;
19388 case I387_CEIL:
19389 if (mode == I387_CW_CEIL)
19390 return mode;
19391 break;
19393 case I387_MASK_PM:
19394 if (mode == I387_CW_MASK_PM)
19395 return mode;
19396 break;
19398 default:
19399 gcc_unreachable ();
19402 return I387_CW_ANY;
19405 /* Return mode that entity must be switched into
19406 prior to the execution of insn. */
19408 static int
19409 ix86_mode_needed (int entity, rtx_insn *insn)
19411 switch (entity)
19413 case X86_DIRFLAG:
19414 return ix86_dirflag_mode_needed (insn);
19415 case AVX_U128:
19416 return ix86_avx_u128_mode_needed (insn);
19417 case I387_TRUNC:
19418 case I387_FLOOR:
19419 case I387_CEIL:
19420 case I387_MASK_PM:
19421 return ix86_i387_mode_needed (entity, insn);
19422 default:
19423 gcc_unreachable ();
19425 return 0;
19428 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
19430 static void
19431 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
19433 if (ix86_check_avx_upper_register (dest))
19435 bool *used = (bool *) data;
19436 *used = true;
19440 /* Calculate mode of upper 128bit AVX registers after the insn. */
19442 static int
19443 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19445 rtx pat = PATTERN (insn);
19447 if (vzeroupper_operation (pat, VOIDmode)
19448 || vzeroall_operation (pat, VOIDmode))
19449 return AVX_U128_CLEAN;
19451 /* We know that state is clean after CALL insn if there are no
19452 256bit or 512bit registers used in the function return register. */
19453 if (CALL_P (insn))
19455 bool avx_upper_reg_found = false;
19456 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
19458 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19461 /* Otherwise, return current mode. Remember that if insn
19462 references AVX 256bit or 512bit registers, the mode was already
19463 changed to DIRTY from MODE_NEEDED. */
19464 return mode;
19467 /* Return the mode that an insn results in. */
19469 static int
19470 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19472 switch (entity)
19474 case X86_DIRFLAG:
19475 return mode;
19476 case AVX_U128:
19477 return ix86_avx_u128_mode_after (mode, insn);
19478 case I387_TRUNC:
19479 case I387_FLOOR:
19480 case I387_CEIL:
19481 case I387_MASK_PM:
19482 return mode;
19483 default:
19484 gcc_unreachable ();
19488 static int
19489 ix86_dirflag_mode_entry (void)
19491 /* For TARGET_CLD or in the interrupt handler we can't assume
19492 direction flag state at function entry. */
19493 if (TARGET_CLD
19494 || cfun->machine->func_type != TYPE_NORMAL)
19495 return X86_DIRFLAG_ANY;
19497 return X86_DIRFLAG_RESET;
19500 static int
19501 ix86_avx_u128_mode_entry (void)
19503 tree arg;
19505 /* Entry mode is set to AVX_U128_DIRTY if there are
19506 256bit or 512bit modes used in function arguments. */
19507 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19508 arg = TREE_CHAIN (arg))
19510 rtx incoming = DECL_INCOMING_RTL (arg);
19512 if (incoming && ix86_check_avx_upper_register (incoming))
19513 return AVX_U128_DIRTY;
19516 return AVX_U128_CLEAN;
19519 /* Return a mode that ENTITY is assumed to be
19520 switched to at function entry. */
19522 static int
19523 ix86_mode_entry (int entity)
19525 switch (entity)
19527 case X86_DIRFLAG:
19528 return ix86_dirflag_mode_entry ();
19529 case AVX_U128:
19530 return ix86_avx_u128_mode_entry ();
19531 case I387_TRUNC:
19532 case I387_FLOOR:
19533 case I387_CEIL:
19534 case I387_MASK_PM:
19535 return I387_CW_ANY;
19536 default:
19537 gcc_unreachable ();
19541 static int
19542 ix86_avx_u128_mode_exit (void)
19544 rtx reg = crtl->return_rtx;
19546 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19547 or 512 bit modes used in the function return register. */
19548 if (reg && ix86_check_avx_upper_register (reg))
19549 return AVX_U128_DIRTY;
19551 return AVX_U128_CLEAN;
19554 /* Return a mode that ENTITY is assumed to be
19555 switched to at function exit. */
19557 static int
19558 ix86_mode_exit (int entity)
19560 switch (entity)
19562 case X86_DIRFLAG:
19563 return X86_DIRFLAG_ANY;
19564 case AVX_U128:
19565 return ix86_avx_u128_mode_exit ();
19566 case I387_TRUNC:
19567 case I387_FLOOR:
19568 case I387_CEIL:
19569 case I387_MASK_PM:
19570 return I387_CW_ANY;
19571 default:
19572 gcc_unreachable ();
19576 static int
19577 ix86_mode_priority (int, int n)
19579 return n;
19582 /* Output code to initialize control word copies used by trunc?f?i and
19583 rounding patterns. CURRENT_MODE is set to current control word,
19584 while NEW_MODE is set to new control word. */
19586 static void
19587 emit_i387_cw_initialization (int mode)
19589 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19590 rtx new_mode;
19592 enum ix86_stack_slot slot;
19594 rtx reg = gen_reg_rtx (HImode);
19596 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19597 emit_move_insn (reg, copy_rtx (stored_mode));
19599 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19600 || optimize_insn_for_size_p ())
19602 switch (mode)
19604 case I387_CW_TRUNC:
19605 /* round toward zero (truncate) */
19606 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19607 slot = SLOT_CW_TRUNC;
19608 break;
19610 case I387_CW_FLOOR:
19611 /* round down toward -oo */
19612 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19613 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19614 slot = SLOT_CW_FLOOR;
19615 break;
19617 case I387_CW_CEIL:
19618 /* round up toward +oo */
19619 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19620 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19621 slot = SLOT_CW_CEIL;
19622 break;
19624 case I387_CW_MASK_PM:
19625 /* mask precision exception for nearbyint() */
19626 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19627 slot = SLOT_CW_MASK_PM;
19628 break;
19630 default:
19631 gcc_unreachable ();
19634 else
19636 switch (mode)
19638 case I387_CW_TRUNC:
19639 /* round toward zero (truncate) */
19640 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19641 slot = SLOT_CW_TRUNC;
19642 break;
19644 case I387_CW_FLOOR:
19645 /* round down toward -oo */
19646 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19647 slot = SLOT_CW_FLOOR;
19648 break;
19650 case I387_CW_CEIL:
19651 /* round up toward +oo */
19652 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19653 slot = SLOT_CW_CEIL;
19654 break;
19656 case I387_CW_MASK_PM:
19657 /* mask precision exception for nearbyint() */
19658 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19659 slot = SLOT_CW_MASK_PM;
19660 break;
19662 default:
19663 gcc_unreachable ();
19667 gcc_assert (slot < MAX_386_STACK_LOCALS);
19669 new_mode = assign_386_stack_local (HImode, slot);
19670 emit_move_insn (new_mode, reg);
19673 /* Emit vzeroupper. */
19675 void
19676 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19678 int i;
19680 /* Cancel automatic vzeroupper insertion if there are
19681 live call-saved SSE registers at the insertion point. */
19683 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19684 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19685 return;
19687 if (TARGET_64BIT)
19688 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19689 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19690 return;
19692 emit_insn (gen_avx_vzeroupper ());
19695 /* Generate one or more insns to set ENTITY to MODE. */
19697 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19698 is the set of hard registers live at the point where the insn(s)
19699 are to be inserted. */
19701 static void
19702 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19703 HARD_REG_SET regs_live)
19705 switch (entity)
19707 case X86_DIRFLAG:
19708 if (mode == X86_DIRFLAG_RESET)
19709 emit_insn (gen_cld ());
19710 break;
19711 case AVX_U128:
19712 if (mode == AVX_U128_CLEAN)
19713 ix86_avx_emit_vzeroupper (regs_live);
19714 break;
19715 case I387_TRUNC:
19716 case I387_FLOOR:
19717 case I387_CEIL:
19718 case I387_MASK_PM:
19719 if (mode != I387_CW_ANY
19720 && mode != I387_CW_UNINITIALIZED)
19721 emit_i387_cw_initialization (mode);
19722 break;
19723 default:
19724 gcc_unreachable ();
19728 /* Output code for INSN to convert a float to a signed int. OPERANDS
19729 are the insn operands. The output may be [HSD]Imode and the input
19730 operand may be [SDX]Fmode. */
19732 const char *
19733 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19735 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19736 bool dimode_p = GET_MODE (operands[0]) == DImode;
19737 int round_mode = get_attr_i387_cw (insn);
19739 static char buf[40];
19740 const char *p;
19742 /* Jump through a hoop or two for DImode, since the hardware has no
19743 non-popping instruction. We used to do this a different way, but
19744 that was somewhat fragile and broke with post-reload splitters. */
19745 if ((dimode_p || fisttp) && !stack_top_dies)
19746 output_asm_insn ("fld\t%y1", operands);
19748 gcc_assert (STACK_TOP_P (operands[1]));
19749 gcc_assert (MEM_P (operands[0]));
19750 gcc_assert (GET_MODE (operands[1]) != TFmode);
19752 if (fisttp)
19753 return "fisttp%Z0\t%0";
19755 strcpy (buf, "fist");
19757 if (round_mode != I387_CW_ANY)
19758 output_asm_insn ("fldcw\t%3", operands);
19760 p = "p%Z0\t%0";
19761 strcat (buf, p + !(stack_top_dies || dimode_p));
19763 output_asm_insn (buf, operands);
19765 if (round_mode != I387_CW_ANY)
19766 output_asm_insn ("fldcw\t%2", operands);
19768 return "";
19771 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19772 have the values zero or one, indicates the ffreep insn's operand
19773 from the OPERANDS array. */
19775 static const char *
19776 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19778 if (TARGET_USE_FFREEP)
19779 #ifdef HAVE_AS_IX86_FFREEP
19780 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19781 #else
19783 static char retval[32];
19784 int regno = REGNO (operands[opno]);
19786 gcc_assert (STACK_REGNO_P (regno));
19788 regno -= FIRST_STACK_REG;
19790 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19791 return retval;
19793 #endif
19795 return opno ? "fstp\t%y1" : "fstp\t%y0";
19799 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19800 should be used. UNORDERED_P is true when fucom should be used. */
19802 const char *
19803 output_fp_compare (rtx_insn *insn, rtx *operands,
19804 bool eflags_p, bool unordered_p)
19806 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19807 bool stack_top_dies;
19809 static char buf[40];
19810 const char *p;
19812 gcc_assert (STACK_TOP_P (xops[0]));
19814 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19816 if (eflags_p)
19818 p = unordered_p ? "fucomi" : "fcomi";
19819 strcpy (buf, p);
19821 p = "p\t{%y1, %0|%0, %y1}";
19822 strcat (buf, p + !stack_top_dies);
19824 return buf;
19827 if (STACK_REG_P (xops[1])
19828 && stack_top_dies
19829 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19831 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19833 /* If both the top of the 387 stack die, and the other operand
19834 is also a stack register that dies, then this must be a
19835 `fcompp' float compare. */
19836 p = unordered_p ? "fucompp" : "fcompp";
19837 strcpy (buf, p);
19839 else if (const0_operand (xops[1], VOIDmode))
19841 gcc_assert (!unordered_p);
19842 strcpy (buf, "ftst");
19844 else
19846 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19848 gcc_assert (!unordered_p);
19849 p = "ficom";
19851 else
19852 p = unordered_p ? "fucom" : "fcom";
19854 strcpy (buf, p);
19856 p = "p%Z2\t%y2";
19857 strcat (buf, p + !stack_top_dies);
19860 output_asm_insn (buf, operands);
19861 return "fnstsw\t%0";
19864 void
19865 ix86_output_addr_vec_elt (FILE *file, int value)
19867 const char *directive = ASM_LONG;
19869 #ifdef ASM_QUAD
19870 if (TARGET_LP64)
19871 directive = ASM_QUAD;
19872 #else
19873 gcc_assert (!TARGET_64BIT);
19874 #endif
19876 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19879 void
19880 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19882 const char *directive = ASM_LONG;
19884 #ifdef ASM_QUAD
19885 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19886 directive = ASM_QUAD;
19887 #else
19888 gcc_assert (!TARGET_64BIT);
19889 #endif
19890 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19891 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19892 fprintf (file, "%s%s%d-%s%d\n",
19893 directive, LPREFIX, value, LPREFIX, rel);
19894 else if (HAVE_AS_GOTOFF_IN_DATA)
19895 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19896 #if TARGET_MACHO
19897 else if (TARGET_MACHO)
19899 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19900 machopic_output_function_base_name (file);
19901 putc ('\n', file);
19903 #endif
19904 else
19905 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19906 GOT_SYMBOL_NAME, LPREFIX, value);
19909 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19910 for the target. */
19912 void
19913 ix86_expand_clear (rtx dest)
19915 rtx tmp;
19917 /* We play register width games, which are only valid after reload. */
19918 gcc_assert (reload_completed);
19920 /* Avoid HImode and its attendant prefix byte. */
19921 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19922 dest = gen_rtx_REG (SImode, REGNO (dest));
19923 tmp = gen_rtx_SET (dest, const0_rtx);
19925 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19927 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19928 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19931 emit_insn (tmp);
19934 void
19935 ix86_expand_move (machine_mode mode, rtx operands[])
19937 rtx op0, op1;
19938 rtx tmp, addend = NULL_RTX;
19939 enum tls_model model;
19941 op0 = operands[0];
19942 op1 = operands[1];
19944 switch (GET_CODE (op1))
19946 case CONST:
19947 tmp = XEXP (op1, 0);
19949 if (GET_CODE (tmp) != PLUS
19950 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19951 break;
19953 op1 = XEXP (tmp, 0);
19954 addend = XEXP (tmp, 1);
19955 /* FALLTHRU */
19957 case SYMBOL_REF:
19958 model = SYMBOL_REF_TLS_MODEL (op1);
19960 if (model)
19961 op1 = legitimize_tls_address (op1, model, true);
19962 else if (ix86_force_load_from_GOT_p (op1))
19964 /* Load the external function address via GOT slot to avoid PLT. */
19965 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19966 (TARGET_64BIT
19967 ? UNSPEC_GOTPCREL
19968 : UNSPEC_GOT));
19969 op1 = gen_rtx_CONST (Pmode, op1);
19970 op1 = gen_const_mem (Pmode, op1);
19971 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19973 else
19975 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19976 if (tmp)
19978 op1 = tmp;
19979 if (!addend)
19980 break;
19982 else
19984 op1 = operands[1];
19985 break;
19989 if (addend)
19991 op1 = force_operand (op1, NULL_RTX);
19992 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19993 op0, 1, OPTAB_DIRECT);
19995 else
19996 op1 = force_operand (op1, op0);
19998 if (op1 == op0)
19999 return;
20001 op1 = convert_to_mode (mode, op1, 1);
20003 default:
20004 break;
20007 if ((flag_pic || MACHOPIC_INDIRECT)
20008 && symbolic_operand (op1, mode))
20010 if (TARGET_MACHO && !TARGET_64BIT)
20012 #if TARGET_MACHO
20013 /* dynamic-no-pic */
20014 if (MACHOPIC_INDIRECT)
20016 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20017 ? op0 : gen_reg_rtx (Pmode);
20018 op1 = machopic_indirect_data_reference (op1, temp);
20019 if (MACHOPIC_PURE)
20020 op1 = machopic_legitimize_pic_address (op1, mode,
20021 temp == op1 ? 0 : temp);
20023 if (op0 != op1 && GET_CODE (op0) != MEM)
20025 rtx insn = gen_rtx_SET (op0, op1);
20026 emit_insn (insn);
20027 return;
20029 if (GET_CODE (op0) == MEM)
20030 op1 = force_reg (Pmode, op1);
20031 else
20033 rtx temp = op0;
20034 if (GET_CODE (temp) != REG)
20035 temp = gen_reg_rtx (Pmode);
20036 temp = legitimize_pic_address (op1, temp);
20037 if (temp == op0)
20038 return;
20039 op1 = temp;
20041 /* dynamic-no-pic */
20042 #endif
20044 else
20046 if (MEM_P (op0))
20047 op1 = force_reg (mode, op1);
20048 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20050 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20051 op1 = legitimize_pic_address (op1, reg);
20052 if (op0 == op1)
20053 return;
20054 op1 = convert_to_mode (mode, op1, 1);
20058 else
20060 if (MEM_P (op0)
20061 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20062 || !push_operand (op0, mode))
20063 && MEM_P (op1))
20064 op1 = force_reg (mode, op1);
20066 if (push_operand (op0, mode)
20067 && ! general_no_elim_operand (op1, mode))
20068 op1 = copy_to_mode_reg (mode, op1);
20070 /* Force large constants in 64bit compilation into register
20071 to get them CSEed. */
20072 if (can_create_pseudo_p ()
20073 && (mode == DImode) && TARGET_64BIT
20074 && immediate_operand (op1, mode)
20075 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20076 && !register_operand (op0, mode)
20077 && optimize)
20078 op1 = copy_to_mode_reg (mode, op1);
20080 if (can_create_pseudo_p ()
20081 && CONST_DOUBLE_P (op1))
20083 /* If we are loading a floating point constant to a register,
20084 force the value to memory now, since we'll get better code
20085 out the back end. */
20087 op1 = validize_mem (force_const_mem (mode, op1));
20088 if (!register_operand (op0, mode))
20090 rtx temp = gen_reg_rtx (mode);
20091 emit_insn (gen_rtx_SET (temp, op1));
20092 emit_move_insn (op0, temp);
20093 return;
20098 emit_insn (gen_rtx_SET (op0, op1));
20101 void
20102 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20104 rtx op0 = operands[0], op1 = operands[1];
20105 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20106 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20107 unsigned int align = (TARGET_IAMCU
20108 ? GET_MODE_BITSIZE (mode)
20109 : GET_MODE_ALIGNMENT (mode));
20111 if (push_operand (op0, VOIDmode))
20112 op0 = emit_move_resolve_push (mode, op0);
20114 /* Force constants other than zero into memory. We do not know how
20115 the instructions used to build constants modify the upper 64 bits
20116 of the register, once we have that information we may be able
20117 to handle some of them more efficiently. */
20118 if (can_create_pseudo_p ()
20119 && (CONSTANT_P (op1)
20120 || (SUBREG_P (op1)
20121 && CONSTANT_P (SUBREG_REG (op1))))
20122 && ((register_operand (op0, mode)
20123 && !standard_sse_constant_p (op1, mode))
20124 /* ix86_expand_vector_move_misalign() does not like constants. */
20125 || (SSE_REG_MODE_P (mode)
20126 && MEM_P (op0)
20127 && MEM_ALIGN (op0) < align)))
20129 if (SUBREG_P (op1))
20131 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20132 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20133 if (r)
20134 r = validize_mem (r);
20135 else
20136 r = force_reg (imode, SUBREG_REG (op1));
20137 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20139 else
20140 op1 = validize_mem (force_const_mem (mode, op1));
20143 /* We need to check memory alignment for SSE mode since attribute
20144 can make operands unaligned. */
20145 if (can_create_pseudo_p ()
20146 && SSE_REG_MODE_P (mode)
20147 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20148 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20150 rtx tmp[2];
20152 /* ix86_expand_vector_move_misalign() does not like both
20153 arguments in memory. */
20154 if (!register_operand (op0, mode)
20155 && !register_operand (op1, mode))
20156 op1 = force_reg (mode, op1);
20158 tmp[0] = op0; tmp[1] = op1;
20159 ix86_expand_vector_move_misalign (mode, tmp);
20160 return;
20163 /* Make operand1 a register if it isn't already. */
20164 if (can_create_pseudo_p ()
20165 && !register_operand (op0, mode)
20166 && !register_operand (op1, mode))
20168 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20169 return;
20172 emit_insn (gen_rtx_SET (op0, op1));
20175 /* Split 32-byte AVX unaligned load and store if needed. */
20177 static void
20178 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20180 rtx m;
20181 rtx (*extract) (rtx, rtx, rtx);
20182 machine_mode mode;
20184 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20185 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20187 emit_insn (gen_rtx_SET (op0, op1));
20188 return;
20191 rtx orig_op0 = NULL_RTX;
20192 mode = GET_MODE (op0);
20193 switch (GET_MODE_CLASS (mode))
20195 case MODE_VECTOR_INT:
20196 case MODE_INT:
20197 if (mode != V32QImode)
20199 if (!MEM_P (op0))
20201 orig_op0 = op0;
20202 op0 = gen_reg_rtx (V32QImode);
20204 else
20205 op0 = gen_lowpart (V32QImode, op0);
20206 op1 = gen_lowpart (V32QImode, op1);
20207 mode = V32QImode;
20209 break;
20210 case MODE_VECTOR_FLOAT:
20211 break;
20212 default:
20213 gcc_unreachable ();
20216 switch (mode)
20218 default:
20219 gcc_unreachable ();
20220 case E_V32QImode:
20221 extract = gen_avx_vextractf128v32qi;
20222 mode = V16QImode;
20223 break;
20224 case E_V8SFmode:
20225 extract = gen_avx_vextractf128v8sf;
20226 mode = V4SFmode;
20227 break;
20228 case E_V4DFmode:
20229 extract = gen_avx_vextractf128v4df;
20230 mode = V2DFmode;
20231 break;
20234 if (MEM_P (op1))
20236 rtx r = gen_reg_rtx (mode);
20237 m = adjust_address (op1, mode, 0);
20238 emit_move_insn (r, m);
20239 m = adjust_address (op1, mode, 16);
20240 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20241 emit_move_insn (op0, r);
20243 else if (MEM_P (op0))
20245 m = adjust_address (op0, mode, 0);
20246 emit_insn (extract (m, op1, const0_rtx));
20247 m = adjust_address (op0, mode, 16);
20248 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20250 else
20251 gcc_unreachable ();
20253 if (orig_op0)
20254 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20257 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20258 straight to ix86_expand_vector_move. */
20259 /* Code generation for scalar reg-reg moves of single and double precision data:
20260 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20261 movaps reg, reg
20262 else
20263 movss reg, reg
20264 if (x86_sse_partial_reg_dependency == true)
20265 movapd reg, reg
20266 else
20267 movsd reg, reg
20269 Code generation for scalar loads of double precision data:
20270 if (x86_sse_split_regs == true)
20271 movlpd mem, reg (gas syntax)
20272 else
20273 movsd mem, reg
20275 Code generation for unaligned packed loads of single precision data
20276 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20277 if (x86_sse_unaligned_move_optimal)
20278 movups mem, reg
20280 if (x86_sse_partial_reg_dependency == true)
20282 xorps reg, reg
20283 movlps mem, reg
20284 movhps mem+8, reg
20286 else
20288 movlps mem, reg
20289 movhps mem+8, reg
20292 Code generation for unaligned packed loads of double precision data
20293 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20294 if (x86_sse_unaligned_move_optimal)
20295 movupd mem, reg
20297 if (x86_sse_split_regs == true)
20299 movlpd mem, reg
20300 movhpd mem+8, reg
20302 else
20304 movsd mem, reg
20305 movhpd mem+8, reg
20309 void
20310 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20312 rtx op0, op1, m;
20314 op0 = operands[0];
20315 op1 = operands[1];
20317 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20318 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20320 emit_insn (gen_rtx_SET (op0, op1));
20321 return;
20324 if (TARGET_AVX)
20326 if (GET_MODE_SIZE (mode) == 32)
20327 ix86_avx256_split_vector_move_misalign (op0, op1);
20328 else
20329 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20330 emit_insn (gen_rtx_SET (op0, op1));
20331 return;
20334 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20335 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20337 emit_insn (gen_rtx_SET (op0, op1));
20338 return;
20341 /* ??? If we have typed data, then it would appear that using
20342 movdqu is the only way to get unaligned data loaded with
20343 integer type. */
20344 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20346 emit_insn (gen_rtx_SET (op0, op1));
20347 return;
20350 if (MEM_P (op1))
20352 if (TARGET_SSE2 && mode == V2DFmode)
20354 rtx zero;
20356 /* When SSE registers are split into halves, we can avoid
20357 writing to the top half twice. */
20358 if (TARGET_SSE_SPLIT_REGS)
20360 emit_clobber (op0);
20361 zero = op0;
20363 else
20365 /* ??? Not sure about the best option for the Intel chips.
20366 The following would seem to satisfy; the register is
20367 entirely cleared, breaking the dependency chain. We
20368 then store to the upper half, with a dependency depth
20369 of one. A rumor has it that Intel recommends two movsd
20370 followed by an unpacklpd, but this is unconfirmed. And
20371 given that the dependency depth of the unpacklpd would
20372 still be one, I'm not sure why this would be better. */
20373 zero = CONST0_RTX (V2DFmode);
20376 m = adjust_address (op1, DFmode, 0);
20377 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20378 m = adjust_address (op1, DFmode, 8);
20379 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20381 else
20383 rtx t;
20385 if (mode != V4SFmode)
20386 t = gen_reg_rtx (V4SFmode);
20387 else
20388 t = op0;
20390 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20391 emit_move_insn (t, CONST0_RTX (V4SFmode));
20392 else
20393 emit_clobber (t);
20395 m = adjust_address (op1, V2SFmode, 0);
20396 emit_insn (gen_sse_loadlps (t, t, m));
20397 m = adjust_address (op1, V2SFmode, 8);
20398 emit_insn (gen_sse_loadhps (t, t, m));
20399 if (mode != V4SFmode)
20400 emit_move_insn (op0, gen_lowpart (mode, t));
20403 else if (MEM_P (op0))
20405 if (TARGET_SSE2 && mode == V2DFmode)
20407 m = adjust_address (op0, DFmode, 0);
20408 emit_insn (gen_sse2_storelpd (m, op1));
20409 m = adjust_address (op0, DFmode, 8);
20410 emit_insn (gen_sse2_storehpd (m, op1));
20412 else
20414 if (mode != V4SFmode)
20415 op1 = gen_lowpart (V4SFmode, op1);
20417 m = adjust_address (op0, V2SFmode, 0);
20418 emit_insn (gen_sse_storelps (m, op1));
20419 m = adjust_address (op0, V2SFmode, 8);
20420 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20423 else
20424 gcc_unreachable ();
20427 /* Helper function of ix86_fixup_binary_operands to canonicalize
20428 operand order. Returns true if the operands should be swapped. */
20430 static bool
20431 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20432 rtx operands[])
20434 rtx dst = operands[0];
20435 rtx src1 = operands[1];
20436 rtx src2 = operands[2];
20438 /* If the operation is not commutative, we can't do anything. */
20439 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
20440 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
20441 return false;
20443 /* Highest priority is that src1 should match dst. */
20444 if (rtx_equal_p (dst, src1))
20445 return false;
20446 if (rtx_equal_p (dst, src2))
20447 return true;
20449 /* Next highest priority is that immediate constants come second. */
20450 if (immediate_operand (src2, mode))
20451 return false;
20452 if (immediate_operand (src1, mode))
20453 return true;
20455 /* Lowest priority is that memory references should come second. */
20456 if (MEM_P (src2))
20457 return false;
20458 if (MEM_P (src1))
20459 return true;
20461 return false;
20465 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20466 destination to use for the operation. If different from the true
20467 destination in operands[0], a copy operation will be required. */
20470 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20471 rtx operands[])
20473 rtx dst = operands[0];
20474 rtx src1 = operands[1];
20475 rtx src2 = operands[2];
20477 /* Canonicalize operand order. */
20478 if (ix86_swap_binary_operands_p (code, mode, operands))
20480 /* It is invalid to swap operands of different modes. */
20481 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20483 std::swap (src1, src2);
20486 /* Both source operands cannot be in memory. */
20487 if (MEM_P (src1) && MEM_P (src2))
20489 /* Optimization: Only read from memory once. */
20490 if (rtx_equal_p (src1, src2))
20492 src2 = force_reg (mode, src2);
20493 src1 = src2;
20495 else if (rtx_equal_p (dst, src1))
20496 src2 = force_reg (mode, src2);
20497 else
20498 src1 = force_reg (mode, src1);
20501 /* If the destination is memory, and we do not have matching source
20502 operands, do things in registers. */
20503 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20504 dst = gen_reg_rtx (mode);
20506 /* Source 1 cannot be a constant. */
20507 if (CONSTANT_P (src1))
20508 src1 = force_reg (mode, src1);
20510 /* Source 1 cannot be a non-matching memory. */
20511 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20512 src1 = force_reg (mode, src1);
20514 /* Improve address combine. */
20515 if (code == PLUS
20516 && GET_MODE_CLASS (mode) == MODE_INT
20517 && MEM_P (src2))
20518 src2 = force_reg (mode, src2);
20520 operands[1] = src1;
20521 operands[2] = src2;
20522 return dst;
20525 /* Similarly, but assume that the destination has already been
20526 set up properly. */
20528 void
20529 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20530 machine_mode mode, rtx operands[])
20532 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20533 gcc_assert (dst == operands[0]);
20536 /* Attempt to expand a binary operator. Make the expansion closer to the
20537 actual machine, then just general_operand, which will allow 3 separate
20538 memory references (one output, two input) in a single insn. */
20540 void
20541 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20542 rtx operands[])
20544 rtx src1, src2, dst, op, clob;
20546 dst = ix86_fixup_binary_operands (code, mode, operands);
20547 src1 = operands[1];
20548 src2 = operands[2];
20550 /* Emit the instruction. */
20552 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20554 if (reload_completed
20555 && code == PLUS
20556 && !rtx_equal_p (dst, src1))
20558 /* This is going to be an LEA; avoid splitting it later. */
20559 emit_insn (op);
20561 else
20563 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20564 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20567 /* Fix up the destination if needed. */
20568 if (dst != operands[0])
20569 emit_move_insn (operands[0], dst);
20572 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20573 the given OPERANDS. */
20575 void
20576 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20577 rtx operands[])
20579 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20580 if (SUBREG_P (operands[1]))
20582 op1 = operands[1];
20583 op2 = operands[2];
20585 else if (SUBREG_P (operands[2]))
20587 op1 = operands[2];
20588 op2 = operands[1];
20590 /* Optimize (__m128i) d | (__m128i) e and similar code
20591 when d and e are float vectors into float vector logical
20592 insn. In C/C++ without using intrinsics there is no other way
20593 to express vector logical operation on float vectors than
20594 to cast them temporarily to integer vectors. */
20595 if (op1
20596 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20597 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20598 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20599 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20600 && SUBREG_BYTE (op1) == 0
20601 && (GET_CODE (op2) == CONST_VECTOR
20602 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20603 && SUBREG_BYTE (op2) == 0))
20604 && can_create_pseudo_p ())
20606 rtx dst;
20607 switch (GET_MODE (SUBREG_REG (op1)))
20609 case E_V4SFmode:
20610 case E_V8SFmode:
20611 case E_V16SFmode:
20612 case E_V2DFmode:
20613 case E_V4DFmode:
20614 case E_V8DFmode:
20615 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20616 if (GET_CODE (op2) == CONST_VECTOR)
20618 op2 = gen_lowpart (GET_MODE (dst), op2);
20619 op2 = force_reg (GET_MODE (dst), op2);
20621 else
20623 op1 = operands[1];
20624 op2 = SUBREG_REG (operands[2]);
20625 if (!vector_operand (op2, GET_MODE (dst)))
20626 op2 = force_reg (GET_MODE (dst), op2);
20628 op1 = SUBREG_REG (op1);
20629 if (!vector_operand (op1, GET_MODE (dst)))
20630 op1 = force_reg (GET_MODE (dst), op1);
20631 emit_insn (gen_rtx_SET (dst,
20632 gen_rtx_fmt_ee (code, GET_MODE (dst),
20633 op1, op2)));
20634 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20635 return;
20636 default:
20637 break;
20640 if (!vector_operand (operands[1], mode))
20641 operands[1] = force_reg (mode, operands[1]);
20642 if (!vector_operand (operands[2], mode))
20643 operands[2] = force_reg (mode, operands[2]);
20644 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20645 emit_insn (gen_rtx_SET (operands[0],
20646 gen_rtx_fmt_ee (code, mode, operands[1],
20647 operands[2])));
20650 /* Return TRUE or FALSE depending on whether the binary operator meets the
20651 appropriate constraints. */
20653 bool
20654 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20655 rtx operands[3])
20657 rtx dst = operands[0];
20658 rtx src1 = operands[1];
20659 rtx src2 = operands[2];
20661 /* Both source operands cannot be in memory. */
20662 if (MEM_P (src1) && MEM_P (src2))
20663 return false;
20665 /* Canonicalize operand order for commutative operators. */
20666 if (ix86_swap_binary_operands_p (code, mode, operands))
20667 std::swap (src1, src2);
20669 /* If the destination is memory, we must have a matching source operand. */
20670 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20671 return false;
20673 /* Source 1 cannot be a constant. */
20674 if (CONSTANT_P (src1))
20675 return false;
20677 /* Source 1 cannot be a non-matching memory. */
20678 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20679 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20680 return (code == AND
20681 && (mode == HImode
20682 || mode == SImode
20683 || (TARGET_64BIT && mode == DImode))
20684 && satisfies_constraint_L (src2));
20686 return true;
20689 /* Attempt to expand a unary operator. Make the expansion closer to the
20690 actual machine, then just general_operand, which will allow 2 separate
20691 memory references (one output, one input) in a single insn. */
20693 void
20694 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20695 rtx operands[])
20697 bool matching_memory = false;
20698 rtx src, dst, op, clob;
20700 dst = operands[0];
20701 src = operands[1];
20703 /* If the destination is memory, and we do not have matching source
20704 operands, do things in registers. */
20705 if (MEM_P (dst))
20707 if (rtx_equal_p (dst, src))
20708 matching_memory = true;
20709 else
20710 dst = gen_reg_rtx (mode);
20713 /* When source operand is memory, destination must match. */
20714 if (MEM_P (src) && !matching_memory)
20715 src = force_reg (mode, src);
20717 /* Emit the instruction. */
20719 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20721 if (code == NOT)
20722 emit_insn (op);
20723 else
20725 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20726 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20729 /* Fix up the destination if needed. */
20730 if (dst != operands[0])
20731 emit_move_insn (operands[0], dst);
20734 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20735 divisor are within the range [0-255]. */
20737 void
20738 ix86_split_idivmod (machine_mode mode, rtx operands[],
20739 bool signed_p)
20741 rtx_code_label *end_label, *qimode_label;
20742 rtx div, mod;
20743 rtx_insn *insn;
20744 rtx scratch, tmp0, tmp1, tmp2;
20745 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20746 rtx (*gen_zero_extend) (rtx, rtx);
20747 rtx (*gen_test_ccno_1) (rtx, rtx);
20749 switch (mode)
20751 case E_SImode:
20752 if (GET_MODE (operands[0]) == SImode)
20754 if (GET_MODE (operands[1]) == SImode)
20755 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20756 else
20757 gen_divmod4_1
20758 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20759 gen_zero_extend = gen_zero_extendqisi2;
20761 else
20763 gen_divmod4_1
20764 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20765 gen_zero_extend = gen_zero_extendqidi2;
20767 gen_test_ccno_1 = gen_testsi_ccno_1;
20768 break;
20769 case E_DImode:
20770 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20771 gen_test_ccno_1 = gen_testdi_ccno_1;
20772 gen_zero_extend = gen_zero_extendqidi2;
20773 break;
20774 default:
20775 gcc_unreachable ();
20778 end_label = gen_label_rtx ();
20779 qimode_label = gen_label_rtx ();
20781 scratch = gen_reg_rtx (mode);
20783 /* Use 8bit unsigned divimod if dividend and divisor are within
20784 the range [0-255]. */
20785 emit_move_insn (scratch, operands[2]);
20786 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20787 scratch, 1, OPTAB_DIRECT);
20788 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20789 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20790 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20791 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20792 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20793 pc_rtx);
20794 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20795 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20796 JUMP_LABEL (insn) = qimode_label;
20798 /* Generate original signed/unsigned divimod. */
20799 div = gen_divmod4_1 (operands[0], operands[1],
20800 operands[2], operands[3]);
20801 emit_insn (div);
20803 /* Branch to the end. */
20804 emit_jump_insn (gen_jump (end_label));
20805 emit_barrier ();
20807 /* Generate 8bit unsigned divide. */
20808 emit_label (qimode_label);
20809 /* Don't use operands[0] for result of 8bit divide since not all
20810 registers support QImode ZERO_EXTRACT. */
20811 tmp0 = lowpart_subreg (HImode, scratch, mode);
20812 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20813 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20814 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20816 if (signed_p)
20818 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20819 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20821 else
20823 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20824 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20826 if (mode == SImode)
20828 if (GET_MODE (operands[0]) != SImode)
20829 div = gen_rtx_ZERO_EXTEND (DImode, div);
20830 if (GET_MODE (operands[1]) != SImode)
20831 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20834 /* Extract remainder from AH. */
20835 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20836 tmp0, GEN_INT (8), GEN_INT (8));
20837 if (REG_P (operands[1]))
20838 insn = emit_move_insn (operands[1], tmp1);
20839 else
20841 /* Need a new scratch register since the old one has result
20842 of 8bit divide. */
20843 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20844 emit_move_insn (scratch, tmp1);
20845 insn = emit_move_insn (operands[1], scratch);
20847 set_unique_reg_note (insn, REG_EQUAL, mod);
20849 /* Zero extend quotient from AL. */
20850 tmp1 = gen_lowpart (QImode, tmp0);
20851 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20852 set_unique_reg_note (insn, REG_EQUAL, div);
20854 emit_label (end_label);
20857 #define LEA_MAX_STALL (3)
20858 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20860 /* Increase given DISTANCE in half-cycles according to
20861 dependencies between PREV and NEXT instructions.
20862 Add 1 half-cycle if there is no dependency and
20863 go to next cycle if there is some dependecy. */
20865 static unsigned int
20866 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20868 df_ref def, use;
20870 if (!prev || !next)
20871 return distance + (distance & 1) + 2;
20873 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20874 return distance + 1;
20876 FOR_EACH_INSN_USE (use, next)
20877 FOR_EACH_INSN_DEF (def, prev)
20878 if (!DF_REF_IS_ARTIFICIAL (def)
20879 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20880 return distance + (distance & 1) + 2;
20882 return distance + 1;
20885 /* Function checks if instruction INSN defines register number
20886 REGNO1 or REGNO2. */
20888 static bool
20889 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20890 rtx_insn *insn)
20892 df_ref def;
20894 FOR_EACH_INSN_DEF (def, insn)
20895 if (DF_REF_REG_DEF_P (def)
20896 && !DF_REF_IS_ARTIFICIAL (def)
20897 && (regno1 == DF_REF_REGNO (def)
20898 || regno2 == DF_REF_REGNO (def)))
20899 return true;
20901 return false;
20904 /* Function checks if instruction INSN uses register number
20905 REGNO as a part of address expression. */
20907 static bool
20908 insn_uses_reg_mem (unsigned int regno, rtx insn)
20910 df_ref use;
20912 FOR_EACH_INSN_USE (use, insn)
20913 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20914 return true;
20916 return false;
20919 /* Search backward for non-agu definition of register number REGNO1
20920 or register number REGNO2 in basic block starting from instruction
20921 START up to head of basic block or instruction INSN.
20923 Function puts true value into *FOUND var if definition was found
20924 and false otherwise.
20926 Distance in half-cycles between START and found instruction or head
20927 of BB is added to DISTANCE and returned. */
20929 static int
20930 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20931 rtx_insn *insn, int distance,
20932 rtx_insn *start, bool *found)
20934 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20935 rtx_insn *prev = start;
20936 rtx_insn *next = NULL;
20938 *found = false;
20940 while (prev
20941 && prev != insn
20942 && distance < LEA_SEARCH_THRESHOLD)
20944 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20946 distance = increase_distance (prev, next, distance);
20947 if (insn_defines_reg (regno1, regno2, prev))
20949 if (recog_memoized (prev) < 0
20950 || get_attr_type (prev) != TYPE_LEA)
20952 *found = true;
20953 return distance;
20957 next = prev;
20959 if (prev == BB_HEAD (bb))
20960 break;
20962 prev = PREV_INSN (prev);
20965 return distance;
20968 /* Search backward for non-agu definition of register number REGNO1
20969 or register number REGNO2 in INSN's basic block until
20970 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20971 2. Reach neighbor BBs boundary, or
20972 3. Reach agu definition.
20973 Returns the distance between the non-agu definition point and INSN.
20974 If no definition point, returns -1. */
20976 static int
20977 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20978 rtx_insn *insn)
20980 basic_block bb = BLOCK_FOR_INSN (insn);
20981 int distance = 0;
20982 bool found = false;
20984 if (insn != BB_HEAD (bb))
20985 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20986 distance, PREV_INSN (insn),
20987 &found);
20989 if (!found && distance < LEA_SEARCH_THRESHOLD)
20991 edge e;
20992 edge_iterator ei;
20993 bool simple_loop = false;
20995 FOR_EACH_EDGE (e, ei, bb->preds)
20996 if (e->src == bb)
20998 simple_loop = true;
20999 break;
21002 if (simple_loop)
21003 distance = distance_non_agu_define_in_bb (regno1, regno2,
21004 insn, distance,
21005 BB_END (bb), &found);
21006 else
21008 int shortest_dist = -1;
21009 bool found_in_bb = false;
21011 FOR_EACH_EDGE (e, ei, bb->preds)
21013 int bb_dist
21014 = distance_non_agu_define_in_bb (regno1, regno2,
21015 insn, distance,
21016 BB_END (e->src),
21017 &found_in_bb);
21018 if (found_in_bb)
21020 if (shortest_dist < 0)
21021 shortest_dist = bb_dist;
21022 else if (bb_dist > 0)
21023 shortest_dist = MIN (bb_dist, shortest_dist);
21025 found = true;
21029 distance = shortest_dist;
21033 /* get_attr_type may modify recog data. We want to make sure
21034 that recog data is valid for instruction INSN, on which
21035 distance_non_agu_define is called. INSN is unchanged here. */
21036 extract_insn_cached (insn);
21038 if (!found)
21039 return -1;
21041 return distance >> 1;
21044 /* Return the distance in half-cycles between INSN and the next
21045 insn that uses register number REGNO in memory address added
21046 to DISTANCE. Return -1 if REGNO0 is set.
21048 Put true value into *FOUND if register usage was found and
21049 false otherwise.
21050 Put true value into *REDEFINED if register redefinition was
21051 found and false otherwise. */
21053 static int
21054 distance_agu_use_in_bb (unsigned int regno,
21055 rtx_insn *insn, int distance, rtx_insn *start,
21056 bool *found, bool *redefined)
21058 basic_block bb = NULL;
21059 rtx_insn *next = start;
21060 rtx_insn *prev = NULL;
21062 *found = false;
21063 *redefined = false;
21065 if (start != NULL_RTX)
21067 bb = BLOCK_FOR_INSN (start);
21068 if (start != BB_HEAD (bb))
21069 /* If insn and start belong to the same bb, set prev to insn,
21070 so the call to increase_distance will increase the distance
21071 between insns by 1. */
21072 prev = insn;
21075 while (next
21076 && next != insn
21077 && distance < LEA_SEARCH_THRESHOLD)
21079 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21081 distance = increase_distance(prev, next, distance);
21082 if (insn_uses_reg_mem (regno, next))
21084 /* Return DISTANCE if OP0 is used in memory
21085 address in NEXT. */
21086 *found = true;
21087 return distance;
21090 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21092 /* Return -1 if OP0 is set in NEXT. */
21093 *redefined = true;
21094 return -1;
21097 prev = next;
21100 if (next == BB_END (bb))
21101 break;
21103 next = NEXT_INSN (next);
21106 return distance;
21109 /* Return the distance between INSN and the next insn that uses
21110 register number REGNO0 in memory address. Return -1 if no such
21111 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21113 static int
21114 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21116 basic_block bb = BLOCK_FOR_INSN (insn);
21117 int distance = 0;
21118 bool found = false;
21119 bool redefined = false;
21121 if (insn != BB_END (bb))
21122 distance = distance_agu_use_in_bb (regno0, insn, distance,
21123 NEXT_INSN (insn),
21124 &found, &redefined);
21126 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21128 edge e;
21129 edge_iterator ei;
21130 bool simple_loop = false;
21132 FOR_EACH_EDGE (e, ei, bb->succs)
21133 if (e->dest == bb)
21135 simple_loop = true;
21136 break;
21139 if (simple_loop)
21140 distance = distance_agu_use_in_bb (regno0, insn,
21141 distance, BB_HEAD (bb),
21142 &found, &redefined);
21143 else
21145 int shortest_dist = -1;
21146 bool found_in_bb = false;
21147 bool redefined_in_bb = false;
21149 FOR_EACH_EDGE (e, ei, bb->succs)
21151 int bb_dist
21152 = distance_agu_use_in_bb (regno0, insn,
21153 distance, BB_HEAD (e->dest),
21154 &found_in_bb, &redefined_in_bb);
21155 if (found_in_bb)
21157 if (shortest_dist < 0)
21158 shortest_dist = bb_dist;
21159 else if (bb_dist > 0)
21160 shortest_dist = MIN (bb_dist, shortest_dist);
21162 found = true;
21166 distance = shortest_dist;
21170 if (!found || redefined)
21171 return -1;
21173 return distance >> 1;
21176 /* Define this macro to tune LEA priority vs ADD, it take effect when
21177 there is a dilemma of choicing LEA or ADD
21178 Negative value: ADD is more preferred than LEA
21179 Zero: Netrual
21180 Positive value: LEA is more preferred than ADD*/
21181 #define IX86_LEA_PRIORITY 0
21183 /* Return true if usage of lea INSN has performance advantage
21184 over a sequence of instructions. Instructions sequence has
21185 SPLIT_COST cycles higher latency than lea latency. */
21187 static bool
21188 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21189 unsigned int regno2, int split_cost, bool has_scale)
21191 int dist_define, dist_use;
21193 /* For Silvermont if using a 2-source or 3-source LEA for
21194 non-destructive destination purposes, or due to wanting
21195 ability to use SCALE, the use of LEA is justified. */
21196 if (TARGET_SILVERMONT || TARGET_INTEL)
21198 if (has_scale)
21199 return true;
21200 if (split_cost < 1)
21201 return false;
21202 if (regno0 == regno1 || regno0 == regno2)
21203 return false;
21204 return true;
21207 dist_define = distance_non_agu_define (regno1, regno2, insn);
21208 dist_use = distance_agu_use (regno0, insn);
21210 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21212 /* If there is no non AGU operand definition, no AGU
21213 operand usage and split cost is 0 then both lea
21214 and non lea variants have same priority. Currently
21215 we prefer lea for 64 bit code and non lea on 32 bit
21216 code. */
21217 if (dist_use < 0 && split_cost == 0)
21218 return TARGET_64BIT || IX86_LEA_PRIORITY;
21219 else
21220 return true;
21223 /* With longer definitions distance lea is more preferable.
21224 Here we change it to take into account splitting cost and
21225 lea priority. */
21226 dist_define += split_cost + IX86_LEA_PRIORITY;
21228 /* If there is no use in memory addess then we just check
21229 that split cost exceeds AGU stall. */
21230 if (dist_use < 0)
21231 return dist_define > LEA_MAX_STALL;
21233 /* If this insn has both backward non-agu dependence and forward
21234 agu dependence, the one with short distance takes effect. */
21235 return dist_define >= dist_use;
21238 /* Return true if it is legal to clobber flags by INSN and
21239 false otherwise. */
21241 static bool
21242 ix86_ok_to_clobber_flags (rtx_insn *insn)
21244 basic_block bb = BLOCK_FOR_INSN (insn);
21245 df_ref use;
21246 bitmap live;
21248 while (insn)
21250 if (NONDEBUG_INSN_P (insn))
21252 FOR_EACH_INSN_USE (use, insn)
21253 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21254 return false;
21256 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21257 return true;
21260 if (insn == BB_END (bb))
21261 break;
21263 insn = NEXT_INSN (insn);
21266 live = df_get_live_out(bb);
21267 return !REGNO_REG_SET_P (live, FLAGS_REG);
21270 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21271 move and add to avoid AGU stalls. */
21273 bool
21274 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21276 unsigned int regno0, regno1, regno2;
21278 /* Check if we need to optimize. */
21279 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21280 return false;
21282 /* Check it is correct to split here. */
21283 if (!ix86_ok_to_clobber_flags(insn))
21284 return false;
21286 regno0 = true_regnum (operands[0]);
21287 regno1 = true_regnum (operands[1]);
21288 regno2 = true_regnum (operands[2]);
21290 /* We need to split only adds with non destructive
21291 destination operand. */
21292 if (regno0 == regno1 || regno0 == regno2)
21293 return false;
21294 else
21295 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21298 /* Return true if we should emit lea instruction instead of mov
21299 instruction. */
21301 bool
21302 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21304 unsigned int regno0, regno1;
21306 /* Check if we need to optimize. */
21307 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21308 return false;
21310 /* Use lea for reg to reg moves only. */
21311 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21312 return false;
21314 regno0 = true_regnum (operands[0]);
21315 regno1 = true_regnum (operands[1]);
21317 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21320 /* Return true if we need to split lea into a sequence of
21321 instructions to avoid AGU stalls. */
21323 bool
21324 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21326 unsigned int regno0, regno1, regno2;
21327 int split_cost;
21328 struct ix86_address parts;
21329 int ok;
21331 /* Check we need to optimize. */
21332 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21333 return false;
21335 /* The "at least two components" test below might not catch simple
21336 move or zero extension insns if parts.base is non-NULL and parts.disp
21337 is const0_rtx as the only components in the address, e.g. if the
21338 register is %rbp or %r13. As this test is much cheaper and moves or
21339 zero extensions are the common case, do this check first. */
21340 if (REG_P (operands[1])
21341 || (SImode_address_operand (operands[1], VOIDmode)
21342 && REG_P (XEXP (operands[1], 0))))
21343 return false;
21345 /* Check if it is OK to split here. */
21346 if (!ix86_ok_to_clobber_flags (insn))
21347 return false;
21349 ok = ix86_decompose_address (operands[1], &parts);
21350 gcc_assert (ok);
21352 /* There should be at least two components in the address. */
21353 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21354 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21355 return false;
21357 /* We should not split into add if non legitimate pic
21358 operand is used as displacement. */
21359 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21360 return false;
21362 regno0 = true_regnum (operands[0]) ;
21363 regno1 = INVALID_REGNUM;
21364 regno2 = INVALID_REGNUM;
21366 if (parts.base)
21367 regno1 = true_regnum (parts.base);
21368 if (parts.index)
21369 regno2 = true_regnum (parts.index);
21371 split_cost = 0;
21373 /* Compute how many cycles we will add to execution time
21374 if split lea into a sequence of instructions. */
21375 if (parts.base || parts.index)
21377 /* Have to use mov instruction if non desctructive
21378 destination form is used. */
21379 if (regno1 != regno0 && regno2 != regno0)
21380 split_cost += 1;
21382 /* Have to add index to base if both exist. */
21383 if (parts.base && parts.index)
21384 split_cost += 1;
21386 /* Have to use shift and adds if scale is 2 or greater. */
21387 if (parts.scale > 1)
21389 if (regno0 != regno1)
21390 split_cost += 1;
21391 else if (regno2 == regno0)
21392 split_cost += 4;
21393 else
21394 split_cost += parts.scale;
21397 /* Have to use add instruction with immediate if
21398 disp is non zero. */
21399 if (parts.disp && parts.disp != const0_rtx)
21400 split_cost += 1;
21402 /* Subtract the price of lea. */
21403 split_cost -= 1;
21406 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21407 parts.scale > 1);
21410 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21411 matches destination. RTX includes clobber of FLAGS_REG. */
21413 static void
21414 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21415 rtx dst, rtx src)
21417 rtx op, clob;
21419 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21420 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21422 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21425 /* Return true if regno1 def is nearest to the insn. */
21427 static bool
21428 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21430 rtx_insn *prev = insn;
21431 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21433 if (insn == start)
21434 return false;
21435 while (prev && prev != start)
21437 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21439 prev = PREV_INSN (prev);
21440 continue;
21442 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21443 return true;
21444 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21445 return false;
21446 prev = PREV_INSN (prev);
21449 /* None of the regs is defined in the bb. */
21450 return false;
21453 /* Split lea instructions into a sequence of instructions
21454 which are executed on ALU to avoid AGU stalls.
21455 It is assumed that it is allowed to clobber flags register
21456 at lea position. */
21458 void
21459 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21461 unsigned int regno0, regno1, regno2;
21462 struct ix86_address parts;
21463 rtx target, tmp;
21464 int ok, adds;
21466 ok = ix86_decompose_address (operands[1], &parts);
21467 gcc_assert (ok);
21469 target = gen_lowpart (mode, operands[0]);
21471 regno0 = true_regnum (target);
21472 regno1 = INVALID_REGNUM;
21473 regno2 = INVALID_REGNUM;
21475 if (parts.base)
21477 parts.base = gen_lowpart (mode, parts.base);
21478 regno1 = true_regnum (parts.base);
21481 if (parts.index)
21483 parts.index = gen_lowpart (mode, parts.index);
21484 regno2 = true_regnum (parts.index);
21487 if (parts.disp)
21488 parts.disp = gen_lowpart (mode, parts.disp);
21490 if (parts.scale > 1)
21492 /* Case r1 = r1 + ... */
21493 if (regno1 == regno0)
21495 /* If we have a case r1 = r1 + C * r2 then we
21496 should use multiplication which is very
21497 expensive. Assume cost model is wrong if we
21498 have such case here. */
21499 gcc_assert (regno2 != regno0);
21501 for (adds = parts.scale; adds > 0; adds--)
21502 ix86_emit_binop (PLUS, mode, target, parts.index);
21504 else
21506 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21507 if (regno0 != regno2)
21508 emit_insn (gen_rtx_SET (target, parts.index));
21510 /* Use shift for scaling. */
21511 ix86_emit_binop (ASHIFT, mode, target,
21512 GEN_INT (exact_log2 (parts.scale)));
21514 if (parts.base)
21515 ix86_emit_binop (PLUS, mode, target, parts.base);
21517 if (parts.disp && parts.disp != const0_rtx)
21518 ix86_emit_binop (PLUS, mode, target, parts.disp);
21521 else if (!parts.base && !parts.index)
21523 gcc_assert(parts.disp);
21524 emit_insn (gen_rtx_SET (target, parts.disp));
21526 else
21528 if (!parts.base)
21530 if (regno0 != regno2)
21531 emit_insn (gen_rtx_SET (target, parts.index));
21533 else if (!parts.index)
21535 if (regno0 != regno1)
21536 emit_insn (gen_rtx_SET (target, parts.base));
21538 else
21540 if (regno0 == regno1)
21541 tmp = parts.index;
21542 else if (regno0 == regno2)
21543 tmp = parts.base;
21544 else
21546 rtx tmp1;
21548 /* Find better operand for SET instruction, depending
21549 on which definition is farther from the insn. */
21550 if (find_nearest_reg_def (insn, regno1, regno2))
21551 tmp = parts.index, tmp1 = parts.base;
21552 else
21553 tmp = parts.base, tmp1 = parts.index;
21555 emit_insn (gen_rtx_SET (target, tmp));
21557 if (parts.disp && parts.disp != const0_rtx)
21558 ix86_emit_binop (PLUS, mode, target, parts.disp);
21560 ix86_emit_binop (PLUS, mode, target, tmp1);
21561 return;
21564 ix86_emit_binop (PLUS, mode, target, tmp);
21567 if (parts.disp && parts.disp != const0_rtx)
21568 ix86_emit_binop (PLUS, mode, target, parts.disp);
21572 /* Return true if it is ok to optimize an ADD operation to LEA
21573 operation to avoid flag register consumation. For most processors,
21574 ADD is faster than LEA. For the processors like BONNELL, if the
21575 destination register of LEA holds an actual address which will be
21576 used soon, LEA is better and otherwise ADD is better. */
21578 bool
21579 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21581 unsigned int regno0 = true_regnum (operands[0]);
21582 unsigned int regno1 = true_regnum (operands[1]);
21583 unsigned int regno2 = true_regnum (operands[2]);
21585 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21586 if (regno0 != regno1 && regno0 != regno2)
21587 return true;
21589 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21590 return false;
21592 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21595 /* Return true if destination reg of SET_BODY is shift count of
21596 USE_BODY. */
21598 static bool
21599 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21601 rtx set_dest;
21602 rtx shift_rtx;
21603 int i;
21605 /* Retrieve destination of SET_BODY. */
21606 switch (GET_CODE (set_body))
21608 case SET:
21609 set_dest = SET_DEST (set_body);
21610 if (!set_dest || !REG_P (set_dest))
21611 return false;
21612 break;
21613 case PARALLEL:
21614 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21615 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21616 use_body))
21617 return true;
21618 /* FALLTHROUGH */
21619 default:
21620 return false;
21623 /* Retrieve shift count of USE_BODY. */
21624 switch (GET_CODE (use_body))
21626 case SET:
21627 shift_rtx = XEXP (use_body, 1);
21628 break;
21629 case PARALLEL:
21630 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21631 if (ix86_dep_by_shift_count_body (set_body,
21632 XVECEXP (use_body, 0, i)))
21633 return true;
21634 /* FALLTHROUGH */
21635 default:
21636 return false;
21639 if (shift_rtx
21640 && (GET_CODE (shift_rtx) == ASHIFT
21641 || GET_CODE (shift_rtx) == LSHIFTRT
21642 || GET_CODE (shift_rtx) == ASHIFTRT
21643 || GET_CODE (shift_rtx) == ROTATE
21644 || GET_CODE (shift_rtx) == ROTATERT))
21646 rtx shift_count = XEXP (shift_rtx, 1);
21648 /* Return true if shift count is dest of SET_BODY. */
21649 if (REG_P (shift_count))
21651 /* Add check since it can be invoked before register
21652 allocation in pre-reload schedule. */
21653 if (reload_completed
21654 && true_regnum (set_dest) == true_regnum (shift_count))
21655 return true;
21656 else if (REGNO(set_dest) == REGNO(shift_count))
21657 return true;
21661 return false;
21664 /* Return true if destination reg of SET_INSN is shift count of
21665 USE_INSN. */
21667 bool
21668 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21670 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21671 PATTERN (use_insn));
21674 /* Return TRUE or FALSE depending on whether the unary operator meets the
21675 appropriate constraints. */
21677 bool
21678 ix86_unary_operator_ok (enum rtx_code,
21679 machine_mode,
21680 rtx operands[2])
21682 /* If one of operands is memory, source and destination must match. */
21683 if ((MEM_P (operands[0])
21684 || MEM_P (operands[1]))
21685 && ! rtx_equal_p (operands[0], operands[1]))
21686 return false;
21687 return true;
21690 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21691 are ok, keeping in mind the possible movddup alternative. */
21693 bool
21694 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21696 if (MEM_P (operands[0]))
21697 return rtx_equal_p (operands[0], operands[1 + high]);
21698 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21699 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21700 return true;
21703 /* Post-reload splitter for converting an SF or DFmode value in an
21704 SSE register into an unsigned SImode. */
21706 void
21707 ix86_split_convert_uns_si_sse (rtx operands[])
21709 machine_mode vecmode;
21710 rtx value, large, zero_or_two31, input, two31, x;
21712 large = operands[1];
21713 zero_or_two31 = operands[2];
21714 input = operands[3];
21715 two31 = operands[4];
21716 vecmode = GET_MODE (large);
21717 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21719 /* Load up the value into the low element. We must ensure that the other
21720 elements are valid floats -- zero is the easiest such value. */
21721 if (MEM_P (input))
21723 if (vecmode == V4SFmode)
21724 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21725 else
21726 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21728 else
21730 input = gen_rtx_REG (vecmode, REGNO (input));
21731 emit_move_insn (value, CONST0_RTX (vecmode));
21732 if (vecmode == V4SFmode)
21733 emit_insn (gen_sse_movss (value, value, input));
21734 else
21735 emit_insn (gen_sse2_movsd (value, value, input));
21738 emit_move_insn (large, two31);
21739 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21741 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21742 emit_insn (gen_rtx_SET (large, x));
21744 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21745 emit_insn (gen_rtx_SET (zero_or_two31, x));
21747 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21748 emit_insn (gen_rtx_SET (value, x));
21750 large = gen_rtx_REG (V4SImode, REGNO (large));
21751 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21753 x = gen_rtx_REG (V4SImode, REGNO (value));
21754 if (vecmode == V4SFmode)
21755 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21756 else
21757 emit_insn (gen_sse2_cvttpd2dq (x, value));
21758 value = x;
21760 emit_insn (gen_xorv4si3 (value, value, large));
21763 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21764 Expects the 64-bit DImode to be supplied in a pair of integral
21765 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21766 -mfpmath=sse, !optimize_size only. */
21768 void
21769 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21771 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21772 rtx int_xmm, fp_xmm;
21773 rtx biases, exponents;
21774 rtx x;
21776 int_xmm = gen_reg_rtx (V4SImode);
21777 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21778 emit_insn (gen_movdi_to_sse (int_xmm, input));
21779 else if (TARGET_SSE_SPLIT_REGS)
21781 emit_clobber (int_xmm);
21782 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21784 else
21786 x = gen_reg_rtx (V2DImode);
21787 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21788 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21791 x = gen_rtx_CONST_VECTOR (V4SImode,
21792 gen_rtvec (4, GEN_INT (0x43300000UL),
21793 GEN_INT (0x45300000UL),
21794 const0_rtx, const0_rtx));
21795 exponents = validize_mem (force_const_mem (V4SImode, x));
21797 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21798 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21800 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21801 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21802 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21803 (0x1.0p84 + double(fp_value_hi_xmm)).
21804 Note these exponents differ by 32. */
21806 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21808 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21809 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21810 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21811 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21812 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21813 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21814 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21815 biases = validize_mem (force_const_mem (V2DFmode, biases));
21816 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21818 /* Add the upper and lower DFmode values together. */
21819 if (TARGET_SSE3)
21820 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21821 else
21823 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21824 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21825 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21828 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21831 /* Not used, but eases macroization of patterns. */
21832 void
21833 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21835 gcc_unreachable ();
21838 /* Convert an unsigned SImode value into a DFmode. Only currently used
21839 for SSE, but applicable anywhere. */
21841 void
21842 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21844 REAL_VALUE_TYPE TWO31r;
21845 rtx x, fp;
21847 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21848 NULL, 1, OPTAB_DIRECT);
21850 fp = gen_reg_rtx (DFmode);
21851 emit_insn (gen_floatsidf2 (fp, x));
21853 real_ldexp (&TWO31r, &dconst1, 31);
21854 x = const_double_from_real_value (TWO31r, DFmode);
21856 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21857 if (x != target)
21858 emit_move_insn (target, x);
21861 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21862 32-bit mode; otherwise we have a direct convert instruction. */
21864 void
21865 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21867 REAL_VALUE_TYPE TWO32r;
21868 rtx fp_lo, fp_hi, x;
21870 fp_lo = gen_reg_rtx (DFmode);
21871 fp_hi = gen_reg_rtx (DFmode);
21873 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21875 real_ldexp (&TWO32r, &dconst1, 32);
21876 x = const_double_from_real_value (TWO32r, DFmode);
21877 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21879 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21881 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21882 0, OPTAB_DIRECT);
21883 if (x != target)
21884 emit_move_insn (target, x);
21887 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21888 For x86_32, -mfpmath=sse, !optimize_size only. */
21889 void
21890 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21892 REAL_VALUE_TYPE ONE16r;
21893 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21895 real_ldexp (&ONE16r, &dconst1, 16);
21896 x = const_double_from_real_value (ONE16r, SFmode);
21897 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21898 NULL, 0, OPTAB_DIRECT);
21899 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21900 NULL, 0, OPTAB_DIRECT);
21901 fp_hi = gen_reg_rtx (SFmode);
21902 fp_lo = gen_reg_rtx (SFmode);
21903 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21904 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21905 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21906 0, OPTAB_DIRECT);
21907 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21908 0, OPTAB_DIRECT);
21909 if (!rtx_equal_p (target, fp_hi))
21910 emit_move_insn (target, fp_hi);
21913 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21914 a vector of unsigned ints VAL to vector of floats TARGET. */
21916 void
21917 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21919 rtx tmp[8];
21920 REAL_VALUE_TYPE TWO16r;
21921 machine_mode intmode = GET_MODE (val);
21922 machine_mode fltmode = GET_MODE (target);
21923 rtx (*cvt) (rtx, rtx);
21925 if (intmode == V4SImode)
21926 cvt = gen_floatv4siv4sf2;
21927 else
21928 cvt = gen_floatv8siv8sf2;
21929 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21930 tmp[0] = force_reg (intmode, tmp[0]);
21931 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21932 OPTAB_DIRECT);
21933 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21934 NULL_RTX, 1, OPTAB_DIRECT);
21935 tmp[3] = gen_reg_rtx (fltmode);
21936 emit_insn (cvt (tmp[3], tmp[1]));
21937 tmp[4] = gen_reg_rtx (fltmode);
21938 emit_insn (cvt (tmp[4], tmp[2]));
21939 real_ldexp (&TWO16r, &dconst1, 16);
21940 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21941 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21942 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21943 OPTAB_DIRECT);
21944 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21945 OPTAB_DIRECT);
21946 if (tmp[7] != target)
21947 emit_move_insn (target, tmp[7]);
21950 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21951 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21952 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21953 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21956 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21958 REAL_VALUE_TYPE TWO31r;
21959 rtx two31r, tmp[4];
21960 machine_mode mode = GET_MODE (val);
21961 machine_mode scalarmode = GET_MODE_INNER (mode);
21962 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21963 rtx (*cmp) (rtx, rtx, rtx, rtx);
21964 int i;
21966 for (i = 0; i < 3; i++)
21967 tmp[i] = gen_reg_rtx (mode);
21968 real_ldexp (&TWO31r, &dconst1, 31);
21969 two31r = const_double_from_real_value (TWO31r, scalarmode);
21970 two31r = ix86_build_const_vector (mode, 1, two31r);
21971 two31r = force_reg (mode, two31r);
21972 switch (mode)
21974 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21975 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21976 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21977 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21978 default: gcc_unreachable ();
21980 tmp[3] = gen_rtx_LE (mode, two31r, val);
21981 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21982 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21983 0, OPTAB_DIRECT);
21984 if (intmode == V4SImode || TARGET_AVX2)
21985 *xorp = expand_simple_binop (intmode, ASHIFT,
21986 gen_lowpart (intmode, tmp[0]),
21987 GEN_INT (31), NULL_RTX, 0,
21988 OPTAB_DIRECT);
21989 else
21991 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21992 two31 = ix86_build_const_vector (intmode, 1, two31);
21993 *xorp = expand_simple_binop (intmode, AND,
21994 gen_lowpart (intmode, tmp[0]),
21995 two31, NULL_RTX, 0,
21996 OPTAB_DIRECT);
21998 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21999 0, OPTAB_DIRECT);
22002 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22003 then replicate the value for all elements of the vector
22004 register. */
22007 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22009 int i, n_elt;
22010 rtvec v;
22011 machine_mode scalar_mode;
22013 switch (mode)
22015 case E_V64QImode:
22016 case E_V32QImode:
22017 case E_V16QImode:
22018 case E_V32HImode:
22019 case E_V16HImode:
22020 case E_V8HImode:
22021 case E_V16SImode:
22022 case E_V8SImode:
22023 case E_V4SImode:
22024 case E_V8DImode:
22025 case E_V4DImode:
22026 case E_V2DImode:
22027 gcc_assert (vect);
22028 /* FALLTHRU */
22029 case E_V16SFmode:
22030 case E_V8SFmode:
22031 case E_V4SFmode:
22032 case E_V8DFmode:
22033 case E_V4DFmode:
22034 case E_V2DFmode:
22035 n_elt = GET_MODE_NUNITS (mode);
22036 v = rtvec_alloc (n_elt);
22037 scalar_mode = GET_MODE_INNER (mode);
22039 RTVEC_ELT (v, 0) = value;
22041 for (i = 1; i < n_elt; ++i)
22042 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22044 return gen_rtx_CONST_VECTOR (mode, v);
22046 default:
22047 gcc_unreachable ();
22051 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22052 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22053 for an SSE register. If VECT is true, then replicate the mask for
22054 all elements of the vector register. If INVERT is true, then create
22055 a mask excluding the sign bit. */
22058 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22060 machine_mode vec_mode, imode;
22061 wide_int w;
22062 rtx mask, v;
22064 switch (mode)
22066 case E_V16SImode:
22067 case E_V16SFmode:
22068 case E_V8SImode:
22069 case E_V4SImode:
22070 case E_V8SFmode:
22071 case E_V4SFmode:
22072 vec_mode = mode;
22073 imode = SImode;
22074 break;
22076 case E_V8DImode:
22077 case E_V4DImode:
22078 case E_V2DImode:
22079 case E_V8DFmode:
22080 case E_V4DFmode:
22081 case E_V2DFmode:
22082 vec_mode = mode;
22083 imode = DImode;
22084 break;
22086 case E_TImode:
22087 case E_TFmode:
22088 vec_mode = VOIDmode;
22089 imode = TImode;
22090 break;
22092 default:
22093 gcc_unreachable ();
22096 machine_mode inner_mode = GET_MODE_INNER (mode);
22097 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22098 GET_MODE_BITSIZE (inner_mode));
22099 if (invert)
22100 w = wi::bit_not (w);
22102 /* Force this value into the low part of a fp vector constant. */
22103 mask = immed_wide_int_const (w, imode);
22104 mask = gen_lowpart (inner_mode, mask);
22106 if (vec_mode == VOIDmode)
22107 return force_reg (inner_mode, mask);
22109 v = ix86_build_const_vector (vec_mode, vect, mask);
22110 return force_reg (vec_mode, v);
22113 /* Generate code for floating point ABS or NEG. */
22115 void
22116 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22117 rtx operands[])
22119 rtx mask, set, dst, src;
22120 bool use_sse = false;
22121 bool vector_mode = VECTOR_MODE_P (mode);
22122 machine_mode vmode = mode;
22124 if (vector_mode)
22125 use_sse = true;
22126 else if (mode == TFmode)
22127 use_sse = true;
22128 else if (TARGET_SSE_MATH)
22130 use_sse = SSE_FLOAT_MODE_P (mode);
22131 if (mode == SFmode)
22132 vmode = V4SFmode;
22133 else if (mode == DFmode)
22134 vmode = V2DFmode;
22137 /* NEG and ABS performed with SSE use bitwise mask operations.
22138 Create the appropriate mask now. */
22139 if (use_sse)
22140 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22141 else
22142 mask = NULL_RTX;
22144 dst = operands[0];
22145 src = operands[1];
22147 set = gen_rtx_fmt_e (code, mode, src);
22148 set = gen_rtx_SET (dst, set);
22150 if (mask)
22152 rtx use, clob;
22153 rtvec par;
22155 use = gen_rtx_USE (VOIDmode, mask);
22156 if (vector_mode)
22157 par = gen_rtvec (2, set, use);
22158 else
22160 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22161 par = gen_rtvec (3, set, use, clob);
22163 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22165 else
22166 emit_insn (set);
22169 /* Expand a copysign operation. Special case operand 0 being a constant. */
22171 void
22172 ix86_expand_copysign (rtx operands[])
22174 machine_mode mode, vmode;
22175 rtx dest, op0, op1, mask, nmask;
22177 dest = operands[0];
22178 op0 = operands[1];
22179 op1 = operands[2];
22181 mode = GET_MODE (dest);
22183 if (mode == SFmode)
22184 vmode = V4SFmode;
22185 else if (mode == DFmode)
22186 vmode = V2DFmode;
22187 else
22188 vmode = mode;
22190 if (CONST_DOUBLE_P (op0))
22192 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22194 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22195 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22197 if (mode == SFmode || mode == DFmode)
22199 if (op0 == CONST0_RTX (mode))
22200 op0 = CONST0_RTX (vmode);
22201 else
22203 rtx v = ix86_build_const_vector (vmode, false, op0);
22205 op0 = force_reg (vmode, v);
22208 else if (op0 != CONST0_RTX (mode))
22209 op0 = force_reg (mode, op0);
22211 mask = ix86_build_signbit_mask (vmode, 0, 0);
22213 if (mode == SFmode)
22214 copysign_insn = gen_copysignsf3_const;
22215 else if (mode == DFmode)
22216 copysign_insn = gen_copysigndf3_const;
22217 else
22218 copysign_insn = gen_copysigntf3_const;
22220 emit_insn (copysign_insn (dest, op0, op1, mask));
22222 else
22224 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22226 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22227 mask = ix86_build_signbit_mask (vmode, 0, 0);
22229 if (mode == SFmode)
22230 copysign_insn = gen_copysignsf3_var;
22231 else if (mode == DFmode)
22232 copysign_insn = gen_copysigndf3_var;
22233 else
22234 copysign_insn = gen_copysigntf3_var;
22236 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22240 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22241 be a constant, and so has already been expanded into a vector constant. */
22243 void
22244 ix86_split_copysign_const (rtx operands[])
22246 machine_mode mode, vmode;
22247 rtx dest, op0, mask, x;
22249 dest = operands[0];
22250 op0 = operands[1];
22251 mask = operands[3];
22253 mode = GET_MODE (dest);
22254 vmode = GET_MODE (mask);
22256 dest = lowpart_subreg (vmode, dest, mode);
22257 x = gen_rtx_AND (vmode, dest, mask);
22258 emit_insn (gen_rtx_SET (dest, x));
22260 if (op0 != CONST0_RTX (vmode))
22262 x = gen_rtx_IOR (vmode, dest, op0);
22263 emit_insn (gen_rtx_SET (dest, x));
22267 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22268 so we have to do two masks. */
22270 void
22271 ix86_split_copysign_var (rtx operands[])
22273 machine_mode mode, vmode;
22274 rtx dest, scratch, op0, op1, mask, nmask, x;
22276 dest = operands[0];
22277 scratch = operands[1];
22278 op0 = operands[2];
22279 op1 = operands[3];
22280 nmask = operands[4];
22281 mask = operands[5];
22283 mode = GET_MODE (dest);
22284 vmode = GET_MODE (mask);
22286 if (rtx_equal_p (op0, op1))
22288 /* Shouldn't happen often (it's useless, obviously), but when it does
22289 we'd generate incorrect code if we continue below. */
22290 emit_move_insn (dest, op0);
22291 return;
22294 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22296 gcc_assert (REGNO (op1) == REGNO (scratch));
22298 x = gen_rtx_AND (vmode, scratch, mask);
22299 emit_insn (gen_rtx_SET (scratch, x));
22301 dest = mask;
22302 op0 = lowpart_subreg (vmode, op0, mode);
22303 x = gen_rtx_NOT (vmode, dest);
22304 x = gen_rtx_AND (vmode, x, op0);
22305 emit_insn (gen_rtx_SET (dest, x));
22307 else
22309 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22311 x = gen_rtx_AND (vmode, scratch, mask);
22313 else /* alternative 2,4 */
22315 gcc_assert (REGNO (mask) == REGNO (scratch));
22316 op1 = lowpart_subreg (vmode, op1, mode);
22317 x = gen_rtx_AND (vmode, scratch, op1);
22319 emit_insn (gen_rtx_SET (scratch, x));
22321 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22323 dest = lowpart_subreg (vmode, op0, mode);
22324 x = gen_rtx_AND (vmode, dest, nmask);
22326 else /* alternative 3,4 */
22328 gcc_assert (REGNO (nmask) == REGNO (dest));
22329 dest = nmask;
22330 op0 = lowpart_subreg (vmode, op0, mode);
22331 x = gen_rtx_AND (vmode, dest, op0);
22333 emit_insn (gen_rtx_SET (dest, x));
22336 x = gen_rtx_IOR (vmode, dest, scratch);
22337 emit_insn (gen_rtx_SET (dest, x));
22340 /* Return TRUE or FALSE depending on whether the first SET in INSN
22341 has source and destination with matching CC modes, and that the
22342 CC mode is at least as constrained as REQ_MODE. */
22344 bool
22345 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22347 rtx set;
22348 machine_mode set_mode;
22350 set = PATTERN (insn);
22351 if (GET_CODE (set) == PARALLEL)
22352 set = XVECEXP (set, 0, 0);
22353 gcc_assert (GET_CODE (set) == SET);
22354 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22356 set_mode = GET_MODE (SET_DEST (set));
22357 switch (set_mode)
22359 case E_CCNOmode:
22360 if (req_mode != CCNOmode
22361 && (req_mode != CCmode
22362 || XEXP (SET_SRC (set), 1) != const0_rtx))
22363 return false;
22364 break;
22365 case E_CCmode:
22366 if (req_mode == CCGCmode)
22367 return false;
22368 /* FALLTHRU */
22369 case E_CCGCmode:
22370 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22371 return false;
22372 /* FALLTHRU */
22373 case E_CCGOCmode:
22374 if (req_mode == CCZmode)
22375 return false;
22376 /* FALLTHRU */
22377 case E_CCZmode:
22378 break;
22380 case E_CCGZmode:
22382 case E_CCAmode:
22383 case E_CCCmode:
22384 case E_CCOmode:
22385 case E_CCPmode:
22386 case E_CCSmode:
22387 if (set_mode != req_mode)
22388 return false;
22389 break;
22391 default:
22392 gcc_unreachable ();
22395 return GET_MODE (SET_SRC (set)) == set_mode;
22398 /* Generate insn patterns to do an integer compare of OPERANDS. */
22400 static rtx
22401 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22403 machine_mode cmpmode;
22404 rtx tmp, flags;
22406 cmpmode = SELECT_CC_MODE (code, op0, op1);
22407 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22409 /* This is very simple, but making the interface the same as in the
22410 FP case makes the rest of the code easier. */
22411 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22412 emit_insn (gen_rtx_SET (flags, tmp));
22414 /* Return the test that should be put into the flags user, i.e.
22415 the bcc, scc, or cmov instruction. */
22416 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22419 /* Figure out whether to use unordered fp comparisons. */
22421 static bool
22422 ix86_unordered_fp_compare (enum rtx_code code)
22424 if (!TARGET_IEEE_FP)
22425 return false;
22427 switch (code)
22429 case GT:
22430 case GE:
22431 case LT:
22432 case LE:
22433 return false;
22435 case EQ:
22436 case NE:
22438 case LTGT:
22439 case UNORDERED:
22440 case ORDERED:
22441 case UNLT:
22442 case UNLE:
22443 case UNGT:
22444 case UNGE:
22445 case UNEQ:
22446 return true;
22448 default:
22449 gcc_unreachable ();
22453 machine_mode
22454 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22456 machine_mode mode = GET_MODE (op0);
22458 if (SCALAR_FLOAT_MODE_P (mode))
22460 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22461 return CCFPmode;
22464 switch (code)
22466 /* Only zero flag is needed. */
22467 case EQ: /* ZF=0 */
22468 case NE: /* ZF!=0 */
22469 return CCZmode;
22470 /* Codes needing carry flag. */
22471 case GEU: /* CF=0 */
22472 case LTU: /* CF=1 */
22473 /* Detect overflow checks. They need just the carry flag. */
22474 if (GET_CODE (op0) == PLUS
22475 && (rtx_equal_p (op1, XEXP (op0, 0))
22476 || rtx_equal_p (op1, XEXP (op0, 1))))
22477 return CCCmode;
22478 else
22479 return CCmode;
22480 case GTU: /* CF=0 & ZF=0 */
22481 case LEU: /* CF=1 | ZF=1 */
22482 return CCmode;
22483 /* Codes possibly doable only with sign flag when
22484 comparing against zero. */
22485 case GE: /* SF=OF or SF=0 */
22486 case LT: /* SF<>OF or SF=1 */
22487 if (op1 == const0_rtx)
22488 return CCGOCmode;
22489 else
22490 /* For other cases Carry flag is not required. */
22491 return CCGCmode;
22492 /* Codes doable only with sign flag when comparing
22493 against zero, but we miss jump instruction for it
22494 so we need to use relational tests against overflow
22495 that thus needs to be zero. */
22496 case GT: /* ZF=0 & SF=OF */
22497 case LE: /* ZF=1 | SF<>OF */
22498 if (op1 == const0_rtx)
22499 return CCNOmode;
22500 else
22501 return CCGCmode;
22502 /* strcmp pattern do (use flags) and combine may ask us for proper
22503 mode. */
22504 case USE:
22505 return CCmode;
22506 default:
22507 gcc_unreachable ();
22511 /* Return the fixed registers used for condition codes. */
22513 static bool
22514 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22516 *p1 = FLAGS_REG;
22517 *p2 = FPSR_REG;
22518 return true;
22521 /* If two condition code modes are compatible, return a condition code
22522 mode which is compatible with both. Otherwise, return
22523 VOIDmode. */
22525 static machine_mode
22526 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22528 if (m1 == m2)
22529 return m1;
22531 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22532 return VOIDmode;
22534 if ((m1 == CCGCmode && m2 == CCGOCmode)
22535 || (m1 == CCGOCmode && m2 == CCGCmode))
22536 return CCGCmode;
22538 if ((m1 == CCNOmode && m2 == CCGOCmode)
22539 || (m1 == CCGOCmode && m2 == CCNOmode))
22540 return CCNOmode;
22542 if (m1 == CCZmode
22543 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22544 return m2;
22545 else if (m2 == CCZmode
22546 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22547 return m1;
22549 switch (m1)
22551 default:
22552 gcc_unreachable ();
22554 case E_CCmode:
22555 case E_CCGCmode:
22556 case E_CCGOCmode:
22557 case E_CCNOmode:
22558 case E_CCAmode:
22559 case E_CCCmode:
22560 case E_CCOmode:
22561 case E_CCPmode:
22562 case E_CCSmode:
22563 case E_CCZmode:
22564 switch (m2)
22566 default:
22567 return VOIDmode;
22569 case E_CCmode:
22570 case E_CCGCmode:
22571 case E_CCGOCmode:
22572 case E_CCNOmode:
22573 case E_CCAmode:
22574 case E_CCCmode:
22575 case E_CCOmode:
22576 case E_CCPmode:
22577 case E_CCSmode:
22578 case E_CCZmode:
22579 return CCmode;
22582 case E_CCFPmode:
22583 /* These are only compatible with themselves, which we already
22584 checked above. */
22585 return VOIDmode;
22590 /* Return a comparison we can do and that it is equivalent to
22591 swap_condition (code) apart possibly from orderedness.
22592 But, never change orderedness if TARGET_IEEE_FP, returning
22593 UNKNOWN in that case if necessary. */
22595 static enum rtx_code
22596 ix86_fp_swap_condition (enum rtx_code code)
22598 switch (code)
22600 case GT: /* GTU - CF=0 & ZF=0 */
22601 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22602 case GE: /* GEU - CF=0 */
22603 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22604 case UNLT: /* LTU - CF=1 */
22605 return TARGET_IEEE_FP ? UNKNOWN : GT;
22606 case UNLE: /* LEU - CF=1 | ZF=1 */
22607 return TARGET_IEEE_FP ? UNKNOWN : GE;
22608 default:
22609 return swap_condition (code);
22613 /* Return cost of comparison CODE using the best strategy for performance.
22614 All following functions do use number of instructions as a cost metrics.
22615 In future this should be tweaked to compute bytes for optimize_size and
22616 take into account performance of various instructions on various CPUs. */
22618 static int
22619 ix86_fp_comparison_cost (enum rtx_code code)
22621 int arith_cost;
22623 /* The cost of code using bit-twiddling on %ah. */
22624 switch (code)
22626 case UNLE:
22627 case UNLT:
22628 case LTGT:
22629 case GT:
22630 case GE:
22631 case UNORDERED:
22632 case ORDERED:
22633 case UNEQ:
22634 arith_cost = 4;
22635 break;
22636 case LT:
22637 case NE:
22638 case EQ:
22639 case UNGE:
22640 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22641 break;
22642 case LE:
22643 case UNGT:
22644 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22645 break;
22646 default:
22647 gcc_unreachable ();
22650 switch (ix86_fp_comparison_strategy (code))
22652 case IX86_FPCMP_COMI:
22653 return arith_cost > 4 ? 3 : 2;
22654 case IX86_FPCMP_SAHF:
22655 return arith_cost > 4 ? 4 : 3;
22656 default:
22657 return arith_cost;
22661 /* Return strategy to use for floating-point. We assume that fcomi is always
22662 preferrable where available, since that is also true when looking at size
22663 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22665 enum ix86_fpcmp_strategy
22666 ix86_fp_comparison_strategy (enum rtx_code)
22668 /* Do fcomi/sahf based test when profitable. */
22670 if (TARGET_CMOVE)
22671 return IX86_FPCMP_COMI;
22673 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22674 return IX86_FPCMP_SAHF;
22676 return IX86_FPCMP_ARITH;
22679 /* Swap, force into registers, or otherwise massage the two operands
22680 to a fp comparison. The operands are updated in place; the new
22681 comparison code is returned. */
22683 static enum rtx_code
22684 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22686 bool unordered_compare = ix86_unordered_fp_compare (code);
22687 rtx op0 = *pop0, op1 = *pop1;
22688 machine_mode op_mode = GET_MODE (op0);
22689 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22691 /* All of the unordered compare instructions only work on registers.
22692 The same is true of the fcomi compare instructions. The XFmode
22693 compare instructions require registers except when comparing
22694 against zero or when converting operand 1 from fixed point to
22695 floating point. */
22697 if (!is_sse
22698 && (unordered_compare
22699 || (op_mode == XFmode
22700 && ! (standard_80387_constant_p (op0) == 1
22701 || standard_80387_constant_p (op1) == 1)
22702 && GET_CODE (op1) != FLOAT)
22703 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22705 op0 = force_reg (op_mode, op0);
22706 op1 = force_reg (op_mode, op1);
22708 else
22710 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22711 things around if they appear profitable, otherwise force op0
22712 into a register. */
22714 if (standard_80387_constant_p (op0) == 0
22715 || (MEM_P (op0)
22716 && ! (standard_80387_constant_p (op1) == 0
22717 || MEM_P (op1))))
22719 enum rtx_code new_code = ix86_fp_swap_condition (code);
22720 if (new_code != UNKNOWN)
22722 std::swap (op0, op1);
22723 code = new_code;
22727 if (!REG_P (op0))
22728 op0 = force_reg (op_mode, op0);
22730 if (CONSTANT_P (op1))
22732 int tmp = standard_80387_constant_p (op1);
22733 if (tmp == 0)
22734 op1 = validize_mem (force_const_mem (op_mode, op1));
22735 else if (tmp == 1)
22737 if (TARGET_CMOVE)
22738 op1 = force_reg (op_mode, op1);
22740 else
22741 op1 = force_reg (op_mode, op1);
22745 /* Try to rearrange the comparison to make it cheaper. */
22746 if (ix86_fp_comparison_cost (code)
22747 > ix86_fp_comparison_cost (swap_condition (code))
22748 && (REG_P (op1) || can_create_pseudo_p ()))
22750 std::swap (op0, op1);
22751 code = swap_condition (code);
22752 if (!REG_P (op0))
22753 op0 = force_reg (op_mode, op0);
22756 *pop0 = op0;
22757 *pop1 = op1;
22758 return code;
22761 /* Convert comparison codes we use to represent FP comparison to integer
22762 code that will result in proper branch. Return UNKNOWN if no such code
22763 is available. */
22765 enum rtx_code
22766 ix86_fp_compare_code_to_integer (enum rtx_code code)
22768 switch (code)
22770 case GT:
22771 return GTU;
22772 case GE:
22773 return GEU;
22774 case ORDERED:
22775 case UNORDERED:
22776 return code;
22777 case UNEQ:
22778 return EQ;
22779 case UNLT:
22780 return LTU;
22781 case UNLE:
22782 return LEU;
22783 case LTGT:
22784 return NE;
22785 default:
22786 return UNKNOWN;
22790 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22792 static rtx
22793 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22795 bool unordered_compare = ix86_unordered_fp_compare (code);
22796 machine_mode intcmp_mode;
22797 rtx tmp, tmp2;
22799 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22801 /* Do fcomi/sahf based test when profitable. */
22802 switch (ix86_fp_comparison_strategy (code))
22804 case IX86_FPCMP_COMI:
22805 intcmp_mode = CCFPmode;
22806 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22807 if (unordered_compare)
22808 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22809 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22810 break;
22812 case IX86_FPCMP_SAHF:
22813 intcmp_mode = CCFPmode;
22814 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22815 if (unordered_compare)
22816 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22817 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22818 if (!scratch)
22819 scratch = gen_reg_rtx (HImode);
22820 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22821 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22822 break;
22824 case IX86_FPCMP_ARITH:
22825 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22826 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22827 if (unordered_compare)
22828 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22829 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22830 if (!scratch)
22831 scratch = gen_reg_rtx (HImode);
22832 emit_insn (gen_rtx_SET (scratch, tmp));
22834 /* In the unordered case, we have to check C2 for NaN's, which
22835 doesn't happen to work out to anything nice combination-wise.
22836 So do some bit twiddling on the value we've got in AH to come
22837 up with an appropriate set of condition codes. */
22839 intcmp_mode = CCNOmode;
22840 switch (code)
22842 case GT:
22843 case UNGT:
22844 if (code == GT || !TARGET_IEEE_FP)
22846 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22847 code = EQ;
22849 else
22851 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22852 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22853 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22854 intcmp_mode = CCmode;
22855 code = GEU;
22857 break;
22858 case LT:
22859 case UNLT:
22860 if (code == LT && TARGET_IEEE_FP)
22862 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22863 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22864 intcmp_mode = CCmode;
22865 code = EQ;
22867 else
22869 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22870 code = NE;
22872 break;
22873 case GE:
22874 case UNGE:
22875 if (code == GE || !TARGET_IEEE_FP)
22877 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22878 code = EQ;
22880 else
22882 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22883 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22884 code = NE;
22886 break;
22887 case LE:
22888 case UNLE:
22889 if (code == LE && TARGET_IEEE_FP)
22891 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22892 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22893 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22894 intcmp_mode = CCmode;
22895 code = LTU;
22897 else
22899 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22900 code = NE;
22902 break;
22903 case EQ:
22904 case UNEQ:
22905 if (code == EQ && TARGET_IEEE_FP)
22907 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22908 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22909 intcmp_mode = CCmode;
22910 code = EQ;
22912 else
22914 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22915 code = NE;
22917 break;
22918 case NE:
22919 case LTGT:
22920 if (code == NE && TARGET_IEEE_FP)
22922 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22923 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22924 GEN_INT (0x40)));
22925 code = NE;
22927 else
22929 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22930 code = EQ;
22932 break;
22934 case UNORDERED:
22935 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22936 code = NE;
22937 break;
22938 case ORDERED:
22939 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22940 code = EQ;
22941 break;
22943 default:
22944 gcc_unreachable ();
22946 break;
22948 default:
22949 gcc_unreachable();
22952 /* Return the test that should be put into the flags user, i.e.
22953 the bcc, scc, or cmov instruction. */
22954 return gen_rtx_fmt_ee (code, VOIDmode,
22955 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22956 const0_rtx);
22959 static rtx
22960 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22962 rtx ret;
22964 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22965 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22967 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22969 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22970 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22972 else
22973 ret = ix86_expand_int_compare (code, op0, op1);
22975 return ret;
22978 void
22979 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22981 machine_mode mode = GET_MODE (op0);
22982 rtx tmp;
22984 /* Handle special case - vector comparsion with boolean result, transform
22985 it using ptest instruction. */
22986 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22988 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22989 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22991 gcc_assert (code == EQ || code == NE);
22992 /* Generate XOR since we can't check that one operand is zero vector. */
22993 tmp = gen_reg_rtx (mode);
22994 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22995 tmp = gen_lowpart (p_mode, tmp);
22996 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22997 gen_rtx_UNSPEC (CCmode,
22998 gen_rtvec (2, tmp, tmp),
22999 UNSPEC_PTEST)));
23000 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23001 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23002 gen_rtx_LABEL_REF (VOIDmode, label),
23003 pc_rtx);
23004 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23005 return;
23008 switch (mode)
23010 case E_SFmode:
23011 case E_DFmode:
23012 case E_XFmode:
23013 case E_QImode:
23014 case E_HImode:
23015 case E_SImode:
23016 simple:
23017 tmp = ix86_expand_compare (code, op0, op1);
23018 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23019 gen_rtx_LABEL_REF (VOIDmode, label),
23020 pc_rtx);
23021 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23022 return;
23024 case E_DImode:
23025 if (TARGET_64BIT)
23026 goto simple;
23027 /* For 32-bit target DI comparison may be performed on
23028 SSE registers. To allow this we should avoid split
23029 to SI mode which is achieved by doing xor in DI mode
23030 and then comparing with zero (which is recognized by
23031 STV pass). We don't compare using xor when optimizing
23032 for size. */
23033 if (!optimize_insn_for_size_p ()
23034 && TARGET_STV
23035 && (code == EQ || code == NE))
23037 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23038 op1 = const0_rtx;
23040 /* FALLTHRU */
23041 case E_TImode:
23042 /* Expand DImode branch into multiple compare+branch. */
23044 rtx lo[2], hi[2];
23045 rtx_code_label *label2;
23046 enum rtx_code code1, code2, code3;
23047 machine_mode submode;
23049 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23051 std::swap (op0, op1);
23052 code = swap_condition (code);
23055 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23056 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23058 submode = mode == DImode ? SImode : DImode;
23060 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23061 avoid two branches. This costs one extra insn, so disable when
23062 optimizing for size. */
23064 if ((code == EQ || code == NE)
23065 && (!optimize_insn_for_size_p ()
23066 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23068 rtx xor0, xor1;
23070 xor1 = hi[0];
23071 if (hi[1] != const0_rtx)
23072 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23073 NULL_RTX, 0, OPTAB_WIDEN);
23075 xor0 = lo[0];
23076 if (lo[1] != const0_rtx)
23077 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23078 NULL_RTX, 0, OPTAB_WIDEN);
23080 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23081 NULL_RTX, 0, OPTAB_WIDEN);
23083 ix86_expand_branch (code, tmp, const0_rtx, label);
23084 return;
23087 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23088 op1 is a constant and the low word is zero, then we can just
23089 examine the high word. Similarly for low word -1 and
23090 less-or-equal-than or greater-than. */
23092 if (CONST_INT_P (hi[1]))
23093 switch (code)
23095 case LT: case LTU: case GE: case GEU:
23096 if (lo[1] == const0_rtx)
23098 ix86_expand_branch (code, hi[0], hi[1], label);
23099 return;
23101 break;
23102 case LE: case LEU: case GT: case GTU:
23103 if (lo[1] == constm1_rtx)
23105 ix86_expand_branch (code, hi[0], hi[1], label);
23106 return;
23108 break;
23109 default:
23110 break;
23113 /* Emulate comparisons that do not depend on Zero flag with
23114 double-word subtraction. Note that only Overflow, Sign
23115 and Carry flags are valid, so swap arguments and condition
23116 of comparisons that would otherwise test Zero flag. */
23118 switch (code)
23120 case LE: case LEU: case GT: case GTU:
23121 std::swap (lo[0], lo[1]);
23122 std::swap (hi[0], hi[1]);
23123 code = swap_condition (code);
23124 /* FALLTHRU */
23126 case LT: case LTU: case GE: case GEU:
23128 rtx (*cmp_insn) (rtx, rtx);
23129 rtx (*sbb_insn) (rtx, rtx, rtx);
23130 bool uns = (code == LTU || code == GEU);
23132 if (TARGET_64BIT)
23134 cmp_insn = gen_cmpdi_1;
23135 sbb_insn
23136 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
23138 else
23140 cmp_insn = gen_cmpsi_1;
23141 sbb_insn
23142 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
23145 if (!nonimmediate_operand (lo[0], submode))
23146 lo[0] = force_reg (submode, lo[0]);
23147 if (!x86_64_general_operand (lo[1], submode))
23148 lo[1] = force_reg (submode, lo[1]);
23150 if (!register_operand (hi[0], submode))
23151 hi[0] = force_reg (submode, hi[0]);
23152 if ((uns && !nonimmediate_operand (hi[1], submode))
23153 || (!uns && !x86_64_general_operand (hi[1], submode)))
23154 hi[1] = force_reg (submode, hi[1]);
23156 emit_insn (cmp_insn (lo[0], lo[1]));
23157 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
23159 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
23161 ix86_expand_branch (code, tmp, const0_rtx, label);
23162 return;
23165 default:
23166 break;
23169 /* Otherwise, we need two or three jumps. */
23171 label2 = gen_label_rtx ();
23173 code1 = code;
23174 code2 = swap_condition (code);
23175 code3 = unsigned_condition (code);
23177 switch (code)
23179 case LT: case GT: case LTU: case GTU:
23180 break;
23182 case LE: code1 = LT; code2 = GT; break;
23183 case GE: code1 = GT; code2 = LT; break;
23184 case LEU: code1 = LTU; code2 = GTU; break;
23185 case GEU: code1 = GTU; code2 = LTU; break;
23187 case EQ: code1 = UNKNOWN; code2 = NE; break;
23188 case NE: code2 = UNKNOWN; break;
23190 default:
23191 gcc_unreachable ();
23195 * a < b =>
23196 * if (hi(a) < hi(b)) goto true;
23197 * if (hi(a) > hi(b)) goto false;
23198 * if (lo(a) < lo(b)) goto true;
23199 * false:
23202 if (code1 != UNKNOWN)
23203 ix86_expand_branch (code1, hi[0], hi[1], label);
23204 if (code2 != UNKNOWN)
23205 ix86_expand_branch (code2, hi[0], hi[1], label2);
23207 ix86_expand_branch (code3, lo[0], lo[1], label);
23209 if (code2 != UNKNOWN)
23210 emit_label (label2);
23211 return;
23214 default:
23215 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23216 goto simple;
23220 void
23221 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23223 rtx ret;
23225 gcc_assert (GET_MODE (dest) == QImode);
23227 ret = ix86_expand_compare (code, op0, op1);
23228 PUT_MODE (ret, QImode);
23229 emit_insn (gen_rtx_SET (dest, ret));
23232 /* Expand comparison setting or clearing carry flag. Return true when
23233 successful and set pop for the operation. */
23234 static bool
23235 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23237 machine_mode mode =
23238 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23240 /* Do not handle double-mode compares that go through special path. */
23241 if (mode == (TARGET_64BIT ? TImode : DImode))
23242 return false;
23244 if (SCALAR_FLOAT_MODE_P (mode))
23246 rtx compare_op;
23247 rtx_insn *compare_seq;
23249 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23251 /* Shortcut: following common codes never translate
23252 into carry flag compares. */
23253 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23254 || code == ORDERED || code == UNORDERED)
23255 return false;
23257 /* These comparisons require zero flag; swap operands so they won't. */
23258 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23259 && !TARGET_IEEE_FP)
23261 std::swap (op0, op1);
23262 code = swap_condition (code);
23265 /* Try to expand the comparison and verify that we end up with
23266 carry flag based comparison. This fails to be true only when
23267 we decide to expand comparison using arithmetic that is not
23268 too common scenario. */
23269 start_sequence ();
23270 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23271 compare_seq = get_insns ();
23272 end_sequence ();
23274 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
23275 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23276 else
23277 code = GET_CODE (compare_op);
23279 if (code != LTU && code != GEU)
23280 return false;
23282 emit_insn (compare_seq);
23283 *pop = compare_op;
23284 return true;
23287 if (!INTEGRAL_MODE_P (mode))
23288 return false;
23290 switch (code)
23292 case LTU:
23293 case GEU:
23294 break;
23296 /* Convert a==0 into (unsigned)a<1. */
23297 case EQ:
23298 case NE:
23299 if (op1 != const0_rtx)
23300 return false;
23301 op1 = const1_rtx;
23302 code = (code == EQ ? LTU : GEU);
23303 break;
23305 /* Convert a>b into b<a or a>=b-1. */
23306 case GTU:
23307 case LEU:
23308 if (CONST_INT_P (op1))
23310 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23311 /* Bail out on overflow. We still can swap operands but that
23312 would force loading of the constant into register. */
23313 if (op1 == const0_rtx
23314 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23315 return false;
23316 code = (code == GTU ? GEU : LTU);
23318 else
23320 std::swap (op0, op1);
23321 code = (code == GTU ? LTU : GEU);
23323 break;
23325 /* Convert a>=0 into (unsigned)a<0x80000000. */
23326 case LT:
23327 case GE:
23328 if (mode == DImode || op1 != const0_rtx)
23329 return false;
23330 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23331 code = (code == LT ? GEU : LTU);
23332 break;
23333 case LE:
23334 case GT:
23335 if (mode == DImode || op1 != constm1_rtx)
23336 return false;
23337 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23338 code = (code == LE ? GEU : LTU);
23339 break;
23341 default:
23342 return false;
23344 /* Swapping operands may cause constant to appear as first operand. */
23345 if (!nonimmediate_operand (op0, VOIDmode))
23347 if (!can_create_pseudo_p ())
23348 return false;
23349 op0 = force_reg (mode, op0);
23351 *pop = ix86_expand_compare (code, op0, op1);
23352 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23353 return true;
23356 bool
23357 ix86_expand_int_movcc (rtx operands[])
23359 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23360 rtx_insn *compare_seq;
23361 rtx compare_op;
23362 machine_mode mode = GET_MODE (operands[0]);
23363 bool sign_bit_compare_p = false;
23364 rtx op0 = XEXP (operands[1], 0);
23365 rtx op1 = XEXP (operands[1], 1);
23367 if (GET_MODE (op0) == TImode
23368 || (GET_MODE (op0) == DImode
23369 && !TARGET_64BIT))
23370 return false;
23372 start_sequence ();
23373 compare_op = ix86_expand_compare (code, op0, op1);
23374 compare_seq = get_insns ();
23375 end_sequence ();
23377 compare_code = GET_CODE (compare_op);
23379 if ((op1 == const0_rtx && (code == GE || code == LT))
23380 || (op1 == constm1_rtx && (code == GT || code == LE)))
23381 sign_bit_compare_p = true;
23383 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23384 HImode insns, we'd be swallowed in word prefix ops. */
23386 if ((mode != HImode || TARGET_FAST_PREFIX)
23387 && (mode != (TARGET_64BIT ? TImode : DImode))
23388 && CONST_INT_P (operands[2])
23389 && CONST_INT_P (operands[3]))
23391 rtx out = operands[0];
23392 HOST_WIDE_INT ct = INTVAL (operands[2]);
23393 HOST_WIDE_INT cf = INTVAL (operands[3]);
23394 HOST_WIDE_INT diff;
23396 diff = ct - cf;
23397 /* Sign bit compares are better done using shifts than we do by using
23398 sbb. */
23399 if (sign_bit_compare_p
23400 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23402 /* Detect overlap between destination and compare sources. */
23403 rtx tmp = out;
23405 if (!sign_bit_compare_p)
23407 rtx flags;
23408 bool fpcmp = false;
23410 compare_code = GET_CODE (compare_op);
23412 flags = XEXP (compare_op, 0);
23414 if (GET_MODE (flags) == CCFPmode)
23416 fpcmp = true;
23417 compare_code
23418 = ix86_fp_compare_code_to_integer (compare_code);
23421 /* To simplify rest of code, restrict to the GEU case. */
23422 if (compare_code == LTU)
23424 std::swap (ct, cf);
23425 compare_code = reverse_condition (compare_code);
23426 code = reverse_condition (code);
23428 else
23430 if (fpcmp)
23431 PUT_CODE (compare_op,
23432 reverse_condition_maybe_unordered
23433 (GET_CODE (compare_op)));
23434 else
23435 PUT_CODE (compare_op,
23436 reverse_condition (GET_CODE (compare_op)));
23438 diff = ct - cf;
23440 if (reg_overlap_mentioned_p (out, op0)
23441 || reg_overlap_mentioned_p (out, op1))
23442 tmp = gen_reg_rtx (mode);
23444 if (mode == DImode)
23445 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23446 else
23447 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23448 flags, compare_op));
23450 else
23452 if (code == GT || code == GE)
23453 code = reverse_condition (code);
23454 else
23456 std::swap (ct, cf);
23457 diff = ct - cf;
23459 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23462 if (diff == 1)
23465 * cmpl op0,op1
23466 * sbbl dest,dest
23467 * [addl dest, ct]
23469 * Size 5 - 8.
23471 if (ct)
23472 tmp = expand_simple_binop (mode, PLUS,
23473 tmp, GEN_INT (ct),
23474 copy_rtx (tmp), 1, OPTAB_DIRECT);
23476 else if (cf == -1)
23479 * cmpl op0,op1
23480 * sbbl dest,dest
23481 * orl $ct, dest
23483 * Size 8.
23485 tmp = expand_simple_binop (mode, IOR,
23486 tmp, GEN_INT (ct),
23487 copy_rtx (tmp), 1, OPTAB_DIRECT);
23489 else if (diff == -1 && ct)
23492 * cmpl op0,op1
23493 * sbbl dest,dest
23494 * notl dest
23495 * [addl dest, cf]
23497 * Size 8 - 11.
23499 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23500 if (cf)
23501 tmp = expand_simple_binop (mode, PLUS,
23502 copy_rtx (tmp), GEN_INT (cf),
23503 copy_rtx (tmp), 1, OPTAB_DIRECT);
23505 else
23508 * cmpl op0,op1
23509 * sbbl dest,dest
23510 * [notl dest]
23511 * andl cf - ct, dest
23512 * [addl dest, ct]
23514 * Size 8 - 11.
23517 if (cf == 0)
23519 cf = ct;
23520 ct = 0;
23521 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23524 tmp = expand_simple_binop (mode, AND,
23525 copy_rtx (tmp),
23526 gen_int_mode (cf - ct, mode),
23527 copy_rtx (tmp), 1, OPTAB_DIRECT);
23528 if (ct)
23529 tmp = expand_simple_binop (mode, PLUS,
23530 copy_rtx (tmp), GEN_INT (ct),
23531 copy_rtx (tmp), 1, OPTAB_DIRECT);
23534 if (!rtx_equal_p (tmp, out))
23535 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23537 return true;
23540 if (diff < 0)
23542 machine_mode cmp_mode = GET_MODE (op0);
23543 enum rtx_code new_code;
23545 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23547 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23549 /* We may be reversing unordered compare to normal compare, that
23550 is not valid in general (we may convert non-trapping condition
23551 to trapping one), however on i386 we currently emit all
23552 comparisons unordered. */
23553 new_code = reverse_condition_maybe_unordered (code);
23555 else
23556 new_code = ix86_reverse_condition (code, cmp_mode);
23557 if (new_code != UNKNOWN)
23559 std::swap (ct, cf);
23560 diff = -diff;
23561 code = new_code;
23565 compare_code = UNKNOWN;
23566 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23567 && CONST_INT_P (op1))
23569 if (op1 == const0_rtx
23570 && (code == LT || code == GE))
23571 compare_code = code;
23572 else if (op1 == constm1_rtx)
23574 if (code == LE)
23575 compare_code = LT;
23576 else if (code == GT)
23577 compare_code = GE;
23581 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23582 if (compare_code != UNKNOWN
23583 && GET_MODE (op0) == GET_MODE (out)
23584 && (cf == -1 || ct == -1))
23586 /* If lea code below could be used, only optimize
23587 if it results in a 2 insn sequence. */
23589 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23590 || diff == 3 || diff == 5 || diff == 9)
23591 || (compare_code == LT && ct == -1)
23592 || (compare_code == GE && cf == -1))
23595 * notl op1 (if necessary)
23596 * sarl $31, op1
23597 * orl cf, op1
23599 if (ct != -1)
23601 cf = ct;
23602 ct = -1;
23603 code = reverse_condition (code);
23606 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23608 out = expand_simple_binop (mode, IOR,
23609 out, GEN_INT (cf),
23610 out, 1, OPTAB_DIRECT);
23611 if (out != operands[0])
23612 emit_move_insn (operands[0], out);
23614 return true;
23619 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23620 || diff == 3 || diff == 5 || diff == 9)
23621 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23622 && (mode != DImode
23623 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23626 * xorl dest,dest
23627 * cmpl op1,op2
23628 * setcc dest
23629 * lea cf(dest*(ct-cf)),dest
23631 * Size 14.
23633 * This also catches the degenerate setcc-only case.
23636 rtx tmp;
23637 int nops;
23639 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23641 nops = 0;
23642 /* On x86_64 the lea instruction operates on Pmode, so we need
23643 to get arithmetics done in proper mode to match. */
23644 if (diff == 1)
23645 tmp = copy_rtx (out);
23646 else
23648 rtx out1;
23649 out1 = copy_rtx (out);
23650 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23651 nops++;
23652 if (diff & 1)
23654 tmp = gen_rtx_PLUS (mode, tmp, out1);
23655 nops++;
23658 if (cf != 0)
23660 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23661 nops++;
23663 if (!rtx_equal_p (tmp, out))
23665 if (nops == 1)
23666 out = force_operand (tmp, copy_rtx (out));
23667 else
23668 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23670 if (!rtx_equal_p (out, operands[0]))
23671 emit_move_insn (operands[0], copy_rtx (out));
23673 return true;
23677 * General case: Jumpful:
23678 * xorl dest,dest cmpl op1, op2
23679 * cmpl op1, op2 movl ct, dest
23680 * setcc dest jcc 1f
23681 * decl dest movl cf, dest
23682 * andl (cf-ct),dest 1:
23683 * addl ct,dest
23685 * Size 20. Size 14.
23687 * This is reasonably steep, but branch mispredict costs are
23688 * high on modern cpus, so consider failing only if optimizing
23689 * for space.
23692 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23693 && BRANCH_COST (optimize_insn_for_speed_p (),
23694 false) >= 2)
23696 if (cf == 0)
23698 machine_mode cmp_mode = GET_MODE (op0);
23699 enum rtx_code new_code;
23701 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23703 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23705 /* We may be reversing unordered compare to normal compare,
23706 that is not valid in general (we may convert non-trapping
23707 condition to trapping one), however on i386 we currently
23708 emit all comparisons unordered. */
23709 new_code = reverse_condition_maybe_unordered (code);
23711 else
23713 new_code = ix86_reverse_condition (code, cmp_mode);
23714 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23715 compare_code = reverse_condition (compare_code);
23718 if (new_code != UNKNOWN)
23720 cf = ct;
23721 ct = 0;
23722 code = new_code;
23726 if (compare_code != UNKNOWN)
23728 /* notl op1 (if needed)
23729 sarl $31, op1
23730 andl (cf-ct), op1
23731 addl ct, op1
23733 For x < 0 (resp. x <= -1) there will be no notl,
23734 so if possible swap the constants to get rid of the
23735 complement.
23736 True/false will be -1/0 while code below (store flag
23737 followed by decrement) is 0/-1, so the constants need
23738 to be exchanged once more. */
23740 if (compare_code == GE || !cf)
23742 code = reverse_condition (code);
23743 compare_code = LT;
23745 else
23746 std::swap (ct, cf);
23748 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23750 else
23752 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23754 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23755 constm1_rtx,
23756 copy_rtx (out), 1, OPTAB_DIRECT);
23759 out = expand_simple_binop (mode, AND, copy_rtx (out),
23760 gen_int_mode (cf - ct, mode),
23761 copy_rtx (out), 1, OPTAB_DIRECT);
23762 if (ct)
23763 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23764 copy_rtx (out), 1, OPTAB_DIRECT);
23765 if (!rtx_equal_p (out, operands[0]))
23766 emit_move_insn (operands[0], copy_rtx (out));
23768 return true;
23772 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23774 /* Try a few things more with specific constants and a variable. */
23776 optab op;
23777 rtx var, orig_out, out, tmp;
23779 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23780 return false;
23782 /* If one of the two operands is an interesting constant, load a
23783 constant with the above and mask it in with a logical operation. */
23785 if (CONST_INT_P (operands[2]))
23787 var = operands[3];
23788 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23789 operands[3] = constm1_rtx, op = and_optab;
23790 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23791 operands[3] = const0_rtx, op = ior_optab;
23792 else
23793 return false;
23795 else if (CONST_INT_P (operands[3]))
23797 var = operands[2];
23798 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23799 operands[2] = constm1_rtx, op = and_optab;
23800 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23801 operands[2] = const0_rtx, op = ior_optab;
23802 else
23803 return false;
23805 else
23806 return false;
23808 orig_out = operands[0];
23809 tmp = gen_reg_rtx (mode);
23810 operands[0] = tmp;
23812 /* Recurse to get the constant loaded. */
23813 if (!ix86_expand_int_movcc (operands))
23814 return false;
23816 /* Mask in the interesting variable. */
23817 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23818 OPTAB_WIDEN);
23819 if (!rtx_equal_p (out, orig_out))
23820 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23822 return true;
23826 * For comparison with above,
23828 * movl cf,dest
23829 * movl ct,tmp
23830 * cmpl op1,op2
23831 * cmovcc tmp,dest
23833 * Size 15.
23836 if (! nonimmediate_operand (operands[2], mode))
23837 operands[2] = force_reg (mode, operands[2]);
23838 if (! nonimmediate_operand (operands[3], mode))
23839 operands[3] = force_reg (mode, operands[3]);
23841 if (! register_operand (operands[2], VOIDmode)
23842 && (mode == QImode
23843 || ! register_operand (operands[3], VOIDmode)))
23844 operands[2] = force_reg (mode, operands[2]);
23846 if (mode == QImode
23847 && ! register_operand (operands[3], VOIDmode))
23848 operands[3] = force_reg (mode, operands[3]);
23850 emit_insn (compare_seq);
23851 emit_insn (gen_rtx_SET (operands[0],
23852 gen_rtx_IF_THEN_ELSE (mode,
23853 compare_op, operands[2],
23854 operands[3])));
23855 return true;
23858 /* Swap, force into registers, or otherwise massage the two operands
23859 to an sse comparison with a mask result. Thus we differ a bit from
23860 ix86_prepare_fp_compare_args which expects to produce a flags result.
23862 The DEST operand exists to help determine whether to commute commutative
23863 operators. The POP0/POP1 operands are updated in place. The new
23864 comparison code is returned, or UNKNOWN if not implementable. */
23866 static enum rtx_code
23867 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23868 rtx *pop0, rtx *pop1)
23870 switch (code)
23872 case LTGT:
23873 case UNEQ:
23874 /* AVX supports all the needed comparisons. */
23875 if (TARGET_AVX)
23876 break;
23877 /* We have no LTGT as an operator. We could implement it with
23878 NE & ORDERED, but this requires an extra temporary. It's
23879 not clear that it's worth it. */
23880 return UNKNOWN;
23882 case LT:
23883 case LE:
23884 case UNGT:
23885 case UNGE:
23886 /* These are supported directly. */
23887 break;
23889 case EQ:
23890 case NE:
23891 case UNORDERED:
23892 case ORDERED:
23893 /* AVX has 3 operand comparisons, no need to swap anything. */
23894 if (TARGET_AVX)
23895 break;
23896 /* For commutative operators, try to canonicalize the destination
23897 operand to be first in the comparison - this helps reload to
23898 avoid extra moves. */
23899 if (!dest || !rtx_equal_p (dest, *pop1))
23900 break;
23901 /* FALLTHRU */
23903 case GE:
23904 case GT:
23905 case UNLE:
23906 case UNLT:
23907 /* These are not supported directly before AVX, and furthermore
23908 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23909 comparison operands to transform into something that is
23910 supported. */
23911 std::swap (*pop0, *pop1);
23912 code = swap_condition (code);
23913 break;
23915 default:
23916 gcc_unreachable ();
23919 return code;
23922 /* Detect conditional moves that exactly match min/max operational
23923 semantics. Note that this is IEEE safe, as long as we don't
23924 interchange the operands.
23926 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23927 and TRUE if the operation is successful and instructions are emitted. */
23929 static bool
23930 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23931 rtx cmp_op1, rtx if_true, rtx if_false)
23933 machine_mode mode;
23934 bool is_min;
23935 rtx tmp;
23937 if (code == LT)
23939 else if (code == UNGE)
23940 std::swap (if_true, if_false);
23941 else
23942 return false;
23944 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23945 is_min = true;
23946 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23947 is_min = false;
23948 else
23949 return false;
23951 mode = GET_MODE (dest);
23953 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23954 but MODE may be a vector mode and thus not appropriate. */
23955 if (!flag_finite_math_only || flag_signed_zeros)
23957 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23958 rtvec v;
23960 if_true = force_reg (mode, if_true);
23961 v = gen_rtvec (2, if_true, if_false);
23962 tmp = gen_rtx_UNSPEC (mode, v, u);
23964 else
23966 code = is_min ? SMIN : SMAX;
23967 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23970 emit_insn (gen_rtx_SET (dest, tmp));
23971 return true;
23974 /* Expand an sse vector comparison. Return the register with the result. */
23976 static rtx
23977 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23978 rtx op_true, rtx op_false)
23980 machine_mode mode = GET_MODE (dest);
23981 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23983 /* In general case result of comparison can differ from operands' type. */
23984 machine_mode cmp_mode;
23986 /* In AVX512F the result of comparison is an integer mask. */
23987 bool maskcmp = false;
23988 rtx x;
23990 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23992 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23993 cmp_mode = int_mode_for_size (nbits, 0).require ();
23994 maskcmp = true;
23996 else
23997 cmp_mode = cmp_ops_mode;
24000 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24001 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24002 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24004 if (optimize
24005 || (maskcmp && cmp_mode != mode)
24006 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24007 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24008 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24010 /* Compare patterns for int modes are unspec in AVX512F only. */
24011 if (maskcmp && (code == GT || code == EQ))
24013 rtx (*gen)(rtx, rtx, rtx);
24015 switch (cmp_ops_mode)
24017 case E_V64QImode:
24018 gcc_assert (TARGET_AVX512BW);
24019 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24020 break;
24021 case E_V32HImode:
24022 gcc_assert (TARGET_AVX512BW);
24023 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24024 break;
24025 case E_V16SImode:
24026 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24027 break;
24028 case E_V8DImode:
24029 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24030 break;
24031 default:
24032 gen = NULL;
24035 if (gen)
24037 emit_insn (gen (dest, cmp_op0, cmp_op1));
24038 return dest;
24041 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24043 if (cmp_mode != mode && !maskcmp)
24045 x = force_reg (cmp_ops_mode, x);
24046 convert_move (dest, x, false);
24048 else
24049 emit_insn (gen_rtx_SET (dest, x));
24051 return dest;
24054 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24055 operations. This is used for both scalar and vector conditional moves. */
24057 void
24058 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24060 machine_mode mode = GET_MODE (dest);
24061 machine_mode cmpmode = GET_MODE (cmp);
24063 /* In AVX512F the result of comparison is an integer mask. */
24064 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24066 rtx t2, t3, x;
24068 /* If we have an integer mask and FP value then we need
24069 to cast mask to FP mode. */
24070 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24072 cmp = force_reg (cmpmode, cmp);
24073 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24076 if (vector_all_ones_operand (op_true, mode)
24077 && rtx_equal_p (op_false, CONST0_RTX (mode))
24078 && !maskcmp)
24080 emit_insn (gen_rtx_SET (dest, cmp));
24082 else if (op_false == CONST0_RTX (mode)
24083 && !maskcmp)
24085 op_true = force_reg (mode, op_true);
24086 x = gen_rtx_AND (mode, cmp, op_true);
24087 emit_insn (gen_rtx_SET (dest, x));
24089 else if (op_true == CONST0_RTX (mode)
24090 && !maskcmp)
24092 op_false = force_reg (mode, op_false);
24093 x = gen_rtx_NOT (mode, cmp);
24094 x = gen_rtx_AND (mode, x, op_false);
24095 emit_insn (gen_rtx_SET (dest, x));
24097 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24098 && !maskcmp)
24100 op_false = force_reg (mode, op_false);
24101 x = gen_rtx_IOR (mode, cmp, op_false);
24102 emit_insn (gen_rtx_SET (dest, x));
24104 else if (TARGET_XOP
24105 && !maskcmp)
24107 op_true = force_reg (mode, op_true);
24109 if (!nonimmediate_operand (op_false, mode))
24110 op_false = force_reg (mode, op_false);
24112 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24113 op_true,
24114 op_false)));
24116 else
24118 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24119 rtx d = dest;
24121 if (!nonimmediate_operand (op_true, mode))
24122 op_true = force_reg (mode, op_true);
24124 op_false = force_reg (mode, op_false);
24126 switch (mode)
24128 case E_V4SFmode:
24129 if (TARGET_SSE4_1)
24130 gen = gen_sse4_1_blendvps;
24131 break;
24132 case E_V2DFmode:
24133 if (TARGET_SSE4_1)
24134 gen = gen_sse4_1_blendvpd;
24135 break;
24136 case E_V16QImode:
24137 case E_V8HImode:
24138 case E_V4SImode:
24139 case E_V2DImode:
24140 if (TARGET_SSE4_1)
24142 gen = gen_sse4_1_pblendvb;
24143 if (mode != V16QImode)
24144 d = gen_reg_rtx (V16QImode);
24145 op_false = gen_lowpart (V16QImode, op_false);
24146 op_true = gen_lowpart (V16QImode, op_true);
24147 cmp = gen_lowpart (V16QImode, cmp);
24149 break;
24150 case E_V8SFmode:
24151 if (TARGET_AVX)
24152 gen = gen_avx_blendvps256;
24153 break;
24154 case E_V4DFmode:
24155 if (TARGET_AVX)
24156 gen = gen_avx_blendvpd256;
24157 break;
24158 case E_V32QImode:
24159 case E_V16HImode:
24160 case E_V8SImode:
24161 case E_V4DImode:
24162 if (TARGET_AVX2)
24164 gen = gen_avx2_pblendvb;
24165 if (mode != V32QImode)
24166 d = gen_reg_rtx (V32QImode);
24167 op_false = gen_lowpart (V32QImode, op_false);
24168 op_true = gen_lowpart (V32QImode, op_true);
24169 cmp = gen_lowpart (V32QImode, cmp);
24171 break;
24173 case E_V64QImode:
24174 gen = gen_avx512bw_blendmv64qi;
24175 break;
24176 case E_V32HImode:
24177 gen = gen_avx512bw_blendmv32hi;
24178 break;
24179 case E_V16SImode:
24180 gen = gen_avx512f_blendmv16si;
24181 break;
24182 case E_V8DImode:
24183 gen = gen_avx512f_blendmv8di;
24184 break;
24185 case E_V8DFmode:
24186 gen = gen_avx512f_blendmv8df;
24187 break;
24188 case E_V16SFmode:
24189 gen = gen_avx512f_blendmv16sf;
24190 break;
24192 default:
24193 break;
24196 if (gen != NULL)
24198 emit_insn (gen (d, op_false, op_true, cmp));
24199 if (d != dest)
24200 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24202 else
24204 op_true = force_reg (mode, op_true);
24206 t2 = gen_reg_rtx (mode);
24207 if (optimize)
24208 t3 = gen_reg_rtx (mode);
24209 else
24210 t3 = dest;
24212 x = gen_rtx_AND (mode, op_true, cmp);
24213 emit_insn (gen_rtx_SET (t2, x));
24215 x = gen_rtx_NOT (mode, cmp);
24216 x = gen_rtx_AND (mode, x, op_false);
24217 emit_insn (gen_rtx_SET (t3, x));
24219 x = gen_rtx_IOR (mode, t3, t2);
24220 emit_insn (gen_rtx_SET (dest, x));
24225 /* Expand a floating-point conditional move. Return true if successful. */
24227 bool
24228 ix86_expand_fp_movcc (rtx operands[])
24230 machine_mode mode = GET_MODE (operands[0]);
24231 enum rtx_code code = GET_CODE (operands[1]);
24232 rtx tmp, compare_op;
24233 rtx op0 = XEXP (operands[1], 0);
24234 rtx op1 = XEXP (operands[1], 1);
24236 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24238 machine_mode cmode;
24240 /* Since we've no cmove for sse registers, don't force bad register
24241 allocation just to gain access to it. Deny movcc when the
24242 comparison mode doesn't match the move mode. */
24243 cmode = GET_MODE (op0);
24244 if (cmode == VOIDmode)
24245 cmode = GET_MODE (op1);
24246 if (cmode != mode)
24247 return false;
24249 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24250 if (code == UNKNOWN)
24251 return false;
24253 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24254 operands[2], operands[3]))
24255 return true;
24257 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24258 operands[2], operands[3]);
24259 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24260 return true;
24263 if (GET_MODE (op0) == TImode
24264 || (GET_MODE (op0) == DImode
24265 && !TARGET_64BIT))
24266 return false;
24268 /* The floating point conditional move instructions don't directly
24269 support conditions resulting from a signed integer comparison. */
24271 compare_op = ix86_expand_compare (code, op0, op1);
24272 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24274 tmp = gen_reg_rtx (QImode);
24275 ix86_expand_setcc (tmp, code, op0, op1);
24277 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24280 emit_insn (gen_rtx_SET (operands[0],
24281 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24282 operands[2], operands[3])));
24284 return true;
24287 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24289 static int
24290 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24292 switch (code)
24294 case EQ:
24295 return 0;
24296 case LT:
24297 case LTU:
24298 return 1;
24299 case LE:
24300 case LEU:
24301 return 2;
24302 case NE:
24303 return 4;
24304 case GE:
24305 case GEU:
24306 return 5;
24307 case GT:
24308 case GTU:
24309 return 6;
24310 default:
24311 gcc_unreachable ();
24315 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24317 static int
24318 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24320 switch (code)
24322 case EQ:
24323 return 0x00;
24324 case NE:
24325 return 0x04;
24326 case GT:
24327 return 0x0e;
24328 case LE:
24329 return 0x02;
24330 case GE:
24331 return 0x0d;
24332 case LT:
24333 return 0x01;
24334 case UNLE:
24335 return 0x0a;
24336 case UNLT:
24337 return 0x09;
24338 case UNGE:
24339 return 0x05;
24340 case UNGT:
24341 return 0x06;
24342 case UNEQ:
24343 return 0x18;
24344 case LTGT:
24345 return 0x0c;
24346 case ORDERED:
24347 return 0x07;
24348 case UNORDERED:
24349 return 0x03;
24350 default:
24351 gcc_unreachable ();
24355 /* Return immediate value to be used in UNSPEC_PCMP
24356 for comparison CODE in MODE. */
24358 static int
24359 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24361 if (FLOAT_MODE_P (mode))
24362 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24363 return ix86_int_cmp_code_to_pcmp_immediate (code);
24366 /* Expand AVX-512 vector comparison. */
24368 bool
24369 ix86_expand_mask_vec_cmp (rtx operands[])
24371 machine_mode mask_mode = GET_MODE (operands[0]);
24372 machine_mode cmp_mode = GET_MODE (operands[2]);
24373 enum rtx_code code = GET_CODE (operands[1]);
24374 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24375 int unspec_code;
24376 rtx unspec;
24378 switch (code)
24380 case LEU:
24381 case GTU:
24382 case GEU:
24383 case LTU:
24384 unspec_code = UNSPEC_UNSIGNED_PCMP;
24385 break;
24387 default:
24388 unspec_code = UNSPEC_PCMP;
24391 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24392 operands[3], imm),
24393 unspec_code);
24394 emit_insn (gen_rtx_SET (operands[0], unspec));
24396 return true;
24399 /* Expand fp vector comparison. */
24401 bool
24402 ix86_expand_fp_vec_cmp (rtx operands[])
24404 enum rtx_code code = GET_CODE (operands[1]);
24405 rtx cmp;
24407 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24408 &operands[2], &operands[3]);
24409 if (code == UNKNOWN)
24411 rtx temp;
24412 switch (GET_CODE (operands[1]))
24414 case LTGT:
24415 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24416 operands[3], NULL, NULL);
24417 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24418 operands[3], NULL, NULL);
24419 code = AND;
24420 break;
24421 case UNEQ:
24422 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24423 operands[3], NULL, NULL);
24424 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24425 operands[3], NULL, NULL);
24426 code = IOR;
24427 break;
24428 default:
24429 gcc_unreachable ();
24431 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24432 OPTAB_DIRECT);
24434 else
24435 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24436 operands[1], operands[2]);
24438 if (operands[0] != cmp)
24439 emit_move_insn (operands[0], cmp);
24441 return true;
24444 static rtx
24445 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24446 rtx op_true, rtx op_false, bool *negate)
24448 machine_mode data_mode = GET_MODE (dest);
24449 machine_mode mode = GET_MODE (cop0);
24450 rtx x;
24452 *negate = false;
24454 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24455 if (TARGET_XOP
24456 && (mode == V16QImode || mode == V8HImode
24457 || mode == V4SImode || mode == V2DImode))
24459 else
24461 /* Canonicalize the comparison to EQ, GT, GTU. */
24462 switch (code)
24464 case EQ:
24465 case GT:
24466 case GTU:
24467 break;
24469 case NE:
24470 case LE:
24471 case LEU:
24472 code = reverse_condition (code);
24473 *negate = true;
24474 break;
24476 case GE:
24477 case GEU:
24478 code = reverse_condition (code);
24479 *negate = true;
24480 /* FALLTHRU */
24482 case LT:
24483 case LTU:
24484 std::swap (cop0, cop1);
24485 code = swap_condition (code);
24486 break;
24488 default:
24489 gcc_unreachable ();
24492 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24493 if (mode == V2DImode)
24495 switch (code)
24497 case EQ:
24498 /* SSE4.1 supports EQ. */
24499 if (!TARGET_SSE4_1)
24500 return NULL;
24501 break;
24503 case GT:
24504 case GTU:
24505 /* SSE4.2 supports GT/GTU. */
24506 if (!TARGET_SSE4_2)
24507 return NULL;
24508 break;
24510 default:
24511 gcc_unreachable ();
24515 /* Unsigned parallel compare is not supported by the hardware.
24516 Play some tricks to turn this into a signed comparison
24517 against 0. */
24518 if (code == GTU)
24520 cop0 = force_reg (mode, cop0);
24522 switch (mode)
24524 case E_V16SImode:
24525 case E_V8DImode:
24526 case E_V8SImode:
24527 case E_V4DImode:
24528 case E_V4SImode:
24529 case E_V2DImode:
24531 rtx t1, t2, mask;
24532 rtx (*gen_sub3) (rtx, rtx, rtx);
24534 switch (mode)
24536 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24537 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24538 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24539 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24540 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24541 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24542 default:
24543 gcc_unreachable ();
24545 /* Subtract (-(INT MAX) - 1) from both operands to make
24546 them signed. */
24547 mask = ix86_build_signbit_mask (mode, true, false);
24548 t1 = gen_reg_rtx (mode);
24549 emit_insn (gen_sub3 (t1, cop0, mask));
24551 t2 = gen_reg_rtx (mode);
24552 emit_insn (gen_sub3 (t2, cop1, mask));
24554 cop0 = t1;
24555 cop1 = t2;
24556 code = GT;
24558 break;
24560 case E_V64QImode:
24561 case E_V32HImode:
24562 case E_V32QImode:
24563 case E_V16HImode:
24564 case E_V16QImode:
24565 case E_V8HImode:
24566 /* Perform a parallel unsigned saturating subtraction. */
24567 x = gen_reg_rtx (mode);
24568 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24569 cop1)));
24571 cop0 = x;
24572 cop1 = CONST0_RTX (mode);
24573 code = EQ;
24574 *negate = !*negate;
24575 break;
24577 default:
24578 gcc_unreachable ();
24583 if (*negate)
24584 std::swap (op_true, op_false);
24586 /* Allow the comparison to be done in one mode, but the movcc to
24587 happen in another mode. */
24588 if (data_mode == mode)
24590 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24591 op_true, op_false);
24593 else
24595 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24596 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24597 op_true, op_false);
24598 if (GET_MODE (x) == mode)
24599 x = gen_lowpart (data_mode, x);
24602 return x;
24605 /* Expand integer vector comparison. */
24607 bool
24608 ix86_expand_int_vec_cmp (rtx operands[])
24610 rtx_code code = GET_CODE (operands[1]);
24611 bool negate = false;
24612 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24613 operands[3], NULL, NULL, &negate);
24615 if (!cmp)
24616 return false;
24618 if (negate)
24619 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24620 CONST0_RTX (GET_MODE (cmp)),
24621 NULL, NULL, &negate);
24623 gcc_assert (!negate);
24625 if (operands[0] != cmp)
24626 emit_move_insn (operands[0], cmp);
24628 return true;
24631 /* Expand a floating-point vector conditional move; a vcond operation
24632 rather than a movcc operation. */
24634 bool
24635 ix86_expand_fp_vcond (rtx operands[])
24637 enum rtx_code code = GET_CODE (operands[3]);
24638 rtx cmp;
24640 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24641 &operands[4], &operands[5]);
24642 if (code == UNKNOWN)
24644 rtx temp;
24645 switch (GET_CODE (operands[3]))
24647 case LTGT:
24648 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24649 operands[5], operands[0], operands[0]);
24650 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24651 operands[5], operands[1], operands[2]);
24652 code = AND;
24653 break;
24654 case UNEQ:
24655 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24656 operands[5], operands[0], operands[0]);
24657 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24658 operands[5], operands[1], operands[2]);
24659 code = IOR;
24660 break;
24661 default:
24662 gcc_unreachable ();
24664 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24665 OPTAB_DIRECT);
24666 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24667 return true;
24670 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24671 operands[5], operands[1], operands[2]))
24672 return true;
24674 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24675 operands[1], operands[2]);
24676 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24677 return true;
24680 /* Expand a signed/unsigned integral vector conditional move. */
24682 bool
24683 ix86_expand_int_vcond (rtx operands[])
24685 machine_mode data_mode = GET_MODE (operands[0]);
24686 machine_mode mode = GET_MODE (operands[4]);
24687 enum rtx_code code = GET_CODE (operands[3]);
24688 bool negate = false;
24689 rtx x, cop0, cop1;
24691 cop0 = operands[4];
24692 cop1 = operands[5];
24694 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24695 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24696 if ((code == LT || code == GE)
24697 && data_mode == mode
24698 && cop1 == CONST0_RTX (mode)
24699 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24700 && GET_MODE_UNIT_SIZE (data_mode) > 1
24701 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24702 && (GET_MODE_SIZE (data_mode) == 16
24703 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24705 rtx negop = operands[2 - (code == LT)];
24706 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24707 if (negop == CONST1_RTX (data_mode))
24709 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24710 operands[0], 1, OPTAB_DIRECT);
24711 if (res != operands[0])
24712 emit_move_insn (operands[0], res);
24713 return true;
24715 else if (GET_MODE_INNER (data_mode) != DImode
24716 && vector_all_ones_operand (negop, data_mode))
24718 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24719 operands[0], 0, OPTAB_DIRECT);
24720 if (res != operands[0])
24721 emit_move_insn (operands[0], res);
24722 return true;
24726 if (!nonimmediate_operand (cop1, mode))
24727 cop1 = force_reg (mode, cop1);
24728 if (!general_operand (operands[1], data_mode))
24729 operands[1] = force_reg (data_mode, operands[1]);
24730 if (!general_operand (operands[2], data_mode))
24731 operands[2] = force_reg (data_mode, operands[2]);
24733 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24734 operands[1], operands[2], &negate);
24736 if (!x)
24737 return false;
24739 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24740 operands[2-negate]);
24741 return true;
24744 /* AVX512F does support 64-byte integer vector operations,
24745 thus the longest vector we are faced with is V64QImode. */
24746 #define MAX_VECT_LEN 64
24748 struct expand_vec_perm_d
24750 rtx target, op0, op1;
24751 unsigned char perm[MAX_VECT_LEN];
24752 machine_mode vmode;
24753 unsigned char nelt;
24754 bool one_operand_p;
24755 bool testing_p;
24758 static bool
24759 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24760 struct expand_vec_perm_d *d)
24762 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24763 expander, so args are either in d, or in op0, op1 etc. */
24764 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24765 machine_mode maskmode = mode;
24766 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24768 switch (mode)
24770 case E_V8HImode:
24771 if (TARGET_AVX512VL && TARGET_AVX512BW)
24772 gen = gen_avx512vl_vpermt2varv8hi3;
24773 break;
24774 case E_V16HImode:
24775 if (TARGET_AVX512VL && TARGET_AVX512BW)
24776 gen = gen_avx512vl_vpermt2varv16hi3;
24777 break;
24778 case E_V64QImode:
24779 if (TARGET_AVX512VBMI)
24780 gen = gen_avx512bw_vpermt2varv64qi3;
24781 break;
24782 case E_V32HImode:
24783 if (TARGET_AVX512BW)
24784 gen = gen_avx512bw_vpermt2varv32hi3;
24785 break;
24786 case E_V4SImode:
24787 if (TARGET_AVX512VL)
24788 gen = gen_avx512vl_vpermt2varv4si3;
24789 break;
24790 case E_V8SImode:
24791 if (TARGET_AVX512VL)
24792 gen = gen_avx512vl_vpermt2varv8si3;
24793 break;
24794 case E_V16SImode:
24795 if (TARGET_AVX512F)
24796 gen = gen_avx512f_vpermt2varv16si3;
24797 break;
24798 case E_V4SFmode:
24799 if (TARGET_AVX512VL)
24801 gen = gen_avx512vl_vpermt2varv4sf3;
24802 maskmode = V4SImode;
24804 break;
24805 case E_V8SFmode:
24806 if (TARGET_AVX512VL)
24808 gen = gen_avx512vl_vpermt2varv8sf3;
24809 maskmode = V8SImode;
24811 break;
24812 case E_V16SFmode:
24813 if (TARGET_AVX512F)
24815 gen = gen_avx512f_vpermt2varv16sf3;
24816 maskmode = V16SImode;
24818 break;
24819 case E_V2DImode:
24820 if (TARGET_AVX512VL)
24821 gen = gen_avx512vl_vpermt2varv2di3;
24822 break;
24823 case E_V4DImode:
24824 if (TARGET_AVX512VL)
24825 gen = gen_avx512vl_vpermt2varv4di3;
24826 break;
24827 case E_V8DImode:
24828 if (TARGET_AVX512F)
24829 gen = gen_avx512f_vpermt2varv8di3;
24830 break;
24831 case E_V2DFmode:
24832 if (TARGET_AVX512VL)
24834 gen = gen_avx512vl_vpermt2varv2df3;
24835 maskmode = V2DImode;
24837 break;
24838 case E_V4DFmode:
24839 if (TARGET_AVX512VL)
24841 gen = gen_avx512vl_vpermt2varv4df3;
24842 maskmode = V4DImode;
24844 break;
24845 case E_V8DFmode:
24846 if (TARGET_AVX512F)
24848 gen = gen_avx512f_vpermt2varv8df3;
24849 maskmode = V8DImode;
24851 break;
24852 default:
24853 break;
24856 if (gen == NULL)
24857 return false;
24859 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24860 expander, so args are either in d, or in op0, op1 etc. */
24861 if (d)
24863 rtx vec[64];
24864 target = d->target;
24865 op0 = d->op0;
24866 op1 = d->op1;
24867 for (int i = 0; i < d->nelt; ++i)
24868 vec[i] = GEN_INT (d->perm[i]);
24869 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24872 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24873 return true;
24876 /* Expand a variable vector permutation. */
24878 void
24879 ix86_expand_vec_perm (rtx operands[])
24881 rtx target = operands[0];
24882 rtx op0 = operands[1];
24883 rtx op1 = operands[2];
24884 rtx mask = operands[3];
24885 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24886 machine_mode mode = GET_MODE (op0);
24887 machine_mode maskmode = GET_MODE (mask);
24888 int w, e, i;
24889 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24891 /* Number of elements in the vector. */
24892 w = GET_MODE_NUNITS (mode);
24893 e = GET_MODE_UNIT_SIZE (mode);
24894 gcc_assert (w <= 64);
24896 if (TARGET_AVX512F && one_operand_shuffle)
24898 rtx (*gen) (rtx, rtx, rtx) = NULL;
24899 switch (mode)
24901 case E_V16SImode:
24902 gen =gen_avx512f_permvarv16si;
24903 break;
24904 case E_V16SFmode:
24905 gen = gen_avx512f_permvarv16sf;
24906 break;
24907 case E_V8DImode:
24908 gen = gen_avx512f_permvarv8di;
24909 break;
24910 case E_V8DFmode:
24911 gen = gen_avx512f_permvarv8df;
24912 break;
24913 default:
24914 break;
24916 if (gen != NULL)
24918 emit_insn (gen (target, op0, mask));
24919 return;
24923 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24924 return;
24926 if (TARGET_AVX2)
24928 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24930 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24931 an constant shuffle operand. With a tiny bit of effort we can
24932 use VPERMD instead. A re-interpretation stall for V4DFmode is
24933 unfortunate but there's no avoiding it.
24934 Similarly for V16HImode we don't have instructions for variable
24935 shuffling, while for V32QImode we can use after preparing suitable
24936 masks vpshufb; vpshufb; vpermq; vpor. */
24938 if (mode == V16HImode)
24940 maskmode = mode = V32QImode;
24941 w = 32;
24942 e = 1;
24944 else
24946 maskmode = mode = V8SImode;
24947 w = 8;
24948 e = 4;
24950 t1 = gen_reg_rtx (maskmode);
24952 /* Replicate the low bits of the V4DImode mask into V8SImode:
24953 mask = { A B C D }
24954 t1 = { A A B B C C D D }. */
24955 for (i = 0; i < w / 2; ++i)
24956 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24957 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24958 vt = force_reg (maskmode, vt);
24959 mask = gen_lowpart (maskmode, mask);
24960 if (maskmode == V8SImode)
24961 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24962 else
24963 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24965 /* Multiply the shuffle indicies by two. */
24966 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24967 OPTAB_DIRECT);
24969 /* Add one to the odd shuffle indicies:
24970 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24971 for (i = 0; i < w / 2; ++i)
24973 vec[i * 2] = const0_rtx;
24974 vec[i * 2 + 1] = const1_rtx;
24976 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24977 vt = validize_mem (force_const_mem (maskmode, vt));
24978 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24979 OPTAB_DIRECT);
24981 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24982 operands[3] = mask = t1;
24983 target = gen_reg_rtx (mode);
24984 op0 = gen_lowpart (mode, op0);
24985 op1 = gen_lowpart (mode, op1);
24988 switch (mode)
24990 case E_V8SImode:
24991 /* The VPERMD and VPERMPS instructions already properly ignore
24992 the high bits of the shuffle elements. No need for us to
24993 perform an AND ourselves. */
24994 if (one_operand_shuffle)
24996 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24997 if (target != operands[0])
24998 emit_move_insn (operands[0],
24999 gen_lowpart (GET_MODE (operands[0]), target));
25001 else
25003 t1 = gen_reg_rtx (V8SImode);
25004 t2 = gen_reg_rtx (V8SImode);
25005 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25006 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25007 goto merge_two;
25009 return;
25011 case E_V8SFmode:
25012 mask = gen_lowpart (V8SImode, mask);
25013 if (one_operand_shuffle)
25014 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25015 else
25017 t1 = gen_reg_rtx (V8SFmode);
25018 t2 = gen_reg_rtx (V8SFmode);
25019 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25020 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25021 goto merge_two;
25023 return;
25025 case E_V4SImode:
25026 /* By combining the two 128-bit input vectors into one 256-bit
25027 input vector, we can use VPERMD and VPERMPS for the full
25028 two-operand shuffle. */
25029 t1 = gen_reg_rtx (V8SImode);
25030 t2 = gen_reg_rtx (V8SImode);
25031 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25032 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25033 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25034 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25035 return;
25037 case E_V4SFmode:
25038 t1 = gen_reg_rtx (V8SFmode);
25039 t2 = gen_reg_rtx (V8SImode);
25040 mask = gen_lowpart (V4SImode, mask);
25041 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25042 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25043 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25044 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25045 return;
25047 case E_V32QImode:
25048 t1 = gen_reg_rtx (V32QImode);
25049 t2 = gen_reg_rtx (V32QImode);
25050 t3 = gen_reg_rtx (V32QImode);
25051 vt2 = GEN_INT (-128);
25052 vt = gen_const_vec_duplicate (V32QImode, vt2);
25053 vt = force_reg (V32QImode, vt);
25054 for (i = 0; i < 32; i++)
25055 vec[i] = i < 16 ? vt2 : const0_rtx;
25056 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25057 vt2 = force_reg (V32QImode, vt2);
25058 /* From mask create two adjusted masks, which contain the same
25059 bits as mask in the low 7 bits of each vector element.
25060 The first mask will have the most significant bit clear
25061 if it requests element from the same 128-bit lane
25062 and MSB set if it requests element from the other 128-bit lane.
25063 The second mask will have the opposite values of the MSB,
25064 and additionally will have its 128-bit lanes swapped.
25065 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25066 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25067 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25068 stands for other 12 bytes. */
25069 /* The bit whether element is from the same lane or the other
25070 lane is bit 4, so shift it up by 3 to the MSB position. */
25071 t5 = gen_reg_rtx (V4DImode);
25072 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25073 GEN_INT (3)));
25074 /* Clear MSB bits from the mask just in case it had them set. */
25075 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25076 /* After this t1 will have MSB set for elements from other lane. */
25077 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25078 /* Clear bits other than MSB. */
25079 emit_insn (gen_andv32qi3 (t1, t1, vt));
25080 /* Or in the lower bits from mask into t3. */
25081 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25082 /* And invert MSB bits in t1, so MSB is set for elements from the same
25083 lane. */
25084 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25085 /* Swap 128-bit lanes in t3. */
25086 t6 = gen_reg_rtx (V4DImode);
25087 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25088 const2_rtx, GEN_INT (3),
25089 const0_rtx, const1_rtx));
25090 /* And or in the lower bits from mask into t1. */
25091 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25092 if (one_operand_shuffle)
25094 /* Each of these shuffles will put 0s in places where
25095 element from the other 128-bit lane is needed, otherwise
25096 will shuffle in the requested value. */
25097 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25098 gen_lowpart (V32QImode, t6)));
25099 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25100 /* For t3 the 128-bit lanes are swapped again. */
25101 t7 = gen_reg_rtx (V4DImode);
25102 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25103 const2_rtx, GEN_INT (3),
25104 const0_rtx, const1_rtx));
25105 /* And oring both together leads to the result. */
25106 emit_insn (gen_iorv32qi3 (target, t1,
25107 gen_lowpart (V32QImode, t7)));
25108 if (target != operands[0])
25109 emit_move_insn (operands[0],
25110 gen_lowpart (GET_MODE (operands[0]), target));
25111 return;
25114 t4 = gen_reg_rtx (V32QImode);
25115 /* Similarly to the above one_operand_shuffle code,
25116 just for repeated twice for each operand. merge_two:
25117 code will merge the two results together. */
25118 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25119 gen_lowpart (V32QImode, t6)));
25120 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25121 gen_lowpart (V32QImode, t6)));
25122 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25123 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25124 t7 = gen_reg_rtx (V4DImode);
25125 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25126 const2_rtx, GEN_INT (3),
25127 const0_rtx, const1_rtx));
25128 t8 = gen_reg_rtx (V4DImode);
25129 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25130 const2_rtx, GEN_INT (3),
25131 const0_rtx, const1_rtx));
25132 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25133 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25134 t1 = t4;
25135 t2 = t3;
25136 goto merge_two;
25138 default:
25139 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25140 break;
25144 if (TARGET_XOP)
25146 /* The XOP VPPERM insn supports three inputs. By ignoring the
25147 one_operand_shuffle special case, we avoid creating another
25148 set of constant vectors in memory. */
25149 one_operand_shuffle = false;
25151 /* mask = mask & {2*w-1, ...} */
25152 vt = GEN_INT (2*w - 1);
25154 else
25156 /* mask = mask & {w-1, ...} */
25157 vt = GEN_INT (w - 1);
25160 vt = gen_const_vec_duplicate (maskmode, vt);
25161 mask = expand_simple_binop (maskmode, AND, mask, vt,
25162 NULL_RTX, 0, OPTAB_DIRECT);
25164 /* For non-QImode operations, convert the word permutation control
25165 into a byte permutation control. */
25166 if (mode != V16QImode)
25168 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25169 GEN_INT (exact_log2 (e)),
25170 NULL_RTX, 0, OPTAB_DIRECT);
25172 /* Convert mask to vector of chars. */
25173 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25175 /* Replicate each of the input bytes into byte positions:
25176 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25177 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25178 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25179 for (i = 0; i < 16; ++i)
25180 vec[i] = GEN_INT (i/e * e);
25181 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25182 vt = validize_mem (force_const_mem (V16QImode, vt));
25183 if (TARGET_XOP)
25184 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25185 else
25186 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25188 /* Convert it into the byte positions by doing
25189 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25190 for (i = 0; i < 16; ++i)
25191 vec[i] = GEN_INT (i % e);
25192 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25193 vt = validize_mem (force_const_mem (V16QImode, vt));
25194 emit_insn (gen_addv16qi3 (mask, mask, vt));
25197 /* The actual shuffle operations all operate on V16QImode. */
25198 op0 = gen_lowpart (V16QImode, op0);
25199 op1 = gen_lowpart (V16QImode, op1);
25201 if (TARGET_XOP)
25203 if (GET_MODE (target) != V16QImode)
25204 target = gen_reg_rtx (V16QImode);
25205 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25206 if (target != operands[0])
25207 emit_move_insn (operands[0],
25208 gen_lowpart (GET_MODE (operands[0]), target));
25210 else if (one_operand_shuffle)
25212 if (GET_MODE (target) != V16QImode)
25213 target = gen_reg_rtx (V16QImode);
25214 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25215 if (target != operands[0])
25216 emit_move_insn (operands[0],
25217 gen_lowpart (GET_MODE (operands[0]), target));
25219 else
25221 rtx xops[6];
25222 bool ok;
25224 /* Shuffle the two input vectors independently. */
25225 t1 = gen_reg_rtx (V16QImode);
25226 t2 = gen_reg_rtx (V16QImode);
25227 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25228 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25230 merge_two:
25231 /* Then merge them together. The key is whether any given control
25232 element contained a bit set that indicates the second word. */
25233 mask = operands[3];
25234 vt = GEN_INT (w);
25235 if (maskmode == V2DImode && !TARGET_SSE4_1)
25237 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25238 more shuffle to convert the V2DI input mask into a V4SI
25239 input mask. At which point the masking that expand_int_vcond
25240 will work as desired. */
25241 rtx t3 = gen_reg_rtx (V4SImode);
25242 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25243 const0_rtx, const0_rtx,
25244 const2_rtx, const2_rtx));
25245 mask = t3;
25246 maskmode = V4SImode;
25247 e = w = 4;
25250 vt = gen_const_vec_duplicate (maskmode, vt);
25251 vt = force_reg (maskmode, vt);
25252 mask = expand_simple_binop (maskmode, AND, mask, vt,
25253 NULL_RTX, 0, OPTAB_DIRECT);
25255 if (GET_MODE (target) != mode)
25256 target = gen_reg_rtx (mode);
25257 xops[0] = target;
25258 xops[1] = gen_lowpart (mode, t2);
25259 xops[2] = gen_lowpart (mode, t1);
25260 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25261 xops[4] = mask;
25262 xops[5] = vt;
25263 ok = ix86_expand_int_vcond (xops);
25264 gcc_assert (ok);
25265 if (target != operands[0])
25266 emit_move_insn (operands[0],
25267 gen_lowpart (GET_MODE (operands[0]), target));
25271 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25272 true if we should do zero extension, else sign extension. HIGH_P is
25273 true if we want the N/2 high elements, else the low elements. */
25275 void
25276 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25278 machine_mode imode = GET_MODE (src);
25279 rtx tmp;
25281 if (TARGET_SSE4_1)
25283 rtx (*unpack)(rtx, rtx);
25284 rtx (*extract)(rtx, rtx) = NULL;
25285 machine_mode halfmode = BLKmode;
25287 switch (imode)
25289 case E_V64QImode:
25290 if (unsigned_p)
25291 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25292 else
25293 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25294 halfmode = V32QImode;
25295 extract
25296 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25297 break;
25298 case E_V32QImode:
25299 if (unsigned_p)
25300 unpack = gen_avx2_zero_extendv16qiv16hi2;
25301 else
25302 unpack = gen_avx2_sign_extendv16qiv16hi2;
25303 halfmode = V16QImode;
25304 extract
25305 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25306 break;
25307 case E_V32HImode:
25308 if (unsigned_p)
25309 unpack = gen_avx512f_zero_extendv16hiv16si2;
25310 else
25311 unpack = gen_avx512f_sign_extendv16hiv16si2;
25312 halfmode = V16HImode;
25313 extract
25314 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25315 break;
25316 case E_V16HImode:
25317 if (unsigned_p)
25318 unpack = gen_avx2_zero_extendv8hiv8si2;
25319 else
25320 unpack = gen_avx2_sign_extendv8hiv8si2;
25321 halfmode = V8HImode;
25322 extract
25323 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25324 break;
25325 case E_V16SImode:
25326 if (unsigned_p)
25327 unpack = gen_avx512f_zero_extendv8siv8di2;
25328 else
25329 unpack = gen_avx512f_sign_extendv8siv8di2;
25330 halfmode = V8SImode;
25331 extract
25332 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25333 break;
25334 case E_V8SImode:
25335 if (unsigned_p)
25336 unpack = gen_avx2_zero_extendv4siv4di2;
25337 else
25338 unpack = gen_avx2_sign_extendv4siv4di2;
25339 halfmode = V4SImode;
25340 extract
25341 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25342 break;
25343 case E_V16QImode:
25344 if (unsigned_p)
25345 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25346 else
25347 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25348 break;
25349 case E_V8HImode:
25350 if (unsigned_p)
25351 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25352 else
25353 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25354 break;
25355 case E_V4SImode:
25356 if (unsigned_p)
25357 unpack = gen_sse4_1_zero_extendv2siv2di2;
25358 else
25359 unpack = gen_sse4_1_sign_extendv2siv2di2;
25360 break;
25361 default:
25362 gcc_unreachable ();
25365 if (GET_MODE_SIZE (imode) >= 32)
25367 tmp = gen_reg_rtx (halfmode);
25368 emit_insn (extract (tmp, src));
25370 else if (high_p)
25372 /* Shift higher 8 bytes to lower 8 bytes. */
25373 tmp = gen_reg_rtx (V1TImode);
25374 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25375 GEN_INT (64)));
25376 tmp = gen_lowpart (imode, tmp);
25378 else
25379 tmp = src;
25381 emit_insn (unpack (dest, tmp));
25383 else
25385 rtx (*unpack)(rtx, rtx, rtx);
25387 switch (imode)
25389 case E_V16QImode:
25390 if (high_p)
25391 unpack = gen_vec_interleave_highv16qi;
25392 else
25393 unpack = gen_vec_interleave_lowv16qi;
25394 break;
25395 case E_V8HImode:
25396 if (high_p)
25397 unpack = gen_vec_interleave_highv8hi;
25398 else
25399 unpack = gen_vec_interleave_lowv8hi;
25400 break;
25401 case E_V4SImode:
25402 if (high_p)
25403 unpack = gen_vec_interleave_highv4si;
25404 else
25405 unpack = gen_vec_interleave_lowv4si;
25406 break;
25407 default:
25408 gcc_unreachable ();
25411 if (unsigned_p)
25412 tmp = force_reg (imode, CONST0_RTX (imode));
25413 else
25414 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25415 src, pc_rtx, pc_rtx);
25417 rtx tmp2 = gen_reg_rtx (imode);
25418 emit_insn (unpack (tmp2, src, tmp));
25419 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25423 /* Expand conditional increment or decrement using adb/sbb instructions.
25424 The default case using setcc followed by the conditional move can be
25425 done by generic code. */
25426 bool
25427 ix86_expand_int_addcc (rtx operands[])
25429 enum rtx_code code = GET_CODE (operands[1]);
25430 rtx flags;
25431 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25432 rtx compare_op;
25433 rtx val = const0_rtx;
25434 bool fpcmp = false;
25435 machine_mode mode;
25436 rtx op0 = XEXP (operands[1], 0);
25437 rtx op1 = XEXP (operands[1], 1);
25439 if (operands[3] != const1_rtx
25440 && operands[3] != constm1_rtx)
25441 return false;
25442 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25443 return false;
25444 code = GET_CODE (compare_op);
25446 flags = XEXP (compare_op, 0);
25448 if (GET_MODE (flags) == CCFPmode)
25450 fpcmp = true;
25451 code = ix86_fp_compare_code_to_integer (code);
25454 if (code != LTU)
25456 val = constm1_rtx;
25457 if (fpcmp)
25458 PUT_CODE (compare_op,
25459 reverse_condition_maybe_unordered
25460 (GET_CODE (compare_op)));
25461 else
25462 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25465 mode = GET_MODE (operands[0]);
25467 /* Construct either adc or sbb insn. */
25468 if ((code == LTU) == (operands[3] == constm1_rtx))
25470 switch (mode)
25472 case E_QImode:
25473 insn = gen_subqi3_carry;
25474 break;
25475 case E_HImode:
25476 insn = gen_subhi3_carry;
25477 break;
25478 case E_SImode:
25479 insn = gen_subsi3_carry;
25480 break;
25481 case E_DImode:
25482 insn = gen_subdi3_carry;
25483 break;
25484 default:
25485 gcc_unreachable ();
25488 else
25490 switch (mode)
25492 case E_QImode:
25493 insn = gen_addqi3_carry;
25494 break;
25495 case E_HImode:
25496 insn = gen_addhi3_carry;
25497 break;
25498 case E_SImode:
25499 insn = gen_addsi3_carry;
25500 break;
25501 case E_DImode:
25502 insn = gen_adddi3_carry;
25503 break;
25504 default:
25505 gcc_unreachable ();
25508 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25510 return true;
25514 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25515 but works for floating pointer parameters and nonoffsetable memories.
25516 For pushes, it returns just stack offsets; the values will be saved
25517 in the right order. Maximally three parts are generated. */
25519 static int
25520 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25522 int size;
25524 if (!TARGET_64BIT)
25525 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25526 else
25527 size = (GET_MODE_SIZE (mode) + 4) / 8;
25529 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25530 gcc_assert (size >= 2 && size <= 4);
25532 /* Optimize constant pool reference to immediates. This is used by fp
25533 moves, that force all constants to memory to allow combining. */
25534 if (MEM_P (operand) && MEM_READONLY_P (operand))
25535 operand = avoid_constant_pool_reference (operand);
25537 if (MEM_P (operand) && !offsettable_memref_p (operand))
25539 /* The only non-offsetable memories we handle are pushes. */
25540 int ok = push_operand (operand, VOIDmode);
25542 gcc_assert (ok);
25544 operand = copy_rtx (operand);
25545 PUT_MODE (operand, word_mode);
25546 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25547 return size;
25550 if (GET_CODE (operand) == CONST_VECTOR)
25552 scalar_int_mode imode = int_mode_for_mode (mode).require ();
25553 /* Caution: if we looked through a constant pool memory above,
25554 the operand may actually have a different mode now. That's
25555 ok, since we want to pun this all the way back to an integer. */
25556 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25557 gcc_assert (operand != NULL);
25558 mode = imode;
25561 if (!TARGET_64BIT)
25563 if (mode == DImode)
25564 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25565 else
25567 int i;
25569 if (REG_P (operand))
25571 gcc_assert (reload_completed);
25572 for (i = 0; i < size; i++)
25573 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25575 else if (offsettable_memref_p (operand))
25577 operand = adjust_address (operand, SImode, 0);
25578 parts[0] = operand;
25579 for (i = 1; i < size; i++)
25580 parts[i] = adjust_address (operand, SImode, 4 * i);
25582 else if (CONST_DOUBLE_P (operand))
25584 const REAL_VALUE_TYPE *r;
25585 long l[4];
25587 r = CONST_DOUBLE_REAL_VALUE (operand);
25588 switch (mode)
25590 case E_TFmode:
25591 real_to_target (l, r, mode);
25592 parts[3] = gen_int_mode (l[3], SImode);
25593 parts[2] = gen_int_mode (l[2], SImode);
25594 break;
25595 case E_XFmode:
25596 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25597 long double may not be 80-bit. */
25598 real_to_target (l, r, mode);
25599 parts[2] = gen_int_mode (l[2], SImode);
25600 break;
25601 case E_DFmode:
25602 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25603 break;
25604 default:
25605 gcc_unreachable ();
25607 parts[1] = gen_int_mode (l[1], SImode);
25608 parts[0] = gen_int_mode (l[0], SImode);
25610 else
25611 gcc_unreachable ();
25614 else
25616 if (mode == TImode)
25617 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25618 if (mode == XFmode || mode == TFmode)
25620 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25621 if (REG_P (operand))
25623 gcc_assert (reload_completed);
25624 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25625 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25627 else if (offsettable_memref_p (operand))
25629 operand = adjust_address (operand, DImode, 0);
25630 parts[0] = operand;
25631 parts[1] = adjust_address (operand, upper_mode, 8);
25633 else if (CONST_DOUBLE_P (operand))
25635 long l[4];
25637 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25639 /* real_to_target puts 32-bit pieces in each long. */
25640 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25641 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25642 << 32), DImode);
25644 if (upper_mode == SImode)
25645 parts[1] = gen_int_mode (l[2], SImode);
25646 else
25647 parts[1]
25648 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25649 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25650 << 32), DImode);
25652 else
25653 gcc_unreachable ();
25657 return size;
25660 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25661 Return false when normal moves are needed; true when all required
25662 insns have been emitted. Operands 2-4 contain the input values
25663 int the correct order; operands 5-7 contain the output values. */
25665 void
25666 ix86_split_long_move (rtx operands[])
25668 rtx part[2][4];
25669 int nparts, i, j;
25670 int push = 0;
25671 int collisions = 0;
25672 machine_mode mode = GET_MODE (operands[0]);
25673 bool collisionparts[4];
25675 /* The DFmode expanders may ask us to move double.
25676 For 64bit target this is single move. By hiding the fact
25677 here we simplify i386.md splitters. */
25678 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25680 /* Optimize constant pool reference to immediates. This is used by
25681 fp moves, that force all constants to memory to allow combining. */
25683 if (MEM_P (operands[1])
25684 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25685 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25686 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25687 if (push_operand (operands[0], VOIDmode))
25689 operands[0] = copy_rtx (operands[0]);
25690 PUT_MODE (operands[0], word_mode);
25692 else
25693 operands[0] = gen_lowpart (DImode, operands[0]);
25694 operands[1] = gen_lowpart (DImode, operands[1]);
25695 emit_move_insn (operands[0], operands[1]);
25696 return;
25699 /* The only non-offsettable memory we handle is push. */
25700 if (push_operand (operands[0], VOIDmode))
25701 push = 1;
25702 else
25703 gcc_assert (!MEM_P (operands[0])
25704 || offsettable_memref_p (operands[0]));
25706 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25707 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25709 /* When emitting push, take care for source operands on the stack. */
25710 if (push && MEM_P (operands[1])
25711 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25713 rtx src_base = XEXP (part[1][nparts - 1], 0);
25715 /* Compensate for the stack decrement by 4. */
25716 if (!TARGET_64BIT && nparts == 3
25717 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25718 src_base = plus_constant (Pmode, src_base, 4);
25720 /* src_base refers to the stack pointer and is
25721 automatically decreased by emitted push. */
25722 for (i = 0; i < nparts; i++)
25723 part[1][i] = change_address (part[1][i],
25724 GET_MODE (part[1][i]), src_base);
25727 /* We need to do copy in the right order in case an address register
25728 of the source overlaps the destination. */
25729 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25731 rtx tmp;
25733 for (i = 0; i < nparts; i++)
25735 collisionparts[i]
25736 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25737 if (collisionparts[i])
25738 collisions++;
25741 /* Collision in the middle part can be handled by reordering. */
25742 if (collisions == 1 && nparts == 3 && collisionparts [1])
25744 std::swap (part[0][1], part[0][2]);
25745 std::swap (part[1][1], part[1][2]);
25747 else if (collisions == 1
25748 && nparts == 4
25749 && (collisionparts [1] || collisionparts [2]))
25751 if (collisionparts [1])
25753 std::swap (part[0][1], part[0][2]);
25754 std::swap (part[1][1], part[1][2]);
25756 else
25758 std::swap (part[0][2], part[0][3]);
25759 std::swap (part[1][2], part[1][3]);
25763 /* If there are more collisions, we can't handle it by reordering.
25764 Do an lea to the last part and use only one colliding move. */
25765 else if (collisions > 1)
25767 rtx base, addr;
25769 collisions = 1;
25771 base = part[0][nparts - 1];
25773 /* Handle the case when the last part isn't valid for lea.
25774 Happens in 64-bit mode storing the 12-byte XFmode. */
25775 if (GET_MODE (base) != Pmode)
25776 base = gen_rtx_REG (Pmode, REGNO (base));
25778 addr = XEXP (part[1][0], 0);
25779 if (TARGET_TLS_DIRECT_SEG_REFS)
25781 struct ix86_address parts;
25782 int ok = ix86_decompose_address (addr, &parts);
25783 gcc_assert (ok);
25784 /* It is not valid to use %gs: or %fs: in lea. */
25785 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25787 emit_insn (gen_rtx_SET (base, addr));
25788 part[1][0] = replace_equiv_address (part[1][0], base);
25789 for (i = 1; i < nparts; i++)
25791 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25792 part[1][i] = replace_equiv_address (part[1][i], tmp);
25797 if (push)
25799 if (!TARGET_64BIT)
25801 if (nparts == 3)
25803 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25804 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25805 stack_pointer_rtx, GEN_INT (-4)));
25806 emit_move_insn (part[0][2], part[1][2]);
25808 else if (nparts == 4)
25810 emit_move_insn (part[0][3], part[1][3]);
25811 emit_move_insn (part[0][2], part[1][2]);
25814 else
25816 /* In 64bit mode we don't have 32bit push available. In case this is
25817 register, it is OK - we will just use larger counterpart. We also
25818 retype memory - these comes from attempt to avoid REX prefix on
25819 moving of second half of TFmode value. */
25820 if (GET_MODE (part[1][1]) == SImode)
25822 switch (GET_CODE (part[1][1]))
25824 case MEM:
25825 part[1][1] = adjust_address (part[1][1], DImode, 0);
25826 break;
25828 case REG:
25829 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25830 break;
25832 default:
25833 gcc_unreachable ();
25836 if (GET_MODE (part[1][0]) == SImode)
25837 part[1][0] = part[1][1];
25840 emit_move_insn (part[0][1], part[1][1]);
25841 emit_move_insn (part[0][0], part[1][0]);
25842 return;
25845 /* Choose correct order to not overwrite the source before it is copied. */
25846 if ((REG_P (part[0][0])
25847 && REG_P (part[1][1])
25848 && (REGNO (part[0][0]) == REGNO (part[1][1])
25849 || (nparts == 3
25850 && REGNO (part[0][0]) == REGNO (part[1][2]))
25851 || (nparts == 4
25852 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25853 || (collisions > 0
25854 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25856 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25858 operands[2 + i] = part[0][j];
25859 operands[6 + i] = part[1][j];
25862 else
25864 for (i = 0; i < nparts; i++)
25866 operands[2 + i] = part[0][i];
25867 operands[6 + i] = part[1][i];
25871 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25872 if (optimize_insn_for_size_p ())
25874 for (j = 0; j < nparts - 1; j++)
25875 if (CONST_INT_P (operands[6 + j])
25876 && operands[6 + j] != const0_rtx
25877 && REG_P (operands[2 + j]))
25878 for (i = j; i < nparts - 1; i++)
25879 if (CONST_INT_P (operands[7 + i])
25880 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25881 operands[7 + i] = operands[2 + j];
25884 for (i = 0; i < nparts; i++)
25885 emit_move_insn (operands[2 + i], operands[6 + i]);
25887 return;
25890 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25891 left shift by a constant, either using a single shift or
25892 a sequence of add instructions. */
25894 static void
25895 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25897 rtx (*insn)(rtx, rtx, rtx);
25899 if (count == 1
25900 || (count * ix86_cost->add <= ix86_cost->shift_const
25901 && !optimize_insn_for_size_p ()))
25903 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25904 while (count-- > 0)
25905 emit_insn (insn (operand, operand, operand));
25907 else
25909 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25910 emit_insn (insn (operand, operand, GEN_INT (count)));
25914 void
25915 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25917 rtx (*gen_ashl3)(rtx, rtx, rtx);
25918 rtx (*gen_shld)(rtx, rtx, rtx);
25919 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25921 rtx low[2], high[2];
25922 int count;
25924 if (CONST_INT_P (operands[2]))
25926 split_double_mode (mode, operands, 2, low, high);
25927 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25929 if (count >= half_width)
25931 emit_move_insn (high[0], low[1]);
25932 emit_move_insn (low[0], const0_rtx);
25934 if (count > half_width)
25935 ix86_expand_ashl_const (high[0], count - half_width, mode);
25937 else
25939 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25941 if (!rtx_equal_p (operands[0], operands[1]))
25942 emit_move_insn (operands[0], operands[1]);
25944 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25945 ix86_expand_ashl_const (low[0], count, mode);
25947 return;
25950 split_double_mode (mode, operands, 1, low, high);
25952 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25954 if (operands[1] == const1_rtx)
25956 /* Assuming we've chosen a QImode capable registers, then 1 << N
25957 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25958 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25960 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25962 ix86_expand_clear (low[0]);
25963 ix86_expand_clear (high[0]);
25964 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25966 d = gen_lowpart (QImode, low[0]);
25967 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25968 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25969 emit_insn (gen_rtx_SET (d, s));
25971 d = gen_lowpart (QImode, high[0]);
25972 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25973 s = gen_rtx_NE (QImode, flags, const0_rtx);
25974 emit_insn (gen_rtx_SET (d, s));
25977 /* Otherwise, we can get the same results by manually performing
25978 a bit extract operation on bit 5/6, and then performing the two
25979 shifts. The two methods of getting 0/1 into low/high are exactly
25980 the same size. Avoiding the shift in the bit extract case helps
25981 pentium4 a bit; no one else seems to care much either way. */
25982 else
25984 machine_mode half_mode;
25985 rtx (*gen_lshr3)(rtx, rtx, rtx);
25986 rtx (*gen_and3)(rtx, rtx, rtx);
25987 rtx (*gen_xor3)(rtx, rtx, rtx);
25988 HOST_WIDE_INT bits;
25989 rtx x;
25991 if (mode == DImode)
25993 half_mode = SImode;
25994 gen_lshr3 = gen_lshrsi3;
25995 gen_and3 = gen_andsi3;
25996 gen_xor3 = gen_xorsi3;
25997 bits = 5;
25999 else
26001 half_mode = DImode;
26002 gen_lshr3 = gen_lshrdi3;
26003 gen_and3 = gen_anddi3;
26004 gen_xor3 = gen_xordi3;
26005 bits = 6;
26008 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26009 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26010 else
26011 x = gen_lowpart (half_mode, operands[2]);
26012 emit_insn (gen_rtx_SET (high[0], x));
26014 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26015 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26016 emit_move_insn (low[0], high[0]);
26017 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26020 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26021 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26022 return;
26025 if (operands[1] == constm1_rtx)
26027 /* For -1 << N, we can avoid the shld instruction, because we
26028 know that we're shifting 0...31/63 ones into a -1. */
26029 emit_move_insn (low[0], constm1_rtx);
26030 if (optimize_insn_for_size_p ())
26031 emit_move_insn (high[0], low[0]);
26032 else
26033 emit_move_insn (high[0], constm1_rtx);
26035 else
26037 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26039 if (!rtx_equal_p (operands[0], operands[1]))
26040 emit_move_insn (operands[0], operands[1]);
26042 split_double_mode (mode, operands, 1, low, high);
26043 emit_insn (gen_shld (high[0], low[0], operands[2]));
26046 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26048 if (TARGET_CMOVE && scratch)
26050 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26051 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26053 ix86_expand_clear (scratch);
26054 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26056 else
26058 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26059 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26061 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26065 void
26066 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26068 rtx (*gen_ashr3)(rtx, rtx, rtx)
26069 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26070 rtx (*gen_shrd)(rtx, rtx, rtx);
26071 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26073 rtx low[2], high[2];
26074 int count;
26076 if (CONST_INT_P (operands[2]))
26078 split_double_mode (mode, operands, 2, low, high);
26079 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26081 if (count == GET_MODE_BITSIZE (mode) - 1)
26083 emit_move_insn (high[0], high[1]);
26084 emit_insn (gen_ashr3 (high[0], high[0],
26085 GEN_INT (half_width - 1)));
26086 emit_move_insn (low[0], high[0]);
26089 else if (count >= half_width)
26091 emit_move_insn (low[0], high[1]);
26092 emit_move_insn (high[0], low[0]);
26093 emit_insn (gen_ashr3 (high[0], high[0],
26094 GEN_INT (half_width - 1)));
26096 if (count > half_width)
26097 emit_insn (gen_ashr3 (low[0], low[0],
26098 GEN_INT (count - half_width)));
26100 else
26102 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26104 if (!rtx_equal_p (operands[0], operands[1]))
26105 emit_move_insn (operands[0], operands[1]);
26107 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26108 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26111 else
26113 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26115 if (!rtx_equal_p (operands[0], operands[1]))
26116 emit_move_insn (operands[0], operands[1]);
26118 split_double_mode (mode, operands, 1, low, high);
26120 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26121 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26123 if (TARGET_CMOVE && scratch)
26125 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26126 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26128 emit_move_insn (scratch, high[0]);
26129 emit_insn (gen_ashr3 (scratch, scratch,
26130 GEN_INT (half_width - 1)));
26131 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26132 scratch));
26134 else
26136 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26137 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26139 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26144 void
26145 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26147 rtx (*gen_lshr3)(rtx, rtx, rtx)
26148 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26149 rtx (*gen_shrd)(rtx, rtx, rtx);
26150 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26152 rtx low[2], high[2];
26153 int count;
26155 if (CONST_INT_P (operands[2]))
26157 split_double_mode (mode, operands, 2, low, high);
26158 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26160 if (count >= half_width)
26162 emit_move_insn (low[0], high[1]);
26163 ix86_expand_clear (high[0]);
26165 if (count > half_width)
26166 emit_insn (gen_lshr3 (low[0], low[0],
26167 GEN_INT (count - half_width)));
26169 else
26171 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26173 if (!rtx_equal_p (operands[0], operands[1]))
26174 emit_move_insn (operands[0], operands[1]);
26176 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26177 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26180 else
26182 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26184 if (!rtx_equal_p (operands[0], operands[1]))
26185 emit_move_insn (operands[0], operands[1]);
26187 split_double_mode (mode, operands, 1, low, high);
26189 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26190 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26192 if (TARGET_CMOVE && scratch)
26194 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26195 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26197 ix86_expand_clear (scratch);
26198 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26199 scratch));
26201 else
26203 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26204 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26206 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26211 /* Predict just emitted jump instruction to be taken with probability PROB. */
26212 static void
26213 predict_jump (int prob)
26215 rtx_insn *insn = get_last_insn ();
26216 gcc_assert (JUMP_P (insn));
26217 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26220 /* Helper function for the string operations below. Dest VARIABLE whether
26221 it is aligned to VALUE bytes. If true, jump to the label. */
26222 static rtx_code_label *
26223 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26225 rtx_code_label *label = gen_label_rtx ();
26226 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26227 if (GET_MODE (variable) == DImode)
26228 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26229 else
26230 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26231 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26232 1, label);
26233 if (epilogue)
26234 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26235 else
26236 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26237 return label;
26240 /* Adjust COUNTER by the VALUE. */
26241 static void
26242 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26244 rtx (*gen_add)(rtx, rtx, rtx)
26245 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26247 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26250 /* Zero extend possibly SImode EXP to Pmode register. */
26252 ix86_zero_extend_to_Pmode (rtx exp)
26254 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26257 /* Divide COUNTREG by SCALE. */
26258 static rtx
26259 scale_counter (rtx countreg, int scale)
26261 rtx sc;
26263 if (scale == 1)
26264 return countreg;
26265 if (CONST_INT_P (countreg))
26266 return GEN_INT (INTVAL (countreg) / scale);
26267 gcc_assert (REG_P (countreg));
26269 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26270 GEN_INT (exact_log2 (scale)),
26271 NULL, 1, OPTAB_DIRECT);
26272 return sc;
26275 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26276 DImode for constant loop counts. */
26278 static machine_mode
26279 counter_mode (rtx count_exp)
26281 if (GET_MODE (count_exp) != VOIDmode)
26282 return GET_MODE (count_exp);
26283 if (!CONST_INT_P (count_exp))
26284 return Pmode;
26285 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26286 return DImode;
26287 return SImode;
26290 /* Copy the address to a Pmode register. This is used for x32 to
26291 truncate DImode TLS address to a SImode register. */
26293 static rtx
26294 ix86_copy_addr_to_reg (rtx addr)
26296 rtx reg;
26297 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26299 reg = copy_addr_to_reg (addr);
26300 REG_POINTER (reg) = 1;
26301 return reg;
26303 else
26305 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26306 reg = copy_to_mode_reg (DImode, addr);
26307 REG_POINTER (reg) = 1;
26308 return gen_rtx_SUBREG (SImode, reg, 0);
26312 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26313 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26314 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26315 memory by VALUE (supposed to be in MODE).
26317 The size is rounded down to whole number of chunk size moved at once.
26318 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26321 static void
26322 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26323 rtx destptr, rtx srcptr, rtx value,
26324 rtx count, machine_mode mode, int unroll,
26325 int expected_size, bool issetmem)
26327 rtx_code_label *out_label, *top_label;
26328 rtx iter, tmp;
26329 machine_mode iter_mode = counter_mode (count);
26330 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26331 rtx piece_size = GEN_INT (piece_size_n);
26332 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26333 rtx size;
26334 int i;
26336 top_label = gen_label_rtx ();
26337 out_label = gen_label_rtx ();
26338 iter = gen_reg_rtx (iter_mode);
26340 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26341 NULL, 1, OPTAB_DIRECT);
26342 /* Those two should combine. */
26343 if (piece_size == const1_rtx)
26345 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26346 true, out_label);
26347 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26349 emit_move_insn (iter, const0_rtx);
26351 emit_label (top_label);
26353 tmp = convert_modes (Pmode, iter_mode, iter, true);
26355 /* This assert could be relaxed - in this case we'll need to compute
26356 smallest power of two, containing in PIECE_SIZE_N and pass it to
26357 offset_address. */
26358 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26359 destmem = offset_address (destmem, tmp, piece_size_n);
26360 destmem = adjust_address (destmem, mode, 0);
26362 if (!issetmem)
26364 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26365 srcmem = adjust_address (srcmem, mode, 0);
26367 /* When unrolling for chips that reorder memory reads and writes,
26368 we can save registers by using single temporary.
26369 Also using 4 temporaries is overkill in 32bit mode. */
26370 if (!TARGET_64BIT && 0)
26372 for (i = 0; i < unroll; i++)
26374 if (i)
26376 destmem =
26377 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26378 srcmem =
26379 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26381 emit_move_insn (destmem, srcmem);
26384 else
26386 rtx tmpreg[4];
26387 gcc_assert (unroll <= 4);
26388 for (i = 0; i < unroll; i++)
26390 tmpreg[i] = gen_reg_rtx (mode);
26391 if (i)
26393 srcmem =
26394 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26396 emit_move_insn (tmpreg[i], srcmem);
26398 for (i = 0; i < unroll; i++)
26400 if (i)
26402 destmem =
26403 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26405 emit_move_insn (destmem, tmpreg[i]);
26409 else
26410 for (i = 0; i < unroll; i++)
26412 if (i)
26413 destmem =
26414 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26415 emit_move_insn (destmem, value);
26418 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26419 true, OPTAB_LIB_WIDEN);
26420 if (tmp != iter)
26421 emit_move_insn (iter, tmp);
26423 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26424 true, top_label);
26425 if (expected_size != -1)
26427 expected_size /= GET_MODE_SIZE (mode) * unroll;
26428 if (expected_size == 0)
26429 predict_jump (0);
26430 else if (expected_size > REG_BR_PROB_BASE)
26431 predict_jump (REG_BR_PROB_BASE - 1);
26432 else
26433 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26435 else
26436 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26437 iter = ix86_zero_extend_to_Pmode (iter);
26438 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26439 true, OPTAB_LIB_WIDEN);
26440 if (tmp != destptr)
26441 emit_move_insn (destptr, tmp);
26442 if (!issetmem)
26444 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26445 true, OPTAB_LIB_WIDEN);
26446 if (tmp != srcptr)
26447 emit_move_insn (srcptr, tmp);
26449 emit_label (out_label);
26452 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26453 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26454 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26455 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26456 ORIG_VALUE is the original value passed to memset to fill the memory with.
26457 Other arguments have same meaning as for previous function. */
26459 static void
26460 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26461 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26462 rtx count,
26463 machine_mode mode, bool issetmem)
26465 rtx destexp;
26466 rtx srcexp;
26467 rtx countreg;
26468 HOST_WIDE_INT rounded_count;
26470 /* If possible, it is shorter to use rep movs.
26471 TODO: Maybe it is better to move this logic to decide_alg. */
26472 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26473 && (!issetmem || orig_value == const0_rtx))
26474 mode = SImode;
26476 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26477 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26479 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26480 GET_MODE_SIZE (mode)));
26481 if (mode != QImode)
26483 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26484 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26485 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26487 else
26488 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26489 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26491 rounded_count
26492 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26493 destmem = shallow_copy_rtx (destmem);
26494 set_mem_size (destmem, rounded_count);
26496 else if (MEM_SIZE_KNOWN_P (destmem))
26497 clear_mem_size (destmem);
26499 if (issetmem)
26501 value = force_reg (mode, gen_lowpart (mode, value));
26502 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26504 else
26506 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26507 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26508 if (mode != QImode)
26510 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26511 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26512 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26514 else
26515 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26516 if (CONST_INT_P (count))
26518 rounded_count
26519 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26520 srcmem = shallow_copy_rtx (srcmem);
26521 set_mem_size (srcmem, rounded_count);
26523 else
26525 if (MEM_SIZE_KNOWN_P (srcmem))
26526 clear_mem_size (srcmem);
26528 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26529 destexp, srcexp));
26533 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26534 DESTMEM.
26535 SRC is passed by pointer to be updated on return.
26536 Return value is updated DST. */
26537 static rtx
26538 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26539 HOST_WIDE_INT size_to_move)
26541 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26542 enum insn_code code;
26543 machine_mode move_mode;
26544 int piece_size, i;
26546 /* Find the widest mode in which we could perform moves.
26547 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26548 it until move of such size is supported. */
26549 piece_size = 1 << floor_log2 (size_to_move);
26550 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26551 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26553 gcc_assert (piece_size > 1);
26554 piece_size >>= 1;
26557 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26558 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26559 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26561 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26562 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26563 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26565 move_mode = word_mode;
26566 piece_size = GET_MODE_SIZE (move_mode);
26567 code = optab_handler (mov_optab, move_mode);
26570 gcc_assert (code != CODE_FOR_nothing);
26572 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26573 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26575 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26576 gcc_assert (size_to_move % piece_size == 0);
26577 adjust = GEN_INT (piece_size);
26578 for (i = 0; i < size_to_move; i += piece_size)
26580 /* We move from memory to memory, so we'll need to do it via
26581 a temporary register. */
26582 tempreg = gen_reg_rtx (move_mode);
26583 emit_insn (GEN_FCN (code) (tempreg, src));
26584 emit_insn (GEN_FCN (code) (dst, tempreg));
26586 emit_move_insn (destptr,
26587 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26588 emit_move_insn (srcptr,
26589 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26591 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26592 piece_size);
26593 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26594 piece_size);
26597 /* Update DST and SRC rtx. */
26598 *srcmem = src;
26599 return dst;
26602 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26603 static void
26604 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26605 rtx destptr, rtx srcptr, rtx count, int max_size)
26607 rtx src, dest;
26608 if (CONST_INT_P (count))
26610 HOST_WIDE_INT countval = INTVAL (count);
26611 HOST_WIDE_INT epilogue_size = countval % max_size;
26612 int i;
26614 /* For now MAX_SIZE should be a power of 2. This assert could be
26615 relaxed, but it'll require a bit more complicated epilogue
26616 expanding. */
26617 gcc_assert ((max_size & (max_size - 1)) == 0);
26618 for (i = max_size; i >= 1; i >>= 1)
26620 if (epilogue_size & i)
26621 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26623 return;
26625 if (max_size > 8)
26627 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26628 count, 1, OPTAB_DIRECT);
26629 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26630 count, QImode, 1, 4, false);
26631 return;
26634 /* When there are stringops, we can cheaply increase dest and src pointers.
26635 Otherwise we save code size by maintaining offset (zero is readily
26636 available from preceding rep operation) and using x86 addressing modes.
26638 if (TARGET_SINGLE_STRINGOP)
26640 if (max_size > 4)
26642 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26643 src = change_address (srcmem, SImode, srcptr);
26644 dest = change_address (destmem, SImode, destptr);
26645 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26646 emit_label (label);
26647 LABEL_NUSES (label) = 1;
26649 if (max_size > 2)
26651 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26652 src = change_address (srcmem, HImode, srcptr);
26653 dest = change_address (destmem, HImode, destptr);
26654 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26655 emit_label (label);
26656 LABEL_NUSES (label) = 1;
26658 if (max_size > 1)
26660 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26661 src = change_address (srcmem, QImode, srcptr);
26662 dest = change_address (destmem, QImode, destptr);
26663 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26664 emit_label (label);
26665 LABEL_NUSES (label) = 1;
26668 else
26670 rtx offset = force_reg (Pmode, const0_rtx);
26671 rtx tmp;
26673 if (max_size > 4)
26675 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26676 src = change_address (srcmem, SImode, srcptr);
26677 dest = change_address (destmem, SImode, destptr);
26678 emit_move_insn (dest, src);
26679 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26680 true, OPTAB_LIB_WIDEN);
26681 if (tmp != offset)
26682 emit_move_insn (offset, tmp);
26683 emit_label (label);
26684 LABEL_NUSES (label) = 1;
26686 if (max_size > 2)
26688 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26689 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26690 src = change_address (srcmem, HImode, tmp);
26691 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26692 dest = change_address (destmem, HImode, tmp);
26693 emit_move_insn (dest, src);
26694 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26695 true, OPTAB_LIB_WIDEN);
26696 if (tmp != offset)
26697 emit_move_insn (offset, tmp);
26698 emit_label (label);
26699 LABEL_NUSES (label) = 1;
26701 if (max_size > 1)
26703 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26704 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26705 src = change_address (srcmem, QImode, tmp);
26706 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26707 dest = change_address (destmem, QImode, tmp);
26708 emit_move_insn (dest, src);
26709 emit_label (label);
26710 LABEL_NUSES (label) = 1;
26715 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26716 with value PROMOTED_VAL.
26717 SRC is passed by pointer to be updated on return.
26718 Return value is updated DST. */
26719 static rtx
26720 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26721 HOST_WIDE_INT size_to_move)
26723 rtx dst = destmem, adjust;
26724 enum insn_code code;
26725 machine_mode move_mode;
26726 int piece_size, i;
26728 /* Find the widest mode in which we could perform moves.
26729 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26730 it until move of such size is supported. */
26731 move_mode = GET_MODE (promoted_val);
26732 if (move_mode == VOIDmode)
26733 move_mode = QImode;
26734 if (size_to_move < GET_MODE_SIZE (move_mode))
26736 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26737 move_mode = int_mode_for_size (move_bits, 0).require ();
26738 promoted_val = gen_lowpart (move_mode, promoted_val);
26740 piece_size = GET_MODE_SIZE (move_mode);
26741 code = optab_handler (mov_optab, move_mode);
26742 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26744 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26746 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26747 gcc_assert (size_to_move % piece_size == 0);
26748 adjust = GEN_INT (piece_size);
26749 for (i = 0; i < size_to_move; i += piece_size)
26751 if (piece_size <= GET_MODE_SIZE (word_mode))
26753 emit_insn (gen_strset (destptr, dst, promoted_val));
26754 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26755 piece_size);
26756 continue;
26759 emit_insn (GEN_FCN (code) (dst, promoted_val));
26761 emit_move_insn (destptr,
26762 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26764 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26765 piece_size);
26768 /* Update DST rtx. */
26769 return dst;
26771 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26772 static void
26773 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26774 rtx count, int max_size)
26776 count =
26777 expand_simple_binop (counter_mode (count), AND, count,
26778 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26779 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26780 gen_lowpart (QImode, value), count, QImode,
26781 1, max_size / 2, true);
26784 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26785 static void
26786 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26787 rtx count, int max_size)
26789 rtx dest;
26791 if (CONST_INT_P (count))
26793 HOST_WIDE_INT countval = INTVAL (count);
26794 HOST_WIDE_INT epilogue_size = countval % max_size;
26795 int i;
26797 /* For now MAX_SIZE should be a power of 2. This assert could be
26798 relaxed, but it'll require a bit more complicated epilogue
26799 expanding. */
26800 gcc_assert ((max_size & (max_size - 1)) == 0);
26801 for (i = max_size; i >= 1; i >>= 1)
26803 if (epilogue_size & i)
26805 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26806 destmem = emit_memset (destmem, destptr, vec_value, i);
26807 else
26808 destmem = emit_memset (destmem, destptr, value, i);
26811 return;
26813 if (max_size > 32)
26815 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26816 return;
26818 if (max_size > 16)
26820 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26821 if (TARGET_64BIT)
26823 dest = change_address (destmem, DImode, destptr);
26824 emit_insn (gen_strset (destptr, dest, value));
26825 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26826 emit_insn (gen_strset (destptr, dest, value));
26828 else
26830 dest = change_address (destmem, SImode, destptr);
26831 emit_insn (gen_strset (destptr, dest, value));
26832 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26833 emit_insn (gen_strset (destptr, dest, value));
26834 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26835 emit_insn (gen_strset (destptr, dest, value));
26836 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26837 emit_insn (gen_strset (destptr, dest, value));
26839 emit_label (label);
26840 LABEL_NUSES (label) = 1;
26842 if (max_size > 8)
26844 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26845 if (TARGET_64BIT)
26847 dest = change_address (destmem, DImode, destptr);
26848 emit_insn (gen_strset (destptr, dest, value));
26850 else
26852 dest = change_address (destmem, SImode, destptr);
26853 emit_insn (gen_strset (destptr, dest, value));
26854 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26855 emit_insn (gen_strset (destptr, dest, value));
26857 emit_label (label);
26858 LABEL_NUSES (label) = 1;
26860 if (max_size > 4)
26862 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26863 dest = change_address (destmem, SImode, destptr);
26864 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26865 emit_label (label);
26866 LABEL_NUSES (label) = 1;
26868 if (max_size > 2)
26870 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26871 dest = change_address (destmem, HImode, destptr);
26872 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26873 emit_label (label);
26874 LABEL_NUSES (label) = 1;
26876 if (max_size > 1)
26878 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26879 dest = change_address (destmem, QImode, destptr);
26880 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26881 emit_label (label);
26882 LABEL_NUSES (label) = 1;
26886 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26887 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26888 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26889 ignored.
26890 Return value is updated DESTMEM. */
26891 static rtx
26892 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26893 rtx destptr, rtx srcptr, rtx value,
26894 rtx vec_value, rtx count, int align,
26895 int desired_alignment, bool issetmem)
26897 int i;
26898 for (i = 1; i < desired_alignment; i <<= 1)
26900 if (align <= i)
26902 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26903 if (issetmem)
26905 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26906 destmem = emit_memset (destmem, destptr, vec_value, i);
26907 else
26908 destmem = emit_memset (destmem, destptr, value, i);
26910 else
26911 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26912 ix86_adjust_counter (count, i);
26913 emit_label (label);
26914 LABEL_NUSES (label) = 1;
26915 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26918 return destmem;
26921 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26922 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26923 and jump to DONE_LABEL. */
26924 static void
26925 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26926 rtx destptr, rtx srcptr,
26927 rtx value, rtx vec_value,
26928 rtx count, int size,
26929 rtx done_label, bool issetmem)
26931 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26932 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26933 rtx modesize;
26934 int n;
26936 /* If we do not have vector value to copy, we must reduce size. */
26937 if (issetmem)
26939 if (!vec_value)
26941 if (GET_MODE (value) == VOIDmode && size > 8)
26942 mode = Pmode;
26943 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26944 mode = GET_MODE (value);
26946 else
26947 mode = GET_MODE (vec_value), value = vec_value;
26949 else
26951 /* Choose appropriate vector mode. */
26952 if (size >= 32)
26953 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26954 else if (size >= 16)
26955 mode = TARGET_SSE ? V16QImode : DImode;
26956 srcmem = change_address (srcmem, mode, srcptr);
26958 destmem = change_address (destmem, mode, destptr);
26959 modesize = GEN_INT (GET_MODE_SIZE (mode));
26960 gcc_assert (GET_MODE_SIZE (mode) <= size);
26961 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26963 if (issetmem)
26964 emit_move_insn (destmem, gen_lowpart (mode, value));
26965 else
26967 emit_move_insn (destmem, srcmem);
26968 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26970 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26973 destmem = offset_address (destmem, count, 1);
26974 destmem = offset_address (destmem, GEN_INT (-2 * size),
26975 GET_MODE_SIZE (mode));
26976 if (!issetmem)
26978 srcmem = offset_address (srcmem, count, 1);
26979 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26980 GET_MODE_SIZE (mode));
26982 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26984 if (issetmem)
26985 emit_move_insn (destmem, gen_lowpart (mode, value));
26986 else
26988 emit_move_insn (destmem, srcmem);
26989 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26991 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26993 emit_jump_insn (gen_jump (done_label));
26994 emit_barrier ();
26996 emit_label (label);
26997 LABEL_NUSES (label) = 1;
27000 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27001 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27002 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27003 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27004 DONE_LABEL is a label after the whole copying sequence. The label is created
27005 on demand if *DONE_LABEL is NULL.
27006 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27007 bounds after the initial copies.
27009 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27010 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27011 we will dispatch to a library call for large blocks.
27013 In pseudocode we do:
27015 if (COUNT < SIZE)
27017 Assume that SIZE is 4. Bigger sizes are handled analogously
27018 if (COUNT & 4)
27020 copy 4 bytes from SRCPTR to DESTPTR
27021 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27022 goto done_label
27024 if (!COUNT)
27025 goto done_label;
27026 copy 1 byte from SRCPTR to DESTPTR
27027 if (COUNT & 2)
27029 copy 2 bytes from SRCPTR to DESTPTR
27030 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27033 else
27035 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27036 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27038 OLD_DESPTR = DESTPTR;
27039 Align DESTPTR up to DESIRED_ALIGN
27040 SRCPTR += DESTPTR - OLD_DESTPTR
27041 COUNT -= DEST_PTR - OLD_DESTPTR
27042 if (DYNAMIC_CHECK)
27043 Round COUNT down to multiple of SIZE
27044 << optional caller supplied zero size guard is here >>
27045 << optional caller supplied dynamic check is here >>
27046 << caller supplied main copy loop is here >>
27048 done_label:
27050 static void
27051 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27052 rtx *destptr, rtx *srcptr,
27053 machine_mode mode,
27054 rtx value, rtx vec_value,
27055 rtx *count,
27056 rtx_code_label **done_label,
27057 int size,
27058 int desired_align,
27059 int align,
27060 unsigned HOST_WIDE_INT *min_size,
27061 bool dynamic_check,
27062 bool issetmem)
27064 rtx_code_label *loop_label = NULL, *label;
27065 int n;
27066 rtx modesize;
27067 int prolog_size = 0;
27068 rtx mode_value;
27070 /* Chose proper value to copy. */
27071 if (issetmem && VECTOR_MODE_P (mode))
27072 mode_value = vec_value;
27073 else
27074 mode_value = value;
27075 gcc_assert (GET_MODE_SIZE (mode) <= size);
27077 /* See if block is big or small, handle small blocks. */
27078 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27080 int size2 = size;
27081 loop_label = gen_label_rtx ();
27083 if (!*done_label)
27084 *done_label = gen_label_rtx ();
27086 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27087 1, loop_label);
27088 size2 >>= 1;
27090 /* Handle sizes > 3. */
27091 for (;size2 > 2; size2 >>= 1)
27092 expand_small_movmem_or_setmem (destmem, srcmem,
27093 *destptr, *srcptr,
27094 value, vec_value,
27095 *count,
27096 size2, *done_label, issetmem);
27097 /* Nothing to copy? Jump to DONE_LABEL if so */
27098 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27099 1, *done_label);
27101 /* Do a byte copy. */
27102 destmem = change_address (destmem, QImode, *destptr);
27103 if (issetmem)
27104 emit_move_insn (destmem, gen_lowpart (QImode, value));
27105 else
27107 srcmem = change_address (srcmem, QImode, *srcptr);
27108 emit_move_insn (destmem, srcmem);
27111 /* Handle sizes 2 and 3. */
27112 label = ix86_expand_aligntest (*count, 2, false);
27113 destmem = change_address (destmem, HImode, *destptr);
27114 destmem = offset_address (destmem, *count, 1);
27115 destmem = offset_address (destmem, GEN_INT (-2), 2);
27116 if (issetmem)
27117 emit_move_insn (destmem, gen_lowpart (HImode, value));
27118 else
27120 srcmem = change_address (srcmem, HImode, *srcptr);
27121 srcmem = offset_address (srcmem, *count, 1);
27122 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27123 emit_move_insn (destmem, srcmem);
27126 emit_label (label);
27127 LABEL_NUSES (label) = 1;
27128 emit_jump_insn (gen_jump (*done_label));
27129 emit_barrier ();
27131 else
27132 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27133 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27135 /* Start memcpy for COUNT >= SIZE. */
27136 if (loop_label)
27138 emit_label (loop_label);
27139 LABEL_NUSES (loop_label) = 1;
27142 /* Copy first desired_align bytes. */
27143 if (!issetmem)
27144 srcmem = change_address (srcmem, mode, *srcptr);
27145 destmem = change_address (destmem, mode, *destptr);
27146 modesize = GEN_INT (GET_MODE_SIZE (mode));
27147 for (n = 0; prolog_size < desired_align - align; n++)
27149 if (issetmem)
27150 emit_move_insn (destmem, mode_value);
27151 else
27153 emit_move_insn (destmem, srcmem);
27154 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27156 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27157 prolog_size += GET_MODE_SIZE (mode);
27161 /* Copy last SIZE bytes. */
27162 destmem = offset_address (destmem, *count, 1);
27163 destmem = offset_address (destmem,
27164 GEN_INT (-size - prolog_size),
27166 if (issetmem)
27167 emit_move_insn (destmem, mode_value);
27168 else
27170 srcmem = offset_address (srcmem, *count, 1);
27171 srcmem = offset_address (srcmem,
27172 GEN_INT (-size - prolog_size),
27174 emit_move_insn (destmem, srcmem);
27176 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27178 destmem = offset_address (destmem, modesize, 1);
27179 if (issetmem)
27180 emit_move_insn (destmem, mode_value);
27181 else
27183 srcmem = offset_address (srcmem, modesize, 1);
27184 emit_move_insn (destmem, srcmem);
27188 /* Align destination. */
27189 if (desired_align > 1 && desired_align > align)
27191 rtx saveddest = *destptr;
27193 gcc_assert (desired_align <= size);
27194 /* Align destptr up, place it to new register. */
27195 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27196 GEN_INT (prolog_size),
27197 NULL_RTX, 1, OPTAB_DIRECT);
27198 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27199 REG_POINTER (*destptr) = 1;
27200 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27201 GEN_INT (-desired_align),
27202 *destptr, 1, OPTAB_DIRECT);
27203 /* See how many bytes we skipped. */
27204 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27205 *destptr,
27206 saveddest, 1, OPTAB_DIRECT);
27207 /* Adjust srcptr and count. */
27208 if (!issetmem)
27209 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27210 saveddest, *srcptr, 1, OPTAB_DIRECT);
27211 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27212 saveddest, *count, 1, OPTAB_DIRECT);
27213 /* We copied at most size + prolog_size. */
27214 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27215 *min_size
27216 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27217 else
27218 *min_size = 0;
27220 /* Our loops always round down the block size, but for dispatch to
27221 library we need precise value. */
27222 if (dynamic_check)
27223 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27224 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27226 else
27228 gcc_assert (prolog_size == 0);
27229 /* Decrease count, so we won't end up copying last word twice. */
27230 if (!CONST_INT_P (*count))
27231 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27232 constm1_rtx, *count, 1, OPTAB_DIRECT);
27233 else
27234 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27235 (unsigned HOST_WIDE_INT)size));
27236 if (*min_size)
27237 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27242 /* This function is like the previous one, except here we know how many bytes
27243 need to be copied. That allows us to update alignment not only of DST, which
27244 is returned, but also of SRC, which is passed as a pointer for that
27245 reason. */
27246 static rtx
27247 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27248 rtx srcreg, rtx value, rtx vec_value,
27249 int desired_align, int align_bytes,
27250 bool issetmem)
27252 rtx src = NULL;
27253 rtx orig_dst = dst;
27254 rtx orig_src = NULL;
27255 int piece_size = 1;
27256 int copied_bytes = 0;
27258 if (!issetmem)
27260 gcc_assert (srcp != NULL);
27261 src = *srcp;
27262 orig_src = src;
27265 for (piece_size = 1;
27266 piece_size <= desired_align && copied_bytes < align_bytes;
27267 piece_size <<= 1)
27269 if (align_bytes & piece_size)
27271 if (issetmem)
27273 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27274 dst = emit_memset (dst, destreg, vec_value, piece_size);
27275 else
27276 dst = emit_memset (dst, destreg, value, piece_size);
27278 else
27279 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27280 copied_bytes += piece_size;
27283 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27284 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27285 if (MEM_SIZE_KNOWN_P (orig_dst))
27286 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27288 if (!issetmem)
27290 int src_align_bytes = get_mem_align_offset (src, desired_align
27291 * BITS_PER_UNIT);
27292 if (src_align_bytes >= 0)
27293 src_align_bytes = desired_align - src_align_bytes;
27294 if (src_align_bytes >= 0)
27296 unsigned int src_align;
27297 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27299 if ((src_align_bytes & (src_align - 1))
27300 == (align_bytes & (src_align - 1)))
27301 break;
27303 if (src_align > (unsigned int) desired_align)
27304 src_align = desired_align;
27305 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27306 set_mem_align (src, src_align * BITS_PER_UNIT);
27308 if (MEM_SIZE_KNOWN_P (orig_src))
27309 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27310 *srcp = src;
27313 return dst;
27316 /* Return true if ALG can be used in current context.
27317 Assume we expand memset if MEMSET is true. */
27318 static bool
27319 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27321 if (alg == no_stringop)
27322 return false;
27323 if (alg == vector_loop)
27324 return TARGET_SSE || TARGET_AVX;
27325 /* Algorithms using the rep prefix want at least edi and ecx;
27326 additionally, memset wants eax and memcpy wants esi. Don't
27327 consider such algorithms if the user has appropriated those
27328 registers for their own purposes, or if we have a non-default
27329 address space, since some string insns cannot override the segment. */
27330 if (alg == rep_prefix_1_byte
27331 || alg == rep_prefix_4_byte
27332 || alg == rep_prefix_8_byte)
27334 if (have_as)
27335 return false;
27336 if (fixed_regs[CX_REG]
27337 || fixed_regs[DI_REG]
27338 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27339 return false;
27341 return true;
27344 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27345 static enum stringop_alg
27346 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27347 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27348 bool memset, bool zero_memset, bool have_as,
27349 int *dynamic_check, bool *noalign, bool recur)
27351 const struct stringop_algs *algs;
27352 bool optimize_for_speed;
27353 int max = 0;
27354 const struct processor_costs *cost;
27355 int i;
27356 bool any_alg_usable_p = false;
27358 *noalign = false;
27359 *dynamic_check = -1;
27361 /* Even if the string operation call is cold, we still might spend a lot
27362 of time processing large blocks. */
27363 if (optimize_function_for_size_p (cfun)
27364 || (optimize_insn_for_size_p ()
27365 && (max_size < 256
27366 || (expected_size != -1 && expected_size < 256))))
27367 optimize_for_speed = false;
27368 else
27369 optimize_for_speed = true;
27371 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27372 if (memset)
27373 algs = &cost->memset[TARGET_64BIT != 0];
27374 else
27375 algs = &cost->memcpy[TARGET_64BIT != 0];
27377 /* See maximal size for user defined algorithm. */
27378 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27380 enum stringop_alg candidate = algs->size[i].alg;
27381 bool usable = alg_usable_p (candidate, memset, have_as);
27382 any_alg_usable_p |= usable;
27384 if (candidate != libcall && candidate && usable)
27385 max = algs->size[i].max;
27388 /* If expected size is not known but max size is small enough
27389 so inline version is a win, set expected size into
27390 the range. */
27391 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27392 && expected_size == -1)
27393 expected_size = min_size / 2 + max_size / 2;
27395 /* If user specified the algorithm, honor it if possible. */
27396 if (ix86_stringop_alg != no_stringop
27397 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27398 return ix86_stringop_alg;
27399 /* rep; movq or rep; movl is the smallest variant. */
27400 else if (!optimize_for_speed)
27402 *noalign = true;
27403 if (!count || (count & 3) || (memset && !zero_memset))
27404 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27405 ? rep_prefix_1_byte : loop_1_byte;
27406 else
27407 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27408 ? rep_prefix_4_byte : loop;
27410 /* Very tiny blocks are best handled via the loop, REP is expensive to
27411 setup. */
27412 else if (expected_size != -1 && expected_size < 4)
27413 return loop_1_byte;
27414 else if (expected_size != -1)
27416 enum stringop_alg alg = libcall;
27417 bool alg_noalign = false;
27418 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27420 /* We get here if the algorithms that were not libcall-based
27421 were rep-prefix based and we are unable to use rep prefixes
27422 based on global register usage. Break out of the loop and
27423 use the heuristic below. */
27424 if (algs->size[i].max == 0)
27425 break;
27426 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27428 enum stringop_alg candidate = algs->size[i].alg;
27430 if (candidate != libcall
27431 && alg_usable_p (candidate, memset, have_as))
27433 alg = candidate;
27434 alg_noalign = algs->size[i].noalign;
27436 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27437 last non-libcall inline algorithm. */
27438 if (TARGET_INLINE_ALL_STRINGOPS)
27440 /* When the current size is best to be copied by a libcall,
27441 but we are still forced to inline, run the heuristic below
27442 that will pick code for medium sized blocks. */
27443 if (alg != libcall)
27445 *noalign = alg_noalign;
27446 return alg;
27448 else if (!any_alg_usable_p)
27449 break;
27451 else if (alg_usable_p (candidate, memset, have_as))
27453 *noalign = algs->size[i].noalign;
27454 return candidate;
27459 /* When asked to inline the call anyway, try to pick meaningful choice.
27460 We look for maximal size of block that is faster to copy by hand and
27461 take blocks of at most of that size guessing that average size will
27462 be roughly half of the block.
27464 If this turns out to be bad, we might simply specify the preferred
27465 choice in ix86_costs. */
27466 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27467 && (algs->unknown_size == libcall
27468 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27470 enum stringop_alg alg;
27471 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27473 /* If there aren't any usable algorithms or if recursing already,
27474 then recursing on smaller sizes or same size isn't going to
27475 find anything. Just return the simple byte-at-a-time copy loop. */
27476 if (!any_alg_usable_p || recur)
27478 /* Pick something reasonable. */
27479 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27480 *dynamic_check = 128;
27481 return loop_1_byte;
27483 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27484 zero_memset, have_as, dynamic_check, noalign, true);
27485 gcc_assert (*dynamic_check == -1);
27486 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27487 *dynamic_check = max;
27488 else
27489 gcc_assert (alg != libcall);
27490 return alg;
27492 return (alg_usable_p (algs->unknown_size, memset, have_as)
27493 ? algs->unknown_size : libcall);
27496 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27497 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27498 static int
27499 decide_alignment (int align,
27500 enum stringop_alg alg,
27501 int expected_size,
27502 machine_mode move_mode)
27504 int desired_align = 0;
27506 gcc_assert (alg != no_stringop);
27508 if (alg == libcall)
27509 return 0;
27510 if (move_mode == VOIDmode)
27511 return 0;
27513 desired_align = GET_MODE_SIZE (move_mode);
27514 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27515 copying whole cacheline at once. */
27516 if (TARGET_PENTIUMPRO
27517 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27518 desired_align = 8;
27520 if (optimize_size)
27521 desired_align = 1;
27522 if (desired_align < align)
27523 desired_align = align;
27524 if (expected_size != -1 && expected_size < 4)
27525 desired_align = align;
27527 return desired_align;
27531 /* Helper function for memcpy. For QImode value 0xXY produce
27532 0xXYXYXYXY of wide specified by MODE. This is essentially
27533 a * 0x10101010, but we can do slightly better than
27534 synth_mult by unwinding the sequence by hand on CPUs with
27535 slow multiply. */
27536 static rtx
27537 promote_duplicated_reg (machine_mode mode, rtx val)
27539 machine_mode valmode = GET_MODE (val);
27540 rtx tmp;
27541 int nops = mode == DImode ? 3 : 2;
27543 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27544 if (val == const0_rtx)
27545 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27546 if (CONST_INT_P (val))
27548 HOST_WIDE_INT v = INTVAL (val) & 255;
27550 v |= v << 8;
27551 v |= v << 16;
27552 if (mode == DImode)
27553 v |= (v << 16) << 16;
27554 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27557 if (valmode == VOIDmode)
27558 valmode = QImode;
27559 if (valmode != QImode)
27560 val = gen_lowpart (QImode, val);
27561 if (mode == QImode)
27562 return val;
27563 if (!TARGET_PARTIAL_REG_STALL)
27564 nops--;
27565 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27566 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27567 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27568 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27570 rtx reg = convert_modes (mode, QImode, val, true);
27571 tmp = promote_duplicated_reg (mode, const1_rtx);
27572 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27573 OPTAB_DIRECT);
27575 else
27577 rtx reg = convert_modes (mode, QImode, val, true);
27579 if (!TARGET_PARTIAL_REG_STALL)
27580 if (mode == SImode)
27581 emit_insn (gen_insvsi_1 (reg, reg));
27582 else
27583 emit_insn (gen_insvdi_1 (reg, reg));
27584 else
27586 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27587 NULL, 1, OPTAB_DIRECT);
27588 reg =
27589 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27591 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27592 NULL, 1, OPTAB_DIRECT);
27593 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27594 if (mode == SImode)
27595 return reg;
27596 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27597 NULL, 1, OPTAB_DIRECT);
27598 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27599 return reg;
27603 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27604 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27605 alignment from ALIGN to DESIRED_ALIGN. */
27606 static rtx
27607 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27608 int align)
27610 rtx promoted_val;
27612 if (TARGET_64BIT
27613 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27614 promoted_val = promote_duplicated_reg (DImode, val);
27615 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27616 promoted_val = promote_duplicated_reg (SImode, val);
27617 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27618 promoted_val = promote_duplicated_reg (HImode, val);
27619 else
27620 promoted_val = val;
27622 return promoted_val;
27625 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27626 operations when profitable. The code depends upon architecture, block size
27627 and alignment, but always has one of the following overall structures:
27629 Aligned move sequence:
27631 1) Prologue guard: Conditional that jumps up to epilogues for small
27632 blocks that can be handled by epilogue alone. This is faster
27633 but also needed for correctness, since prologue assume the block
27634 is larger than the desired alignment.
27636 Optional dynamic check for size and libcall for large
27637 blocks is emitted here too, with -minline-stringops-dynamically.
27639 2) Prologue: copy first few bytes in order to get destination
27640 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27641 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27642 copied. We emit either a jump tree on power of two sized
27643 blocks, or a byte loop.
27645 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27646 with specified algorithm.
27648 4) Epilogue: code copying tail of the block that is too small to be
27649 handled by main body (or up to size guarded by prologue guard).
27651 Misaligned move sequence
27653 1) missaligned move prologue/epilogue containing:
27654 a) Prologue handling small memory blocks and jumping to done_label
27655 (skipped if blocks are known to be large enough)
27656 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27657 needed by single possibly misaligned move
27658 (skipped if alignment is not needed)
27659 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27661 2) Zero size guard dispatching to done_label, if needed
27663 3) dispatch to library call, if needed,
27665 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27666 with specified algorithm. */
27667 bool
27668 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27669 rtx align_exp, rtx expected_align_exp,
27670 rtx expected_size_exp, rtx min_size_exp,
27671 rtx max_size_exp, rtx probable_max_size_exp,
27672 bool issetmem)
27674 rtx destreg;
27675 rtx srcreg = NULL;
27676 rtx_code_label *label = NULL;
27677 rtx tmp;
27678 rtx_code_label *jump_around_label = NULL;
27679 HOST_WIDE_INT align = 1;
27680 unsigned HOST_WIDE_INT count = 0;
27681 HOST_WIDE_INT expected_size = -1;
27682 int size_needed = 0, epilogue_size_needed;
27683 int desired_align = 0, align_bytes = 0;
27684 enum stringop_alg alg;
27685 rtx promoted_val = NULL;
27686 rtx vec_promoted_val = NULL;
27687 bool force_loopy_epilogue = false;
27688 int dynamic_check;
27689 bool need_zero_guard = false;
27690 bool noalign;
27691 machine_mode move_mode = VOIDmode;
27692 machine_mode wider_mode;
27693 int unroll_factor = 1;
27694 /* TODO: Once value ranges are available, fill in proper data. */
27695 unsigned HOST_WIDE_INT min_size = 0;
27696 unsigned HOST_WIDE_INT max_size = -1;
27697 unsigned HOST_WIDE_INT probable_max_size = -1;
27698 bool misaligned_prologue_used = false;
27699 bool have_as;
27701 if (CONST_INT_P (align_exp))
27702 align = INTVAL (align_exp);
27703 /* i386 can do misaligned access on reasonably increased cost. */
27704 if (CONST_INT_P (expected_align_exp)
27705 && INTVAL (expected_align_exp) > align)
27706 align = INTVAL (expected_align_exp);
27707 /* ALIGN is the minimum of destination and source alignment, but we care here
27708 just about destination alignment. */
27709 else if (!issetmem
27710 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27711 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27713 if (CONST_INT_P (count_exp))
27715 min_size = max_size = probable_max_size = count = expected_size
27716 = INTVAL (count_exp);
27717 /* When COUNT is 0, there is nothing to do. */
27718 if (!count)
27719 return true;
27721 else
27723 if (min_size_exp)
27724 min_size = INTVAL (min_size_exp);
27725 if (max_size_exp)
27726 max_size = INTVAL (max_size_exp);
27727 if (probable_max_size_exp)
27728 probable_max_size = INTVAL (probable_max_size_exp);
27729 if (CONST_INT_P (expected_size_exp))
27730 expected_size = INTVAL (expected_size_exp);
27733 /* Make sure we don't need to care about overflow later on. */
27734 if (count > (HOST_WIDE_INT_1U << 30))
27735 return false;
27737 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27738 if (!issetmem)
27739 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27741 /* Step 0: Decide on preferred algorithm, desired alignment and
27742 size of chunks to be copied by main loop. */
27743 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27744 issetmem,
27745 issetmem && val_exp == const0_rtx, have_as,
27746 &dynamic_check, &noalign, false);
27747 if (alg == libcall)
27748 return false;
27749 gcc_assert (alg != no_stringop);
27751 /* For now vector-version of memset is generated only for memory zeroing, as
27752 creating of promoted vector value is very cheap in this case. */
27753 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27754 alg = unrolled_loop;
27756 if (!count)
27757 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27758 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27759 if (!issetmem)
27760 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27762 unroll_factor = 1;
27763 move_mode = word_mode;
27764 switch (alg)
27766 case libcall:
27767 case no_stringop:
27768 case last_alg:
27769 gcc_unreachable ();
27770 case loop_1_byte:
27771 need_zero_guard = true;
27772 move_mode = QImode;
27773 break;
27774 case loop:
27775 need_zero_guard = true;
27776 break;
27777 case unrolled_loop:
27778 need_zero_guard = true;
27779 unroll_factor = (TARGET_64BIT ? 4 : 2);
27780 break;
27781 case vector_loop:
27782 need_zero_guard = true;
27783 unroll_factor = 4;
27784 /* Find the widest supported mode. */
27785 move_mode = word_mode;
27786 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27787 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27788 move_mode = wider_mode;
27790 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27791 move_mode = TImode;
27793 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27794 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27795 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27797 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27798 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27799 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27800 move_mode = word_mode;
27802 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27803 break;
27804 case rep_prefix_8_byte:
27805 move_mode = DImode;
27806 break;
27807 case rep_prefix_4_byte:
27808 move_mode = SImode;
27809 break;
27810 case rep_prefix_1_byte:
27811 move_mode = QImode;
27812 break;
27814 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27815 epilogue_size_needed = size_needed;
27817 /* If we are going to call any library calls conditionally, make sure any
27818 pending stack adjustment happen before the first conditional branch,
27819 otherwise they will be emitted before the library call only and won't
27820 happen from the other branches. */
27821 if (dynamic_check != -1)
27822 do_pending_stack_adjust ();
27824 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27825 if (!TARGET_ALIGN_STRINGOPS || noalign)
27826 align = desired_align;
27828 /* Step 1: Prologue guard. */
27830 /* Alignment code needs count to be in register. */
27831 if (CONST_INT_P (count_exp) && desired_align > align)
27833 if (INTVAL (count_exp) > desired_align
27834 && INTVAL (count_exp) > size_needed)
27836 align_bytes
27837 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27838 if (align_bytes <= 0)
27839 align_bytes = 0;
27840 else
27841 align_bytes = desired_align - align_bytes;
27843 if (align_bytes == 0)
27844 count_exp = force_reg (counter_mode (count_exp), count_exp);
27846 gcc_assert (desired_align >= 1 && align >= 1);
27848 /* Misaligned move sequences handle both prologue and epilogue at once.
27849 Default code generation results in a smaller code for large alignments
27850 and also avoids redundant job when sizes are known precisely. */
27851 misaligned_prologue_used
27852 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27853 && MAX (desired_align, epilogue_size_needed) <= 32
27854 && desired_align <= epilogue_size_needed
27855 && ((desired_align > align && !align_bytes)
27856 || (!count && epilogue_size_needed > 1)));
27858 /* Do the cheap promotion to allow better CSE across the
27859 main loop and epilogue (ie one load of the big constant in the
27860 front of all code.
27861 For now the misaligned move sequences do not have fast path
27862 without broadcasting. */
27863 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27865 if (alg == vector_loop)
27867 gcc_assert (val_exp == const0_rtx);
27868 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27869 promoted_val = promote_duplicated_reg_to_size (val_exp,
27870 GET_MODE_SIZE (word_mode),
27871 desired_align, align);
27873 else
27875 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27876 desired_align, align);
27879 /* Misaligned move sequences handles both prologues and epilogues at once.
27880 Default code generation results in smaller code for large alignments and
27881 also avoids redundant job when sizes are known precisely. */
27882 if (misaligned_prologue_used)
27884 /* Misaligned move prologue handled small blocks by itself. */
27885 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27886 (dst, src, &destreg, &srcreg,
27887 move_mode, promoted_val, vec_promoted_val,
27888 &count_exp,
27889 &jump_around_label,
27890 desired_align < align
27891 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27892 desired_align, align, &min_size, dynamic_check, issetmem);
27893 if (!issetmem)
27894 src = change_address (src, BLKmode, srcreg);
27895 dst = change_address (dst, BLKmode, destreg);
27896 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27897 epilogue_size_needed = 0;
27898 if (need_zero_guard
27899 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27901 /* It is possible that we copied enough so the main loop will not
27902 execute. */
27903 gcc_assert (size_needed > 1);
27904 if (jump_around_label == NULL_RTX)
27905 jump_around_label = gen_label_rtx ();
27906 emit_cmp_and_jump_insns (count_exp,
27907 GEN_INT (size_needed),
27908 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27909 if (expected_size == -1
27910 || expected_size < (desired_align - align) / 2 + size_needed)
27911 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27912 else
27913 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27916 /* Ensure that alignment prologue won't copy past end of block. */
27917 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27919 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27920 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27921 Make sure it is power of 2. */
27922 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27924 /* To improve performance of small blocks, we jump around the VAL
27925 promoting mode. This mean that if the promoted VAL is not constant,
27926 we might not use it in the epilogue and have to use byte
27927 loop variant. */
27928 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27929 force_loopy_epilogue = true;
27930 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27931 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27933 /* If main algorithm works on QImode, no epilogue is needed.
27934 For small sizes just don't align anything. */
27935 if (size_needed == 1)
27936 desired_align = align;
27937 else
27938 goto epilogue;
27940 else if (!count
27941 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27943 label = gen_label_rtx ();
27944 emit_cmp_and_jump_insns (count_exp,
27945 GEN_INT (epilogue_size_needed),
27946 LTU, 0, counter_mode (count_exp), 1, label);
27947 if (expected_size == -1 || expected_size < epilogue_size_needed)
27948 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27949 else
27950 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27954 /* Emit code to decide on runtime whether library call or inline should be
27955 used. */
27956 if (dynamic_check != -1)
27958 if (!issetmem && CONST_INT_P (count_exp))
27960 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27962 emit_block_copy_via_libcall (dst, src, count_exp);
27963 count_exp = const0_rtx;
27964 goto epilogue;
27967 else
27969 rtx_code_label *hot_label = gen_label_rtx ();
27970 if (jump_around_label == NULL_RTX)
27971 jump_around_label = gen_label_rtx ();
27972 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27973 LEU, 0, counter_mode (count_exp),
27974 1, hot_label);
27975 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27976 if (issetmem)
27977 set_storage_via_libcall (dst, count_exp, val_exp);
27978 else
27979 emit_block_copy_via_libcall (dst, src, count_exp);
27980 emit_jump (jump_around_label);
27981 emit_label (hot_label);
27985 /* Step 2: Alignment prologue. */
27986 /* Do the expensive promotion once we branched off the small blocks. */
27987 if (issetmem && !promoted_val)
27988 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27989 desired_align, align);
27991 if (desired_align > align && !misaligned_prologue_used)
27993 if (align_bytes == 0)
27995 /* Except for the first move in prologue, we no longer know
27996 constant offset in aliasing info. It don't seems to worth
27997 the pain to maintain it for the first move, so throw away
27998 the info early. */
27999 dst = change_address (dst, BLKmode, destreg);
28000 if (!issetmem)
28001 src = change_address (src, BLKmode, srcreg);
28002 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28003 promoted_val, vec_promoted_val,
28004 count_exp, align, desired_align,
28005 issetmem);
28006 /* At most desired_align - align bytes are copied. */
28007 if (min_size < (unsigned)(desired_align - align))
28008 min_size = 0;
28009 else
28010 min_size -= desired_align - align;
28012 else
28014 /* If we know how many bytes need to be stored before dst is
28015 sufficiently aligned, maintain aliasing info accurately. */
28016 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28017 srcreg,
28018 promoted_val,
28019 vec_promoted_val,
28020 desired_align,
28021 align_bytes,
28022 issetmem);
28024 count_exp = plus_constant (counter_mode (count_exp),
28025 count_exp, -align_bytes);
28026 count -= align_bytes;
28027 min_size -= align_bytes;
28028 max_size -= align_bytes;
28030 if (need_zero_guard
28031 && min_size < (unsigned HOST_WIDE_INT) size_needed
28032 && (count < (unsigned HOST_WIDE_INT) size_needed
28033 || (align_bytes == 0
28034 && count < ((unsigned HOST_WIDE_INT) size_needed
28035 + desired_align - align))))
28037 /* It is possible that we copied enough so the main loop will not
28038 execute. */
28039 gcc_assert (size_needed > 1);
28040 if (label == NULL_RTX)
28041 label = gen_label_rtx ();
28042 emit_cmp_and_jump_insns (count_exp,
28043 GEN_INT (size_needed),
28044 LTU, 0, counter_mode (count_exp), 1, label);
28045 if (expected_size == -1
28046 || expected_size < (desired_align - align) / 2 + size_needed)
28047 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28048 else
28049 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28052 if (label && size_needed == 1)
28054 emit_label (label);
28055 LABEL_NUSES (label) = 1;
28056 label = NULL;
28057 epilogue_size_needed = 1;
28058 if (issetmem)
28059 promoted_val = val_exp;
28061 else if (label == NULL_RTX && !misaligned_prologue_used)
28062 epilogue_size_needed = size_needed;
28064 /* Step 3: Main loop. */
28066 switch (alg)
28068 case libcall:
28069 case no_stringop:
28070 case last_alg:
28071 gcc_unreachable ();
28072 case loop_1_byte:
28073 case loop:
28074 case unrolled_loop:
28075 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28076 count_exp, move_mode, unroll_factor,
28077 expected_size, issetmem);
28078 break;
28079 case vector_loop:
28080 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28081 vec_promoted_val, count_exp, move_mode,
28082 unroll_factor, expected_size, issetmem);
28083 break;
28084 case rep_prefix_8_byte:
28085 case rep_prefix_4_byte:
28086 case rep_prefix_1_byte:
28087 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28088 val_exp, count_exp, move_mode, issetmem);
28089 break;
28091 /* Adjust properly the offset of src and dest memory for aliasing. */
28092 if (CONST_INT_P (count_exp))
28094 if (!issetmem)
28095 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28096 (count / size_needed) * size_needed);
28097 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28098 (count / size_needed) * size_needed);
28100 else
28102 if (!issetmem)
28103 src = change_address (src, BLKmode, srcreg);
28104 dst = change_address (dst, BLKmode, destreg);
28107 /* Step 4: Epilogue to copy the remaining bytes. */
28108 epilogue:
28109 if (label)
28111 /* When the main loop is done, COUNT_EXP might hold original count,
28112 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28113 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28114 bytes. Compensate if needed. */
28116 if (size_needed < epilogue_size_needed)
28118 tmp =
28119 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28120 GEN_INT (size_needed - 1), count_exp, 1,
28121 OPTAB_DIRECT);
28122 if (tmp != count_exp)
28123 emit_move_insn (count_exp, tmp);
28125 emit_label (label);
28126 LABEL_NUSES (label) = 1;
28129 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28131 if (force_loopy_epilogue)
28132 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28133 epilogue_size_needed);
28134 else
28136 if (issetmem)
28137 expand_setmem_epilogue (dst, destreg, promoted_val,
28138 vec_promoted_val, count_exp,
28139 epilogue_size_needed);
28140 else
28141 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28142 epilogue_size_needed);
28145 if (jump_around_label)
28146 emit_label (jump_around_label);
28147 return true;
28151 /* Expand the appropriate insns for doing strlen if not just doing
28152 repnz; scasb
28154 out = result, initialized with the start address
28155 align_rtx = alignment of the address.
28156 scratch = scratch register, initialized with the startaddress when
28157 not aligned, otherwise undefined
28159 This is just the body. It needs the initializations mentioned above and
28160 some address computing at the end. These things are done in i386.md. */
28162 static void
28163 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28165 int align;
28166 rtx tmp;
28167 rtx_code_label *align_2_label = NULL;
28168 rtx_code_label *align_3_label = NULL;
28169 rtx_code_label *align_4_label = gen_label_rtx ();
28170 rtx_code_label *end_0_label = gen_label_rtx ();
28171 rtx mem;
28172 rtx tmpreg = gen_reg_rtx (SImode);
28173 rtx scratch = gen_reg_rtx (SImode);
28174 rtx cmp;
28176 align = 0;
28177 if (CONST_INT_P (align_rtx))
28178 align = INTVAL (align_rtx);
28180 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28182 /* Is there a known alignment and is it less than 4? */
28183 if (align < 4)
28185 rtx scratch1 = gen_reg_rtx (Pmode);
28186 emit_move_insn (scratch1, out);
28187 /* Is there a known alignment and is it not 2? */
28188 if (align != 2)
28190 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28191 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28193 /* Leave just the 3 lower bits. */
28194 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28195 NULL_RTX, 0, OPTAB_WIDEN);
28197 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28198 Pmode, 1, align_4_label);
28199 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28200 Pmode, 1, align_2_label);
28201 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28202 Pmode, 1, align_3_label);
28204 else
28206 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28207 check if is aligned to 4 - byte. */
28209 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28210 NULL_RTX, 0, OPTAB_WIDEN);
28212 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28213 Pmode, 1, align_4_label);
28216 mem = change_address (src, QImode, out);
28218 /* Now compare the bytes. */
28220 /* Compare the first n unaligned byte on a byte per byte basis. */
28221 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28222 QImode, 1, end_0_label);
28224 /* Increment the address. */
28225 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28227 /* Not needed with an alignment of 2 */
28228 if (align != 2)
28230 emit_label (align_2_label);
28232 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28233 end_0_label);
28235 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28237 emit_label (align_3_label);
28240 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28241 end_0_label);
28243 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28246 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28247 align this loop. It gives only huge programs, but does not help to
28248 speed up. */
28249 emit_label (align_4_label);
28251 mem = change_address (src, SImode, out);
28252 emit_move_insn (scratch, mem);
28253 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28255 /* This formula yields a nonzero result iff one of the bytes is zero.
28256 This saves three branches inside loop and many cycles. */
28258 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28259 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28260 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28261 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28262 gen_int_mode (0x80808080, SImode)));
28263 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28264 align_4_label);
28266 if (TARGET_CMOVE)
28268 rtx reg = gen_reg_rtx (SImode);
28269 rtx reg2 = gen_reg_rtx (Pmode);
28270 emit_move_insn (reg, tmpreg);
28271 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28273 /* If zero is not in the first two bytes, move two bytes forward. */
28274 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28275 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28276 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28277 emit_insn (gen_rtx_SET (tmpreg,
28278 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28279 reg,
28280 tmpreg)));
28281 /* Emit lea manually to avoid clobbering of flags. */
28282 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28284 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28285 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28286 emit_insn (gen_rtx_SET (out,
28287 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28288 reg2,
28289 out)));
28291 else
28293 rtx_code_label *end_2_label = gen_label_rtx ();
28294 /* Is zero in the first two bytes? */
28296 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28297 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28298 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28299 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28300 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28301 pc_rtx);
28302 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28303 JUMP_LABEL (tmp) = end_2_label;
28305 /* Not in the first two. Move two bytes forward. */
28306 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28307 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28309 emit_label (end_2_label);
28313 /* Avoid branch in fixing the byte. */
28314 tmpreg = gen_lowpart (QImode, tmpreg);
28315 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28316 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28317 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28318 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28320 emit_label (end_0_label);
28323 /* Expand strlen. */
28325 bool
28326 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28328 rtx addr, scratch1, scratch2, scratch3, scratch4;
28330 /* The generic case of strlen expander is long. Avoid it's
28331 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28333 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28334 && !TARGET_INLINE_ALL_STRINGOPS
28335 && !optimize_insn_for_size_p ()
28336 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28337 return false;
28339 addr = force_reg (Pmode, XEXP (src, 0));
28340 scratch1 = gen_reg_rtx (Pmode);
28342 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28343 && !optimize_insn_for_size_p ())
28345 /* Well it seems that some optimizer does not combine a call like
28346 foo(strlen(bar), strlen(bar));
28347 when the move and the subtraction is done here. It does calculate
28348 the length just once when these instructions are done inside of
28349 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28350 often used and I use one fewer register for the lifetime of
28351 output_strlen_unroll() this is better. */
28353 emit_move_insn (out, addr);
28355 ix86_expand_strlensi_unroll_1 (out, src, align);
28357 /* strlensi_unroll_1 returns the address of the zero at the end of
28358 the string, like memchr(), so compute the length by subtracting
28359 the start address. */
28360 emit_insn (ix86_gen_sub3 (out, out, addr));
28362 else
28364 rtx unspec;
28366 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28367 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28368 return false;
28369 /* Can't use this for non-default address spaces. */
28370 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28371 return false;
28373 scratch2 = gen_reg_rtx (Pmode);
28374 scratch3 = gen_reg_rtx (Pmode);
28375 scratch4 = force_reg (Pmode, constm1_rtx);
28377 emit_move_insn (scratch3, addr);
28378 eoschar = force_reg (QImode, eoschar);
28380 src = replace_equiv_address_nv (src, scratch3);
28382 /* If .md starts supporting :P, this can be done in .md. */
28383 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28384 scratch4), UNSPEC_SCAS);
28385 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28386 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28387 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28389 return true;
28392 /* For given symbol (function) construct code to compute address of it's PLT
28393 entry in large x86-64 PIC model. */
28394 static rtx
28395 construct_plt_address (rtx symbol)
28397 rtx tmp, unspec;
28399 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28400 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28401 gcc_assert (Pmode == DImode);
28403 tmp = gen_reg_rtx (Pmode);
28404 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28406 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28407 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28408 return tmp;
28412 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28413 rtx callarg2,
28414 rtx pop, bool sibcall)
28416 rtx vec[3];
28417 rtx use = NULL, call;
28418 unsigned int vec_len = 0;
28419 tree fndecl;
28421 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28423 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28424 if (fndecl
28425 && (lookup_attribute ("interrupt",
28426 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28427 error ("interrupt service routine can't be called directly");
28429 else
28430 fndecl = NULL_TREE;
28432 if (pop == const0_rtx)
28433 pop = NULL;
28434 gcc_assert (!TARGET_64BIT || !pop);
28436 if (TARGET_MACHO && !TARGET_64BIT)
28438 #if TARGET_MACHO
28439 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28440 fnaddr = machopic_indirect_call_target (fnaddr);
28441 #endif
28443 else
28445 /* Static functions and indirect calls don't need the pic register. Also,
28446 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28447 it an indirect call. */
28448 rtx addr = XEXP (fnaddr, 0);
28449 if (flag_pic
28450 && GET_CODE (addr) == SYMBOL_REF
28451 && !SYMBOL_REF_LOCAL_P (addr))
28453 if (flag_plt
28454 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28455 || !lookup_attribute ("noplt",
28456 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28458 if (!TARGET_64BIT
28459 || (ix86_cmodel == CM_LARGE_PIC
28460 && DEFAULT_ABI != MS_ABI))
28462 use_reg (&use, gen_rtx_REG (Pmode,
28463 REAL_PIC_OFFSET_TABLE_REGNUM));
28464 if (ix86_use_pseudo_pic_reg ())
28465 emit_move_insn (gen_rtx_REG (Pmode,
28466 REAL_PIC_OFFSET_TABLE_REGNUM),
28467 pic_offset_table_rtx);
28470 else if (!TARGET_PECOFF && !TARGET_MACHO)
28472 if (TARGET_64BIT)
28474 fnaddr = gen_rtx_UNSPEC (Pmode,
28475 gen_rtvec (1, addr),
28476 UNSPEC_GOTPCREL);
28477 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28479 else
28481 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28482 UNSPEC_GOT);
28483 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28484 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28485 fnaddr);
28487 fnaddr = gen_const_mem (Pmode, fnaddr);
28488 /* Pmode may not be the same as word_mode for x32, which
28489 doesn't support indirect branch via 32-bit memory slot.
28490 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28491 indirect branch via x32 GOT slot is OK. */
28492 if (GET_MODE (fnaddr) != word_mode)
28493 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28494 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28499 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28500 parameters passed in vector registers. */
28501 if (TARGET_64BIT
28502 && (INTVAL (callarg2) > 0
28503 || (INTVAL (callarg2) == 0
28504 && (TARGET_SSE || !flag_skip_rax_setup))))
28506 rtx al = gen_rtx_REG (QImode, AX_REG);
28507 emit_move_insn (al, callarg2);
28508 use_reg (&use, al);
28511 if (ix86_cmodel == CM_LARGE_PIC
28512 && !TARGET_PECOFF
28513 && MEM_P (fnaddr)
28514 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28515 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28516 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28517 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28518 branch via x32 GOT slot is OK. */
28519 else if (!(TARGET_X32
28520 && MEM_P (fnaddr)
28521 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28522 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28523 && (sibcall
28524 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28525 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28527 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28528 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28531 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28533 if (retval)
28535 /* We should add bounds as destination register in case
28536 pointer with bounds may be returned. */
28537 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28539 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28540 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28541 if (GET_CODE (retval) == PARALLEL)
28543 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28544 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28545 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28546 retval = chkp_join_splitted_slot (retval, par);
28548 else
28550 retval = gen_rtx_PARALLEL (VOIDmode,
28551 gen_rtvec (3, retval, b0, b1));
28552 chkp_put_regs_to_expr_list (retval);
28556 call = gen_rtx_SET (retval, call);
28558 vec[vec_len++] = call;
28560 if (pop)
28562 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28563 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28564 vec[vec_len++] = pop;
28567 if (cfun->machine->no_caller_saved_registers
28568 && (!fndecl
28569 || (!TREE_THIS_VOLATILE (fndecl)
28570 && !lookup_attribute ("no_caller_saved_registers",
28571 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28573 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28574 bool is_64bit_ms_abi = (TARGET_64BIT
28575 && ix86_function_abi (fndecl) == MS_ABI);
28576 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28578 /* If there are no caller-saved registers, add all registers
28579 that are clobbered by the call which returns. */
28580 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28581 if (!fixed_regs[i]
28582 && (ix86_call_used_regs[i] == 1
28583 || (ix86_call_used_regs[i] & c_mask))
28584 && !STACK_REGNO_P (i)
28585 && !MMX_REGNO_P (i))
28586 clobber_reg (&use,
28587 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28589 else if (TARGET_64BIT_MS_ABI
28590 && (!callarg2 || INTVAL (callarg2) != -2))
28592 unsigned i;
28594 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28596 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28597 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28599 clobber_reg (&use, gen_rtx_REG (mode, regno));
28602 /* Set here, but it may get cleared later. */
28603 if (TARGET_CALL_MS2SYSV_XLOGUES)
28605 if (!TARGET_SSE)
28608 /* Don't break hot-patched functions. */
28609 else if (ix86_function_ms_hook_prologue (current_function_decl))
28612 /* TODO: Cases not yet examined. */
28613 else if (flag_split_stack)
28614 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28616 else
28618 gcc_assert (!reload_completed);
28619 cfun->machine->call_ms2sysv = true;
28624 if (vec_len > 1)
28625 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28626 call = emit_call_insn (call);
28627 if (use)
28628 CALL_INSN_FUNCTION_USAGE (call) = use;
28630 return call;
28633 /* Return true if the function being called was marked with attribute
28634 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28635 to handle the non-PIC case in the backend because there is no easy
28636 interface for the front-end to force non-PLT calls to use the GOT.
28637 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28638 to call the function marked "noplt" indirectly. */
28640 static bool
28641 ix86_nopic_noplt_attribute_p (rtx call_op)
28643 if (flag_pic || ix86_cmodel == CM_LARGE
28644 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28645 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28646 || SYMBOL_REF_LOCAL_P (call_op))
28647 return false;
28649 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28651 if (!flag_plt
28652 || (symbol_decl != NULL_TREE
28653 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28654 return true;
28656 return false;
28659 /* Output indirect branch via a call and return thunk. CALL_OP is a
28660 register which contains the branch target. XASM is the assembly
28661 template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
28662 A normal call is converted to:
28664 call __x86_indirect_thunk_reg
28666 and a tail call is converted to:
28668 jmp __x86_indirect_thunk_reg
28671 static void
28672 ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
28674 char thunk_name_buf[32];
28675 char *thunk_name;
28676 bool need_bnd_p = ix86_bnd_prefixed_insn_p (current_output_insn);
28677 int regno = REGNO (call_op);
28679 if (cfun->machine->indirect_branch_type
28680 != indirect_branch_thunk_inline)
28682 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28684 int i = regno;
28685 if (i >= FIRST_REX_INT_REG)
28686 i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
28687 if (need_bnd_p)
28688 indirect_thunks_bnd_used |= 1 << i;
28689 else
28690 indirect_thunks_used |= 1 << i;
28692 indirect_thunk_name (thunk_name_buf, regno, need_bnd_p, false);
28693 thunk_name = thunk_name_buf;
28695 else
28696 thunk_name = NULL;
28698 if (sibcall_p)
28700 if (thunk_name != NULL)
28702 if (need_bnd_p)
28703 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28704 else
28705 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28707 else
28708 output_indirect_thunk (need_bnd_p, regno);
28710 else
28712 if (thunk_name != NULL)
28714 if (need_bnd_p)
28715 fprintf (asm_out_file, "\tbnd call\t%s\n", thunk_name);
28716 else
28717 fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
28718 return;
28721 char indirectlabel1[32];
28722 char indirectlabel2[32];
28724 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28725 INDIRECT_LABEL,
28726 indirectlabelno++);
28727 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28728 INDIRECT_LABEL,
28729 indirectlabelno++);
28731 /* Jump. */
28732 if (need_bnd_p)
28733 fputs ("\tbnd jmp\t", asm_out_file);
28734 else
28735 fputs ("\tjmp\t", asm_out_file);
28736 assemble_name_raw (asm_out_file, indirectlabel2);
28737 fputc ('\n', asm_out_file);
28739 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28741 if (thunk_name != NULL)
28743 if (need_bnd_p)
28744 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28745 else
28746 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28748 else
28749 output_indirect_thunk (need_bnd_p, regno);
28751 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28753 /* Call. */
28754 if (need_bnd_p)
28755 fputs ("\tbnd call\t", asm_out_file);
28756 else
28757 fputs ("\tcall\t", asm_out_file);
28758 assemble_name_raw (asm_out_file, indirectlabel1);
28759 fputc ('\n', asm_out_file);
28763 /* Output indirect branch via a call and return thunk. CALL_OP is
28764 the branch target. XASM is the assembly template for CALL_OP.
28765 Branch is a tail call if SIBCALL_P is true. A normal call is
28766 converted to:
28768 jmp L2
28770 push CALL_OP
28771 jmp __x86_indirect_thunk
28773 call L1
28775 and a tail call is converted to:
28777 push CALL_OP
28778 jmp __x86_indirect_thunk
28781 static void
28782 ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
28783 bool sibcall_p)
28785 char thunk_name_buf[32];
28786 char *thunk_name;
28787 char push_buf[64];
28788 bool need_bnd_p = ix86_bnd_prefixed_insn_p (current_output_insn);
28789 int regno = -1;
28791 if (cfun->machine->indirect_branch_type
28792 != indirect_branch_thunk_inline)
28794 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28796 if (need_bnd_p)
28797 indirect_thunk_bnd_needed = true;
28798 else
28799 indirect_thunk_needed = true;
28801 indirect_thunk_name (thunk_name_buf, regno, need_bnd_p, false);
28802 thunk_name = thunk_name_buf;
28804 else
28805 thunk_name = NULL;
28807 snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
28808 TARGET_64BIT ? 'q' : 'l', xasm);
28810 if (sibcall_p)
28812 output_asm_insn (push_buf, &call_op);
28813 if (thunk_name != NULL)
28815 if (need_bnd_p)
28816 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28817 else
28818 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28820 else
28821 output_indirect_thunk (need_bnd_p, regno);
28823 else
28825 char indirectlabel1[32];
28826 char indirectlabel2[32];
28828 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28829 INDIRECT_LABEL,
28830 indirectlabelno++);
28831 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28832 INDIRECT_LABEL,
28833 indirectlabelno++);
28835 /* Jump. */
28836 if (need_bnd_p)
28837 fputs ("\tbnd jmp\t", asm_out_file);
28838 else
28839 fputs ("\tjmp\t", asm_out_file);
28840 assemble_name_raw (asm_out_file, indirectlabel2);
28841 fputc ('\n', asm_out_file);
28843 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28845 /* An external function may be called via GOT, instead of PLT. */
28846 if (MEM_P (call_op))
28848 struct ix86_address parts;
28849 rtx addr = XEXP (call_op, 0);
28850 if (ix86_decompose_address (addr, &parts)
28851 && parts.base == stack_pointer_rtx)
28853 /* Since call will adjust stack by -UNITS_PER_WORD,
28854 we must convert "disp(stack, index, scale)" to
28855 "disp+UNITS_PER_WORD(stack, index, scale)". */
28856 if (parts.index)
28858 addr = gen_rtx_MULT (Pmode, parts.index,
28859 GEN_INT (parts.scale));
28860 addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28861 addr);
28863 else
28864 addr = stack_pointer_rtx;
28866 rtx disp;
28867 if (parts.disp != NULL_RTX)
28868 disp = plus_constant (Pmode, parts.disp,
28869 UNITS_PER_WORD);
28870 else
28871 disp = GEN_INT (UNITS_PER_WORD);
28873 addr = gen_rtx_PLUS (Pmode, addr, disp);
28874 call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
28878 output_asm_insn (push_buf, &call_op);
28880 if (thunk_name != NULL)
28882 if (need_bnd_p)
28883 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28884 else
28885 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28887 else
28888 output_indirect_thunk (need_bnd_p, regno);
28890 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28892 /* Call. */
28893 if (need_bnd_p)
28894 fputs ("\tbnd call\t", asm_out_file);
28895 else
28896 fputs ("\tcall\t", asm_out_file);
28897 assemble_name_raw (asm_out_file, indirectlabel1);
28898 fputc ('\n', asm_out_file);
28902 /* Output indirect branch via a call and return thunk. CALL_OP is
28903 the branch target. XASM is the assembly template for CALL_OP.
28904 Branch is a tail call if SIBCALL_P is true. */
28906 static void
28907 ix86_output_indirect_branch (rtx call_op, const char *xasm,
28908 bool sibcall_p)
28910 if (REG_P (call_op))
28911 ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
28912 else
28913 ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
28915 /* Output indirect jump. CALL_OP is the jump target. Jump is a
28916 function return if RET_P is true. */
28918 const char *
28919 ix86_output_indirect_jmp (rtx call_op, bool ret_p)
28921 if (cfun->machine->indirect_branch_type != indirect_branch_keep)
28923 /* We can't have red-zone if this isn't a function return since
28924 "call" in the indirect thunk pushes the return address onto
28925 stack, destroying red-zone. */
28926 if (!ret_p && ix86_red_zone_size != 0)
28927 gcc_unreachable ();
28929 ix86_output_indirect_branch (call_op, "%0", true);
28930 return "";
28932 else
28933 return "%!jmp\t%A0";
28936 /* Output function return. CALL_OP is the jump target. Add a REP
28937 prefix to RET if LONG_P is true and function return is kept. */
28939 const char *
28940 ix86_output_function_return (bool long_p)
28942 if (cfun->machine->function_return_type != indirect_branch_keep)
28944 char thunk_name[32];
28945 bool need_bnd_p = ix86_bnd_prefixed_insn_p (current_output_insn);
28947 if (cfun->machine->function_return_type
28948 != indirect_branch_thunk_inline)
28950 bool need_thunk = (cfun->machine->function_return_type
28951 == indirect_branch_thunk);
28952 indirect_thunk_name (thunk_name, INVALID_REGNUM, need_bnd_p,
28953 true);
28954 if (need_bnd_p)
28956 indirect_thunk_bnd_needed |= need_thunk;
28957 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28959 else
28961 indirect_thunk_needed |= need_thunk;
28962 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28965 else
28966 output_indirect_thunk (need_bnd_p, INVALID_REGNUM);
28968 return "";
28971 if (!long_p || ix86_bnd_prefixed_insn_p (current_output_insn))
28972 return "%!ret";
28974 return "rep%; ret";
28977 /* Output the assembly for a call instruction. */
28979 const char *
28980 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28982 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28983 bool output_indirect_p
28984 = (!TARGET_SEH
28985 && cfun->machine->indirect_branch_type != indirect_branch_keep);
28986 bool seh_nop_p = false;
28987 const char *xasm;
28989 if (SIBLING_CALL_P (insn))
28991 if (direct_p)
28993 if (ix86_nopic_noplt_attribute_p (call_op))
28995 direct_p = false;
28996 if (TARGET_64BIT)
28998 if (output_indirect_p)
28999 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29000 else
29001 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29003 else
29005 if (output_indirect_p)
29006 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29007 else
29008 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29011 else
29012 xasm = "%!jmp\t%P0";
29014 /* SEH epilogue detection requires the indirect branch case
29015 to include REX.W. */
29016 else if (TARGET_SEH)
29017 xasm = "%!rex.W jmp\t%A0";
29018 else
29020 if (output_indirect_p)
29021 xasm = "%0";
29022 else
29023 xasm = "%!jmp\t%A0";
29026 if (output_indirect_p && !direct_p)
29027 ix86_output_indirect_branch (call_op, xasm, true);
29028 else
29029 output_asm_insn (xasm, &call_op);
29030 return "";
29033 /* SEH unwinding can require an extra nop to be emitted in several
29034 circumstances. Determine if we have one of those. */
29035 if (TARGET_SEH)
29037 rtx_insn *i;
29039 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29041 /* If we get to another real insn, we don't need the nop. */
29042 if (INSN_P (i))
29043 break;
29045 /* If we get to the epilogue note, prevent a catch region from
29046 being adjacent to the standard epilogue sequence. If non-
29047 call-exceptions, we'll have done this during epilogue emission. */
29048 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29049 && !flag_non_call_exceptions
29050 && !can_throw_internal (insn))
29052 seh_nop_p = true;
29053 break;
29057 /* If we didn't find a real insn following the call, prevent the
29058 unwinder from looking into the next function. */
29059 if (i == NULL)
29060 seh_nop_p = true;
29063 if (direct_p)
29065 if (ix86_nopic_noplt_attribute_p (call_op))
29067 direct_p = false;
29068 if (TARGET_64BIT)
29070 if (output_indirect_p)
29071 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29072 else
29073 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29075 else
29077 if (output_indirect_p)
29078 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29079 else
29080 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29083 else
29084 xasm = "%!call\t%P0";
29086 else
29088 if (output_indirect_p)
29089 xasm = "%0";
29090 else
29091 xasm = "%!call\t%A0";
29094 if (output_indirect_p && !direct_p)
29095 ix86_output_indirect_branch (call_op, xasm, false);
29096 else
29097 output_asm_insn (xasm, &call_op);
29099 if (seh_nop_p)
29100 return "nop";
29102 return "";
29105 /* Clear stack slot assignments remembered from previous functions.
29106 This is called from INIT_EXPANDERS once before RTL is emitted for each
29107 function. */
29109 static struct machine_function *
29110 ix86_init_machine_status (void)
29112 struct machine_function *f;
29114 f = ggc_cleared_alloc<machine_function> ();
29115 f->call_abi = ix86_abi;
29117 return f;
29120 /* Return a MEM corresponding to a stack slot with mode MODE.
29121 Allocate a new slot if necessary.
29123 The RTL for a function can have several slots available: N is
29124 which slot to use. */
29127 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29129 struct stack_local_entry *s;
29131 gcc_assert (n < MAX_386_STACK_LOCALS);
29133 for (s = ix86_stack_locals; s; s = s->next)
29134 if (s->mode == mode && s->n == n)
29135 return validize_mem (copy_rtx (s->rtl));
29137 s = ggc_alloc<stack_local_entry> ();
29138 s->n = n;
29139 s->mode = mode;
29140 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29142 s->next = ix86_stack_locals;
29143 ix86_stack_locals = s;
29144 return validize_mem (copy_rtx (s->rtl));
29147 static void
29148 ix86_instantiate_decls (void)
29150 struct stack_local_entry *s;
29152 for (s = ix86_stack_locals; s; s = s->next)
29153 if (s->rtl != NULL_RTX)
29154 instantiate_decl_rtl (s->rtl);
29157 /* Return the number used for encoding REG, in the range 0..7. */
29159 static int
29160 reg_encoded_number (rtx reg)
29162 unsigned regno = REGNO (reg);
29163 switch (regno)
29165 case AX_REG:
29166 return 0;
29167 case CX_REG:
29168 return 1;
29169 case DX_REG:
29170 return 2;
29171 case BX_REG:
29172 return 3;
29173 case SP_REG:
29174 return 4;
29175 case BP_REG:
29176 return 5;
29177 case SI_REG:
29178 return 6;
29179 case DI_REG:
29180 return 7;
29181 default:
29182 break;
29184 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29185 return regno - FIRST_STACK_REG;
29186 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29187 return regno - FIRST_SSE_REG;
29188 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29189 return regno - FIRST_MMX_REG;
29190 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29191 return regno - FIRST_REX_SSE_REG;
29192 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29193 return regno - FIRST_REX_INT_REG;
29194 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29195 return regno - FIRST_MASK_REG;
29196 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29197 return regno - FIRST_BND_REG;
29198 return -1;
29201 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29202 in its encoding if it could be relevant for ROP mitigation, otherwise
29203 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29204 used for calculating it into them. */
29206 static int
29207 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29208 int *popno0 = 0, int *popno1 = 0)
29210 if (asm_noperands (PATTERN (insn)) >= 0)
29211 return -1;
29212 int has_modrm = get_attr_modrm (insn);
29213 if (!has_modrm)
29214 return -1;
29215 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29216 rtx op0, op1;
29217 switch (cls)
29219 case MODRM_CLASS_OP02:
29220 gcc_assert (noperands >= 3);
29221 if (popno0)
29223 *popno0 = 0;
29224 *popno1 = 2;
29226 op0 = operands[0];
29227 op1 = operands[2];
29228 break;
29229 case MODRM_CLASS_OP01:
29230 gcc_assert (noperands >= 2);
29231 if (popno0)
29233 *popno0 = 0;
29234 *popno1 = 1;
29236 op0 = operands[0];
29237 op1 = operands[1];
29238 break;
29239 default:
29240 return -1;
29242 if (REG_P (op0) && REG_P (op1))
29244 int enc0 = reg_encoded_number (op0);
29245 int enc1 = reg_encoded_number (op1);
29246 return 0xc0 + (enc1 << 3) + enc0;
29248 return -1;
29251 /* Check whether x86 address PARTS is a pc-relative address. */
29253 bool
29254 ix86_rip_relative_addr_p (struct ix86_address *parts)
29256 rtx base, index, disp;
29258 base = parts->base;
29259 index = parts->index;
29260 disp = parts->disp;
29262 if (disp && !base && !index)
29264 if (TARGET_64BIT)
29266 rtx symbol = disp;
29268 if (GET_CODE (disp) == CONST)
29269 symbol = XEXP (disp, 0);
29270 if (GET_CODE (symbol) == PLUS
29271 && CONST_INT_P (XEXP (symbol, 1)))
29272 symbol = XEXP (symbol, 0);
29274 if (GET_CODE (symbol) == LABEL_REF
29275 || (GET_CODE (symbol) == SYMBOL_REF
29276 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29277 || (GET_CODE (symbol) == UNSPEC
29278 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29279 || XINT (symbol, 1) == UNSPEC_PCREL
29280 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29281 return true;
29284 return false;
29287 /* Calculate the length of the memory address in the instruction encoding.
29288 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29289 or other prefixes. We never generate addr32 prefix for LEA insn. */
29292 memory_address_length (rtx addr, bool lea)
29294 struct ix86_address parts;
29295 rtx base, index, disp;
29296 int len;
29297 int ok;
29299 if (GET_CODE (addr) == PRE_DEC
29300 || GET_CODE (addr) == POST_INC
29301 || GET_CODE (addr) == PRE_MODIFY
29302 || GET_CODE (addr) == POST_MODIFY)
29303 return 0;
29305 ok = ix86_decompose_address (addr, &parts);
29306 gcc_assert (ok);
29308 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29310 /* If this is not LEA instruction, add the length of addr32 prefix. */
29311 if (TARGET_64BIT && !lea
29312 && (SImode_address_operand (addr, VOIDmode)
29313 || (parts.base && GET_MODE (parts.base) == SImode)
29314 || (parts.index && GET_MODE (parts.index) == SImode)))
29315 len++;
29317 base = parts.base;
29318 index = parts.index;
29319 disp = parts.disp;
29321 if (base && SUBREG_P (base))
29322 base = SUBREG_REG (base);
29323 if (index && SUBREG_P (index))
29324 index = SUBREG_REG (index);
29326 gcc_assert (base == NULL_RTX || REG_P (base));
29327 gcc_assert (index == NULL_RTX || REG_P (index));
29329 /* Rule of thumb:
29330 - esp as the base always wants an index,
29331 - ebp as the base always wants a displacement,
29332 - r12 as the base always wants an index,
29333 - r13 as the base always wants a displacement. */
29335 /* Register Indirect. */
29336 if (base && !index && !disp)
29338 /* esp (for its index) and ebp (for its displacement) need
29339 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29340 code. */
29341 if (base == arg_pointer_rtx
29342 || base == frame_pointer_rtx
29343 || REGNO (base) == SP_REG
29344 || REGNO (base) == BP_REG
29345 || REGNO (base) == R12_REG
29346 || REGNO (base) == R13_REG)
29347 len++;
29350 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29351 is not disp32, but disp32(%rip), so for disp32
29352 SIB byte is needed, unless print_operand_address
29353 optimizes it into disp32(%rip) or (%rip) is implied
29354 by UNSPEC. */
29355 else if (disp && !base && !index)
29357 len += 4;
29358 if (!ix86_rip_relative_addr_p (&parts))
29359 len++;
29361 else
29363 /* Find the length of the displacement constant. */
29364 if (disp)
29366 if (base && satisfies_constraint_K (disp))
29367 len += 1;
29368 else
29369 len += 4;
29371 /* ebp always wants a displacement. Similarly r13. */
29372 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29373 len++;
29375 /* An index requires the two-byte modrm form.... */
29376 if (index
29377 /* ...like esp (or r12), which always wants an index. */
29378 || base == arg_pointer_rtx
29379 || base == frame_pointer_rtx
29380 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29381 len++;
29384 return len;
29387 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29388 is set, expect that insn have 8bit immediate alternative. */
29390 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29392 int len = 0;
29393 int i;
29394 extract_insn_cached (insn);
29395 for (i = recog_data.n_operands - 1; i >= 0; --i)
29396 if (CONSTANT_P (recog_data.operand[i]))
29398 enum attr_mode mode = get_attr_mode (insn);
29400 gcc_assert (!len);
29401 if (shortform && CONST_INT_P (recog_data.operand[i]))
29403 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29404 switch (mode)
29406 case MODE_QI:
29407 len = 1;
29408 continue;
29409 case MODE_HI:
29410 ival = trunc_int_for_mode (ival, HImode);
29411 break;
29412 case MODE_SI:
29413 ival = trunc_int_for_mode (ival, SImode);
29414 break;
29415 default:
29416 break;
29418 if (IN_RANGE (ival, -128, 127))
29420 len = 1;
29421 continue;
29424 switch (mode)
29426 case MODE_QI:
29427 len = 1;
29428 break;
29429 case MODE_HI:
29430 len = 2;
29431 break;
29432 case MODE_SI:
29433 len = 4;
29434 break;
29435 /* Immediates for DImode instructions are encoded
29436 as 32bit sign extended values. */
29437 case MODE_DI:
29438 len = 4;
29439 break;
29440 default:
29441 fatal_insn ("unknown insn mode", insn);
29444 return len;
29447 /* Compute default value for "length_address" attribute. */
29449 ix86_attr_length_address_default (rtx_insn *insn)
29451 int i;
29453 if (get_attr_type (insn) == TYPE_LEA)
29455 rtx set = PATTERN (insn), addr;
29457 if (GET_CODE (set) == PARALLEL)
29458 set = XVECEXP (set, 0, 0);
29460 gcc_assert (GET_CODE (set) == SET);
29462 addr = SET_SRC (set);
29464 return memory_address_length (addr, true);
29467 extract_insn_cached (insn);
29468 for (i = recog_data.n_operands - 1; i >= 0; --i)
29470 rtx op = recog_data.operand[i];
29471 if (MEM_P (op))
29473 constrain_operands_cached (insn, reload_completed);
29474 if (which_alternative != -1)
29476 const char *constraints = recog_data.constraints[i];
29477 int alt = which_alternative;
29479 while (*constraints == '=' || *constraints == '+')
29480 constraints++;
29481 while (alt-- > 0)
29482 while (*constraints++ != ',')
29484 /* Skip ignored operands. */
29485 if (*constraints == 'X')
29486 continue;
29489 int len = memory_address_length (XEXP (op, 0), false);
29491 /* Account for segment prefix for non-default addr spaces. */
29492 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29493 len++;
29495 return len;
29498 return 0;
29501 /* Compute default value for "length_vex" attribute. It includes
29502 2 or 3 byte VEX prefix and 1 opcode byte. */
29505 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29506 bool has_vex_w)
29508 int i;
29510 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29511 byte VEX prefix. */
29512 if (!has_0f_opcode || has_vex_w)
29513 return 3 + 1;
29515 /* We can always use 2 byte VEX prefix in 32bit. */
29516 if (!TARGET_64BIT)
29517 return 2 + 1;
29519 extract_insn_cached (insn);
29521 for (i = recog_data.n_operands - 1; i >= 0; --i)
29522 if (REG_P (recog_data.operand[i]))
29524 /* REX.W bit uses 3 byte VEX prefix. */
29525 if (GET_MODE (recog_data.operand[i]) == DImode
29526 && GENERAL_REG_P (recog_data.operand[i]))
29527 return 3 + 1;
29529 else
29531 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29532 if (MEM_P (recog_data.operand[i])
29533 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29534 return 3 + 1;
29537 return 2 + 1;
29541 static bool
29542 ix86_class_likely_spilled_p (reg_class_t);
29544 /* Returns true if lhs of insn is HW function argument register and set up
29545 is_spilled to true if it is likely spilled HW register. */
29546 static bool
29547 insn_is_function_arg (rtx insn, bool* is_spilled)
29549 rtx dst;
29551 if (!NONDEBUG_INSN_P (insn))
29552 return false;
29553 /* Call instructions are not movable, ignore it. */
29554 if (CALL_P (insn))
29555 return false;
29556 insn = PATTERN (insn);
29557 if (GET_CODE (insn) == PARALLEL)
29558 insn = XVECEXP (insn, 0, 0);
29559 if (GET_CODE (insn) != SET)
29560 return false;
29561 dst = SET_DEST (insn);
29562 if (REG_P (dst) && HARD_REGISTER_P (dst)
29563 && ix86_function_arg_regno_p (REGNO (dst)))
29565 /* Is it likely spilled HW register? */
29566 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29567 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29568 *is_spilled = true;
29569 return true;
29571 return false;
29574 /* Add output dependencies for chain of function adjacent arguments if only
29575 there is a move to likely spilled HW register. Return first argument
29576 if at least one dependence was added or NULL otherwise. */
29577 static rtx_insn *
29578 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29580 rtx_insn *insn;
29581 rtx_insn *last = call;
29582 rtx_insn *first_arg = NULL;
29583 bool is_spilled = false;
29585 head = PREV_INSN (head);
29587 /* Find nearest to call argument passing instruction. */
29588 while (true)
29590 last = PREV_INSN (last);
29591 if (last == head)
29592 return NULL;
29593 if (!NONDEBUG_INSN_P (last))
29594 continue;
29595 if (insn_is_function_arg (last, &is_spilled))
29596 break;
29597 return NULL;
29600 first_arg = last;
29601 while (true)
29603 insn = PREV_INSN (last);
29604 if (!INSN_P (insn))
29605 break;
29606 if (insn == head)
29607 break;
29608 if (!NONDEBUG_INSN_P (insn))
29610 last = insn;
29611 continue;
29613 if (insn_is_function_arg (insn, &is_spilled))
29615 /* Add output depdendence between two function arguments if chain
29616 of output arguments contains likely spilled HW registers. */
29617 if (is_spilled)
29618 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29619 first_arg = last = insn;
29621 else
29622 break;
29624 if (!is_spilled)
29625 return NULL;
29626 return first_arg;
29629 /* Add output or anti dependency from insn to first_arg to restrict its code
29630 motion. */
29631 static void
29632 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29634 rtx set;
29635 rtx tmp;
29637 /* Add anti dependencies for bounds stores. */
29638 if (INSN_P (insn)
29639 && GET_CODE (PATTERN (insn)) == PARALLEL
29640 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29641 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29643 add_dependence (first_arg, insn, REG_DEP_ANTI);
29644 return;
29647 set = single_set (insn);
29648 if (!set)
29649 return;
29650 tmp = SET_DEST (set);
29651 if (REG_P (tmp))
29653 /* Add output dependency to the first function argument. */
29654 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29655 return;
29657 /* Add anti dependency. */
29658 add_dependence (first_arg, insn, REG_DEP_ANTI);
29661 /* Avoid cross block motion of function argument through adding dependency
29662 from the first non-jump instruction in bb. */
29663 static void
29664 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29666 rtx_insn *insn = BB_END (bb);
29668 while (insn)
29670 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29672 rtx set = single_set (insn);
29673 if (set)
29675 avoid_func_arg_motion (arg, insn);
29676 return;
29679 if (insn == BB_HEAD (bb))
29680 return;
29681 insn = PREV_INSN (insn);
29685 /* Hook for pre-reload schedule - avoid motion of function arguments
29686 passed in likely spilled HW registers. */
29687 static void
29688 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29690 rtx_insn *insn;
29691 rtx_insn *first_arg = NULL;
29692 if (reload_completed)
29693 return;
29694 while (head != tail && DEBUG_INSN_P (head))
29695 head = NEXT_INSN (head);
29696 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29697 if (INSN_P (insn) && CALL_P (insn))
29699 first_arg = add_parameter_dependencies (insn, head);
29700 if (first_arg)
29702 /* Add dependee for first argument to predecessors if only
29703 region contains more than one block. */
29704 basic_block bb = BLOCK_FOR_INSN (insn);
29705 int rgn = CONTAINING_RGN (bb->index);
29706 int nr_blks = RGN_NR_BLOCKS (rgn);
29707 /* Skip trivial regions and region head blocks that can have
29708 predecessors outside of region. */
29709 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29711 edge e;
29712 edge_iterator ei;
29714 /* Regions are SCCs with the exception of selective
29715 scheduling with pipelining of outer blocks enabled.
29716 So also check that immediate predecessors of a non-head
29717 block are in the same region. */
29718 FOR_EACH_EDGE (e, ei, bb->preds)
29720 /* Avoid creating of loop-carried dependencies through
29721 using topological ordering in the region. */
29722 if (rgn == CONTAINING_RGN (e->src->index)
29723 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29724 add_dependee_for_func_arg (first_arg, e->src);
29727 insn = first_arg;
29728 if (insn == head)
29729 break;
29732 else if (first_arg)
29733 avoid_func_arg_motion (first_arg, insn);
29736 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29737 HW registers to maximum, to schedule them at soon as possible. These are
29738 moves from function argument registers at the top of the function entry
29739 and moves from function return value registers after call. */
29740 static int
29741 ix86_adjust_priority (rtx_insn *insn, int priority)
29743 rtx set;
29745 if (reload_completed)
29746 return priority;
29748 if (!NONDEBUG_INSN_P (insn))
29749 return priority;
29751 set = single_set (insn);
29752 if (set)
29754 rtx tmp = SET_SRC (set);
29755 if (REG_P (tmp)
29756 && HARD_REGISTER_P (tmp)
29757 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29758 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29759 return current_sched_info->sched_max_insns_priority;
29762 return priority;
29765 /* Prepare for scheduling pass. */
29766 static void
29767 ix86_sched_init_global (FILE *, int, int)
29769 /* Install scheduling hooks for current CPU. Some of these hooks are used
29770 in time-critical parts of the scheduler, so we only set them up when
29771 they are actually used. */
29772 switch (ix86_tune)
29774 case PROCESSOR_CORE2:
29775 case PROCESSOR_NEHALEM:
29776 case PROCESSOR_SANDYBRIDGE:
29777 case PROCESSOR_HASWELL:
29778 case PROCESSOR_GENERIC:
29779 /* Do not perform multipass scheduling for pre-reload schedule
29780 to save compile time. */
29781 if (reload_completed)
29783 ix86_core2i7_init_hooks ();
29784 break;
29786 /* Fall through. */
29787 default:
29788 targetm.sched.dfa_post_advance_cycle = NULL;
29789 targetm.sched.first_cycle_multipass_init = NULL;
29790 targetm.sched.first_cycle_multipass_begin = NULL;
29791 targetm.sched.first_cycle_multipass_issue = NULL;
29792 targetm.sched.first_cycle_multipass_backtrack = NULL;
29793 targetm.sched.first_cycle_multipass_end = NULL;
29794 targetm.sched.first_cycle_multipass_fini = NULL;
29795 break;
29800 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
29802 static HOST_WIDE_INT
29803 ix86_static_rtx_alignment (machine_mode mode)
29805 if (mode == DFmode)
29806 return 64;
29807 if (ALIGN_MODE_128 (mode))
29808 return MAX (128, GET_MODE_ALIGNMENT (mode));
29809 return GET_MODE_ALIGNMENT (mode);
29812 /* Implement TARGET_CONSTANT_ALIGNMENT. */
29814 static HOST_WIDE_INT
29815 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
29817 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
29818 || TREE_CODE (exp) == INTEGER_CST)
29820 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
29821 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
29822 return MAX (mode_align, align);
29824 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
29825 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
29826 return BITS_PER_WORD;
29828 return align;
29831 /* Implement TARGET_EMPTY_RECORD_P. */
29833 static bool
29834 ix86_is_empty_record (const_tree type)
29836 if (!TARGET_64BIT)
29837 return false;
29838 return default_is_empty_record (type);
29841 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
29843 static void
29844 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
29846 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
29848 if (!cum->warn_empty)
29849 return;
29851 if (!TYPE_EMPTY_P (type))
29852 return;
29854 const_tree ctx = get_ultimate_context (cum->decl);
29855 if (ctx != NULL_TREE
29856 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
29857 return;
29859 /* If the actual size of the type is zero, then there is no change
29860 in how objects of this size are passed. */
29861 if (int_size_in_bytes (type) == 0)
29862 return;
29864 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
29865 "changes in -fabi-version=12 (GCC 8)", type);
29867 /* Only warn once. */
29868 cum->warn_empty = false;
29871 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
29872 the data type, and ALIGN is the alignment that the object would
29873 ordinarily have. */
29875 static int
29876 iamcu_alignment (tree type, int align)
29878 machine_mode mode;
29880 if (align < 32 || TYPE_USER_ALIGN (type))
29881 return align;
29883 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
29884 bytes. */
29885 mode = TYPE_MODE (strip_array_types (type));
29886 switch (GET_MODE_CLASS (mode))
29888 case MODE_INT:
29889 case MODE_COMPLEX_INT:
29890 case MODE_COMPLEX_FLOAT:
29891 case MODE_FLOAT:
29892 case MODE_DECIMAL_FLOAT:
29893 return 32;
29894 default:
29895 return align;
29899 /* Compute the alignment for a static variable.
29900 TYPE is the data type, and ALIGN is the alignment that
29901 the object would ordinarily have. The value of this function is used
29902 instead of that alignment to align the object. */
29905 ix86_data_alignment (tree type, int align, bool opt)
29907 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
29908 for symbols from other compilation units or symbols that don't need
29909 to bind locally. In order to preserve some ABI compatibility with
29910 those compilers, ensure we don't decrease alignment from what we
29911 used to assume. */
29913 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
29915 /* A data structure, equal or greater than the size of a cache line
29916 (64 bytes in the Pentium 4 and other recent Intel processors, including
29917 processors based on Intel Core microarchitecture) should be aligned
29918 so that its base address is a multiple of a cache line size. */
29920 int max_align
29921 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
29923 if (max_align < BITS_PER_WORD)
29924 max_align = BITS_PER_WORD;
29926 switch (ix86_align_data_type)
29928 case ix86_align_data_type_abi: opt = false; break;
29929 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29930 case ix86_align_data_type_cacheline: break;
29933 if (TARGET_IAMCU)
29934 align = iamcu_alignment (type, align);
29936 if (opt
29937 && AGGREGATE_TYPE_P (type)
29938 && TYPE_SIZE (type)
29939 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29941 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29942 && align < max_align_compat)
29943 align = max_align_compat;
29944 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29945 && align < max_align)
29946 align = max_align;
29949 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29950 to 16byte boundary. */
29951 if (TARGET_64BIT)
29953 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29954 && TYPE_SIZE (type)
29955 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29956 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29957 && align < 128)
29958 return 128;
29961 if (!opt)
29962 return align;
29964 if (TREE_CODE (type) == ARRAY_TYPE)
29966 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29967 return 64;
29968 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29969 return 128;
29971 else if (TREE_CODE (type) == COMPLEX_TYPE)
29974 if (TYPE_MODE (type) == DCmode && align < 64)
29975 return 64;
29976 if ((TYPE_MODE (type) == XCmode
29977 || TYPE_MODE (type) == TCmode) && align < 128)
29978 return 128;
29980 else if ((TREE_CODE (type) == RECORD_TYPE
29981 || TREE_CODE (type) == UNION_TYPE
29982 || TREE_CODE (type) == QUAL_UNION_TYPE)
29983 && TYPE_FIELDS (type))
29985 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29986 return 64;
29987 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29988 return 128;
29990 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29991 || TREE_CODE (type) == INTEGER_TYPE)
29993 if (TYPE_MODE (type) == DFmode && align < 64)
29994 return 64;
29995 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29996 return 128;
29999 return align;
30002 /* Compute the alignment for a local variable or a stack slot. EXP is
30003 the data type or decl itself, MODE is the widest mode available and
30004 ALIGN is the alignment that the object would ordinarily have. The
30005 value of this macro is used instead of that alignment to align the
30006 object. */
30008 unsigned int
30009 ix86_local_alignment (tree exp, machine_mode mode,
30010 unsigned int align)
30012 tree type, decl;
30014 if (exp && DECL_P (exp))
30016 type = TREE_TYPE (exp);
30017 decl = exp;
30019 else
30021 type = exp;
30022 decl = NULL;
30025 /* Don't do dynamic stack realignment for long long objects with
30026 -mpreferred-stack-boundary=2. */
30027 if (!TARGET_64BIT
30028 && align == 64
30029 && ix86_preferred_stack_boundary < 64
30030 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30031 && (!type || !TYPE_USER_ALIGN (type))
30032 && (!decl || !DECL_USER_ALIGN (decl)))
30033 align = 32;
30035 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30036 register in MODE. We will return the largest alignment of XF
30037 and DF. */
30038 if (!type)
30040 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30041 align = GET_MODE_ALIGNMENT (DFmode);
30042 return align;
30045 /* Don't increase alignment for Intel MCU psABI. */
30046 if (TARGET_IAMCU)
30047 return align;
30049 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30050 to 16byte boundary. Exact wording is:
30052 An array uses the same alignment as its elements, except that a local or
30053 global array variable of length at least 16 bytes or
30054 a C99 variable-length array variable always has alignment of at least 16 bytes.
30056 This was added to allow use of aligned SSE instructions at arrays. This
30057 rule is meant for static storage (where compiler can not do the analysis
30058 by itself). We follow it for automatic variables only when convenient.
30059 We fully control everything in the function compiled and functions from
30060 other unit can not rely on the alignment.
30062 Exclude va_list type. It is the common case of local array where
30063 we can not benefit from the alignment.
30065 TODO: Probably one should optimize for size only when var is not escaping. */
30066 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30067 && TARGET_SSE)
30069 if (AGGREGATE_TYPE_P (type)
30070 && (va_list_type_node == NULL_TREE
30071 || (TYPE_MAIN_VARIANT (type)
30072 != TYPE_MAIN_VARIANT (va_list_type_node)))
30073 && TYPE_SIZE (type)
30074 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30075 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30076 && align < 128)
30077 return 128;
30079 if (TREE_CODE (type) == ARRAY_TYPE)
30081 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30082 return 64;
30083 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30084 return 128;
30086 else if (TREE_CODE (type) == COMPLEX_TYPE)
30088 if (TYPE_MODE (type) == DCmode && align < 64)
30089 return 64;
30090 if ((TYPE_MODE (type) == XCmode
30091 || TYPE_MODE (type) == TCmode) && align < 128)
30092 return 128;
30094 else if ((TREE_CODE (type) == RECORD_TYPE
30095 || TREE_CODE (type) == UNION_TYPE
30096 || TREE_CODE (type) == QUAL_UNION_TYPE)
30097 && TYPE_FIELDS (type))
30099 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30100 return 64;
30101 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30102 return 128;
30104 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30105 || TREE_CODE (type) == INTEGER_TYPE)
30108 if (TYPE_MODE (type) == DFmode && align < 64)
30109 return 64;
30110 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30111 return 128;
30113 return align;
30116 /* Compute the minimum required alignment for dynamic stack realignment
30117 purposes for a local variable, parameter or a stack slot. EXP is
30118 the data type or decl itself, MODE is its mode and ALIGN is the
30119 alignment that the object would ordinarily have. */
30121 unsigned int
30122 ix86_minimum_alignment (tree exp, machine_mode mode,
30123 unsigned int align)
30125 tree type, decl;
30127 if (exp && DECL_P (exp))
30129 type = TREE_TYPE (exp);
30130 decl = exp;
30132 else
30134 type = exp;
30135 decl = NULL;
30138 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30139 return align;
30141 /* Don't do dynamic stack realignment for long long objects with
30142 -mpreferred-stack-boundary=2. */
30143 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30144 && (!type || !TYPE_USER_ALIGN (type))
30145 && (!decl || !DECL_USER_ALIGN (decl)))
30147 gcc_checking_assert (!TARGET_STV);
30148 return 32;
30151 return align;
30154 /* Find a location for the static chain incoming to a nested function.
30155 This is a register, unless all free registers are used by arguments. */
30157 static rtx
30158 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30160 unsigned regno;
30162 if (TARGET_64BIT)
30164 /* We always use R10 in 64-bit mode. */
30165 regno = R10_REG;
30167 else
30169 const_tree fntype, fndecl;
30170 unsigned int ccvt;
30172 /* By default in 32-bit mode we use ECX to pass the static chain. */
30173 regno = CX_REG;
30175 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30177 fntype = TREE_TYPE (fndecl_or_type);
30178 fndecl = fndecl_or_type;
30180 else
30182 fntype = fndecl_or_type;
30183 fndecl = NULL;
30186 ccvt = ix86_get_callcvt (fntype);
30187 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30189 /* Fastcall functions use ecx/edx for arguments, which leaves
30190 us with EAX for the static chain.
30191 Thiscall functions use ecx for arguments, which also
30192 leaves us with EAX for the static chain. */
30193 regno = AX_REG;
30195 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30197 /* Thiscall functions use ecx for arguments, which leaves
30198 us with EAX and EDX for the static chain.
30199 We are using for abi-compatibility EAX. */
30200 regno = AX_REG;
30202 else if (ix86_function_regparm (fntype, fndecl) == 3)
30204 /* For regparm 3, we have no free call-clobbered registers in
30205 which to store the static chain. In order to implement this,
30206 we have the trampoline push the static chain to the stack.
30207 However, we can't push a value below the return address when
30208 we call the nested function directly, so we have to use an
30209 alternate entry point. For this we use ESI, and have the
30210 alternate entry point push ESI, so that things appear the
30211 same once we're executing the nested function. */
30212 if (incoming_p)
30214 if (fndecl == current_function_decl
30215 && !ix86_static_chain_on_stack)
30217 gcc_assert (!reload_completed);
30218 ix86_static_chain_on_stack = true;
30220 return gen_frame_mem (SImode,
30221 plus_constant (Pmode,
30222 arg_pointer_rtx, -8));
30224 regno = SI_REG;
30228 return gen_rtx_REG (Pmode, regno);
30231 /* Emit RTL insns to initialize the variable parts of a trampoline.
30232 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30233 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30234 to be passed to the target function. */
30236 static void
30237 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30239 rtx mem, fnaddr;
30240 int opcode;
30241 int offset = 0;
30243 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30245 if (TARGET_64BIT)
30247 int size;
30249 /* Load the function address to r11. Try to load address using
30250 the shorter movl instead of movabs. We may want to support
30251 movq for kernel mode, but kernel does not use trampolines at
30252 the moment. FNADDR is a 32bit address and may not be in
30253 DImode when ptr_mode == SImode. Always use movl in this
30254 case. */
30255 if (ptr_mode == SImode
30256 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30258 fnaddr = copy_addr_to_reg (fnaddr);
30260 mem = adjust_address (m_tramp, HImode, offset);
30261 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30263 mem = adjust_address (m_tramp, SImode, offset + 2);
30264 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30265 offset += 6;
30267 else
30269 mem = adjust_address (m_tramp, HImode, offset);
30270 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30272 mem = adjust_address (m_tramp, DImode, offset + 2);
30273 emit_move_insn (mem, fnaddr);
30274 offset += 10;
30277 /* Load static chain using movabs to r10. Use the shorter movl
30278 instead of movabs when ptr_mode == SImode. */
30279 if (ptr_mode == SImode)
30281 opcode = 0xba41;
30282 size = 6;
30284 else
30286 opcode = 0xba49;
30287 size = 10;
30290 mem = adjust_address (m_tramp, HImode, offset);
30291 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30293 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30294 emit_move_insn (mem, chain_value);
30295 offset += size;
30297 /* Jump to r11; the last (unused) byte is a nop, only there to
30298 pad the write out to a single 32-bit store. */
30299 mem = adjust_address (m_tramp, SImode, offset);
30300 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30301 offset += 4;
30303 else
30305 rtx disp, chain;
30307 /* Depending on the static chain location, either load a register
30308 with a constant, or push the constant to the stack. All of the
30309 instructions are the same size. */
30310 chain = ix86_static_chain (fndecl, true);
30311 if (REG_P (chain))
30313 switch (REGNO (chain))
30315 case AX_REG:
30316 opcode = 0xb8; break;
30317 case CX_REG:
30318 opcode = 0xb9; break;
30319 default:
30320 gcc_unreachable ();
30323 else
30324 opcode = 0x68;
30326 mem = adjust_address (m_tramp, QImode, offset);
30327 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30329 mem = adjust_address (m_tramp, SImode, offset + 1);
30330 emit_move_insn (mem, chain_value);
30331 offset += 5;
30333 mem = adjust_address (m_tramp, QImode, offset);
30334 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30336 mem = adjust_address (m_tramp, SImode, offset + 1);
30338 /* Compute offset from the end of the jmp to the target function.
30339 In the case in which the trampoline stores the static chain on
30340 the stack, we need to skip the first insn which pushes the
30341 (call-saved) register static chain; this push is 1 byte. */
30342 offset += 5;
30343 disp = expand_binop (SImode, sub_optab, fnaddr,
30344 plus_constant (Pmode, XEXP (m_tramp, 0),
30345 offset - (MEM_P (chain) ? 1 : 0)),
30346 NULL_RTX, 1, OPTAB_DIRECT);
30347 emit_move_insn (mem, disp);
30350 gcc_assert (offset <= TRAMPOLINE_SIZE);
30352 #ifdef HAVE_ENABLE_EXECUTE_STACK
30353 #ifdef CHECK_EXECUTE_STACK_ENABLED
30354 if (CHECK_EXECUTE_STACK_ENABLED)
30355 #endif
30356 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30357 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
30358 #endif
30361 static bool
30362 ix86_allocate_stack_slots_for_args (void)
30364 /* Naked functions should not allocate stack slots for arguments. */
30365 return !ix86_function_naked (current_function_decl);
30368 static bool
30369 ix86_warn_func_return (tree decl)
30371 /* Naked functions are implemented entirely in assembly, including the
30372 return sequence, so suppress warnings about this. */
30373 return !ix86_function_naked (decl);
30376 /* The following file contains several enumerations and data structures
30377 built from the definitions in i386-builtin-types.def. */
30379 #include "i386-builtin-types.inc"
30381 /* Table for the ix86 builtin non-function types. */
30382 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30384 /* Retrieve an element from the above table, building some of
30385 the types lazily. */
30387 static tree
30388 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30390 unsigned int index;
30391 tree type, itype;
30393 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30395 type = ix86_builtin_type_tab[(int) tcode];
30396 if (type != NULL)
30397 return type;
30399 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30400 if (tcode <= IX86_BT_LAST_VECT)
30402 machine_mode mode;
30404 index = tcode - IX86_BT_LAST_PRIM - 1;
30405 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30406 mode = ix86_builtin_type_vect_mode[index];
30408 type = build_vector_type_for_mode (itype, mode);
30410 else
30412 int quals;
30414 index = tcode - IX86_BT_LAST_VECT - 1;
30415 if (tcode <= IX86_BT_LAST_PTR)
30416 quals = TYPE_UNQUALIFIED;
30417 else
30418 quals = TYPE_QUAL_CONST;
30420 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30421 if (quals != TYPE_UNQUALIFIED)
30422 itype = build_qualified_type (itype, quals);
30424 type = build_pointer_type (itype);
30427 ix86_builtin_type_tab[(int) tcode] = type;
30428 return type;
30431 /* Table for the ix86 builtin function types. */
30432 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30434 /* Retrieve an element from the above table, building some of
30435 the types lazily. */
30437 static tree
30438 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30440 tree type;
30442 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30444 type = ix86_builtin_func_type_tab[(int) tcode];
30445 if (type != NULL)
30446 return type;
30448 if (tcode <= IX86_BT_LAST_FUNC)
30450 unsigned start = ix86_builtin_func_start[(int) tcode];
30451 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30452 tree rtype, atype, args = void_list_node;
30453 unsigned i;
30455 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30456 for (i = after - 1; i > start; --i)
30458 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30459 args = tree_cons (NULL, atype, args);
30462 type = build_function_type (rtype, args);
30464 else
30466 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30467 enum ix86_builtin_func_type icode;
30469 icode = ix86_builtin_func_alias_base[index];
30470 type = ix86_get_builtin_func_type (icode);
30473 ix86_builtin_func_type_tab[(int) tcode] = type;
30474 return type;
30478 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30479 bdesc_* arrays below should come first, then builtins for each bdesc_*
30480 array in ascending order, so that we can use direct array accesses. */
30481 enum ix86_builtins
30483 IX86_BUILTIN_MASKMOVQ,
30484 IX86_BUILTIN_LDMXCSR,
30485 IX86_BUILTIN_STMXCSR,
30486 IX86_BUILTIN_MASKMOVDQU,
30487 IX86_BUILTIN_PSLLDQ128,
30488 IX86_BUILTIN_CLFLUSH,
30489 IX86_BUILTIN_MONITOR,
30490 IX86_BUILTIN_MWAIT,
30491 IX86_BUILTIN_CLZERO,
30492 IX86_BUILTIN_VEC_INIT_V2SI,
30493 IX86_BUILTIN_VEC_INIT_V4HI,
30494 IX86_BUILTIN_VEC_INIT_V8QI,
30495 IX86_BUILTIN_VEC_EXT_V2DF,
30496 IX86_BUILTIN_VEC_EXT_V2DI,
30497 IX86_BUILTIN_VEC_EXT_V4SF,
30498 IX86_BUILTIN_VEC_EXT_V4SI,
30499 IX86_BUILTIN_VEC_EXT_V8HI,
30500 IX86_BUILTIN_VEC_EXT_V2SI,
30501 IX86_BUILTIN_VEC_EXT_V4HI,
30502 IX86_BUILTIN_VEC_EXT_V16QI,
30503 IX86_BUILTIN_VEC_SET_V2DI,
30504 IX86_BUILTIN_VEC_SET_V4SF,
30505 IX86_BUILTIN_VEC_SET_V4SI,
30506 IX86_BUILTIN_VEC_SET_V8HI,
30507 IX86_BUILTIN_VEC_SET_V4HI,
30508 IX86_BUILTIN_VEC_SET_V16QI,
30509 IX86_BUILTIN_GATHERSIV2DF,
30510 IX86_BUILTIN_GATHERSIV4DF,
30511 IX86_BUILTIN_GATHERDIV2DF,
30512 IX86_BUILTIN_GATHERDIV4DF,
30513 IX86_BUILTIN_GATHERSIV4SF,
30514 IX86_BUILTIN_GATHERSIV8SF,
30515 IX86_BUILTIN_GATHERDIV4SF,
30516 IX86_BUILTIN_GATHERDIV8SF,
30517 IX86_BUILTIN_GATHERSIV2DI,
30518 IX86_BUILTIN_GATHERSIV4DI,
30519 IX86_BUILTIN_GATHERDIV2DI,
30520 IX86_BUILTIN_GATHERDIV4DI,
30521 IX86_BUILTIN_GATHERSIV4SI,
30522 IX86_BUILTIN_GATHERSIV8SI,
30523 IX86_BUILTIN_GATHERDIV4SI,
30524 IX86_BUILTIN_GATHERDIV8SI,
30525 IX86_BUILTIN_VFMSUBSD3_MASK3,
30526 IX86_BUILTIN_VFMSUBSS3_MASK3,
30527 IX86_BUILTIN_GATHER3SIV8SF,
30528 IX86_BUILTIN_GATHER3SIV4SF,
30529 IX86_BUILTIN_GATHER3SIV4DF,
30530 IX86_BUILTIN_GATHER3SIV2DF,
30531 IX86_BUILTIN_GATHER3DIV8SF,
30532 IX86_BUILTIN_GATHER3DIV4SF,
30533 IX86_BUILTIN_GATHER3DIV4DF,
30534 IX86_BUILTIN_GATHER3DIV2DF,
30535 IX86_BUILTIN_GATHER3SIV8SI,
30536 IX86_BUILTIN_GATHER3SIV4SI,
30537 IX86_BUILTIN_GATHER3SIV4DI,
30538 IX86_BUILTIN_GATHER3SIV2DI,
30539 IX86_BUILTIN_GATHER3DIV8SI,
30540 IX86_BUILTIN_GATHER3DIV4SI,
30541 IX86_BUILTIN_GATHER3DIV4DI,
30542 IX86_BUILTIN_GATHER3DIV2DI,
30543 IX86_BUILTIN_SCATTERSIV8SF,
30544 IX86_BUILTIN_SCATTERSIV4SF,
30545 IX86_BUILTIN_SCATTERSIV4DF,
30546 IX86_BUILTIN_SCATTERSIV2DF,
30547 IX86_BUILTIN_SCATTERDIV8SF,
30548 IX86_BUILTIN_SCATTERDIV4SF,
30549 IX86_BUILTIN_SCATTERDIV4DF,
30550 IX86_BUILTIN_SCATTERDIV2DF,
30551 IX86_BUILTIN_SCATTERSIV8SI,
30552 IX86_BUILTIN_SCATTERSIV4SI,
30553 IX86_BUILTIN_SCATTERSIV4DI,
30554 IX86_BUILTIN_SCATTERSIV2DI,
30555 IX86_BUILTIN_SCATTERDIV8SI,
30556 IX86_BUILTIN_SCATTERDIV4SI,
30557 IX86_BUILTIN_SCATTERDIV4DI,
30558 IX86_BUILTIN_SCATTERDIV2DI,
30559 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30560 where all operands are 32-byte or 64-byte wide respectively. */
30561 IX86_BUILTIN_GATHERALTSIV4DF,
30562 IX86_BUILTIN_GATHERALTDIV8SF,
30563 IX86_BUILTIN_GATHERALTSIV4DI,
30564 IX86_BUILTIN_GATHERALTDIV8SI,
30565 IX86_BUILTIN_GATHER3ALTDIV16SF,
30566 IX86_BUILTIN_GATHER3ALTDIV16SI,
30567 IX86_BUILTIN_GATHER3ALTSIV4DF,
30568 IX86_BUILTIN_GATHER3ALTDIV8SF,
30569 IX86_BUILTIN_GATHER3ALTSIV4DI,
30570 IX86_BUILTIN_GATHER3ALTDIV8SI,
30571 IX86_BUILTIN_GATHER3ALTSIV8DF,
30572 IX86_BUILTIN_GATHER3ALTSIV8DI,
30573 IX86_BUILTIN_GATHER3DIV16SF,
30574 IX86_BUILTIN_GATHER3DIV16SI,
30575 IX86_BUILTIN_GATHER3DIV8DF,
30576 IX86_BUILTIN_GATHER3DIV8DI,
30577 IX86_BUILTIN_GATHER3SIV16SF,
30578 IX86_BUILTIN_GATHER3SIV16SI,
30579 IX86_BUILTIN_GATHER3SIV8DF,
30580 IX86_BUILTIN_GATHER3SIV8DI,
30581 IX86_BUILTIN_SCATTERALTSIV8DF,
30582 IX86_BUILTIN_SCATTERALTDIV16SF,
30583 IX86_BUILTIN_SCATTERALTSIV8DI,
30584 IX86_BUILTIN_SCATTERALTDIV16SI,
30585 IX86_BUILTIN_SCATTERDIV16SF,
30586 IX86_BUILTIN_SCATTERDIV16SI,
30587 IX86_BUILTIN_SCATTERDIV8DF,
30588 IX86_BUILTIN_SCATTERDIV8DI,
30589 IX86_BUILTIN_SCATTERSIV16SF,
30590 IX86_BUILTIN_SCATTERSIV16SI,
30591 IX86_BUILTIN_SCATTERSIV8DF,
30592 IX86_BUILTIN_SCATTERSIV8DI,
30593 IX86_BUILTIN_GATHERPFQPD,
30594 IX86_BUILTIN_GATHERPFDPS,
30595 IX86_BUILTIN_GATHERPFDPD,
30596 IX86_BUILTIN_GATHERPFQPS,
30597 IX86_BUILTIN_SCATTERPFDPD,
30598 IX86_BUILTIN_SCATTERPFDPS,
30599 IX86_BUILTIN_SCATTERPFQPD,
30600 IX86_BUILTIN_SCATTERPFQPS,
30601 IX86_BUILTIN_CLWB,
30602 IX86_BUILTIN_CLFLUSHOPT,
30603 IX86_BUILTIN_INFQ,
30604 IX86_BUILTIN_HUGE_VALQ,
30605 IX86_BUILTIN_NANQ,
30606 IX86_BUILTIN_NANSQ,
30607 IX86_BUILTIN_XABORT,
30608 IX86_BUILTIN_ADDCARRYX32,
30609 IX86_BUILTIN_ADDCARRYX64,
30610 IX86_BUILTIN_SBB32,
30611 IX86_BUILTIN_SBB64,
30612 IX86_BUILTIN_RDRAND16_STEP,
30613 IX86_BUILTIN_RDRAND32_STEP,
30614 IX86_BUILTIN_RDRAND64_STEP,
30615 IX86_BUILTIN_RDSEED16_STEP,
30616 IX86_BUILTIN_RDSEED32_STEP,
30617 IX86_BUILTIN_RDSEED64_STEP,
30618 IX86_BUILTIN_MONITORX,
30619 IX86_BUILTIN_MWAITX,
30620 IX86_BUILTIN_CFSTRING,
30621 IX86_BUILTIN_CPU_INIT,
30622 IX86_BUILTIN_CPU_IS,
30623 IX86_BUILTIN_CPU_SUPPORTS,
30624 IX86_BUILTIN_READ_FLAGS,
30625 IX86_BUILTIN_WRITE_FLAGS,
30627 /* All the remaining builtins are tracked in bdesc_* arrays in
30628 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30629 this point. */
30630 #define BDESC(mask, icode, name, code, comparison, flag) \
30631 code,
30632 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30633 code, \
30634 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30635 #define BDESC_END(kind, next_kind)
30637 #include "i386-builtin.def"
30639 #undef BDESC
30640 #undef BDESC_FIRST
30641 #undef BDESC_END
30643 IX86_BUILTIN_MAX,
30645 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30647 /* Now just the aliases for bdesc_* start/end. */
30648 #define BDESC(mask, icode, name, code, comparison, flag)
30649 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30650 #define BDESC_END(kind, next_kind) \
30651 IX86_BUILTIN__BDESC_##kind##_LAST \
30652 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30654 #include "i386-builtin.def"
30656 #undef BDESC
30657 #undef BDESC_FIRST
30658 #undef BDESC_END
30660 /* Just to make sure there is no comma after the last enumerator. */
30661 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30664 /* Table for the ix86 builtin decls. */
30665 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30667 /* Table of all of the builtin functions that are possible with different ISA's
30668 but are waiting to be built until a function is declared to use that
30669 ISA. */
30670 struct builtin_isa {
30671 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30672 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
30673 const char *name; /* function name */
30674 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30675 unsigned char const_p:1; /* true if the declaration is constant */
30676 unsigned char pure_p:1; /* true if the declaration has pure attribute */
30677 bool leaf_p; /* true if the declaration has leaf attribute */
30678 bool nothrow_p; /* true if the declaration has nothrow attribute */
30679 bool set_and_not_built_p;
30682 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30684 /* Bits that can still enable any inclusion of a builtin. */
30685 static HOST_WIDE_INT deferred_isa_values = 0;
30686 static HOST_WIDE_INT deferred_isa_values2 = 0;
30688 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30689 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30690 function decl in the ix86_builtins array. Returns the function decl or
30691 NULL_TREE, if the builtin was not added.
30693 If the front end has a special hook for builtin functions, delay adding
30694 builtin functions that aren't in the current ISA until the ISA is changed
30695 with function specific optimization. Doing so, can save about 300K for the
30696 default compiler. When the builtin is expanded, check at that time whether
30697 it is valid.
30699 If the front end doesn't have a special hook, record all builtins, even if
30700 it isn't an instruction set in the current ISA in case the user uses
30701 function specific options for a different ISA, so that we don't get scope
30702 errors if a builtin is added in the middle of a function scope. */
30704 static inline tree
30705 def_builtin (HOST_WIDE_INT mask, const char *name,
30706 enum ix86_builtin_func_type tcode,
30707 enum ix86_builtins code)
30709 tree decl = NULL_TREE;
30711 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30713 ix86_builtins_isa[(int) code].isa = mask;
30715 mask &= ~OPTION_MASK_ISA_64BIT;
30717 /* Filter out the masks most often ored together with others. */
30718 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30719 && mask != OPTION_MASK_ISA_AVX512VL)
30720 mask &= ~OPTION_MASK_ISA_AVX512VL;
30721 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
30722 && mask != OPTION_MASK_ISA_AVX512BW)
30723 mask &= ~OPTION_MASK_ISA_AVX512BW;
30725 if (mask == 0
30726 || (mask & ix86_isa_flags) != 0
30727 || (lang_hooks.builtin_function
30728 == lang_hooks.builtin_function_ext_scope))
30730 tree type = ix86_get_builtin_func_type (tcode);
30731 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30732 NULL, NULL_TREE);
30733 ix86_builtins[(int) code] = decl;
30734 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30736 else
30738 /* Just a MASK where set_and_not_built_p == true can potentially
30739 include a builtin. */
30740 deferred_isa_values |= mask;
30741 ix86_builtins[(int) code] = NULL_TREE;
30742 ix86_builtins_isa[(int) code].tcode = tcode;
30743 ix86_builtins_isa[(int) code].name = name;
30744 ix86_builtins_isa[(int) code].leaf_p = false;
30745 ix86_builtins_isa[(int) code].nothrow_p = false;
30746 ix86_builtins_isa[(int) code].const_p = false;
30747 ix86_builtins_isa[(int) code].pure_p = false;
30748 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30752 return decl;
30755 /* Like def_builtin, but also marks the function decl "const". */
30757 static inline tree
30758 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30759 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30761 tree decl = def_builtin (mask, name, tcode, code);
30762 if (decl)
30763 TREE_READONLY (decl) = 1;
30764 else
30765 ix86_builtins_isa[(int) code].const_p = true;
30767 return decl;
30770 /* Like def_builtin, but also marks the function decl "pure". */
30772 static inline tree
30773 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
30774 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30776 tree decl = def_builtin (mask, name, tcode, code);
30777 if (decl)
30778 DECL_PURE_P (decl) = 1;
30779 else
30780 ix86_builtins_isa[(int) code].pure_p = true;
30782 return decl;
30785 /* Like def_builtin, but for additional isa2 flags. */
30787 static inline tree
30788 def_builtin2 (HOST_WIDE_INT mask, const char *name,
30789 enum ix86_builtin_func_type tcode,
30790 enum ix86_builtins code)
30792 tree decl = NULL_TREE;
30794 ix86_builtins_isa[(int) code].isa2 = mask;
30796 if (mask == 0
30797 || (mask & ix86_isa_flags2) != 0
30798 || (lang_hooks.builtin_function
30799 == lang_hooks.builtin_function_ext_scope))
30802 tree type = ix86_get_builtin_func_type (tcode);
30803 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30804 NULL, NULL_TREE);
30805 ix86_builtins[(int) code] = decl;
30806 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30808 else
30810 /* Just a MASK where set_and_not_built_p == true can potentially
30811 include a builtin. */
30812 deferred_isa_values2 |= mask;
30813 ix86_builtins[(int) code] = NULL_TREE;
30814 ix86_builtins_isa[(int) code].tcode = tcode;
30815 ix86_builtins_isa[(int) code].name = name;
30816 ix86_builtins_isa[(int) code].leaf_p = false;
30817 ix86_builtins_isa[(int) code].nothrow_p = false;
30818 ix86_builtins_isa[(int) code].const_p = false;
30819 ix86_builtins_isa[(int) code].pure_p = false;
30820 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30823 return decl;
30826 /* Like def_builtin, but also marks the function decl "const". */
30828 static inline tree
30829 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
30830 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30832 tree decl = def_builtin2 (mask, name, tcode, code);
30833 if (decl)
30834 TREE_READONLY (decl) = 1;
30835 else
30836 ix86_builtins_isa[(int) code].const_p = true;
30838 return decl;
30841 /* Like def_builtin, but also marks the function decl "pure". */
30843 static inline tree
30844 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
30845 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30847 tree decl = def_builtin2 (mask, name, tcode, code);
30848 if (decl)
30849 DECL_PURE_P (decl) = 1;
30850 else
30851 ix86_builtins_isa[(int) code].pure_p = true;
30853 return decl;
30856 /* Add any new builtin functions for a given ISA that may not have been
30857 declared. This saves a bit of space compared to adding all of the
30858 declarations to the tree, even if we didn't use them. */
30860 static void
30861 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
30863 isa &= ~OPTION_MASK_ISA_64BIT;
30865 if ((isa & deferred_isa_values) == 0
30866 && (isa2 & deferred_isa_values2) == 0)
30867 return;
30869 /* Bits in ISA value can be removed from potential isa values. */
30870 deferred_isa_values &= ~isa;
30871 deferred_isa_values2 &= ~isa2;
30873 int i;
30874 tree saved_current_target_pragma = current_target_pragma;
30875 current_target_pragma = NULL_TREE;
30877 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30879 if (((ix86_builtins_isa[i].isa & isa) != 0
30880 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
30881 && ix86_builtins_isa[i].set_and_not_built_p)
30883 tree decl, type;
30885 /* Don't define the builtin again. */
30886 ix86_builtins_isa[i].set_and_not_built_p = false;
30888 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30889 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30890 type, i, BUILT_IN_MD, NULL,
30891 NULL_TREE);
30893 ix86_builtins[i] = decl;
30894 if (ix86_builtins_isa[i].const_p)
30895 TREE_READONLY (decl) = 1;
30896 if (ix86_builtins_isa[i].pure_p)
30897 DECL_PURE_P (decl) = 1;
30898 if (ix86_builtins_isa[i].leaf_p)
30899 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30900 NULL_TREE);
30901 if (ix86_builtins_isa[i].nothrow_p)
30902 TREE_NOTHROW (decl) = 1;
30906 current_target_pragma = saved_current_target_pragma;
30909 /* Bits for builtin_description.flag. */
30911 /* Set when we don't support the comparison natively, and should
30912 swap_comparison in order to support it. */
30913 #define BUILTIN_DESC_SWAP_OPERANDS 1
30915 struct builtin_description
30917 const HOST_WIDE_INT mask;
30918 const enum insn_code icode;
30919 const char *const name;
30920 const enum ix86_builtins code;
30921 const enum rtx_code comparison;
30922 const int flag;
30925 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30926 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30927 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30928 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30929 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30930 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30931 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30932 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30933 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30934 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30935 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30936 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30937 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30938 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30939 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30940 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30941 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30942 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30943 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30944 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30945 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30946 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30947 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30948 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30949 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30950 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30951 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30952 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30953 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30954 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30955 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30956 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30957 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30958 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30959 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30960 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30961 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30962 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30963 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30964 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30965 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30966 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30967 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30968 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30969 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30970 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30971 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30972 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30973 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30974 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30975 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30976 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30978 #define BDESC(mask, icode, name, code, comparison, flag) \
30979 { mask, icode, name, code, comparison, flag },
30980 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30981 static const struct builtin_description bdesc_##kind[] = \
30983 BDESC (mask, icode, name, code, comparison, flag)
30984 #define BDESC_END(kind, next_kind) \
30987 #include "i386-builtin.def"
30989 #undef BDESC
30990 #undef BDESC_FIRST
30991 #undef BDESC_END
30993 /* TM vector builtins. */
30995 /* Reuse the existing x86-specific `struct builtin_description' cause
30996 we're lazy. Add casts to make them fit. */
30997 static const struct builtin_description bdesc_tm[] =
30999 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31000 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31001 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31002 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31003 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31004 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31005 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31007 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31008 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31009 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31010 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31011 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31012 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31013 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31015 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31016 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31017 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31018 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31019 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31020 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31021 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31023 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31024 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31025 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31028 /* Initialize the transactional memory vector load/store builtins. */
31030 static void
31031 ix86_init_tm_builtins (void)
31033 enum ix86_builtin_func_type ftype;
31034 const struct builtin_description *d;
31035 size_t i;
31036 tree decl;
31037 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31038 tree attrs_log, attrs_type_log;
31040 if (!flag_tm)
31041 return;
31043 /* If there are no builtins defined, we must be compiling in a
31044 language without trans-mem support. */
31045 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31046 return;
31048 /* Use whatever attributes a normal TM load has. */
31049 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31050 attrs_load = DECL_ATTRIBUTES (decl);
31051 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31052 /* Use whatever attributes a normal TM store has. */
31053 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31054 attrs_store = DECL_ATTRIBUTES (decl);
31055 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31056 /* Use whatever attributes a normal TM log has. */
31057 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31058 attrs_log = DECL_ATTRIBUTES (decl);
31059 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31061 for (i = 0, d = bdesc_tm;
31062 i < ARRAY_SIZE (bdesc_tm);
31063 i++, d++)
31065 if ((d->mask & ix86_isa_flags) != 0
31066 || (lang_hooks.builtin_function
31067 == lang_hooks.builtin_function_ext_scope))
31069 tree type, attrs, attrs_type;
31070 enum built_in_function code = (enum built_in_function) d->code;
31072 ftype = (enum ix86_builtin_func_type) d->flag;
31073 type = ix86_get_builtin_func_type (ftype);
31075 if (BUILTIN_TM_LOAD_P (code))
31077 attrs = attrs_load;
31078 attrs_type = attrs_type_load;
31080 else if (BUILTIN_TM_STORE_P (code))
31082 attrs = attrs_store;
31083 attrs_type = attrs_type_store;
31085 else
31087 attrs = attrs_log;
31088 attrs_type = attrs_type_log;
31090 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31091 /* The builtin without the prefix for
31092 calling it directly. */
31093 d->name + strlen ("__builtin_"),
31094 attrs);
31095 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31096 set the TYPE_ATTRIBUTES. */
31097 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31099 set_builtin_decl (code, decl, false);
31104 /* Macros for verification of enum ix86_builtins order. */
31105 #define BDESC_VERIFY(x, y, z) \
31106 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31107 #define BDESC_VERIFYS(x, y, z) \
31108 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31110 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31111 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31112 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31113 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31114 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31115 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31116 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31117 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31118 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31119 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31120 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31121 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31122 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31123 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31124 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31125 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31126 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31127 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31128 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
31129 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31130 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31131 IX86_BUILTIN__BDESC_CET_LAST, 1);
31132 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31133 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
31135 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31136 in the current target ISA to allow the user to compile particular modules
31137 with different target specific options that differ from the command line
31138 options. */
31139 static void
31140 ix86_init_mmx_sse_builtins (void)
31142 const struct builtin_description * d;
31143 enum ix86_builtin_func_type ftype;
31144 size_t i;
31146 /* Add all special builtins with variable number of operands. */
31147 for (i = 0, d = bdesc_special_args;
31148 i < ARRAY_SIZE (bdesc_special_args);
31149 i++, d++)
31151 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31152 if (d->name == 0)
31153 continue;
31155 ftype = (enum ix86_builtin_func_type) d->flag;
31156 def_builtin (d->mask, d->name, ftype, d->code);
31158 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31159 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31160 ARRAY_SIZE (bdesc_special_args) - 1);
31162 /* Add all builtins with variable number of operands. */
31163 for (i = 0, d = bdesc_args;
31164 i < ARRAY_SIZE (bdesc_args);
31165 i++, d++)
31167 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31168 if (d->name == 0)
31169 continue;
31171 ftype = (enum ix86_builtin_func_type) d->flag;
31172 def_builtin_const (d->mask, d->name, ftype, d->code);
31174 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31175 IX86_BUILTIN__BDESC_ARGS_FIRST,
31176 ARRAY_SIZE (bdesc_args) - 1);
31178 /* Add all builtins with variable number of operands. */
31179 for (i = 0, d = bdesc_args2;
31180 i < ARRAY_SIZE (bdesc_args2);
31181 i++, d++)
31183 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
31184 if (d->name == 0)
31185 continue;
31187 ftype = (enum ix86_builtin_func_type) d->flag;
31188 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31190 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
31191 IX86_BUILTIN__BDESC_ARGS2_FIRST,
31192 ARRAY_SIZE (bdesc_args2) - 1);
31194 /* Add all builtins with rounding. */
31195 for (i = 0, d = bdesc_round_args;
31196 i < ARRAY_SIZE (bdesc_round_args);
31197 i++, d++)
31199 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31200 if (d->name == 0)
31201 continue;
31203 ftype = (enum ix86_builtin_func_type) d->flag;
31204 def_builtin_const (d->mask, d->name, ftype, d->code);
31206 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31207 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31208 ARRAY_SIZE (bdesc_round_args) - 1);
31210 /* pcmpestr[im] insns. */
31211 for (i = 0, d = bdesc_pcmpestr;
31212 i < ARRAY_SIZE (bdesc_pcmpestr);
31213 i++, d++)
31215 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31216 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31217 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31218 else
31219 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31220 def_builtin_const (d->mask, d->name, ftype, d->code);
31222 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31223 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31224 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31226 /* pcmpistr[im] insns. */
31227 for (i = 0, d = bdesc_pcmpistr;
31228 i < ARRAY_SIZE (bdesc_pcmpistr);
31229 i++, d++)
31231 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31232 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31233 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31234 else
31235 ftype = INT_FTYPE_V16QI_V16QI_INT;
31236 def_builtin_const (d->mask, d->name, ftype, d->code);
31238 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31239 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31240 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31242 /* comi/ucomi insns. */
31243 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31245 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31246 if (d->mask == OPTION_MASK_ISA_SSE2)
31247 ftype = INT_FTYPE_V2DF_V2DF;
31248 else
31249 ftype = INT_FTYPE_V4SF_V4SF;
31250 def_builtin_const (d->mask, d->name, ftype, d->code);
31252 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31253 IX86_BUILTIN__BDESC_COMI_FIRST,
31254 ARRAY_SIZE (bdesc_comi) - 1);
31256 /* SSE */
31257 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31258 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31259 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31260 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31262 /* SSE or 3DNow!A */
31263 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31264 /* As it uses V4HImode, we have to require -mmmx too. */
31265 | OPTION_MASK_ISA_MMX,
31266 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31267 IX86_BUILTIN_MASKMOVQ);
31269 /* SSE2 */
31270 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31271 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31273 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31274 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31275 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31276 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31278 /* SSE3. */
31279 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31280 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31281 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31282 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31284 /* AES */
31285 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31286 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31287 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31288 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31289 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31290 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31291 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31292 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31293 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31294 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31295 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31296 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31298 /* PCLMUL */
31299 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31300 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31302 /* RDRND */
31303 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31304 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31305 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31306 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31307 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31308 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31309 IX86_BUILTIN_RDRAND64_STEP);
31311 /* AVX2 */
31312 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31313 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31314 IX86_BUILTIN_GATHERSIV2DF);
31316 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31317 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31318 IX86_BUILTIN_GATHERSIV4DF);
31320 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31321 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31322 IX86_BUILTIN_GATHERDIV2DF);
31324 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31325 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31326 IX86_BUILTIN_GATHERDIV4DF);
31328 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31329 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31330 IX86_BUILTIN_GATHERSIV4SF);
31332 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31333 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31334 IX86_BUILTIN_GATHERSIV8SF);
31336 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31337 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31338 IX86_BUILTIN_GATHERDIV4SF);
31340 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31341 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31342 IX86_BUILTIN_GATHERDIV8SF);
31344 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31345 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31346 IX86_BUILTIN_GATHERSIV2DI);
31348 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31349 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31350 IX86_BUILTIN_GATHERSIV4DI);
31352 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31353 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31354 IX86_BUILTIN_GATHERDIV2DI);
31356 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31357 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31358 IX86_BUILTIN_GATHERDIV4DI);
31360 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31361 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31362 IX86_BUILTIN_GATHERSIV4SI);
31364 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31365 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31366 IX86_BUILTIN_GATHERSIV8SI);
31368 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31369 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31370 IX86_BUILTIN_GATHERDIV4SI);
31372 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31373 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31374 IX86_BUILTIN_GATHERDIV8SI);
31376 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31377 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31378 IX86_BUILTIN_GATHERALTSIV4DF);
31380 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31381 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31382 IX86_BUILTIN_GATHERALTDIV8SF);
31384 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31385 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31386 IX86_BUILTIN_GATHERALTSIV4DI);
31388 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31389 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31390 IX86_BUILTIN_GATHERALTDIV8SI);
31392 /* AVX512F */
31393 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31394 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31395 IX86_BUILTIN_GATHER3SIV16SF);
31397 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31398 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31399 IX86_BUILTIN_GATHER3SIV8DF);
31401 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31402 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31403 IX86_BUILTIN_GATHER3DIV16SF);
31405 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31406 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31407 IX86_BUILTIN_GATHER3DIV8DF);
31409 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31410 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31411 IX86_BUILTIN_GATHER3SIV16SI);
31413 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31414 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31415 IX86_BUILTIN_GATHER3SIV8DI);
31417 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31418 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31419 IX86_BUILTIN_GATHER3DIV16SI);
31421 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31422 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31423 IX86_BUILTIN_GATHER3DIV8DI);
31425 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31426 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31427 IX86_BUILTIN_GATHER3ALTSIV8DF);
31429 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31430 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31431 IX86_BUILTIN_GATHER3ALTDIV16SF);
31433 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31434 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31435 IX86_BUILTIN_GATHER3ALTSIV8DI);
31437 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31438 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31439 IX86_BUILTIN_GATHER3ALTDIV16SI);
31441 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31442 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31443 IX86_BUILTIN_SCATTERSIV16SF);
31445 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31446 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31447 IX86_BUILTIN_SCATTERSIV8DF);
31449 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31450 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31451 IX86_BUILTIN_SCATTERDIV16SF);
31453 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31454 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31455 IX86_BUILTIN_SCATTERDIV8DF);
31457 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31458 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31459 IX86_BUILTIN_SCATTERSIV16SI);
31461 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31462 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31463 IX86_BUILTIN_SCATTERSIV8DI);
31465 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31466 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31467 IX86_BUILTIN_SCATTERDIV16SI);
31469 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31470 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31471 IX86_BUILTIN_SCATTERDIV8DI);
31473 /* AVX512VL */
31474 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31475 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31476 IX86_BUILTIN_GATHER3SIV2DF);
31478 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31479 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31480 IX86_BUILTIN_GATHER3SIV4DF);
31482 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31483 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31484 IX86_BUILTIN_GATHER3DIV2DF);
31486 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31487 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31488 IX86_BUILTIN_GATHER3DIV4DF);
31490 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31491 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31492 IX86_BUILTIN_GATHER3SIV4SF);
31494 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31495 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31496 IX86_BUILTIN_GATHER3SIV8SF);
31498 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31499 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31500 IX86_BUILTIN_GATHER3DIV4SF);
31502 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31503 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31504 IX86_BUILTIN_GATHER3DIV8SF);
31506 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31507 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31508 IX86_BUILTIN_GATHER3SIV2DI);
31510 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31511 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31512 IX86_BUILTIN_GATHER3SIV4DI);
31514 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31515 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31516 IX86_BUILTIN_GATHER3DIV2DI);
31518 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31519 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31520 IX86_BUILTIN_GATHER3DIV4DI);
31522 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31523 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31524 IX86_BUILTIN_GATHER3SIV4SI);
31526 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31527 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31528 IX86_BUILTIN_GATHER3SIV8SI);
31530 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31531 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31532 IX86_BUILTIN_GATHER3DIV4SI);
31534 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31535 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31536 IX86_BUILTIN_GATHER3DIV8SI);
31538 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31539 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31540 IX86_BUILTIN_GATHER3ALTSIV4DF);
31542 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31543 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31544 IX86_BUILTIN_GATHER3ALTDIV8SF);
31546 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31547 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31548 IX86_BUILTIN_GATHER3ALTSIV4DI);
31550 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31551 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31552 IX86_BUILTIN_GATHER3ALTDIV8SI);
31554 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31555 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31556 IX86_BUILTIN_SCATTERSIV8SF);
31558 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31559 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31560 IX86_BUILTIN_SCATTERSIV4SF);
31562 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31563 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31564 IX86_BUILTIN_SCATTERSIV4DF);
31566 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31567 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31568 IX86_BUILTIN_SCATTERSIV2DF);
31570 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31571 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31572 IX86_BUILTIN_SCATTERDIV8SF);
31574 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31575 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31576 IX86_BUILTIN_SCATTERDIV4SF);
31578 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31579 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31580 IX86_BUILTIN_SCATTERDIV4DF);
31582 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31583 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31584 IX86_BUILTIN_SCATTERDIV2DF);
31586 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31587 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31588 IX86_BUILTIN_SCATTERSIV8SI);
31590 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31591 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31592 IX86_BUILTIN_SCATTERSIV4SI);
31594 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31595 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31596 IX86_BUILTIN_SCATTERSIV4DI);
31598 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31599 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31600 IX86_BUILTIN_SCATTERSIV2DI);
31602 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31603 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31604 IX86_BUILTIN_SCATTERDIV8SI);
31606 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31607 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31608 IX86_BUILTIN_SCATTERDIV4SI);
31610 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31611 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31612 IX86_BUILTIN_SCATTERDIV4DI);
31614 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31615 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31616 IX86_BUILTIN_SCATTERDIV2DI);
31617 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31618 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31619 IX86_BUILTIN_SCATTERALTSIV8DF);
31621 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31622 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31623 IX86_BUILTIN_SCATTERALTDIV16SF);
31625 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31626 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31627 IX86_BUILTIN_SCATTERALTSIV8DI);
31629 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31630 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31631 IX86_BUILTIN_SCATTERALTDIV16SI);
31633 /* AVX512PF */
31634 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31635 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31636 IX86_BUILTIN_GATHERPFDPD);
31637 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31638 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31639 IX86_BUILTIN_GATHERPFDPS);
31640 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31641 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31642 IX86_BUILTIN_GATHERPFQPD);
31643 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31644 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31645 IX86_BUILTIN_GATHERPFQPS);
31646 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31647 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31648 IX86_BUILTIN_SCATTERPFDPD);
31649 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31650 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31651 IX86_BUILTIN_SCATTERPFDPS);
31652 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31653 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31654 IX86_BUILTIN_SCATTERPFQPD);
31655 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31656 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31657 IX86_BUILTIN_SCATTERPFQPS);
31659 /* SHA */
31660 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31661 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31662 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31663 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31664 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31665 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31666 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31667 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31668 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31669 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31670 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31671 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31672 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31673 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31675 /* RTM. */
31676 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31677 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31679 /* MMX access to the vec_init patterns. */
31680 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31681 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31683 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31684 V4HI_FTYPE_HI_HI_HI_HI,
31685 IX86_BUILTIN_VEC_INIT_V4HI);
31687 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31688 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31689 IX86_BUILTIN_VEC_INIT_V8QI);
31691 /* Access to the vec_extract patterns. */
31692 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31693 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31694 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31695 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31696 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31697 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31698 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31699 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31700 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31701 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31703 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31704 /* As it uses V4HImode, we have to require -mmmx too. */
31705 | OPTION_MASK_ISA_MMX,
31706 "__builtin_ia32_vec_ext_v4hi",
31707 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31709 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31710 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31712 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31713 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31715 /* Access to the vec_set patterns. */
31716 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31717 "__builtin_ia32_vec_set_v2di",
31718 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31720 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31721 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31723 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31724 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31726 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31727 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31729 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31730 /* As it uses V4HImode, we have to require -mmmx too. */
31731 | OPTION_MASK_ISA_MMX,
31732 "__builtin_ia32_vec_set_v4hi",
31733 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31735 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31736 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31738 /* RDSEED */
31739 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31740 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31741 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31742 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31743 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31744 "__builtin_ia32_rdseed_di_step",
31745 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31747 /* ADCX */
31748 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31749 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31750 def_builtin (OPTION_MASK_ISA_64BIT,
31751 "__builtin_ia32_addcarryx_u64",
31752 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31753 IX86_BUILTIN_ADDCARRYX64);
31755 /* SBB */
31756 def_builtin (0, "__builtin_ia32_sbb_u32",
31757 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31758 def_builtin (OPTION_MASK_ISA_64BIT,
31759 "__builtin_ia32_sbb_u64",
31760 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31761 IX86_BUILTIN_SBB64);
31763 /* Read/write FLAGS. */
31764 def_builtin (0, "__builtin_ia32_readeflags_u32",
31765 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31766 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31767 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31768 def_builtin (0, "__builtin_ia32_writeeflags_u32",
31769 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31770 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31771 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31773 /* CLFLUSHOPT. */
31774 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31775 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31777 /* CLWB. */
31778 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31779 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31781 /* MONITORX and MWAITX. */
31782 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31783 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31784 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31785 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31787 /* CLZERO. */
31788 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31789 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31791 /* Add FMA4 multi-arg argument instructions */
31792 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31794 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31795 if (d->name == 0)
31796 continue;
31798 ftype = (enum ix86_builtin_func_type) d->flag;
31799 def_builtin_const (d->mask, d->name, ftype, d->code);
31801 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31802 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31803 ARRAY_SIZE (bdesc_multi_arg) - 1);
31805 /* Add CET inrinsics. */
31806 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
31808 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
31809 if (d->name == 0)
31810 continue;
31812 ftype = (enum ix86_builtin_func_type) d->flag;
31813 def_builtin (d->mask, d->name, ftype, d->code);
31815 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
31816 IX86_BUILTIN__BDESC_CET_FIRST,
31817 ARRAY_SIZE (bdesc_cet) - 1);
31819 for (i = 0, d = bdesc_cet_rdssp;
31820 i < ARRAY_SIZE (bdesc_cet_rdssp);
31821 i++, d++)
31823 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
31824 if (d->name == 0)
31825 continue;
31827 ftype = (enum ix86_builtin_func_type) d->flag;
31828 def_builtin (d->mask, d->name, ftype, d->code);
31830 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
31831 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31832 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
31835 static void
31836 ix86_init_mpx_builtins ()
31838 const struct builtin_description * d;
31839 enum ix86_builtin_func_type ftype;
31840 tree decl;
31841 size_t i;
31843 for (i = 0, d = bdesc_mpx;
31844 i < ARRAY_SIZE (bdesc_mpx);
31845 i++, d++)
31847 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
31848 if (d->name == 0)
31849 continue;
31851 ftype = (enum ix86_builtin_func_type) d->flag;
31852 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
31854 /* With no leaf and nothrow flags for MPX builtins
31855 abnormal edges may follow its call when setjmp
31856 presents in the function. Since we may have a lot
31857 of MPX builtins calls it causes lots of useless
31858 edges and enormous PHI nodes. To avoid this we mark
31859 MPX builtins as leaf and nothrow. */
31860 if (decl)
31862 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31863 NULL_TREE);
31864 TREE_NOTHROW (decl) = 1;
31866 else
31868 ix86_builtins_isa[(int)d->code].leaf_p = true;
31869 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31872 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
31873 IX86_BUILTIN__BDESC_MPX_FIRST,
31874 ARRAY_SIZE (bdesc_mpx) - 1);
31876 for (i = 0, d = bdesc_mpx_const;
31877 i < ARRAY_SIZE (bdesc_mpx_const);
31878 i++, d++)
31880 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
31881 if (d->name == 0)
31882 continue;
31884 ftype = (enum ix86_builtin_func_type) d->flag;
31885 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
31887 if (decl)
31889 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31890 NULL_TREE);
31891 TREE_NOTHROW (decl) = 1;
31893 else
31895 ix86_builtins_isa[(int)d->code].leaf_p = true;
31896 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31899 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
31900 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31901 ARRAY_SIZE (bdesc_mpx_const) - 1);
31903 #undef BDESC_VERIFY
31904 #undef BDESC_VERIFYS
31906 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31907 to return a pointer to VERSION_DECL if the outcome of the expression
31908 formed by PREDICATE_CHAIN is true. This function will be called during
31909 version dispatch to decide which function version to execute. It returns
31910 the basic block at the end, to which more conditions can be added. */
31912 static basic_block
31913 add_condition_to_bb (tree function_decl, tree version_decl,
31914 tree predicate_chain, basic_block new_bb)
31916 gimple *return_stmt;
31917 tree convert_expr, result_var;
31918 gimple *convert_stmt;
31919 gimple *call_cond_stmt;
31920 gimple *if_else_stmt;
31922 basic_block bb1, bb2, bb3;
31923 edge e12, e23;
31925 tree cond_var, and_expr_var = NULL_TREE;
31926 gimple_seq gseq;
31928 tree predicate_decl, predicate_arg;
31930 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31932 gcc_assert (new_bb != NULL);
31933 gseq = bb_seq (new_bb);
31936 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31937 build_fold_addr_expr (version_decl));
31938 result_var = create_tmp_var (ptr_type_node);
31939 convert_stmt = gimple_build_assign (result_var, convert_expr);
31940 return_stmt = gimple_build_return (result_var);
31942 if (predicate_chain == NULL_TREE)
31944 gimple_seq_add_stmt (&gseq, convert_stmt);
31945 gimple_seq_add_stmt (&gseq, return_stmt);
31946 set_bb_seq (new_bb, gseq);
31947 gimple_set_bb (convert_stmt, new_bb);
31948 gimple_set_bb (return_stmt, new_bb);
31949 pop_cfun ();
31950 return new_bb;
31953 while (predicate_chain != NULL)
31955 cond_var = create_tmp_var (integer_type_node);
31956 predicate_decl = TREE_PURPOSE (predicate_chain);
31957 predicate_arg = TREE_VALUE (predicate_chain);
31958 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31959 gimple_call_set_lhs (call_cond_stmt, cond_var);
31961 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31962 gimple_set_bb (call_cond_stmt, new_bb);
31963 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31965 predicate_chain = TREE_CHAIN (predicate_chain);
31967 if (and_expr_var == NULL)
31968 and_expr_var = cond_var;
31969 else
31971 gimple *assign_stmt;
31972 /* Use MIN_EXPR to check if any integer is zero?.
31973 and_expr_var = min_expr <cond_var, and_expr_var> */
31974 assign_stmt = gimple_build_assign (and_expr_var,
31975 build2 (MIN_EXPR, integer_type_node,
31976 cond_var, and_expr_var));
31978 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31979 gimple_set_bb (assign_stmt, new_bb);
31980 gimple_seq_add_stmt (&gseq, assign_stmt);
31984 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31985 integer_zero_node,
31986 NULL_TREE, NULL_TREE);
31987 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31988 gimple_set_bb (if_else_stmt, new_bb);
31989 gimple_seq_add_stmt (&gseq, if_else_stmt);
31991 gimple_seq_add_stmt (&gseq, convert_stmt);
31992 gimple_seq_add_stmt (&gseq, return_stmt);
31993 set_bb_seq (new_bb, gseq);
31995 bb1 = new_bb;
31996 e12 = split_block (bb1, if_else_stmt);
31997 bb2 = e12->dest;
31998 e12->flags &= ~EDGE_FALLTHRU;
31999 e12->flags |= EDGE_TRUE_VALUE;
32001 e23 = split_block (bb2, return_stmt);
32003 gimple_set_bb (convert_stmt, bb2);
32004 gimple_set_bb (return_stmt, bb2);
32006 bb3 = e23->dest;
32007 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32009 remove_edge (e23);
32010 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32012 pop_cfun ();
32014 return bb3;
32017 /* This parses the attribute arguments to target in DECL and determines
32018 the right builtin to use to match the platform specification.
32019 It returns the priority value for this version decl. If PREDICATE_LIST
32020 is not NULL, it stores the list of cpu features that need to be checked
32021 before dispatching this function. */
32023 static unsigned int
32024 get_builtin_code_for_version (tree decl, tree *predicate_list)
32026 tree attrs;
32027 struct cl_target_option cur_target;
32028 tree target_node;
32029 struct cl_target_option *new_target;
32030 const char *arg_str = NULL;
32031 const char *attrs_str = NULL;
32032 char *tok_str = NULL;
32033 char *token;
32035 /* Priority of i386 features, greater value is higher priority. This is
32036 used to decide the order in which function dispatch must happen. For
32037 instance, a version specialized for SSE4.2 should be checked for dispatch
32038 before a version for SSE3, as SSE4.2 implies SSE3. */
32039 enum feature_priority
32041 P_ZERO = 0,
32042 P_MMX,
32043 P_SSE,
32044 P_SSE2,
32045 P_SSE3,
32046 P_SSSE3,
32047 P_PROC_SSSE3,
32048 P_SSE4_A,
32049 P_PROC_SSE4_A,
32050 P_SSE4_1,
32051 P_SSE4_2,
32052 P_PROC_SSE4_2,
32053 P_POPCNT,
32054 P_AES,
32055 P_PCLMUL,
32056 P_AVX,
32057 P_PROC_AVX,
32058 P_BMI,
32059 P_PROC_BMI,
32060 P_FMA4,
32061 P_XOP,
32062 P_PROC_XOP,
32063 P_FMA,
32064 P_PROC_FMA,
32065 P_BMI2,
32066 P_AVX2,
32067 P_PROC_AVX2,
32068 P_AVX512F,
32069 P_PROC_AVX512F
32072 enum feature_priority priority = P_ZERO;
32074 /* These are the target attribute strings for which a dispatcher is
32075 available, from fold_builtin_cpu. */
32077 static struct _feature_list
32079 const char *const name;
32080 const enum feature_priority priority;
32082 const feature_list[] =
32084 {"mmx", P_MMX},
32085 {"sse", P_SSE},
32086 {"sse2", P_SSE2},
32087 {"sse3", P_SSE3},
32088 {"sse4a", P_SSE4_A},
32089 {"ssse3", P_SSSE3},
32090 {"sse4.1", P_SSE4_1},
32091 {"sse4.2", P_SSE4_2},
32092 {"popcnt", P_POPCNT},
32093 {"aes", P_AES},
32094 {"pclmul", P_PCLMUL},
32095 {"avx", P_AVX},
32096 {"bmi", P_BMI},
32097 {"fma4", P_FMA4},
32098 {"xop", P_XOP},
32099 {"fma", P_FMA},
32100 {"bmi2", P_BMI2},
32101 {"avx2", P_AVX2},
32102 {"avx512f", P_AVX512F}
32106 static unsigned int NUM_FEATURES
32107 = sizeof (feature_list) / sizeof (struct _feature_list);
32109 unsigned int i;
32111 tree predicate_chain = NULL_TREE;
32112 tree predicate_decl, predicate_arg;
32114 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32115 gcc_assert (attrs != NULL);
32117 attrs = TREE_VALUE (TREE_VALUE (attrs));
32119 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32120 attrs_str = TREE_STRING_POINTER (attrs);
32122 /* Return priority zero for default function. */
32123 if (strcmp (attrs_str, "default") == 0)
32124 return 0;
32126 /* Handle arch= if specified. For priority, set it to be 1 more than
32127 the best instruction set the processor can handle. For instance, if
32128 there is a version for atom and a version for ssse3 (the highest ISA
32129 priority for atom), the atom version must be checked for dispatch
32130 before the ssse3 version. */
32131 if (strstr (attrs_str, "arch=") != NULL)
32133 cl_target_option_save (&cur_target, &global_options);
32134 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32135 &global_options_set);
32137 gcc_assert (target_node);
32138 new_target = TREE_TARGET_OPTION (target_node);
32139 gcc_assert (new_target);
32141 if (new_target->arch_specified && new_target->arch > 0)
32143 switch (new_target->arch)
32145 case PROCESSOR_CORE2:
32146 arg_str = "core2";
32147 priority = P_PROC_SSSE3;
32148 break;
32149 case PROCESSOR_NEHALEM:
32150 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32152 arg_str = "westmere";
32153 priority = P_AES;
32155 else
32157 /* We translate "arch=corei7" and "arch=nehalem" to
32158 "corei7" so that it will be mapped to M_INTEL_COREI7
32159 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32160 arg_str = "corei7";
32161 priority = P_PROC_SSE4_2;
32163 break;
32164 case PROCESSOR_SANDYBRIDGE:
32165 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32166 arg_str = "ivybridge";
32167 else
32168 arg_str = "sandybridge";
32169 priority = P_PROC_AVX;
32170 break;
32171 case PROCESSOR_HASWELL:
32172 case PROCESSOR_SKYLAKE_AVX512:
32173 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_GFNI)
32174 arg_str = "icelake";
32175 else if (new_target->x_ix86_isa_flags
32176 & OPTION_MASK_ISA_AVX512VBMI)
32177 arg_str = "cannonlake";
32178 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32179 arg_str = "skylake-avx512";
32180 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32181 arg_str = "skylake";
32182 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32183 arg_str = "broadwell";
32184 else
32185 arg_str = "haswell";
32186 priority = P_PROC_AVX2;
32187 break;
32188 case PROCESSOR_BONNELL:
32189 arg_str = "bonnell";
32190 priority = P_PROC_SSSE3;
32191 break;
32192 case PROCESSOR_KNL:
32193 arg_str = "knl";
32194 priority = P_PROC_AVX512F;
32195 break;
32196 case PROCESSOR_KNM:
32197 arg_str = "knm";
32198 priority = P_PROC_AVX512F;
32199 break;
32200 case PROCESSOR_SILVERMONT:
32201 arg_str = "silvermont";
32202 priority = P_PROC_SSE4_2;
32203 break;
32204 case PROCESSOR_AMDFAM10:
32205 arg_str = "amdfam10h";
32206 priority = P_PROC_SSE4_A;
32207 break;
32208 case PROCESSOR_BTVER1:
32209 arg_str = "btver1";
32210 priority = P_PROC_SSE4_A;
32211 break;
32212 case PROCESSOR_BTVER2:
32213 arg_str = "btver2";
32214 priority = P_PROC_BMI;
32215 break;
32216 case PROCESSOR_BDVER1:
32217 arg_str = "bdver1";
32218 priority = P_PROC_XOP;
32219 break;
32220 case PROCESSOR_BDVER2:
32221 arg_str = "bdver2";
32222 priority = P_PROC_FMA;
32223 break;
32224 case PROCESSOR_BDVER3:
32225 arg_str = "bdver3";
32226 priority = P_PROC_FMA;
32227 break;
32228 case PROCESSOR_BDVER4:
32229 arg_str = "bdver4";
32230 priority = P_PROC_AVX2;
32231 break;
32232 case PROCESSOR_ZNVER1:
32233 arg_str = "znver1";
32234 priority = P_PROC_AVX2;
32235 break;
32239 cl_target_option_restore (&global_options, &cur_target);
32241 if (predicate_list && arg_str == NULL)
32243 error_at (DECL_SOURCE_LOCATION (decl),
32244 "No dispatcher found for the versioning attributes");
32245 return 0;
32248 if (predicate_list)
32250 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32251 /* For a C string literal the length includes the trailing NULL. */
32252 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32253 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32254 predicate_chain);
32258 /* Process feature name. */
32259 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32260 strcpy (tok_str, attrs_str);
32261 token = strtok (tok_str, ",");
32262 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32264 while (token != NULL)
32266 /* Do not process "arch=" */
32267 if (strncmp (token, "arch=", 5) == 0)
32269 token = strtok (NULL, ",");
32270 continue;
32272 for (i = 0; i < NUM_FEATURES; ++i)
32274 if (strcmp (token, feature_list[i].name) == 0)
32276 if (predicate_list)
32278 predicate_arg = build_string_literal (
32279 strlen (feature_list[i].name) + 1,
32280 feature_list[i].name);
32281 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32282 predicate_chain);
32284 /* Find the maximum priority feature. */
32285 if (feature_list[i].priority > priority)
32286 priority = feature_list[i].priority;
32288 break;
32291 if (predicate_list && i == NUM_FEATURES)
32293 error_at (DECL_SOURCE_LOCATION (decl),
32294 "No dispatcher found for %s", token);
32295 return 0;
32297 token = strtok (NULL, ",");
32299 free (tok_str);
32301 if (predicate_list && predicate_chain == NULL_TREE)
32303 error_at (DECL_SOURCE_LOCATION (decl),
32304 "No dispatcher found for the versioning attributes : %s",
32305 attrs_str);
32306 return 0;
32308 else if (predicate_list)
32310 predicate_chain = nreverse (predicate_chain);
32311 *predicate_list = predicate_chain;
32314 return priority;
32317 /* This compares the priority of target features in function DECL1
32318 and DECL2. It returns positive value if DECL1 is higher priority,
32319 negative value if DECL2 is higher priority and 0 if they are the
32320 same. */
32322 static int
32323 ix86_compare_version_priority (tree decl1, tree decl2)
32325 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32326 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32328 return (int)priority1 - (int)priority2;
32331 /* V1 and V2 point to function versions with different priorities
32332 based on the target ISA. This function compares their priorities. */
32334 static int
32335 feature_compare (const void *v1, const void *v2)
32337 typedef struct _function_version_info
32339 tree version_decl;
32340 tree predicate_chain;
32341 unsigned int dispatch_priority;
32342 } function_version_info;
32344 const function_version_info c1 = *(const function_version_info *)v1;
32345 const function_version_info c2 = *(const function_version_info *)v2;
32346 return (c2.dispatch_priority - c1.dispatch_priority);
32349 /* This function generates the dispatch function for
32350 multi-versioned functions. DISPATCH_DECL is the function which will
32351 contain the dispatch logic. FNDECLS are the function choices for
32352 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32353 in DISPATCH_DECL in which the dispatch code is generated. */
32355 static int
32356 dispatch_function_versions (tree dispatch_decl,
32357 void *fndecls_p,
32358 basic_block *empty_bb)
32360 tree default_decl;
32361 gimple *ifunc_cpu_init_stmt;
32362 gimple_seq gseq;
32363 int ix;
32364 tree ele;
32365 vec<tree> *fndecls;
32366 unsigned int num_versions = 0;
32367 unsigned int actual_versions = 0;
32368 unsigned int i;
32370 struct _function_version_info
32372 tree version_decl;
32373 tree predicate_chain;
32374 unsigned int dispatch_priority;
32375 }*function_version_info;
32377 gcc_assert (dispatch_decl != NULL
32378 && fndecls_p != NULL
32379 && empty_bb != NULL);
32381 /*fndecls_p is actually a vector. */
32382 fndecls = static_cast<vec<tree> *> (fndecls_p);
32384 /* At least one more version other than the default. */
32385 num_versions = fndecls->length ();
32386 gcc_assert (num_versions >= 2);
32388 function_version_info = (struct _function_version_info *)
32389 XNEWVEC (struct _function_version_info, (num_versions - 1));
32391 /* The first version in the vector is the default decl. */
32392 default_decl = (*fndecls)[0];
32394 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32396 gseq = bb_seq (*empty_bb);
32397 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32398 constructors, so explicity call __builtin_cpu_init here. */
32399 ifunc_cpu_init_stmt = gimple_build_call_vec (
32400 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32401 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32402 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32403 set_bb_seq (*empty_bb, gseq);
32405 pop_cfun ();
32408 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32410 tree version_decl = ele;
32411 tree predicate_chain = NULL_TREE;
32412 unsigned int priority;
32413 /* Get attribute string, parse it and find the right predicate decl.
32414 The predicate function could be a lengthy combination of many
32415 features, like arch-type and various isa-variants. */
32416 priority = get_builtin_code_for_version (version_decl,
32417 &predicate_chain);
32419 if (predicate_chain == NULL_TREE)
32420 continue;
32422 function_version_info [actual_versions].version_decl = version_decl;
32423 function_version_info [actual_versions].predicate_chain
32424 = predicate_chain;
32425 function_version_info [actual_versions].dispatch_priority = priority;
32426 actual_versions++;
32429 /* Sort the versions according to descending order of dispatch priority. The
32430 priority is based on the ISA. This is not a perfect solution. There
32431 could still be ambiguity. If more than one function version is suitable
32432 to execute, which one should be dispatched? In future, allow the user
32433 to specify a dispatch priority next to the version. */
32434 qsort (function_version_info, actual_versions,
32435 sizeof (struct _function_version_info), feature_compare);
32437 for (i = 0; i < actual_versions; ++i)
32438 *empty_bb = add_condition_to_bb (dispatch_decl,
32439 function_version_info[i].version_decl,
32440 function_version_info[i].predicate_chain,
32441 *empty_bb);
32443 /* dispatch default version at the end. */
32444 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32445 NULL, *empty_bb);
32447 free (function_version_info);
32448 return 0;
32451 /* This function changes the assembler name for functions that are
32452 versions. If DECL is a function version and has a "target"
32453 attribute, it appends the attribute string to its assembler name. */
32455 static tree
32456 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32458 tree version_attr;
32459 const char *orig_name, *version_string;
32460 char *attr_str, *assembler_name;
32462 if (DECL_DECLARED_INLINE_P (decl)
32463 && lookup_attribute ("gnu_inline",
32464 DECL_ATTRIBUTES (decl)))
32465 error_at (DECL_SOURCE_LOCATION (decl),
32466 "Function versions cannot be marked as gnu_inline,"
32467 " bodies have to be generated");
32469 if (DECL_VIRTUAL_P (decl)
32470 || DECL_VINDEX (decl))
32471 sorry ("Virtual function multiversioning not supported");
32473 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32475 /* target attribute string cannot be NULL. */
32476 gcc_assert (version_attr != NULL_TREE);
32478 orig_name = IDENTIFIER_POINTER (id);
32479 version_string
32480 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32482 if (strcmp (version_string, "default") == 0)
32483 return id;
32485 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32486 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32488 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32490 /* Allow assembler name to be modified if already set. */
32491 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32492 SET_DECL_RTL (decl, NULL);
32494 tree ret = get_identifier (assembler_name);
32495 XDELETEVEC (attr_str);
32496 XDELETEVEC (assembler_name);
32497 return ret;
32501 static tree
32502 ix86_mangle_decl_assembler_name (tree decl, tree id)
32504 /* For function version, add the target suffix to the assembler name. */
32505 if (TREE_CODE (decl) == FUNCTION_DECL
32506 && DECL_FUNCTION_VERSIONED (decl))
32507 id = ix86_mangle_function_version_assembler_name (decl, id);
32508 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32509 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32510 #endif
32512 return id;
32515 /* Make a dispatcher declaration for the multi-versioned function DECL.
32516 Calls to DECL function will be replaced with calls to the dispatcher
32517 by the front-end. Returns the decl of the dispatcher function. */
32519 static tree
32520 ix86_get_function_versions_dispatcher (void *decl)
32522 tree fn = (tree) decl;
32523 struct cgraph_node *node = NULL;
32524 struct cgraph_node *default_node = NULL;
32525 struct cgraph_function_version_info *node_v = NULL;
32526 struct cgraph_function_version_info *first_v = NULL;
32528 tree dispatch_decl = NULL;
32530 struct cgraph_function_version_info *default_version_info = NULL;
32532 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32534 node = cgraph_node::get (fn);
32535 gcc_assert (node != NULL);
32537 node_v = node->function_version ();
32538 gcc_assert (node_v != NULL);
32540 if (node_v->dispatcher_resolver != NULL)
32541 return node_v->dispatcher_resolver;
32543 /* Find the default version and make it the first node. */
32544 first_v = node_v;
32545 /* Go to the beginning of the chain. */
32546 while (first_v->prev != NULL)
32547 first_v = first_v->prev;
32548 default_version_info = first_v;
32549 while (default_version_info != NULL)
32551 if (is_function_default_version
32552 (default_version_info->this_node->decl))
32553 break;
32554 default_version_info = default_version_info->next;
32557 /* If there is no default node, just return NULL. */
32558 if (default_version_info == NULL)
32559 return NULL;
32561 /* Make default info the first node. */
32562 if (first_v != default_version_info)
32564 default_version_info->prev->next = default_version_info->next;
32565 if (default_version_info->next)
32566 default_version_info->next->prev = default_version_info->prev;
32567 first_v->prev = default_version_info;
32568 default_version_info->next = first_v;
32569 default_version_info->prev = NULL;
32572 default_node = default_version_info->this_node;
32574 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32575 if (targetm.has_ifunc_p ())
32577 struct cgraph_function_version_info *it_v = NULL;
32578 struct cgraph_node *dispatcher_node = NULL;
32579 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32581 /* Right now, the dispatching is done via ifunc. */
32582 dispatch_decl = make_dispatcher_decl (default_node->decl);
32584 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32585 gcc_assert (dispatcher_node != NULL);
32586 dispatcher_node->dispatcher_function = 1;
32587 dispatcher_version_info
32588 = dispatcher_node->insert_new_function_version ();
32589 dispatcher_version_info->next = default_version_info;
32590 dispatcher_node->definition = 1;
32592 /* Set the dispatcher for all the versions. */
32593 it_v = default_version_info;
32594 while (it_v != NULL)
32596 it_v->dispatcher_resolver = dispatch_decl;
32597 it_v = it_v->next;
32600 else
32601 #endif
32603 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32604 "multiversioning needs ifunc which is not supported "
32605 "on this target");
32608 return dispatch_decl;
32611 /* Make the resolver function decl to dispatch the versions of
32612 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
32613 ifunc alias that will point to the created resolver. Create an
32614 empty basic block in the resolver and store the pointer in
32615 EMPTY_BB. Return the decl of the resolver function. */
32617 static tree
32618 make_resolver_func (const tree default_decl,
32619 const tree ifunc_alias_decl,
32620 basic_block *empty_bb)
32622 char *resolver_name;
32623 tree decl, type, decl_name, t;
32625 /* IFUNC's have to be globally visible. So, if the default_decl is
32626 not, then the name of the IFUNC should be made unique. */
32627 if (TREE_PUBLIC (default_decl) == 0)
32629 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
32630 symtab->change_decl_assembler_name (ifunc_alias_decl,
32631 get_identifier (ifunc_name));
32632 XDELETEVEC (ifunc_name);
32635 resolver_name = make_unique_name (default_decl, "resolver", false);
32637 /* The resolver function should return a (void *). */
32638 type = build_function_type_list (ptr_type_node, NULL_TREE);
32640 decl = build_fn_decl (resolver_name, type);
32641 decl_name = get_identifier (resolver_name);
32642 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32644 DECL_NAME (decl) = decl_name;
32645 TREE_USED (decl) = 1;
32646 DECL_ARTIFICIAL (decl) = 1;
32647 DECL_IGNORED_P (decl) = 1;
32648 TREE_PUBLIC (decl) = 0;
32649 DECL_UNINLINABLE (decl) = 1;
32651 /* Resolver is not external, body is generated. */
32652 DECL_EXTERNAL (decl) = 0;
32653 DECL_EXTERNAL (ifunc_alias_decl) = 0;
32655 DECL_CONTEXT (decl) = NULL_TREE;
32656 DECL_INITIAL (decl) = make_node (BLOCK);
32657 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32659 if (DECL_COMDAT_GROUP (default_decl)
32660 || TREE_PUBLIC (default_decl))
32662 /* In this case, each translation unit with a call to this
32663 versioned function will put out a resolver. Ensure it
32664 is comdat to keep just one copy. */
32665 DECL_COMDAT (decl) = 1;
32666 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32668 /* Build result decl and add to function_decl. */
32669 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32670 DECL_ARTIFICIAL (t) = 1;
32671 DECL_IGNORED_P (t) = 1;
32672 DECL_RESULT (decl) = t;
32674 gimplify_function_tree (decl);
32675 push_cfun (DECL_STRUCT_FUNCTION (decl));
32676 *empty_bb = init_lowered_empty_function (decl, false,
32677 profile_count::uninitialized ());
32679 cgraph_node::add_new_function (decl, true);
32680 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32682 pop_cfun ();
32684 gcc_assert (ifunc_alias_decl != NULL);
32685 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
32686 DECL_ATTRIBUTES (ifunc_alias_decl)
32687 = make_attribute ("ifunc", resolver_name,
32688 DECL_ATTRIBUTES (ifunc_alias_decl));
32690 /* Create the alias for dispatch to resolver here. */
32691 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
32692 XDELETEVEC (resolver_name);
32693 return decl;
32696 /* Generate the dispatching code body to dispatch multi-versioned function
32697 DECL. The target hook is called to process the "target" attributes and
32698 provide the code to dispatch the right function at run-time. NODE points
32699 to the dispatcher decl whose body will be created. */
32701 static tree
32702 ix86_generate_version_dispatcher_body (void *node_p)
32704 tree resolver_decl;
32705 basic_block empty_bb;
32706 tree default_ver_decl;
32707 struct cgraph_node *versn;
32708 struct cgraph_node *node;
32710 struct cgraph_function_version_info *node_version_info = NULL;
32711 struct cgraph_function_version_info *versn_info = NULL;
32713 node = (cgraph_node *)node_p;
32715 node_version_info = node->function_version ();
32716 gcc_assert (node->dispatcher_function
32717 && node_version_info != NULL);
32719 if (node_version_info->dispatcher_resolver)
32720 return node_version_info->dispatcher_resolver;
32722 /* The first version in the chain corresponds to the default version. */
32723 default_ver_decl = node_version_info->next->this_node->decl;
32725 /* node is going to be an alias, so remove the finalized bit. */
32726 node->definition = false;
32728 resolver_decl = make_resolver_func (default_ver_decl,
32729 node->decl, &empty_bb);
32731 node_version_info->dispatcher_resolver = resolver_decl;
32733 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32735 auto_vec<tree, 2> fn_ver_vec;
32737 for (versn_info = node_version_info->next; versn_info;
32738 versn_info = versn_info->next)
32740 versn = versn_info->this_node;
32741 /* Check for virtual functions here again, as by this time it should
32742 have been determined if this function needs a vtable index or
32743 not. This happens for methods in derived classes that override
32744 virtual methods in base classes but are not explicitly marked as
32745 virtual. */
32746 if (DECL_VINDEX (versn->decl))
32747 sorry ("Virtual function multiversioning not supported");
32749 fn_ver_vec.safe_push (versn->decl);
32752 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32753 cgraph_edge::rebuild_edges ();
32754 pop_cfun ();
32755 return resolver_decl;
32757 /* This builds the processor_model struct type defined in
32758 libgcc/config/i386/cpuinfo.c */
32760 static tree
32761 build_processor_model_struct (void)
32763 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32764 "__cpu_features"};
32765 tree field = NULL_TREE, field_chain = NULL_TREE;
32766 int i;
32767 tree type = make_node (RECORD_TYPE);
32769 /* The first 3 fields are unsigned int. */
32770 for (i = 0; i < 3; ++i)
32772 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32773 get_identifier (field_name[i]), unsigned_type_node);
32774 if (field_chain != NULL_TREE)
32775 DECL_CHAIN (field) = field_chain;
32776 field_chain = field;
32779 /* The last field is an array of unsigned integers of size one. */
32780 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32781 get_identifier (field_name[3]),
32782 build_array_type (unsigned_type_node,
32783 build_index_type (size_one_node)));
32784 if (field_chain != NULL_TREE)
32785 DECL_CHAIN (field) = field_chain;
32786 field_chain = field;
32788 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
32789 return type;
32792 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
32794 static tree
32795 make_var_decl (tree type, const char *name)
32797 tree new_decl;
32799 new_decl = build_decl (UNKNOWN_LOCATION,
32800 VAR_DECL,
32801 get_identifier(name),
32802 type);
32804 DECL_EXTERNAL (new_decl) = 1;
32805 TREE_STATIC (new_decl) = 1;
32806 TREE_PUBLIC (new_decl) = 1;
32807 DECL_INITIAL (new_decl) = 0;
32808 DECL_ARTIFICIAL (new_decl) = 0;
32809 DECL_PRESERVE_P (new_decl) = 1;
32811 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
32812 assemble_variable (new_decl, 0, 0, 0);
32814 return new_decl;
32817 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
32818 into an integer defined in libgcc/config/i386/cpuinfo.c */
32820 static tree
32821 fold_builtin_cpu (tree fndecl, tree *args)
32823 unsigned int i;
32824 enum ix86_builtins fn_code = (enum ix86_builtins)
32825 DECL_FUNCTION_CODE (fndecl);
32826 tree param_string_cst = NULL;
32828 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
32829 enum processor_features
32831 F_CMOV = 0,
32832 F_MMX,
32833 F_POPCNT,
32834 F_SSE,
32835 F_SSE2,
32836 F_SSE3,
32837 F_SSSE3,
32838 F_SSE4_1,
32839 F_SSE4_2,
32840 F_AVX,
32841 F_AVX2,
32842 F_SSE4_A,
32843 F_FMA4,
32844 F_XOP,
32845 F_FMA,
32846 F_AVX512F,
32847 F_BMI,
32848 F_BMI2,
32849 F_AES,
32850 F_PCLMUL,
32851 F_AVX512VL,
32852 F_AVX512BW,
32853 F_AVX512DQ,
32854 F_AVX512CD,
32855 F_AVX512ER,
32856 F_AVX512PF,
32857 F_AVX512VBMI,
32858 F_AVX512IFMA,
32859 F_AVX5124VNNIW,
32860 F_AVX5124FMAPS,
32861 F_AVX512VPOPCNTDQ,
32862 F_MAX
32865 /* These are the values for vendor types and cpu types and subtypes
32866 in cpuinfo.c. Cpu types and subtypes should be subtracted by
32867 the corresponding start value. */
32868 enum processor_model
32870 M_INTEL = 1,
32871 M_AMD,
32872 M_CPU_TYPE_START,
32873 M_INTEL_BONNELL,
32874 M_INTEL_CORE2,
32875 M_INTEL_COREI7,
32876 M_AMDFAM10H,
32877 M_AMDFAM15H,
32878 M_INTEL_SILVERMONT,
32879 M_INTEL_KNL,
32880 M_AMD_BTVER1,
32881 M_AMD_BTVER2,
32882 M_AMDFAM17H,
32883 M_INTEL_KNM,
32884 M_CPU_SUBTYPE_START,
32885 M_INTEL_COREI7_NEHALEM,
32886 M_INTEL_COREI7_WESTMERE,
32887 M_INTEL_COREI7_SANDYBRIDGE,
32888 M_AMDFAM10H_BARCELONA,
32889 M_AMDFAM10H_SHANGHAI,
32890 M_AMDFAM10H_ISTANBUL,
32891 M_AMDFAM15H_BDVER1,
32892 M_AMDFAM15H_BDVER2,
32893 M_AMDFAM15H_BDVER3,
32894 M_AMDFAM15H_BDVER4,
32895 M_AMDFAM17H_ZNVER1,
32896 M_INTEL_COREI7_IVYBRIDGE,
32897 M_INTEL_COREI7_HASWELL,
32898 M_INTEL_COREI7_BROADWELL,
32899 M_INTEL_COREI7_SKYLAKE,
32900 M_INTEL_COREI7_SKYLAKE_AVX512,
32901 M_INTEL_COREI7_CANNONLAKE,
32902 M_INTEL_COREI7_ICELAKE
32905 static struct _arch_names_table
32907 const char *const name;
32908 const enum processor_model model;
32910 const arch_names_table[] =
32912 {"amd", M_AMD},
32913 {"intel", M_INTEL},
32914 {"atom", M_INTEL_BONNELL},
32915 {"slm", M_INTEL_SILVERMONT},
32916 {"core2", M_INTEL_CORE2},
32917 {"corei7", M_INTEL_COREI7},
32918 {"nehalem", M_INTEL_COREI7_NEHALEM},
32919 {"westmere", M_INTEL_COREI7_WESTMERE},
32920 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32921 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32922 {"haswell", M_INTEL_COREI7_HASWELL},
32923 {"broadwell", M_INTEL_COREI7_BROADWELL},
32924 {"skylake", M_INTEL_COREI7_SKYLAKE},
32925 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32926 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32927 {"icelake", M_INTEL_COREI7_ICELAKE},
32928 {"bonnell", M_INTEL_BONNELL},
32929 {"silvermont", M_INTEL_SILVERMONT},
32930 {"knl", M_INTEL_KNL},
32931 {"knm", M_INTEL_KNM},
32932 {"amdfam10h", M_AMDFAM10H},
32933 {"barcelona", M_AMDFAM10H_BARCELONA},
32934 {"shanghai", M_AMDFAM10H_SHANGHAI},
32935 {"istanbul", M_AMDFAM10H_ISTANBUL},
32936 {"btver1", M_AMD_BTVER1},
32937 {"amdfam15h", M_AMDFAM15H},
32938 {"bdver1", M_AMDFAM15H_BDVER1},
32939 {"bdver2", M_AMDFAM15H_BDVER2},
32940 {"bdver3", M_AMDFAM15H_BDVER3},
32941 {"bdver4", M_AMDFAM15H_BDVER4},
32942 {"btver2", M_AMD_BTVER2},
32943 {"amdfam17h", M_AMDFAM17H},
32944 {"znver1", M_AMDFAM17H_ZNVER1},
32947 static struct _isa_names_table
32949 const char *const name;
32950 const enum processor_features feature;
32952 const isa_names_table[] =
32954 {"cmov", F_CMOV},
32955 {"mmx", F_MMX},
32956 {"popcnt", F_POPCNT},
32957 {"sse", F_SSE},
32958 {"sse2", F_SSE2},
32959 {"sse3", F_SSE3},
32960 {"ssse3", F_SSSE3},
32961 {"sse4a", F_SSE4_A},
32962 {"sse4.1", F_SSE4_1},
32963 {"sse4.2", F_SSE4_2},
32964 {"avx", F_AVX},
32965 {"fma4", F_FMA4},
32966 {"xop", F_XOP},
32967 {"fma", F_FMA},
32968 {"avx2", F_AVX2},
32969 {"avx512f", F_AVX512F},
32970 {"bmi", F_BMI},
32971 {"bmi2", F_BMI2},
32972 {"aes", F_AES},
32973 {"pclmul", F_PCLMUL},
32974 {"avx512vl",F_AVX512VL},
32975 {"avx512bw",F_AVX512BW},
32976 {"avx512dq",F_AVX512DQ},
32977 {"avx512cd",F_AVX512CD},
32978 {"avx512er",F_AVX512ER},
32979 {"avx512pf",F_AVX512PF},
32980 {"avx512vbmi",F_AVX512VBMI},
32981 {"avx512ifma",F_AVX512IFMA},
32982 {"avx5124vnniw",F_AVX5124VNNIW},
32983 {"avx5124fmaps",F_AVX5124FMAPS},
32984 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32987 tree __processor_model_type = build_processor_model_struct ();
32988 tree __cpu_model_var = make_var_decl (__processor_model_type,
32989 "__cpu_model");
32992 varpool_node::add (__cpu_model_var);
32994 gcc_assert ((args != NULL) && (*args != NULL));
32996 param_string_cst = *args;
32997 while (param_string_cst
32998 && TREE_CODE (param_string_cst) != STRING_CST)
33000 /* *args must be a expr that can contain other EXPRS leading to a
33001 STRING_CST. */
33002 if (!EXPR_P (param_string_cst))
33004 error ("Parameter to builtin must be a string constant or literal");
33005 return integer_zero_node;
33007 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33010 gcc_assert (param_string_cst);
33012 if (fn_code == IX86_BUILTIN_CPU_IS)
33014 tree ref;
33015 tree field;
33016 tree final;
33018 unsigned int field_val = 0;
33019 unsigned int NUM_ARCH_NAMES
33020 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33022 for (i = 0; i < NUM_ARCH_NAMES; i++)
33023 if (strcmp (arch_names_table[i].name,
33024 TREE_STRING_POINTER (param_string_cst)) == 0)
33025 break;
33027 if (i == NUM_ARCH_NAMES)
33029 error ("Parameter to builtin not valid: %s",
33030 TREE_STRING_POINTER (param_string_cst));
33031 return integer_zero_node;
33034 field = TYPE_FIELDS (__processor_model_type);
33035 field_val = arch_names_table[i].model;
33037 /* CPU types are stored in the next field. */
33038 if (field_val > M_CPU_TYPE_START
33039 && field_val < M_CPU_SUBTYPE_START)
33041 field = DECL_CHAIN (field);
33042 field_val -= M_CPU_TYPE_START;
33045 /* CPU subtypes are stored in the next field. */
33046 if (field_val > M_CPU_SUBTYPE_START)
33048 field = DECL_CHAIN ( DECL_CHAIN (field));
33049 field_val -= M_CPU_SUBTYPE_START;
33052 /* Get the appropriate field in __cpu_model. */
33053 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33054 field, NULL_TREE);
33056 /* Check the value. */
33057 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33058 build_int_cstu (unsigned_type_node, field_val));
33059 return build1 (CONVERT_EXPR, integer_type_node, final);
33061 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33063 tree ref;
33064 tree array_elt;
33065 tree field;
33066 tree final;
33068 unsigned int field_val = 0;
33069 unsigned int NUM_ISA_NAMES
33070 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33072 for (i = 0; i < NUM_ISA_NAMES; i++)
33073 if (strcmp (isa_names_table[i].name,
33074 TREE_STRING_POINTER (param_string_cst)) == 0)
33075 break;
33077 if (i == NUM_ISA_NAMES)
33079 error ("Parameter to builtin not valid: %s",
33080 TREE_STRING_POINTER (param_string_cst));
33081 return integer_zero_node;
33084 field = TYPE_FIELDS (__processor_model_type);
33085 /* Get the last field, which is __cpu_features. */
33086 while (DECL_CHAIN (field))
33087 field = DECL_CHAIN (field);
33089 /* Get the appropriate field: __cpu_model.__cpu_features */
33090 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33091 field, NULL_TREE);
33093 /* Access the 0th element of __cpu_features array. */
33094 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33095 integer_zero_node, NULL_TREE, NULL_TREE);
33097 field_val = (1 << isa_names_table[i].feature);
33098 /* Return __cpu_model.__cpu_features[0] & field_val */
33099 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33100 build_int_cstu (unsigned_type_node, field_val));
33101 return build1 (CONVERT_EXPR, integer_type_node, final);
33103 gcc_unreachable ();
33106 static tree
33107 ix86_fold_builtin (tree fndecl, int n_args,
33108 tree *args, bool ignore ATTRIBUTE_UNUSED)
33110 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33112 enum ix86_builtins fn_code = (enum ix86_builtins)
33113 DECL_FUNCTION_CODE (fndecl);
33114 switch (fn_code)
33116 case IX86_BUILTIN_CPU_IS:
33117 case IX86_BUILTIN_CPU_SUPPORTS:
33118 gcc_assert (n_args == 1);
33119 return fold_builtin_cpu (fndecl, args);
33121 case IX86_BUILTIN_NANQ:
33122 case IX86_BUILTIN_NANSQ:
33124 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33125 const char *str = c_getstr (*args);
33126 int quiet = fn_code == IX86_BUILTIN_NANQ;
33127 REAL_VALUE_TYPE real;
33129 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33130 return build_real (type, real);
33131 return NULL_TREE;
33134 case IX86_BUILTIN_INFQ:
33135 case IX86_BUILTIN_HUGE_VALQ:
33137 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33138 REAL_VALUE_TYPE inf;
33139 real_inf (&inf);
33140 return build_real (type, inf);
33143 case IX86_BUILTIN_TZCNT16:
33144 case IX86_BUILTIN_CTZS:
33145 case IX86_BUILTIN_TZCNT32:
33146 case IX86_BUILTIN_TZCNT64:
33147 gcc_assert (n_args == 1);
33148 if (TREE_CODE (args[0]) == INTEGER_CST)
33150 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33151 tree arg = args[0];
33152 if (fn_code == IX86_BUILTIN_TZCNT16
33153 || fn_code == IX86_BUILTIN_CTZS)
33154 arg = fold_convert (short_unsigned_type_node, arg);
33155 if (integer_zerop (arg))
33156 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33157 else
33158 return fold_const_call (CFN_CTZ, type, arg);
33160 break;
33162 case IX86_BUILTIN_LZCNT16:
33163 case IX86_BUILTIN_CLZS:
33164 case IX86_BUILTIN_LZCNT32:
33165 case IX86_BUILTIN_LZCNT64:
33166 gcc_assert (n_args == 1);
33167 if (TREE_CODE (args[0]) == INTEGER_CST)
33169 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33170 tree arg = args[0];
33171 if (fn_code == IX86_BUILTIN_LZCNT16
33172 || fn_code == IX86_BUILTIN_CLZS)
33173 arg = fold_convert (short_unsigned_type_node, arg);
33174 if (integer_zerop (arg))
33175 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33176 else
33177 return fold_const_call (CFN_CLZ, type, arg);
33179 break;
33181 case IX86_BUILTIN_BEXTR32:
33182 case IX86_BUILTIN_BEXTR64:
33183 case IX86_BUILTIN_BEXTRI32:
33184 case IX86_BUILTIN_BEXTRI64:
33185 gcc_assert (n_args == 2);
33186 if (tree_fits_uhwi_p (args[1]))
33188 unsigned HOST_WIDE_INT res = 0;
33189 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33190 unsigned int start = tree_to_uhwi (args[1]);
33191 unsigned int len = (start & 0xff00) >> 8;
33192 start &= 0xff;
33193 if (start >= prec || len == 0)
33194 res = 0;
33195 else if (!tree_fits_uhwi_p (args[0]))
33196 break;
33197 else
33198 res = tree_to_uhwi (args[0]) >> start;
33199 if (len > prec)
33200 len = prec;
33201 if (len < HOST_BITS_PER_WIDE_INT)
33202 res &= (HOST_WIDE_INT_1U << len) - 1;
33203 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33205 break;
33207 case IX86_BUILTIN_BZHI32:
33208 case IX86_BUILTIN_BZHI64:
33209 gcc_assert (n_args == 2);
33210 if (tree_fits_uhwi_p (args[1]))
33212 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33213 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33214 return args[0];
33215 if (!tree_fits_uhwi_p (args[0]))
33216 break;
33217 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33218 res &= ~(HOST_WIDE_INT_M1U << idx);
33219 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33221 break;
33223 case IX86_BUILTIN_PDEP32:
33224 case IX86_BUILTIN_PDEP64:
33225 gcc_assert (n_args == 2);
33226 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33228 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33229 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33230 unsigned HOST_WIDE_INT res = 0;
33231 unsigned HOST_WIDE_INT m, k = 1;
33232 for (m = 1; m; m <<= 1)
33233 if ((mask & m) != 0)
33235 if ((src & k) != 0)
33236 res |= m;
33237 k <<= 1;
33239 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33241 break;
33243 case IX86_BUILTIN_PEXT32:
33244 case IX86_BUILTIN_PEXT64:
33245 gcc_assert (n_args == 2);
33246 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33248 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33249 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33250 unsigned HOST_WIDE_INT res = 0;
33251 unsigned HOST_WIDE_INT m, k = 1;
33252 for (m = 1; m; m <<= 1)
33253 if ((mask & m) != 0)
33255 if ((src & m) != 0)
33256 res |= k;
33257 k <<= 1;
33259 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33261 break;
33263 default:
33264 break;
33268 #ifdef SUBTARGET_FOLD_BUILTIN
33269 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33270 #endif
33272 return NULL_TREE;
33275 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33276 constant) in GIMPLE. */
33278 bool
33279 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33281 gimple *stmt = gsi_stmt (*gsi);
33282 tree fndecl = gimple_call_fndecl (stmt);
33283 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33284 int n_args = gimple_call_num_args (stmt);
33285 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33286 tree decl = NULL_TREE;
33287 tree arg0, arg1;
33289 switch (fn_code)
33291 case IX86_BUILTIN_TZCNT32:
33292 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33293 goto fold_tzcnt_lzcnt;
33295 case IX86_BUILTIN_TZCNT64:
33296 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33297 goto fold_tzcnt_lzcnt;
33299 case IX86_BUILTIN_LZCNT32:
33300 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33301 goto fold_tzcnt_lzcnt;
33303 case IX86_BUILTIN_LZCNT64:
33304 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33305 goto fold_tzcnt_lzcnt;
33307 fold_tzcnt_lzcnt:
33308 gcc_assert (n_args == 1);
33309 arg0 = gimple_call_arg (stmt, 0);
33310 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33312 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33313 /* If arg0 is provably non-zero, optimize into generic
33314 __builtin_c[tl]z{,ll} function the middle-end handles
33315 better. */
33316 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33317 return false;
33319 location_t loc = gimple_location (stmt);
33320 gimple *g = gimple_build_call (decl, 1, arg0);
33321 gimple_set_location (g, loc);
33322 tree lhs = make_ssa_name (integer_type_node);
33323 gimple_call_set_lhs (g, lhs);
33324 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33325 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33326 gimple_set_location (g, loc);
33327 gsi_replace (gsi, g, false);
33328 return true;
33330 break;
33332 case IX86_BUILTIN_BZHI32:
33333 case IX86_BUILTIN_BZHI64:
33334 gcc_assert (n_args == 2);
33335 arg1 = gimple_call_arg (stmt, 1);
33336 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33338 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33339 arg0 = gimple_call_arg (stmt, 0);
33340 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33341 break;
33342 location_t loc = gimple_location (stmt);
33343 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33344 gimple_set_location (g, loc);
33345 gsi_replace (gsi, g, false);
33346 return true;
33348 break;
33350 case IX86_BUILTIN_PDEP32:
33351 case IX86_BUILTIN_PDEP64:
33352 case IX86_BUILTIN_PEXT32:
33353 case IX86_BUILTIN_PEXT64:
33354 gcc_assert (n_args == 2);
33355 arg1 = gimple_call_arg (stmt, 1);
33356 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33358 location_t loc = gimple_location (stmt);
33359 arg0 = gimple_call_arg (stmt, 0);
33360 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33361 gimple_set_location (g, loc);
33362 gsi_replace (gsi, g, false);
33363 return true;
33365 break;
33367 default:
33368 break;
33371 return false;
33374 /* Make builtins to detect cpu type and features supported. NAME is
33375 the builtin name, CODE is the builtin code, and FTYPE is the function
33376 type of the builtin. */
33378 static void
33379 make_cpu_type_builtin (const char* name, int code,
33380 enum ix86_builtin_func_type ftype, bool is_const)
33382 tree decl;
33383 tree type;
33385 type = ix86_get_builtin_func_type (ftype);
33386 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33387 NULL, NULL_TREE);
33388 gcc_assert (decl != NULL_TREE);
33389 ix86_builtins[(int) code] = decl;
33390 TREE_READONLY (decl) = is_const;
33393 /* Make builtins to get CPU type and features supported. The created
33394 builtins are :
33396 __builtin_cpu_init (), to detect cpu type and features,
33397 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33398 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33401 static void
33402 ix86_init_platform_type_builtins (void)
33404 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33405 INT_FTYPE_VOID, false);
33406 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33407 INT_FTYPE_PCCHAR, true);
33408 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33409 INT_FTYPE_PCCHAR, true);
33412 /* Internal method for ix86_init_builtins. */
33414 static void
33415 ix86_init_builtins_va_builtins_abi (void)
33417 tree ms_va_ref, sysv_va_ref;
33418 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33419 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33420 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33421 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33423 if (!TARGET_64BIT)
33424 return;
33425 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33426 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33427 ms_va_ref = build_reference_type (ms_va_list_type_node);
33428 sysv_va_ref =
33429 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33431 fnvoid_va_end_ms =
33432 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33433 fnvoid_va_start_ms =
33434 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33435 fnvoid_va_end_sysv =
33436 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33437 fnvoid_va_start_sysv =
33438 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33439 NULL_TREE);
33440 fnvoid_va_copy_ms =
33441 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33442 NULL_TREE);
33443 fnvoid_va_copy_sysv =
33444 build_function_type_list (void_type_node, sysv_va_ref,
33445 sysv_va_ref, NULL_TREE);
33447 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33448 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33449 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33450 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33451 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33452 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33453 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33454 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33455 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33456 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33457 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33458 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33461 static void
33462 ix86_init_builtin_types (void)
33464 tree float80_type_node, const_string_type_node;
33466 /* The __float80 type. */
33467 float80_type_node = long_double_type_node;
33468 if (TYPE_MODE (float80_type_node) != XFmode)
33470 if (float64x_type_node != NULL_TREE
33471 && TYPE_MODE (float64x_type_node) == XFmode)
33472 float80_type_node = float64x_type_node;
33473 else
33475 /* The __float80 type. */
33476 float80_type_node = make_node (REAL_TYPE);
33478 TYPE_PRECISION (float80_type_node) = 80;
33479 layout_type (float80_type_node);
33482 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33484 /* The __float128 type. The node has already been created as
33485 _Float128, so we only need to register the __float128 name for
33486 it. */
33487 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33489 const_string_type_node
33490 = build_pointer_type (build_qualified_type
33491 (char_type_node, TYPE_QUAL_CONST));
33493 /* This macro is built by i386-builtin-types.awk. */
33494 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33497 static void
33498 ix86_init_builtins (void)
33500 tree ftype, decl;
33502 ix86_init_builtin_types ();
33504 /* Builtins to get CPU type and features. */
33505 ix86_init_platform_type_builtins ();
33507 /* TFmode support builtins. */
33508 def_builtin_const (0, "__builtin_infq",
33509 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33510 def_builtin_const (0, "__builtin_huge_valq",
33511 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33513 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33514 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33515 BUILT_IN_MD, "nanq", NULL_TREE);
33516 TREE_READONLY (decl) = 1;
33517 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33519 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33520 BUILT_IN_MD, "nansq", NULL_TREE);
33521 TREE_READONLY (decl) = 1;
33522 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33524 /* We will expand them to normal call if SSE isn't available since
33525 they are used by libgcc. */
33526 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33527 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33528 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33529 TREE_READONLY (decl) = 1;
33530 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33532 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33533 decl = add_builtin_function ("__builtin_copysignq", ftype,
33534 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33535 "__copysigntf3", NULL_TREE);
33536 TREE_READONLY (decl) = 1;
33537 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33539 ix86_init_tm_builtins ();
33540 ix86_init_mmx_sse_builtins ();
33541 ix86_init_mpx_builtins ();
33543 if (TARGET_LP64)
33544 ix86_init_builtins_va_builtins_abi ();
33546 #ifdef SUBTARGET_INIT_BUILTINS
33547 SUBTARGET_INIT_BUILTINS;
33548 #endif
33551 /* Return the ix86 builtin for CODE. */
33553 static tree
33554 ix86_builtin_decl (unsigned code, bool)
33556 if (code >= IX86_BUILTIN_MAX)
33557 return error_mark_node;
33559 return ix86_builtins[code];
33562 /* Errors in the source file can cause expand_expr to return const0_rtx
33563 where we expect a vector. To avoid crashing, use one of the vector
33564 clear instructions. */
33565 static rtx
33566 safe_vector_operand (rtx x, machine_mode mode)
33568 if (x == const0_rtx)
33569 x = CONST0_RTX (mode);
33570 return x;
33573 /* Fixup modeless constants to fit required mode. */
33574 static rtx
33575 fixup_modeless_constant (rtx x, machine_mode mode)
33577 if (GET_MODE (x) == VOIDmode)
33578 x = convert_to_mode (mode, x, 1);
33579 return x;
33582 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33584 static rtx
33585 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33587 rtx pat;
33588 tree arg0 = CALL_EXPR_ARG (exp, 0);
33589 tree arg1 = CALL_EXPR_ARG (exp, 1);
33590 rtx op0 = expand_normal (arg0);
33591 rtx op1 = expand_normal (arg1);
33592 machine_mode tmode = insn_data[icode].operand[0].mode;
33593 machine_mode mode0 = insn_data[icode].operand[1].mode;
33594 machine_mode mode1 = insn_data[icode].operand[2].mode;
33596 if (VECTOR_MODE_P (mode0))
33597 op0 = safe_vector_operand (op0, mode0);
33598 if (VECTOR_MODE_P (mode1))
33599 op1 = safe_vector_operand (op1, mode1);
33601 if (optimize || !target
33602 || GET_MODE (target) != tmode
33603 || !insn_data[icode].operand[0].predicate (target, tmode))
33604 target = gen_reg_rtx (tmode);
33606 if (GET_MODE (op1) == SImode && mode1 == TImode)
33608 rtx x = gen_reg_rtx (V4SImode);
33609 emit_insn (gen_sse2_loadd (x, op1));
33610 op1 = gen_lowpart (TImode, x);
33613 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33614 op0 = copy_to_mode_reg (mode0, op0);
33615 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33616 op1 = copy_to_mode_reg (mode1, op1);
33618 pat = GEN_FCN (icode) (target, op0, op1);
33619 if (! pat)
33620 return 0;
33622 emit_insn (pat);
33624 return target;
33627 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33629 static rtx
33630 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33631 enum ix86_builtin_func_type m_type,
33632 enum rtx_code sub_code)
33634 rtx pat;
33635 int i;
33636 int nargs;
33637 bool comparison_p = false;
33638 bool tf_p = false;
33639 bool last_arg_constant = false;
33640 int num_memory = 0;
33641 struct {
33642 rtx op;
33643 machine_mode mode;
33644 } args[4];
33646 machine_mode tmode = insn_data[icode].operand[0].mode;
33648 switch (m_type)
33650 case MULTI_ARG_4_DF2_DI_I:
33651 case MULTI_ARG_4_DF2_DI_I1:
33652 case MULTI_ARG_4_SF2_SI_I:
33653 case MULTI_ARG_4_SF2_SI_I1:
33654 nargs = 4;
33655 last_arg_constant = true;
33656 break;
33658 case MULTI_ARG_3_SF:
33659 case MULTI_ARG_3_DF:
33660 case MULTI_ARG_3_SF2:
33661 case MULTI_ARG_3_DF2:
33662 case MULTI_ARG_3_DI:
33663 case MULTI_ARG_3_SI:
33664 case MULTI_ARG_3_SI_DI:
33665 case MULTI_ARG_3_HI:
33666 case MULTI_ARG_3_HI_SI:
33667 case MULTI_ARG_3_QI:
33668 case MULTI_ARG_3_DI2:
33669 case MULTI_ARG_3_SI2:
33670 case MULTI_ARG_3_HI2:
33671 case MULTI_ARG_3_QI2:
33672 nargs = 3;
33673 break;
33675 case MULTI_ARG_2_SF:
33676 case MULTI_ARG_2_DF:
33677 case MULTI_ARG_2_DI:
33678 case MULTI_ARG_2_SI:
33679 case MULTI_ARG_2_HI:
33680 case MULTI_ARG_2_QI:
33681 nargs = 2;
33682 break;
33684 case MULTI_ARG_2_DI_IMM:
33685 case MULTI_ARG_2_SI_IMM:
33686 case MULTI_ARG_2_HI_IMM:
33687 case MULTI_ARG_2_QI_IMM:
33688 nargs = 2;
33689 last_arg_constant = true;
33690 break;
33692 case MULTI_ARG_1_SF:
33693 case MULTI_ARG_1_DF:
33694 case MULTI_ARG_1_SF2:
33695 case MULTI_ARG_1_DF2:
33696 case MULTI_ARG_1_DI:
33697 case MULTI_ARG_1_SI:
33698 case MULTI_ARG_1_HI:
33699 case MULTI_ARG_1_QI:
33700 case MULTI_ARG_1_SI_DI:
33701 case MULTI_ARG_1_HI_DI:
33702 case MULTI_ARG_1_HI_SI:
33703 case MULTI_ARG_1_QI_DI:
33704 case MULTI_ARG_1_QI_SI:
33705 case MULTI_ARG_1_QI_HI:
33706 nargs = 1;
33707 break;
33709 case MULTI_ARG_2_DI_CMP:
33710 case MULTI_ARG_2_SI_CMP:
33711 case MULTI_ARG_2_HI_CMP:
33712 case MULTI_ARG_2_QI_CMP:
33713 nargs = 2;
33714 comparison_p = true;
33715 break;
33717 case MULTI_ARG_2_SF_TF:
33718 case MULTI_ARG_2_DF_TF:
33719 case MULTI_ARG_2_DI_TF:
33720 case MULTI_ARG_2_SI_TF:
33721 case MULTI_ARG_2_HI_TF:
33722 case MULTI_ARG_2_QI_TF:
33723 nargs = 2;
33724 tf_p = true;
33725 break;
33727 default:
33728 gcc_unreachable ();
33731 if (optimize || !target
33732 || GET_MODE (target) != tmode
33733 || !insn_data[icode].operand[0].predicate (target, tmode))
33734 target = gen_reg_rtx (tmode);
33735 else if (memory_operand (target, tmode))
33736 num_memory++;
33738 gcc_assert (nargs <= 4);
33740 for (i = 0; i < nargs; i++)
33742 tree arg = CALL_EXPR_ARG (exp, i);
33743 rtx op = expand_normal (arg);
33744 int adjust = (comparison_p) ? 1 : 0;
33745 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33747 if (last_arg_constant && i == nargs - 1)
33749 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33751 enum insn_code new_icode = icode;
33752 switch (icode)
33754 case CODE_FOR_xop_vpermil2v2df3:
33755 case CODE_FOR_xop_vpermil2v4sf3:
33756 case CODE_FOR_xop_vpermil2v4df3:
33757 case CODE_FOR_xop_vpermil2v8sf3:
33758 error ("the last argument must be a 2-bit immediate");
33759 return gen_reg_rtx (tmode);
33760 case CODE_FOR_xop_rotlv2di3:
33761 new_icode = CODE_FOR_rotlv2di3;
33762 goto xop_rotl;
33763 case CODE_FOR_xop_rotlv4si3:
33764 new_icode = CODE_FOR_rotlv4si3;
33765 goto xop_rotl;
33766 case CODE_FOR_xop_rotlv8hi3:
33767 new_icode = CODE_FOR_rotlv8hi3;
33768 goto xop_rotl;
33769 case CODE_FOR_xop_rotlv16qi3:
33770 new_icode = CODE_FOR_rotlv16qi3;
33771 xop_rotl:
33772 if (CONST_INT_P (op))
33774 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33775 op = GEN_INT (INTVAL (op) & mask);
33776 gcc_checking_assert
33777 (insn_data[icode].operand[i + 1].predicate (op, mode));
33779 else
33781 gcc_checking_assert
33782 (nargs == 2
33783 && insn_data[new_icode].operand[0].mode == tmode
33784 && insn_data[new_icode].operand[1].mode == tmode
33785 && insn_data[new_icode].operand[2].mode == mode
33786 && insn_data[new_icode].operand[0].predicate
33787 == insn_data[icode].operand[0].predicate
33788 && insn_data[new_icode].operand[1].predicate
33789 == insn_data[icode].operand[1].predicate);
33790 icode = new_icode;
33791 goto non_constant;
33793 break;
33794 default:
33795 gcc_unreachable ();
33799 else
33801 non_constant:
33802 if (VECTOR_MODE_P (mode))
33803 op = safe_vector_operand (op, mode);
33805 /* If we aren't optimizing, only allow one memory operand to be
33806 generated. */
33807 if (memory_operand (op, mode))
33808 num_memory++;
33810 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
33812 if (optimize
33813 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
33814 || num_memory > 1)
33815 op = force_reg (mode, op);
33818 args[i].op = op;
33819 args[i].mode = mode;
33822 switch (nargs)
33824 case 1:
33825 pat = GEN_FCN (icode) (target, args[0].op);
33826 break;
33828 case 2:
33829 if (tf_p)
33830 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
33831 GEN_INT ((int)sub_code));
33832 else if (! comparison_p)
33833 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
33834 else
33836 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
33837 args[0].op,
33838 args[1].op);
33840 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
33842 break;
33844 case 3:
33845 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
33846 break;
33848 case 4:
33849 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
33850 break;
33852 default:
33853 gcc_unreachable ();
33856 if (! pat)
33857 return 0;
33859 emit_insn (pat);
33860 return target;
33863 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
33864 insns with vec_merge. */
33866 static rtx
33867 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
33868 rtx target)
33870 rtx pat;
33871 tree arg0 = CALL_EXPR_ARG (exp, 0);
33872 rtx op1, op0 = expand_normal (arg0);
33873 machine_mode tmode = insn_data[icode].operand[0].mode;
33874 machine_mode mode0 = insn_data[icode].operand[1].mode;
33876 if (optimize || !target
33877 || GET_MODE (target) != tmode
33878 || !insn_data[icode].operand[0].predicate (target, tmode))
33879 target = gen_reg_rtx (tmode);
33881 if (VECTOR_MODE_P (mode0))
33882 op0 = safe_vector_operand (op0, mode0);
33884 if ((optimize && !register_operand (op0, mode0))
33885 || !insn_data[icode].operand[1].predicate (op0, mode0))
33886 op0 = copy_to_mode_reg (mode0, op0);
33888 op1 = op0;
33889 if (!insn_data[icode].operand[2].predicate (op1, mode0))
33890 op1 = copy_to_mode_reg (mode0, op1);
33892 pat = GEN_FCN (icode) (target, op0, op1);
33893 if (! pat)
33894 return 0;
33895 emit_insn (pat);
33896 return target;
33899 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
33901 static rtx
33902 ix86_expand_sse_compare (const struct builtin_description *d,
33903 tree exp, rtx target, bool swap)
33905 rtx pat;
33906 tree arg0 = CALL_EXPR_ARG (exp, 0);
33907 tree arg1 = CALL_EXPR_ARG (exp, 1);
33908 rtx op0 = expand_normal (arg0);
33909 rtx op1 = expand_normal (arg1);
33910 rtx op2;
33911 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33912 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33913 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33914 enum rtx_code comparison = d->comparison;
33916 if (VECTOR_MODE_P (mode0))
33917 op0 = safe_vector_operand (op0, mode0);
33918 if (VECTOR_MODE_P (mode1))
33919 op1 = safe_vector_operand (op1, mode1);
33921 /* Swap operands if we have a comparison that isn't available in
33922 hardware. */
33923 if (swap)
33924 std::swap (op0, op1);
33926 if (optimize || !target
33927 || GET_MODE (target) != tmode
33928 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33929 target = gen_reg_rtx (tmode);
33931 if ((optimize && !register_operand (op0, mode0))
33932 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33933 op0 = copy_to_mode_reg (mode0, op0);
33934 if ((optimize && !register_operand (op1, mode1))
33935 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33936 op1 = copy_to_mode_reg (mode1, op1);
33938 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33939 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33940 if (! pat)
33941 return 0;
33942 emit_insn (pat);
33943 return target;
33946 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33948 static rtx
33949 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33950 rtx target)
33952 rtx pat;
33953 tree arg0 = CALL_EXPR_ARG (exp, 0);
33954 tree arg1 = CALL_EXPR_ARG (exp, 1);
33955 rtx op0 = expand_normal (arg0);
33956 rtx op1 = expand_normal (arg1);
33957 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33958 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33959 enum rtx_code comparison = d->comparison;
33961 if (VECTOR_MODE_P (mode0))
33962 op0 = safe_vector_operand (op0, mode0);
33963 if (VECTOR_MODE_P (mode1))
33964 op1 = safe_vector_operand (op1, mode1);
33966 /* Swap operands if we have a comparison that isn't available in
33967 hardware. */
33968 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33969 std::swap (op0, op1);
33971 target = gen_reg_rtx (SImode);
33972 emit_move_insn (target, const0_rtx);
33973 target = gen_rtx_SUBREG (QImode, target, 0);
33975 if ((optimize && !register_operand (op0, mode0))
33976 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33977 op0 = copy_to_mode_reg (mode0, op0);
33978 if ((optimize && !register_operand (op1, mode1))
33979 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33980 op1 = copy_to_mode_reg (mode1, op1);
33982 pat = GEN_FCN (d->icode) (op0, op1);
33983 if (! pat)
33984 return 0;
33985 emit_insn (pat);
33986 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33987 gen_rtx_fmt_ee (comparison, QImode,
33988 SET_DEST (pat),
33989 const0_rtx)));
33991 return SUBREG_REG (target);
33994 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33996 static rtx
33997 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33998 rtx target)
34000 rtx pat;
34001 tree arg0 = CALL_EXPR_ARG (exp, 0);
34002 rtx op1, op0 = expand_normal (arg0);
34003 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34004 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34006 if (optimize || target == 0
34007 || GET_MODE (target) != tmode
34008 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34009 target = gen_reg_rtx (tmode);
34011 if (VECTOR_MODE_P (mode0))
34012 op0 = safe_vector_operand (op0, mode0);
34014 if ((optimize && !register_operand (op0, mode0))
34015 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34016 op0 = copy_to_mode_reg (mode0, op0);
34018 op1 = GEN_INT (d->comparison);
34020 pat = GEN_FCN (d->icode) (target, op0, op1);
34021 if (! pat)
34022 return 0;
34023 emit_insn (pat);
34024 return target;
34027 static rtx
34028 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34029 tree exp, rtx target)
34031 rtx pat;
34032 tree arg0 = CALL_EXPR_ARG (exp, 0);
34033 tree arg1 = CALL_EXPR_ARG (exp, 1);
34034 rtx op0 = expand_normal (arg0);
34035 rtx op1 = expand_normal (arg1);
34036 rtx op2;
34037 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34038 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34039 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34041 if (optimize || target == 0
34042 || GET_MODE (target) != tmode
34043 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34044 target = gen_reg_rtx (tmode);
34046 op0 = safe_vector_operand (op0, mode0);
34047 op1 = safe_vector_operand (op1, mode1);
34049 if ((optimize && !register_operand (op0, mode0))
34050 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34051 op0 = copy_to_mode_reg (mode0, op0);
34052 if ((optimize && !register_operand (op1, mode1))
34053 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34054 op1 = copy_to_mode_reg (mode1, op1);
34056 op2 = GEN_INT (d->comparison);
34058 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34059 if (! pat)
34060 return 0;
34061 emit_insn (pat);
34062 return target;
34065 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34067 static rtx
34068 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34069 rtx target)
34071 rtx pat;
34072 tree arg0 = CALL_EXPR_ARG (exp, 0);
34073 tree arg1 = CALL_EXPR_ARG (exp, 1);
34074 rtx op0 = expand_normal (arg0);
34075 rtx op1 = expand_normal (arg1);
34076 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34077 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34078 enum rtx_code comparison = d->comparison;
34080 if (VECTOR_MODE_P (mode0))
34081 op0 = safe_vector_operand (op0, mode0);
34082 if (VECTOR_MODE_P (mode1))
34083 op1 = safe_vector_operand (op1, mode1);
34085 target = gen_reg_rtx (SImode);
34086 emit_move_insn (target, const0_rtx);
34087 target = gen_rtx_SUBREG (QImode, target, 0);
34089 if ((optimize && !register_operand (op0, mode0))
34090 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34091 op0 = copy_to_mode_reg (mode0, op0);
34092 if ((optimize && !register_operand (op1, mode1))
34093 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34094 op1 = copy_to_mode_reg (mode1, op1);
34096 pat = GEN_FCN (d->icode) (op0, op1);
34097 if (! pat)
34098 return 0;
34099 emit_insn (pat);
34100 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34101 gen_rtx_fmt_ee (comparison, QImode,
34102 SET_DEST (pat),
34103 const0_rtx)));
34105 return SUBREG_REG (target);
34108 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34110 static rtx
34111 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34112 tree exp, rtx target)
34114 rtx pat;
34115 tree arg0 = CALL_EXPR_ARG (exp, 0);
34116 tree arg1 = CALL_EXPR_ARG (exp, 1);
34117 tree arg2 = CALL_EXPR_ARG (exp, 2);
34118 tree arg3 = CALL_EXPR_ARG (exp, 3);
34119 tree arg4 = CALL_EXPR_ARG (exp, 4);
34120 rtx scratch0, scratch1;
34121 rtx op0 = expand_normal (arg0);
34122 rtx op1 = expand_normal (arg1);
34123 rtx op2 = expand_normal (arg2);
34124 rtx op3 = expand_normal (arg3);
34125 rtx op4 = expand_normal (arg4);
34126 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34128 tmode0 = insn_data[d->icode].operand[0].mode;
34129 tmode1 = insn_data[d->icode].operand[1].mode;
34130 modev2 = insn_data[d->icode].operand[2].mode;
34131 modei3 = insn_data[d->icode].operand[3].mode;
34132 modev4 = insn_data[d->icode].operand[4].mode;
34133 modei5 = insn_data[d->icode].operand[5].mode;
34134 modeimm = insn_data[d->icode].operand[6].mode;
34136 if (VECTOR_MODE_P (modev2))
34137 op0 = safe_vector_operand (op0, modev2);
34138 if (VECTOR_MODE_P (modev4))
34139 op2 = safe_vector_operand (op2, modev4);
34141 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34142 op0 = copy_to_mode_reg (modev2, op0);
34143 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34144 op1 = copy_to_mode_reg (modei3, op1);
34145 if ((optimize && !register_operand (op2, modev4))
34146 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34147 op2 = copy_to_mode_reg (modev4, op2);
34148 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34149 op3 = copy_to_mode_reg (modei5, op3);
34151 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34153 error ("the fifth argument must be an 8-bit immediate");
34154 return const0_rtx;
34157 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34159 if (optimize || !target
34160 || GET_MODE (target) != tmode0
34161 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34162 target = gen_reg_rtx (tmode0);
34164 scratch1 = gen_reg_rtx (tmode1);
34166 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34168 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34170 if (optimize || !target
34171 || GET_MODE (target) != tmode1
34172 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34173 target = gen_reg_rtx (tmode1);
34175 scratch0 = gen_reg_rtx (tmode0);
34177 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34179 else
34181 gcc_assert (d->flag);
34183 scratch0 = gen_reg_rtx (tmode0);
34184 scratch1 = gen_reg_rtx (tmode1);
34186 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34189 if (! pat)
34190 return 0;
34192 emit_insn (pat);
34194 if (d->flag)
34196 target = gen_reg_rtx (SImode);
34197 emit_move_insn (target, const0_rtx);
34198 target = gen_rtx_SUBREG (QImode, target, 0);
34200 emit_insn
34201 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34202 gen_rtx_fmt_ee (EQ, QImode,
34203 gen_rtx_REG ((machine_mode) d->flag,
34204 FLAGS_REG),
34205 const0_rtx)));
34206 return SUBREG_REG (target);
34208 else
34209 return target;
34213 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34215 static rtx
34216 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34217 tree exp, rtx target)
34219 rtx pat;
34220 tree arg0 = CALL_EXPR_ARG (exp, 0);
34221 tree arg1 = CALL_EXPR_ARG (exp, 1);
34222 tree arg2 = CALL_EXPR_ARG (exp, 2);
34223 rtx scratch0, scratch1;
34224 rtx op0 = expand_normal (arg0);
34225 rtx op1 = expand_normal (arg1);
34226 rtx op2 = expand_normal (arg2);
34227 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34229 tmode0 = insn_data[d->icode].operand[0].mode;
34230 tmode1 = insn_data[d->icode].operand[1].mode;
34231 modev2 = insn_data[d->icode].operand[2].mode;
34232 modev3 = insn_data[d->icode].operand[3].mode;
34233 modeimm = insn_data[d->icode].operand[4].mode;
34235 if (VECTOR_MODE_P (modev2))
34236 op0 = safe_vector_operand (op0, modev2);
34237 if (VECTOR_MODE_P (modev3))
34238 op1 = safe_vector_operand (op1, modev3);
34240 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34241 op0 = copy_to_mode_reg (modev2, op0);
34242 if ((optimize && !register_operand (op1, modev3))
34243 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34244 op1 = copy_to_mode_reg (modev3, op1);
34246 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34248 error ("the third argument must be an 8-bit immediate");
34249 return const0_rtx;
34252 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34254 if (optimize || !target
34255 || GET_MODE (target) != tmode0
34256 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34257 target = gen_reg_rtx (tmode0);
34259 scratch1 = gen_reg_rtx (tmode1);
34261 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34263 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34265 if (optimize || !target
34266 || GET_MODE (target) != tmode1
34267 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34268 target = gen_reg_rtx (tmode1);
34270 scratch0 = gen_reg_rtx (tmode0);
34272 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34274 else
34276 gcc_assert (d->flag);
34278 scratch0 = gen_reg_rtx (tmode0);
34279 scratch1 = gen_reg_rtx (tmode1);
34281 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34284 if (! pat)
34285 return 0;
34287 emit_insn (pat);
34289 if (d->flag)
34291 target = gen_reg_rtx (SImode);
34292 emit_move_insn (target, const0_rtx);
34293 target = gen_rtx_SUBREG (QImode, target, 0);
34295 emit_insn
34296 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34297 gen_rtx_fmt_ee (EQ, QImode,
34298 gen_rtx_REG ((machine_mode) d->flag,
34299 FLAGS_REG),
34300 const0_rtx)));
34301 return SUBREG_REG (target);
34303 else
34304 return target;
34307 /* Subroutine of ix86_expand_builtin to take care of insns with
34308 variable number of operands. */
34310 static rtx
34311 ix86_expand_args_builtin (const struct builtin_description *d,
34312 tree exp, rtx target)
34314 rtx pat, real_target;
34315 unsigned int i, nargs;
34316 unsigned int nargs_constant = 0;
34317 unsigned int mask_pos = 0;
34318 int num_memory = 0;
34319 struct
34321 rtx op;
34322 machine_mode mode;
34323 } args[6];
34324 bool second_arg_count = false;
34325 enum insn_code icode = d->icode;
34326 const struct insn_data_d *insn_p = &insn_data[icode];
34327 machine_mode tmode = insn_p->operand[0].mode;
34328 machine_mode rmode = VOIDmode;
34329 bool swap = false;
34330 enum rtx_code comparison = d->comparison;
34332 switch ((enum ix86_builtin_func_type) d->flag)
34334 case V2DF_FTYPE_V2DF_ROUND:
34335 case V4DF_FTYPE_V4DF_ROUND:
34336 case V8DF_FTYPE_V8DF_ROUND:
34337 case V4SF_FTYPE_V4SF_ROUND:
34338 case V8SF_FTYPE_V8SF_ROUND:
34339 case V16SF_FTYPE_V16SF_ROUND:
34340 case V4SI_FTYPE_V4SF_ROUND:
34341 case V8SI_FTYPE_V8SF_ROUND:
34342 case V16SI_FTYPE_V16SF_ROUND:
34343 return ix86_expand_sse_round (d, exp, target);
34344 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34345 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34346 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34347 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34348 case INT_FTYPE_V8SF_V8SF_PTEST:
34349 case INT_FTYPE_V4DI_V4DI_PTEST:
34350 case INT_FTYPE_V4DF_V4DF_PTEST:
34351 case INT_FTYPE_V4SF_V4SF_PTEST:
34352 case INT_FTYPE_V2DI_V2DI_PTEST:
34353 case INT_FTYPE_V2DF_V2DF_PTEST:
34354 return ix86_expand_sse_ptest (d, exp, target);
34355 case FLOAT128_FTYPE_FLOAT128:
34356 case FLOAT_FTYPE_FLOAT:
34357 case INT_FTYPE_INT:
34358 case UINT_FTYPE_UINT:
34359 case UINT16_FTYPE_UINT16:
34360 case UINT64_FTYPE_INT:
34361 case UINT64_FTYPE_UINT64:
34362 case INT64_FTYPE_INT64:
34363 case INT64_FTYPE_V4SF:
34364 case INT64_FTYPE_V2DF:
34365 case INT_FTYPE_V16QI:
34366 case INT_FTYPE_V8QI:
34367 case INT_FTYPE_V8SF:
34368 case INT_FTYPE_V4DF:
34369 case INT_FTYPE_V4SF:
34370 case INT_FTYPE_V2DF:
34371 case INT_FTYPE_V32QI:
34372 case V16QI_FTYPE_V16QI:
34373 case V8SI_FTYPE_V8SF:
34374 case V8SI_FTYPE_V4SI:
34375 case V8HI_FTYPE_V8HI:
34376 case V8HI_FTYPE_V16QI:
34377 case V8QI_FTYPE_V8QI:
34378 case V8SF_FTYPE_V8SF:
34379 case V8SF_FTYPE_V8SI:
34380 case V8SF_FTYPE_V4SF:
34381 case V8SF_FTYPE_V8HI:
34382 case V4SI_FTYPE_V4SI:
34383 case V4SI_FTYPE_V16QI:
34384 case V4SI_FTYPE_V4SF:
34385 case V4SI_FTYPE_V8SI:
34386 case V4SI_FTYPE_V8HI:
34387 case V4SI_FTYPE_V4DF:
34388 case V4SI_FTYPE_V2DF:
34389 case V4HI_FTYPE_V4HI:
34390 case V4DF_FTYPE_V4DF:
34391 case V4DF_FTYPE_V4SI:
34392 case V4DF_FTYPE_V4SF:
34393 case V4DF_FTYPE_V2DF:
34394 case V4SF_FTYPE_V4SF:
34395 case V4SF_FTYPE_V4SI:
34396 case V4SF_FTYPE_V8SF:
34397 case V4SF_FTYPE_V4DF:
34398 case V4SF_FTYPE_V8HI:
34399 case V4SF_FTYPE_V2DF:
34400 case V2DI_FTYPE_V2DI:
34401 case V2DI_FTYPE_V16QI:
34402 case V2DI_FTYPE_V8HI:
34403 case V2DI_FTYPE_V4SI:
34404 case V2DF_FTYPE_V2DF:
34405 case V2DF_FTYPE_V4SI:
34406 case V2DF_FTYPE_V4DF:
34407 case V2DF_FTYPE_V4SF:
34408 case V2DF_FTYPE_V2SI:
34409 case V2SI_FTYPE_V2SI:
34410 case V2SI_FTYPE_V4SF:
34411 case V2SI_FTYPE_V2SF:
34412 case V2SI_FTYPE_V2DF:
34413 case V2SF_FTYPE_V2SF:
34414 case V2SF_FTYPE_V2SI:
34415 case V32QI_FTYPE_V32QI:
34416 case V32QI_FTYPE_V16QI:
34417 case V16HI_FTYPE_V16HI:
34418 case V16HI_FTYPE_V8HI:
34419 case V8SI_FTYPE_V8SI:
34420 case V16HI_FTYPE_V16QI:
34421 case V8SI_FTYPE_V16QI:
34422 case V4DI_FTYPE_V16QI:
34423 case V8SI_FTYPE_V8HI:
34424 case V4DI_FTYPE_V8HI:
34425 case V4DI_FTYPE_V4SI:
34426 case V4DI_FTYPE_V2DI:
34427 case UQI_FTYPE_UQI:
34428 case UHI_FTYPE_UHI:
34429 case USI_FTYPE_USI:
34430 case USI_FTYPE_UQI:
34431 case USI_FTYPE_UHI:
34432 case UDI_FTYPE_UDI:
34433 case UHI_FTYPE_V16QI:
34434 case USI_FTYPE_V32QI:
34435 case UDI_FTYPE_V64QI:
34436 case V16QI_FTYPE_UHI:
34437 case V32QI_FTYPE_USI:
34438 case V64QI_FTYPE_UDI:
34439 case V8HI_FTYPE_UQI:
34440 case V16HI_FTYPE_UHI:
34441 case V32HI_FTYPE_USI:
34442 case V4SI_FTYPE_UQI:
34443 case V8SI_FTYPE_UQI:
34444 case V4SI_FTYPE_UHI:
34445 case V8SI_FTYPE_UHI:
34446 case UQI_FTYPE_V8HI:
34447 case UHI_FTYPE_V16HI:
34448 case USI_FTYPE_V32HI:
34449 case UQI_FTYPE_V4SI:
34450 case UQI_FTYPE_V8SI:
34451 case UHI_FTYPE_V16SI:
34452 case UQI_FTYPE_V2DI:
34453 case UQI_FTYPE_V4DI:
34454 case UQI_FTYPE_V8DI:
34455 case V16SI_FTYPE_UHI:
34456 case V2DI_FTYPE_UQI:
34457 case V4DI_FTYPE_UQI:
34458 case V16SI_FTYPE_INT:
34459 case V16SF_FTYPE_V8SF:
34460 case V16SI_FTYPE_V8SI:
34461 case V16SF_FTYPE_V4SF:
34462 case V16SI_FTYPE_V4SI:
34463 case V16SI_FTYPE_V16SF:
34464 case V16SI_FTYPE_V16SI:
34465 case V64QI_FTYPE_V64QI:
34466 case V32HI_FTYPE_V32HI:
34467 case V16SF_FTYPE_V16SF:
34468 case V8DI_FTYPE_UQI:
34469 case V8DI_FTYPE_V8DI:
34470 case V8DF_FTYPE_V4DF:
34471 case V8DF_FTYPE_V2DF:
34472 case V8DF_FTYPE_V8DF:
34473 case V4DI_FTYPE_V4DI:
34474 nargs = 1;
34475 break;
34476 case V4SF_FTYPE_V4SF_VEC_MERGE:
34477 case V2DF_FTYPE_V2DF_VEC_MERGE:
34478 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34479 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34480 case V16QI_FTYPE_V16QI_V16QI:
34481 case V16QI_FTYPE_V8HI_V8HI:
34482 case V16SF_FTYPE_V16SF_V16SF:
34483 case V8QI_FTYPE_V8QI_V8QI:
34484 case V8QI_FTYPE_V4HI_V4HI:
34485 case V8HI_FTYPE_V8HI_V8HI:
34486 case V8HI_FTYPE_V16QI_V16QI:
34487 case V8HI_FTYPE_V4SI_V4SI:
34488 case V8SF_FTYPE_V8SF_V8SF:
34489 case V8SF_FTYPE_V8SF_V8SI:
34490 case V8DF_FTYPE_V8DF_V8DF:
34491 case V4SI_FTYPE_V4SI_V4SI:
34492 case V4SI_FTYPE_V8HI_V8HI:
34493 case V4SI_FTYPE_V2DF_V2DF:
34494 case V4HI_FTYPE_V4HI_V4HI:
34495 case V4HI_FTYPE_V8QI_V8QI:
34496 case V4HI_FTYPE_V2SI_V2SI:
34497 case V4DF_FTYPE_V4DF_V4DF:
34498 case V4DF_FTYPE_V4DF_V4DI:
34499 case V4SF_FTYPE_V4SF_V4SF:
34500 case V4SF_FTYPE_V4SF_V4SI:
34501 case V4SF_FTYPE_V4SF_V2SI:
34502 case V4SF_FTYPE_V4SF_V2DF:
34503 case V4SF_FTYPE_V4SF_UINT:
34504 case V4SF_FTYPE_V4SF_DI:
34505 case V4SF_FTYPE_V4SF_SI:
34506 case V2DI_FTYPE_V2DI_V2DI:
34507 case V2DI_FTYPE_V16QI_V16QI:
34508 case V2DI_FTYPE_V4SI_V4SI:
34509 case V2DI_FTYPE_V2DI_V16QI:
34510 case V2SI_FTYPE_V2SI_V2SI:
34511 case V2SI_FTYPE_V4HI_V4HI:
34512 case V2SI_FTYPE_V2SF_V2SF:
34513 case V2DF_FTYPE_V2DF_V2DF:
34514 case V2DF_FTYPE_V2DF_V4SF:
34515 case V2DF_FTYPE_V2DF_V2DI:
34516 case V2DF_FTYPE_V2DF_DI:
34517 case V2DF_FTYPE_V2DF_SI:
34518 case V2DF_FTYPE_V2DF_UINT:
34519 case V2SF_FTYPE_V2SF_V2SF:
34520 case V1DI_FTYPE_V1DI_V1DI:
34521 case V1DI_FTYPE_V8QI_V8QI:
34522 case V1DI_FTYPE_V2SI_V2SI:
34523 case V32QI_FTYPE_V16HI_V16HI:
34524 case V16HI_FTYPE_V8SI_V8SI:
34525 case V64QI_FTYPE_V64QI_V64QI:
34526 case V32QI_FTYPE_V32QI_V32QI:
34527 case V16HI_FTYPE_V32QI_V32QI:
34528 case V16HI_FTYPE_V16HI_V16HI:
34529 case V8SI_FTYPE_V4DF_V4DF:
34530 case V8SI_FTYPE_V8SI_V8SI:
34531 case V8SI_FTYPE_V16HI_V16HI:
34532 case V4DI_FTYPE_V4DI_V4DI:
34533 case V4DI_FTYPE_V8SI_V8SI:
34534 case V8DI_FTYPE_V64QI_V64QI:
34535 if (comparison == UNKNOWN)
34536 return ix86_expand_binop_builtin (icode, exp, target);
34537 nargs = 2;
34538 break;
34539 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34540 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34541 gcc_assert (comparison != UNKNOWN);
34542 nargs = 2;
34543 swap = true;
34544 break;
34545 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34546 case V16HI_FTYPE_V16HI_SI_COUNT:
34547 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34548 case V8SI_FTYPE_V8SI_SI_COUNT:
34549 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34550 case V4DI_FTYPE_V4DI_INT_COUNT:
34551 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34552 case V8HI_FTYPE_V8HI_SI_COUNT:
34553 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34554 case V4SI_FTYPE_V4SI_SI_COUNT:
34555 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34556 case V4HI_FTYPE_V4HI_SI_COUNT:
34557 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34558 case V2DI_FTYPE_V2DI_SI_COUNT:
34559 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34560 case V2SI_FTYPE_V2SI_SI_COUNT:
34561 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34562 case V1DI_FTYPE_V1DI_SI_COUNT:
34563 nargs = 2;
34564 second_arg_count = true;
34565 break;
34566 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
34567 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
34568 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
34569 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
34570 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
34571 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
34572 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
34573 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
34574 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
34575 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
34576 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
34577 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
34578 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
34579 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
34580 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
34581 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
34582 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
34583 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
34584 nargs = 4;
34585 second_arg_count = true;
34586 break;
34587 case UINT64_FTYPE_UINT64_UINT64:
34588 case UINT_FTYPE_UINT_UINT:
34589 case UINT_FTYPE_UINT_USHORT:
34590 case UINT_FTYPE_UINT_UCHAR:
34591 case UINT16_FTYPE_UINT16_INT:
34592 case UINT8_FTYPE_UINT8_INT:
34593 case UQI_FTYPE_UQI_UQI:
34594 case UHI_FTYPE_UHI_UHI:
34595 case USI_FTYPE_USI_USI:
34596 case UDI_FTYPE_UDI_UDI:
34597 case V16SI_FTYPE_V8DF_V8DF:
34598 nargs = 2;
34599 break;
34600 case V2DI_FTYPE_V2DI_INT_CONVERT:
34601 nargs = 2;
34602 rmode = V1TImode;
34603 nargs_constant = 1;
34604 break;
34605 case V4DI_FTYPE_V4DI_INT_CONVERT:
34606 nargs = 2;
34607 rmode = V2TImode;
34608 nargs_constant = 1;
34609 break;
34610 case V8DI_FTYPE_V8DI_INT_CONVERT:
34611 nargs = 2;
34612 rmode = V4TImode;
34613 nargs_constant = 1;
34614 break;
34615 case V8HI_FTYPE_V8HI_INT:
34616 case V8HI_FTYPE_V8SF_INT:
34617 case V16HI_FTYPE_V16SF_INT:
34618 case V8HI_FTYPE_V4SF_INT:
34619 case V8SF_FTYPE_V8SF_INT:
34620 case V4SF_FTYPE_V16SF_INT:
34621 case V16SF_FTYPE_V16SF_INT:
34622 case V4SI_FTYPE_V4SI_INT:
34623 case V4SI_FTYPE_V8SI_INT:
34624 case V4HI_FTYPE_V4HI_INT:
34625 case V4DF_FTYPE_V4DF_INT:
34626 case V4DF_FTYPE_V8DF_INT:
34627 case V4SF_FTYPE_V4SF_INT:
34628 case V4SF_FTYPE_V8SF_INT:
34629 case V2DI_FTYPE_V2DI_INT:
34630 case V2DF_FTYPE_V2DF_INT:
34631 case V2DF_FTYPE_V4DF_INT:
34632 case V16HI_FTYPE_V16HI_INT:
34633 case V8SI_FTYPE_V8SI_INT:
34634 case V16SI_FTYPE_V16SI_INT:
34635 case V4SI_FTYPE_V16SI_INT:
34636 case V4DI_FTYPE_V4DI_INT:
34637 case V2DI_FTYPE_V4DI_INT:
34638 case V4DI_FTYPE_V8DI_INT:
34639 case QI_FTYPE_V4SF_INT:
34640 case QI_FTYPE_V2DF_INT:
34641 case UQI_FTYPE_UQI_UQI_CONST:
34642 case UHI_FTYPE_UHI_UQI:
34643 case USI_FTYPE_USI_UQI:
34644 case UDI_FTYPE_UDI_UQI:
34645 nargs = 2;
34646 nargs_constant = 1;
34647 break;
34648 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34649 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34650 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34651 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34652 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34653 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34654 case UHI_FTYPE_V16SI_V16SI_UHI:
34655 case UQI_FTYPE_V8DI_V8DI_UQI:
34656 case V16HI_FTYPE_V16SI_V16HI_UHI:
34657 case V16QI_FTYPE_V16SI_V16QI_UHI:
34658 case V16QI_FTYPE_V8DI_V16QI_UQI:
34659 case V16SF_FTYPE_V16SF_V16SF_UHI:
34660 case V16SF_FTYPE_V4SF_V16SF_UHI:
34661 case V16SI_FTYPE_SI_V16SI_UHI:
34662 case V16SI_FTYPE_V16HI_V16SI_UHI:
34663 case V16SI_FTYPE_V16QI_V16SI_UHI:
34664 case V8SF_FTYPE_V4SF_V8SF_UQI:
34665 case V4DF_FTYPE_V2DF_V4DF_UQI:
34666 case V8SI_FTYPE_V4SI_V8SI_UQI:
34667 case V8SI_FTYPE_SI_V8SI_UQI:
34668 case V4SI_FTYPE_V4SI_V4SI_UQI:
34669 case V4SI_FTYPE_SI_V4SI_UQI:
34670 case V4DI_FTYPE_V2DI_V4DI_UQI:
34671 case V4DI_FTYPE_DI_V4DI_UQI:
34672 case V2DI_FTYPE_V2DI_V2DI_UQI:
34673 case V2DI_FTYPE_DI_V2DI_UQI:
34674 case V64QI_FTYPE_V64QI_V64QI_UDI:
34675 case V64QI_FTYPE_V16QI_V64QI_UDI:
34676 case V64QI_FTYPE_QI_V64QI_UDI:
34677 case V32QI_FTYPE_V32QI_V32QI_USI:
34678 case V32QI_FTYPE_V16QI_V32QI_USI:
34679 case V32QI_FTYPE_QI_V32QI_USI:
34680 case V16QI_FTYPE_V16QI_V16QI_UHI:
34681 case V16QI_FTYPE_QI_V16QI_UHI:
34682 case V32HI_FTYPE_V8HI_V32HI_USI:
34683 case V32HI_FTYPE_HI_V32HI_USI:
34684 case V16HI_FTYPE_V8HI_V16HI_UHI:
34685 case V16HI_FTYPE_HI_V16HI_UHI:
34686 case V8HI_FTYPE_V8HI_V8HI_UQI:
34687 case V8HI_FTYPE_HI_V8HI_UQI:
34688 case V8SF_FTYPE_V8HI_V8SF_UQI:
34689 case V4SF_FTYPE_V8HI_V4SF_UQI:
34690 case V8SI_FTYPE_V8SF_V8SI_UQI:
34691 case V4SI_FTYPE_V4SF_V4SI_UQI:
34692 case V4DI_FTYPE_V4SF_V4DI_UQI:
34693 case V2DI_FTYPE_V4SF_V2DI_UQI:
34694 case V4SF_FTYPE_V4DI_V4SF_UQI:
34695 case V4SF_FTYPE_V2DI_V4SF_UQI:
34696 case V4DF_FTYPE_V4DI_V4DF_UQI:
34697 case V2DF_FTYPE_V2DI_V2DF_UQI:
34698 case V16QI_FTYPE_V8HI_V16QI_UQI:
34699 case V16QI_FTYPE_V16HI_V16QI_UHI:
34700 case V16QI_FTYPE_V4SI_V16QI_UQI:
34701 case V16QI_FTYPE_V8SI_V16QI_UQI:
34702 case V8HI_FTYPE_V4SI_V8HI_UQI:
34703 case V8HI_FTYPE_V8SI_V8HI_UQI:
34704 case V16QI_FTYPE_V2DI_V16QI_UQI:
34705 case V16QI_FTYPE_V4DI_V16QI_UQI:
34706 case V8HI_FTYPE_V2DI_V8HI_UQI:
34707 case V8HI_FTYPE_V4DI_V8HI_UQI:
34708 case V4SI_FTYPE_V2DI_V4SI_UQI:
34709 case V4SI_FTYPE_V4DI_V4SI_UQI:
34710 case V32QI_FTYPE_V32HI_V32QI_USI:
34711 case UHI_FTYPE_V16QI_V16QI_UHI:
34712 case USI_FTYPE_V32QI_V32QI_USI:
34713 case UDI_FTYPE_V64QI_V64QI_UDI:
34714 case UQI_FTYPE_V8HI_V8HI_UQI:
34715 case UHI_FTYPE_V16HI_V16HI_UHI:
34716 case USI_FTYPE_V32HI_V32HI_USI:
34717 case UQI_FTYPE_V4SI_V4SI_UQI:
34718 case UQI_FTYPE_V8SI_V8SI_UQI:
34719 case UQI_FTYPE_V2DI_V2DI_UQI:
34720 case UQI_FTYPE_V4DI_V4DI_UQI:
34721 case V4SF_FTYPE_V2DF_V4SF_UQI:
34722 case V4SF_FTYPE_V4DF_V4SF_UQI:
34723 case V16SI_FTYPE_V16SI_V16SI_UHI:
34724 case V16SI_FTYPE_V4SI_V16SI_UHI:
34725 case V2DI_FTYPE_V4SI_V2DI_UQI:
34726 case V2DI_FTYPE_V8HI_V2DI_UQI:
34727 case V2DI_FTYPE_V16QI_V2DI_UQI:
34728 case V4DI_FTYPE_V4DI_V4DI_UQI:
34729 case V4DI_FTYPE_V4SI_V4DI_UQI:
34730 case V4DI_FTYPE_V8HI_V4DI_UQI:
34731 case V4DI_FTYPE_V16QI_V4DI_UQI:
34732 case V4DI_FTYPE_V4DF_V4DI_UQI:
34733 case V2DI_FTYPE_V2DF_V2DI_UQI:
34734 case V4SI_FTYPE_V4DF_V4SI_UQI:
34735 case V4SI_FTYPE_V2DF_V4SI_UQI:
34736 case V4SI_FTYPE_V8HI_V4SI_UQI:
34737 case V4SI_FTYPE_V16QI_V4SI_UQI:
34738 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34739 case V8DF_FTYPE_V2DF_V8DF_UQI:
34740 case V8DF_FTYPE_V4DF_V8DF_UQI:
34741 case V8DF_FTYPE_V8DF_V8DF_UQI:
34742 case V8SF_FTYPE_V8SF_V8SF_UQI:
34743 case V8SF_FTYPE_V8SI_V8SF_UQI:
34744 case V4DF_FTYPE_V4DF_V4DF_UQI:
34745 case V4SF_FTYPE_V4SF_V4SF_UQI:
34746 case V2DF_FTYPE_V2DF_V2DF_UQI:
34747 case V2DF_FTYPE_V4SF_V2DF_UQI:
34748 case V2DF_FTYPE_V4SI_V2DF_UQI:
34749 case V4SF_FTYPE_V4SI_V4SF_UQI:
34750 case V4DF_FTYPE_V4SF_V4DF_UQI:
34751 case V4DF_FTYPE_V4SI_V4DF_UQI:
34752 case V8SI_FTYPE_V8SI_V8SI_UQI:
34753 case V8SI_FTYPE_V8HI_V8SI_UQI:
34754 case V8SI_FTYPE_V16QI_V8SI_UQI:
34755 case V8DF_FTYPE_V8SI_V8DF_UQI:
34756 case V8DI_FTYPE_DI_V8DI_UQI:
34757 case V16SF_FTYPE_V8SF_V16SF_UHI:
34758 case V16SI_FTYPE_V8SI_V16SI_UHI:
34759 case V16HI_FTYPE_V16HI_V16HI_UHI:
34760 case V8HI_FTYPE_V16QI_V8HI_UQI:
34761 case V16HI_FTYPE_V16QI_V16HI_UHI:
34762 case V32HI_FTYPE_V32HI_V32HI_USI:
34763 case V32HI_FTYPE_V32QI_V32HI_USI:
34764 case V8DI_FTYPE_V16QI_V8DI_UQI:
34765 case V8DI_FTYPE_V2DI_V8DI_UQI:
34766 case V8DI_FTYPE_V4DI_V8DI_UQI:
34767 case V8DI_FTYPE_V8DI_V8DI_UQI:
34768 case V8DI_FTYPE_V8HI_V8DI_UQI:
34769 case V8DI_FTYPE_V8SI_V8DI_UQI:
34770 case V8HI_FTYPE_V8DI_V8HI_UQI:
34771 case V8SI_FTYPE_V8DI_V8SI_UQI:
34772 case V4SI_FTYPE_V4SI_V4SI_V4SI:
34773 case V16SI_FTYPE_V16SI_V16SI_V16SI:
34774 case V8DI_FTYPE_V8DI_V8DI_V8DI:
34775 case V32HI_FTYPE_V32HI_V32HI_V32HI:
34776 case V2DI_FTYPE_V2DI_V2DI_V2DI:
34777 case V16HI_FTYPE_V16HI_V16HI_V16HI:
34778 case V8SI_FTYPE_V8SI_V8SI_V8SI:
34779 case V8HI_FTYPE_V8HI_V8HI_V8HI:
34780 nargs = 3;
34781 break;
34782 case V32QI_FTYPE_V32QI_V32QI_INT:
34783 case V16HI_FTYPE_V16HI_V16HI_INT:
34784 case V16QI_FTYPE_V16QI_V16QI_INT:
34785 case V4DI_FTYPE_V4DI_V4DI_INT:
34786 case V8HI_FTYPE_V8HI_V8HI_INT:
34787 case V8SI_FTYPE_V8SI_V8SI_INT:
34788 case V8SI_FTYPE_V8SI_V4SI_INT:
34789 case V8SF_FTYPE_V8SF_V8SF_INT:
34790 case V8SF_FTYPE_V8SF_V4SF_INT:
34791 case V4SI_FTYPE_V4SI_V4SI_INT:
34792 case V4DF_FTYPE_V4DF_V4DF_INT:
34793 case V16SF_FTYPE_V16SF_V16SF_INT:
34794 case V16SF_FTYPE_V16SF_V4SF_INT:
34795 case V16SI_FTYPE_V16SI_V4SI_INT:
34796 case V4DF_FTYPE_V4DF_V2DF_INT:
34797 case V4SF_FTYPE_V4SF_V4SF_INT:
34798 case V2DI_FTYPE_V2DI_V2DI_INT:
34799 case V4DI_FTYPE_V4DI_V2DI_INT:
34800 case V2DF_FTYPE_V2DF_V2DF_INT:
34801 case UQI_FTYPE_V8DI_V8UDI_INT:
34802 case UQI_FTYPE_V8DF_V8DF_INT:
34803 case UQI_FTYPE_V2DF_V2DF_INT:
34804 case UQI_FTYPE_V4SF_V4SF_INT:
34805 case UHI_FTYPE_V16SI_V16SI_INT:
34806 case UHI_FTYPE_V16SF_V16SF_INT:
34807 case V64QI_FTYPE_V64QI_V64QI_INT:
34808 case V32HI_FTYPE_V32HI_V32HI_INT:
34809 case V16SI_FTYPE_V16SI_V16SI_INT:
34810 case V8DI_FTYPE_V8DI_V8DI_INT:
34811 nargs = 3;
34812 nargs_constant = 1;
34813 break;
34814 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
34815 nargs = 3;
34816 rmode = V4DImode;
34817 nargs_constant = 1;
34818 break;
34819 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
34820 nargs = 3;
34821 rmode = V2DImode;
34822 nargs_constant = 1;
34823 break;
34824 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
34825 nargs = 3;
34826 rmode = DImode;
34827 nargs_constant = 1;
34828 break;
34829 case V2DI_FTYPE_V2DI_UINT_UINT:
34830 nargs = 3;
34831 nargs_constant = 2;
34832 break;
34833 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
34834 nargs = 3;
34835 rmode = V8DImode;
34836 nargs_constant = 1;
34837 break;
34838 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
34839 nargs = 5;
34840 rmode = V8DImode;
34841 mask_pos = 2;
34842 nargs_constant = 1;
34843 break;
34844 case QI_FTYPE_V8DF_INT_UQI:
34845 case QI_FTYPE_V4DF_INT_UQI:
34846 case QI_FTYPE_V2DF_INT_UQI:
34847 case HI_FTYPE_V16SF_INT_UHI:
34848 case QI_FTYPE_V8SF_INT_UQI:
34849 case QI_FTYPE_V4SF_INT_UQI:
34850 case V4SI_FTYPE_V4SI_V4SI_UHI:
34851 case V8SI_FTYPE_V8SI_V8SI_UHI:
34852 nargs = 3;
34853 mask_pos = 1;
34854 nargs_constant = 1;
34855 break;
34856 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
34857 nargs = 5;
34858 rmode = V4DImode;
34859 mask_pos = 2;
34860 nargs_constant = 1;
34861 break;
34862 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
34863 nargs = 5;
34864 rmode = V2DImode;
34865 mask_pos = 2;
34866 nargs_constant = 1;
34867 break;
34868 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
34869 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
34870 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
34871 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
34872 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
34873 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
34874 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
34875 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
34876 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
34877 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
34878 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
34879 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
34880 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
34881 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
34882 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
34883 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
34884 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
34885 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
34886 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
34887 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
34888 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
34889 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
34890 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
34891 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
34892 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
34893 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
34894 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
34895 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
34896 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
34897 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
34898 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
34899 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
34900 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
34901 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
34902 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
34903 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
34904 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
34905 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
34906 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
34907 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
34908 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
34909 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
34910 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
34911 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34912 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34913 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34914 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34915 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34916 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34917 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34918 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34919 nargs = 4;
34920 break;
34921 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34922 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34923 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34924 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34925 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34926 nargs = 4;
34927 nargs_constant = 1;
34928 break;
34929 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34930 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34931 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34932 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34933 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34934 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34935 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34936 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34937 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34938 case USI_FTYPE_V32QI_V32QI_INT_USI:
34939 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34940 case USI_FTYPE_V32HI_V32HI_INT_USI:
34941 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34942 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34943 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34944 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34945 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34946 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34947 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34948 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34949 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34950 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34951 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34952 nargs = 4;
34953 mask_pos = 1;
34954 nargs_constant = 1;
34955 break;
34956 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34957 nargs = 4;
34958 nargs_constant = 2;
34959 break;
34960 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34961 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34962 nargs = 4;
34963 break;
34964 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34965 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34966 mask_pos = 1;
34967 nargs = 4;
34968 nargs_constant = 1;
34969 break;
34970 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34971 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34972 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34973 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34974 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34975 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34976 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34977 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34978 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34979 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34980 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34981 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34982 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34983 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34984 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34985 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34986 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34987 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34988 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34989 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34990 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34991 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34992 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34993 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34994 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34995 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34996 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34997 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34998 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34999 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35000 nargs = 4;
35001 mask_pos = 2;
35002 nargs_constant = 1;
35003 break;
35004 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35005 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35006 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35007 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35008 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35009 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35010 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35011 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35012 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35013 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35014 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35015 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35016 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35017 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35018 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35019 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35020 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35021 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35022 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35023 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35024 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35025 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35026 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35027 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35028 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35029 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35030 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35031 nargs = 5;
35032 mask_pos = 2;
35033 nargs_constant = 1;
35034 break;
35035 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35036 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35037 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35038 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35039 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35040 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35041 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35042 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35043 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35044 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35045 nargs = 5;
35046 mask_pos = 1;
35047 nargs_constant = 1;
35048 break;
35049 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
35050 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
35051 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
35052 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
35053 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
35054 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
35055 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
35056 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
35057 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
35058 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
35059 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
35060 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
35061 nargs = 5;
35062 mask_pos = 1;
35063 nargs_constant = 2;
35064 break;
35066 default:
35067 gcc_unreachable ();
35070 gcc_assert (nargs <= ARRAY_SIZE (args));
35072 if (comparison != UNKNOWN)
35074 gcc_assert (nargs == 2);
35075 return ix86_expand_sse_compare (d, exp, target, swap);
35078 if (rmode == VOIDmode || rmode == tmode)
35080 if (optimize
35081 || target == 0
35082 || GET_MODE (target) != tmode
35083 || !insn_p->operand[0].predicate (target, tmode))
35084 target = gen_reg_rtx (tmode);
35085 else if (memory_operand (target, tmode))
35086 num_memory++;
35087 real_target = target;
35089 else
35091 real_target = gen_reg_rtx (tmode);
35092 target = lowpart_subreg (rmode, real_target, tmode);
35095 for (i = 0; i < nargs; i++)
35097 tree arg = CALL_EXPR_ARG (exp, i);
35098 rtx op = expand_normal (arg);
35099 machine_mode mode = insn_p->operand[i + 1].mode;
35100 bool match = insn_p->operand[i + 1].predicate (op, mode);
35102 if (second_arg_count && i == 1)
35104 /* SIMD shift insns take either an 8-bit immediate or
35105 register as count. But builtin functions take int as
35106 count. If count doesn't match, we put it in register.
35107 The instructions are using 64-bit count, if op is just
35108 32-bit, zero-extend it, as negative shift counts
35109 are undefined behavior and zero-extension is more
35110 efficient. */
35111 if (!match)
35113 if (SCALAR_INT_MODE_P (GET_MODE (op)))
35114 op = convert_modes (mode, GET_MODE (op), op, 1);
35115 else
35116 op = lowpart_subreg (mode, op, GET_MODE (op));
35117 if (!insn_p->operand[i + 1].predicate (op, mode))
35118 op = copy_to_reg (op);
35121 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35122 (!mask_pos && (nargs - i) <= nargs_constant))
35124 if (!match)
35125 switch (icode)
35127 case CODE_FOR_avx_vinsertf128v4di:
35128 case CODE_FOR_avx_vextractf128v4di:
35129 error ("the last argument must be an 1-bit immediate");
35130 return const0_rtx;
35132 case CODE_FOR_avx512f_cmpv8di3_mask:
35133 case CODE_FOR_avx512f_cmpv16si3_mask:
35134 case CODE_FOR_avx512f_ucmpv8di3_mask:
35135 case CODE_FOR_avx512f_ucmpv16si3_mask:
35136 case CODE_FOR_avx512vl_cmpv4di3_mask:
35137 case CODE_FOR_avx512vl_cmpv8si3_mask:
35138 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35139 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35140 case CODE_FOR_avx512vl_cmpv2di3_mask:
35141 case CODE_FOR_avx512vl_cmpv4si3_mask:
35142 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35143 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35144 error ("the last argument must be a 3-bit immediate");
35145 return const0_rtx;
35147 case CODE_FOR_sse4_1_roundsd:
35148 case CODE_FOR_sse4_1_roundss:
35150 case CODE_FOR_sse4_1_roundpd:
35151 case CODE_FOR_sse4_1_roundps:
35152 case CODE_FOR_avx_roundpd256:
35153 case CODE_FOR_avx_roundps256:
35155 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35156 case CODE_FOR_sse4_1_roundps_sfix:
35157 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35158 case CODE_FOR_avx_roundps_sfix256:
35160 case CODE_FOR_sse4_1_blendps:
35161 case CODE_FOR_avx_blendpd256:
35162 case CODE_FOR_avx_vpermilv4df:
35163 case CODE_FOR_avx_vpermilv4df_mask:
35164 case CODE_FOR_avx512f_getmantv8df_mask:
35165 case CODE_FOR_avx512f_getmantv16sf_mask:
35166 case CODE_FOR_avx512vl_getmantv8sf_mask:
35167 case CODE_FOR_avx512vl_getmantv4df_mask:
35168 case CODE_FOR_avx512vl_getmantv4sf_mask:
35169 case CODE_FOR_avx512vl_getmantv2df_mask:
35170 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35171 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35172 case CODE_FOR_avx512dq_rangepv4df_mask:
35173 case CODE_FOR_avx512dq_rangepv8sf_mask:
35174 case CODE_FOR_avx512dq_rangepv2df_mask:
35175 case CODE_FOR_avx512dq_rangepv4sf_mask:
35176 case CODE_FOR_avx_shufpd256_mask:
35177 error ("the last argument must be a 4-bit immediate");
35178 return const0_rtx;
35180 case CODE_FOR_sha1rnds4:
35181 case CODE_FOR_sse4_1_blendpd:
35182 case CODE_FOR_avx_vpermilv2df:
35183 case CODE_FOR_avx_vpermilv2df_mask:
35184 case CODE_FOR_xop_vpermil2v2df3:
35185 case CODE_FOR_xop_vpermil2v4sf3:
35186 case CODE_FOR_xop_vpermil2v4df3:
35187 case CODE_FOR_xop_vpermil2v8sf3:
35188 case CODE_FOR_avx512f_vinsertf32x4_mask:
35189 case CODE_FOR_avx512f_vinserti32x4_mask:
35190 case CODE_FOR_avx512f_vextractf32x4_mask:
35191 case CODE_FOR_avx512f_vextracti32x4_mask:
35192 case CODE_FOR_sse2_shufpd:
35193 case CODE_FOR_sse2_shufpd_mask:
35194 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35195 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35196 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35197 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35198 error ("the last argument must be a 2-bit immediate");
35199 return const0_rtx;
35201 case CODE_FOR_avx_vextractf128v4df:
35202 case CODE_FOR_avx_vextractf128v8sf:
35203 case CODE_FOR_avx_vextractf128v8si:
35204 case CODE_FOR_avx_vinsertf128v4df:
35205 case CODE_FOR_avx_vinsertf128v8sf:
35206 case CODE_FOR_avx_vinsertf128v8si:
35207 case CODE_FOR_avx512f_vinsertf64x4_mask:
35208 case CODE_FOR_avx512f_vinserti64x4_mask:
35209 case CODE_FOR_avx512f_vextractf64x4_mask:
35210 case CODE_FOR_avx512f_vextracti64x4_mask:
35211 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35212 case CODE_FOR_avx512dq_vinserti32x8_mask:
35213 case CODE_FOR_avx512vl_vinsertv4df:
35214 case CODE_FOR_avx512vl_vinsertv4di:
35215 case CODE_FOR_avx512vl_vinsertv8sf:
35216 case CODE_FOR_avx512vl_vinsertv8si:
35217 error ("the last argument must be a 1-bit immediate");
35218 return const0_rtx;
35220 case CODE_FOR_avx_vmcmpv2df3:
35221 case CODE_FOR_avx_vmcmpv4sf3:
35222 case CODE_FOR_avx_cmpv2df3:
35223 case CODE_FOR_avx_cmpv4sf3:
35224 case CODE_FOR_avx_cmpv4df3:
35225 case CODE_FOR_avx_cmpv8sf3:
35226 case CODE_FOR_avx512f_cmpv8df3_mask:
35227 case CODE_FOR_avx512f_cmpv16sf3_mask:
35228 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35229 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35230 error ("the last argument must be a 5-bit immediate");
35231 return const0_rtx;
35233 default:
35234 switch (nargs_constant)
35236 case 2:
35237 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35238 (!mask_pos && (nargs - i) == nargs_constant))
35240 error ("the next to last argument must be an 8-bit immediate");
35241 break;
35243 /* FALLTHRU */
35244 case 1:
35245 error ("the last argument must be an 8-bit immediate");
35246 break;
35247 default:
35248 gcc_unreachable ();
35250 return const0_rtx;
35253 else
35255 if (VECTOR_MODE_P (mode))
35256 op = safe_vector_operand (op, mode);
35258 /* If we aren't optimizing, only allow one memory operand to
35259 be generated. */
35260 if (memory_operand (op, mode))
35261 num_memory++;
35263 op = fixup_modeless_constant (op, mode);
35265 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35267 if (optimize || !match || num_memory > 1)
35268 op = copy_to_mode_reg (mode, op);
35270 else
35272 op = copy_to_reg (op);
35273 op = lowpart_subreg (mode, op, GET_MODE (op));
35277 args[i].op = op;
35278 args[i].mode = mode;
35281 switch (nargs)
35283 case 1:
35284 pat = GEN_FCN (icode) (real_target, args[0].op);
35285 break;
35286 case 2:
35287 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35288 break;
35289 case 3:
35290 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35291 args[2].op);
35292 break;
35293 case 4:
35294 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35295 args[2].op, args[3].op);
35296 break;
35297 case 5:
35298 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35299 args[2].op, args[3].op, args[4].op);
35300 break;
35301 case 6:
35302 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35303 args[2].op, args[3].op, args[4].op,
35304 args[5].op);
35305 break;
35306 default:
35307 gcc_unreachable ();
35310 if (! pat)
35311 return 0;
35313 emit_insn (pat);
35314 return target;
35317 /* Transform pattern of following layout:
35318 (set A
35319 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
35321 into:
35322 (set (A B)) */
35324 static rtx
35325 ix86_erase_embedded_rounding (rtx pat)
35327 if (GET_CODE (pat) == INSN)
35328 pat = PATTERN (pat);
35330 gcc_assert (GET_CODE (pat) == SET);
35331 rtx src = SET_SRC (pat);
35332 gcc_assert (XVECLEN (src, 0) == 2);
35333 rtx p0 = XVECEXP (src, 0, 0);
35334 gcc_assert (GET_CODE (src) == UNSPEC
35335 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
35336 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
35337 return res;
35340 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35341 with rounding. */
35342 static rtx
35343 ix86_expand_sse_comi_round (const struct builtin_description *d,
35344 tree exp, rtx target)
35346 rtx pat, set_dst;
35347 tree arg0 = CALL_EXPR_ARG (exp, 0);
35348 tree arg1 = CALL_EXPR_ARG (exp, 1);
35349 tree arg2 = CALL_EXPR_ARG (exp, 2);
35350 tree arg3 = CALL_EXPR_ARG (exp, 3);
35351 rtx op0 = expand_normal (arg0);
35352 rtx op1 = expand_normal (arg1);
35353 rtx op2 = expand_normal (arg2);
35354 rtx op3 = expand_normal (arg3);
35355 enum insn_code icode = d->icode;
35356 const struct insn_data_d *insn_p = &insn_data[icode];
35357 machine_mode mode0 = insn_p->operand[0].mode;
35358 machine_mode mode1 = insn_p->operand[1].mode;
35359 enum rtx_code comparison = UNEQ;
35360 bool need_ucomi = false;
35362 /* See avxintrin.h for values. */
35363 enum rtx_code comi_comparisons[32] =
35365 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35366 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35367 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35369 bool need_ucomi_values[32] =
35371 true, false, false, true, true, false, false, true,
35372 true, false, false, true, true, false, false, true,
35373 false, true, true, false, false, true, true, false,
35374 false, true, true, false, false, true, true, false
35377 if (!CONST_INT_P (op2))
35379 error ("the third argument must be comparison constant");
35380 return const0_rtx;
35382 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35384 error ("incorrect comparison mode");
35385 return const0_rtx;
35388 if (!insn_p->operand[2].predicate (op3, SImode))
35390 error ("incorrect rounding operand");
35391 return const0_rtx;
35394 comparison = comi_comparisons[INTVAL (op2)];
35395 need_ucomi = need_ucomi_values[INTVAL (op2)];
35397 if (VECTOR_MODE_P (mode0))
35398 op0 = safe_vector_operand (op0, mode0);
35399 if (VECTOR_MODE_P (mode1))
35400 op1 = safe_vector_operand (op1, mode1);
35402 target = gen_reg_rtx (SImode);
35403 emit_move_insn (target, const0_rtx);
35404 target = gen_rtx_SUBREG (QImode, target, 0);
35406 if ((optimize && !register_operand (op0, mode0))
35407 || !insn_p->operand[0].predicate (op0, mode0))
35408 op0 = copy_to_mode_reg (mode0, op0);
35409 if ((optimize && !register_operand (op1, mode1))
35410 || !insn_p->operand[1].predicate (op1, mode1))
35411 op1 = copy_to_mode_reg (mode1, op1);
35413 if (need_ucomi)
35414 icode = icode == CODE_FOR_sse_comi_round
35415 ? CODE_FOR_sse_ucomi_round
35416 : CODE_FOR_sse2_ucomi_round;
35418 pat = GEN_FCN (icode) (op0, op1, op3);
35419 if (! pat)
35420 return 0;
35422 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35423 if (INTVAL (op3) == NO_ROUND)
35425 pat = ix86_erase_embedded_rounding (pat);
35426 if (! pat)
35427 return 0;
35429 set_dst = SET_DEST (pat);
35431 else
35433 gcc_assert (GET_CODE (pat) == SET);
35434 set_dst = SET_DEST (pat);
35437 emit_insn (pat);
35438 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35439 gen_rtx_fmt_ee (comparison, QImode,
35440 set_dst,
35441 const0_rtx)));
35443 return SUBREG_REG (target);
35446 static rtx
35447 ix86_expand_round_builtin (const struct builtin_description *d,
35448 tree exp, rtx target)
35450 rtx pat;
35451 unsigned int i, nargs;
35452 struct
35454 rtx op;
35455 machine_mode mode;
35456 } args[6];
35457 enum insn_code icode = d->icode;
35458 const struct insn_data_d *insn_p = &insn_data[icode];
35459 machine_mode tmode = insn_p->operand[0].mode;
35460 unsigned int nargs_constant = 0;
35461 unsigned int redundant_embed_rnd = 0;
35463 switch ((enum ix86_builtin_func_type) d->flag)
35465 case UINT64_FTYPE_V2DF_INT:
35466 case UINT64_FTYPE_V4SF_INT:
35467 case UINT_FTYPE_V2DF_INT:
35468 case UINT_FTYPE_V4SF_INT:
35469 case INT64_FTYPE_V2DF_INT:
35470 case INT64_FTYPE_V4SF_INT:
35471 case INT_FTYPE_V2DF_INT:
35472 case INT_FTYPE_V4SF_INT:
35473 nargs = 2;
35474 break;
35475 case V4SF_FTYPE_V4SF_UINT_INT:
35476 case V4SF_FTYPE_V4SF_UINT64_INT:
35477 case V2DF_FTYPE_V2DF_UINT64_INT:
35478 case V4SF_FTYPE_V4SF_INT_INT:
35479 case V4SF_FTYPE_V4SF_INT64_INT:
35480 case V2DF_FTYPE_V2DF_INT64_INT:
35481 case V4SF_FTYPE_V4SF_V4SF_INT:
35482 case V2DF_FTYPE_V2DF_V2DF_INT:
35483 case V4SF_FTYPE_V4SF_V2DF_INT:
35484 case V2DF_FTYPE_V2DF_V4SF_INT:
35485 nargs = 3;
35486 break;
35487 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35488 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35489 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35490 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35491 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35492 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35493 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35494 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35495 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35496 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35497 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35498 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35499 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35500 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35501 nargs = 4;
35502 break;
35503 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35504 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35505 nargs_constant = 2;
35506 nargs = 4;
35507 break;
35508 case INT_FTYPE_V4SF_V4SF_INT_INT:
35509 case INT_FTYPE_V2DF_V2DF_INT_INT:
35510 return ix86_expand_sse_comi_round (d, exp, target);
35511 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35512 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
35513 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
35514 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35515 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35516 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35517 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35518 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35519 nargs = 5;
35520 break;
35521 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35522 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35523 nargs_constant = 4;
35524 nargs = 5;
35525 break;
35526 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35527 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35528 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35529 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35530 nargs_constant = 3;
35531 nargs = 5;
35532 break;
35533 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35534 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35535 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35536 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35537 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
35538 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
35539 nargs = 6;
35540 nargs_constant = 4;
35541 break;
35542 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35543 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35544 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35545 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35546 nargs = 6;
35547 nargs_constant = 3;
35548 break;
35549 default:
35550 gcc_unreachable ();
35552 gcc_assert (nargs <= ARRAY_SIZE (args));
35554 if (optimize
35555 || target == 0
35556 || GET_MODE (target) != tmode
35557 || !insn_p->operand[0].predicate (target, tmode))
35558 target = gen_reg_rtx (tmode);
35560 for (i = 0; i < nargs; i++)
35562 tree arg = CALL_EXPR_ARG (exp, i);
35563 rtx op = expand_normal (arg);
35564 machine_mode mode = insn_p->operand[i + 1].mode;
35565 bool match = insn_p->operand[i + 1].predicate (op, mode);
35567 if (i == nargs - nargs_constant)
35569 if (!match)
35571 switch (icode)
35573 case CODE_FOR_avx512f_getmantv8df_mask_round:
35574 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35575 case CODE_FOR_avx512f_vgetmantv2df_round:
35576 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
35577 case CODE_FOR_avx512f_vgetmantv4sf_round:
35578 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
35579 error ("the immediate argument must be a 4-bit immediate");
35580 return const0_rtx;
35581 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35582 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35583 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35584 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35585 error ("the immediate argument must be a 5-bit immediate");
35586 return const0_rtx;
35587 default:
35588 error ("the immediate argument must be an 8-bit immediate");
35589 return const0_rtx;
35593 else if (i == nargs-1)
35595 if (!insn_p->operand[nargs].predicate (op, SImode))
35597 error ("incorrect rounding operand");
35598 return const0_rtx;
35601 /* If there is no rounding use normal version of the pattern. */
35602 if (INTVAL (op) == NO_ROUND)
35603 redundant_embed_rnd = 1;
35605 else
35607 if (VECTOR_MODE_P (mode))
35608 op = safe_vector_operand (op, mode);
35610 op = fixup_modeless_constant (op, mode);
35612 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35614 if (optimize || !match)
35615 op = copy_to_mode_reg (mode, op);
35617 else
35619 op = copy_to_reg (op);
35620 op = lowpart_subreg (mode, op, GET_MODE (op));
35624 args[i].op = op;
35625 args[i].mode = mode;
35628 switch (nargs)
35630 case 1:
35631 pat = GEN_FCN (icode) (target, args[0].op);
35632 break;
35633 case 2:
35634 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35635 break;
35636 case 3:
35637 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35638 args[2].op);
35639 break;
35640 case 4:
35641 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35642 args[2].op, args[3].op);
35643 break;
35644 case 5:
35645 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35646 args[2].op, args[3].op, args[4].op);
35647 break;
35648 case 6:
35649 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35650 args[2].op, args[3].op, args[4].op,
35651 args[5].op);
35652 break;
35653 default:
35654 gcc_unreachable ();
35657 if (!pat)
35658 return 0;
35660 if (redundant_embed_rnd)
35661 pat = ix86_erase_embedded_rounding (pat);
35663 emit_insn (pat);
35664 return target;
35667 /* Subroutine of ix86_expand_builtin to take care of special insns
35668 with variable number of operands. */
35670 static rtx
35671 ix86_expand_special_args_builtin (const struct builtin_description *d,
35672 tree exp, rtx target)
35674 tree arg;
35675 rtx pat, op;
35676 unsigned int i, nargs, arg_adjust, memory;
35677 bool aligned_mem = false;
35678 struct
35680 rtx op;
35681 machine_mode mode;
35682 } args[3];
35683 enum insn_code icode = d->icode;
35684 bool last_arg_constant = false;
35685 const struct insn_data_d *insn_p = &insn_data[icode];
35686 machine_mode tmode = insn_p->operand[0].mode;
35687 enum { load, store } klass;
35689 switch ((enum ix86_builtin_func_type) d->flag)
35691 case VOID_FTYPE_VOID:
35692 emit_insn (GEN_FCN (icode) (target));
35693 return 0;
35694 case VOID_FTYPE_UINT64:
35695 case VOID_FTYPE_UNSIGNED:
35696 nargs = 0;
35697 klass = store;
35698 memory = 0;
35699 break;
35701 case INT_FTYPE_VOID:
35702 case USHORT_FTYPE_VOID:
35703 case UINT64_FTYPE_VOID:
35704 case UNSIGNED_FTYPE_VOID:
35705 nargs = 0;
35706 klass = load;
35707 memory = 0;
35708 break;
35709 case UINT64_FTYPE_PUNSIGNED:
35710 case V2DI_FTYPE_PV2DI:
35711 case V4DI_FTYPE_PV4DI:
35712 case V32QI_FTYPE_PCCHAR:
35713 case V16QI_FTYPE_PCCHAR:
35714 case V8SF_FTYPE_PCV4SF:
35715 case V8SF_FTYPE_PCFLOAT:
35716 case V4SF_FTYPE_PCFLOAT:
35717 case V4DF_FTYPE_PCV2DF:
35718 case V4DF_FTYPE_PCDOUBLE:
35719 case V2DF_FTYPE_PCDOUBLE:
35720 case VOID_FTYPE_PVOID:
35721 case V8DI_FTYPE_PV8DI:
35722 nargs = 1;
35723 klass = load;
35724 memory = 0;
35725 switch (icode)
35727 case CODE_FOR_sse4_1_movntdqa:
35728 case CODE_FOR_avx2_movntdqa:
35729 case CODE_FOR_avx512f_movntdqa:
35730 aligned_mem = true;
35731 break;
35732 default:
35733 break;
35735 break;
35736 case VOID_FTYPE_PV2SF_V4SF:
35737 case VOID_FTYPE_PV8DI_V8DI:
35738 case VOID_FTYPE_PV4DI_V4DI:
35739 case VOID_FTYPE_PV2DI_V2DI:
35740 case VOID_FTYPE_PCHAR_V32QI:
35741 case VOID_FTYPE_PCHAR_V16QI:
35742 case VOID_FTYPE_PFLOAT_V16SF:
35743 case VOID_FTYPE_PFLOAT_V8SF:
35744 case VOID_FTYPE_PFLOAT_V4SF:
35745 case VOID_FTYPE_PDOUBLE_V8DF:
35746 case VOID_FTYPE_PDOUBLE_V4DF:
35747 case VOID_FTYPE_PDOUBLE_V2DF:
35748 case VOID_FTYPE_PLONGLONG_LONGLONG:
35749 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35750 case VOID_FTYPE_PINT_INT:
35751 nargs = 1;
35752 klass = store;
35753 /* Reserve memory operand for target. */
35754 memory = ARRAY_SIZE (args);
35755 switch (icode)
35757 /* These builtins and instructions require the memory
35758 to be properly aligned. */
35759 case CODE_FOR_avx_movntv4di:
35760 case CODE_FOR_sse2_movntv2di:
35761 case CODE_FOR_avx_movntv8sf:
35762 case CODE_FOR_sse_movntv4sf:
35763 case CODE_FOR_sse4a_vmmovntv4sf:
35764 case CODE_FOR_avx_movntv4df:
35765 case CODE_FOR_sse2_movntv2df:
35766 case CODE_FOR_sse4a_vmmovntv2df:
35767 case CODE_FOR_sse2_movntidi:
35768 case CODE_FOR_sse_movntq:
35769 case CODE_FOR_sse2_movntisi:
35770 case CODE_FOR_avx512f_movntv16sf:
35771 case CODE_FOR_avx512f_movntv8df:
35772 case CODE_FOR_avx512f_movntv8di:
35773 aligned_mem = true;
35774 break;
35775 default:
35776 break;
35778 break;
35779 case V4SF_FTYPE_V4SF_PCV2SF:
35780 case V2DF_FTYPE_V2DF_PCDOUBLE:
35781 nargs = 2;
35782 klass = load;
35783 memory = 1;
35784 break;
35785 case V8SF_FTYPE_PCV8SF_V8SI:
35786 case V4DF_FTYPE_PCV4DF_V4DI:
35787 case V4SF_FTYPE_PCV4SF_V4SI:
35788 case V2DF_FTYPE_PCV2DF_V2DI:
35789 case V8SI_FTYPE_PCV8SI_V8SI:
35790 case V4DI_FTYPE_PCV4DI_V4DI:
35791 case V4SI_FTYPE_PCV4SI_V4SI:
35792 case V2DI_FTYPE_PCV2DI_V2DI:
35793 case VOID_FTYPE_INT_INT64:
35794 nargs = 2;
35795 klass = load;
35796 memory = 0;
35797 break;
35798 case VOID_FTYPE_PV8DF_V8DF_UQI:
35799 case VOID_FTYPE_PV4DF_V4DF_UQI:
35800 case VOID_FTYPE_PV2DF_V2DF_UQI:
35801 case VOID_FTYPE_PV16SF_V16SF_UHI:
35802 case VOID_FTYPE_PV8SF_V8SF_UQI:
35803 case VOID_FTYPE_PV4SF_V4SF_UQI:
35804 case VOID_FTYPE_PV8DI_V8DI_UQI:
35805 case VOID_FTYPE_PV4DI_V4DI_UQI:
35806 case VOID_FTYPE_PV2DI_V2DI_UQI:
35807 case VOID_FTYPE_PV16SI_V16SI_UHI:
35808 case VOID_FTYPE_PV8SI_V8SI_UQI:
35809 case VOID_FTYPE_PV4SI_V4SI_UQI:
35810 case VOID_FTYPE_PV64QI_V64QI_UDI:
35811 case VOID_FTYPE_PV32HI_V32HI_USI:
35812 case VOID_FTYPE_PV32QI_V32QI_USI:
35813 case VOID_FTYPE_PV16QI_V16QI_UHI:
35814 case VOID_FTYPE_PV16HI_V16HI_UHI:
35815 case VOID_FTYPE_PV8HI_V8HI_UQI:
35816 switch (icode)
35818 /* These builtins and instructions require the memory
35819 to be properly aligned. */
35820 case CODE_FOR_avx512f_storev16sf_mask:
35821 case CODE_FOR_avx512f_storev16si_mask:
35822 case CODE_FOR_avx512f_storev8df_mask:
35823 case CODE_FOR_avx512f_storev8di_mask:
35824 case CODE_FOR_avx512vl_storev8sf_mask:
35825 case CODE_FOR_avx512vl_storev8si_mask:
35826 case CODE_FOR_avx512vl_storev4df_mask:
35827 case CODE_FOR_avx512vl_storev4di_mask:
35828 case CODE_FOR_avx512vl_storev4sf_mask:
35829 case CODE_FOR_avx512vl_storev4si_mask:
35830 case CODE_FOR_avx512vl_storev2df_mask:
35831 case CODE_FOR_avx512vl_storev2di_mask:
35832 aligned_mem = true;
35833 break;
35834 default:
35835 break;
35837 /* FALLTHRU */
35838 case VOID_FTYPE_PV8SF_V8SI_V8SF:
35839 case VOID_FTYPE_PV4DF_V4DI_V4DF:
35840 case VOID_FTYPE_PV4SF_V4SI_V4SF:
35841 case VOID_FTYPE_PV2DF_V2DI_V2DF:
35842 case VOID_FTYPE_PV8SI_V8SI_V8SI:
35843 case VOID_FTYPE_PV4DI_V4DI_V4DI:
35844 case VOID_FTYPE_PV4SI_V4SI_V4SI:
35845 case VOID_FTYPE_PV2DI_V2DI_V2DI:
35846 case VOID_FTYPE_PV8SI_V8DI_UQI:
35847 case VOID_FTYPE_PV8HI_V8DI_UQI:
35848 case VOID_FTYPE_PV16HI_V16SI_UHI:
35849 case VOID_FTYPE_PV16QI_V8DI_UQI:
35850 case VOID_FTYPE_PV16QI_V16SI_UHI:
35851 case VOID_FTYPE_PV4SI_V4DI_UQI:
35852 case VOID_FTYPE_PV4SI_V2DI_UQI:
35853 case VOID_FTYPE_PV8HI_V4DI_UQI:
35854 case VOID_FTYPE_PV8HI_V2DI_UQI:
35855 case VOID_FTYPE_PV8HI_V8SI_UQI:
35856 case VOID_FTYPE_PV8HI_V4SI_UQI:
35857 case VOID_FTYPE_PV16QI_V4DI_UQI:
35858 case VOID_FTYPE_PV16QI_V2DI_UQI:
35859 case VOID_FTYPE_PV16QI_V8SI_UQI:
35860 case VOID_FTYPE_PV16QI_V4SI_UQI:
35861 case VOID_FTYPE_PCHAR_V64QI_UDI:
35862 case VOID_FTYPE_PCHAR_V32QI_USI:
35863 case VOID_FTYPE_PCHAR_V16QI_UHI:
35864 case VOID_FTYPE_PSHORT_V32HI_USI:
35865 case VOID_FTYPE_PSHORT_V16HI_UHI:
35866 case VOID_FTYPE_PSHORT_V8HI_UQI:
35867 case VOID_FTYPE_PINT_V16SI_UHI:
35868 case VOID_FTYPE_PINT_V8SI_UQI:
35869 case VOID_FTYPE_PINT_V4SI_UQI:
35870 case VOID_FTYPE_PINT64_V8DI_UQI:
35871 case VOID_FTYPE_PINT64_V4DI_UQI:
35872 case VOID_FTYPE_PINT64_V2DI_UQI:
35873 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
35874 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
35875 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
35876 case VOID_FTYPE_PFLOAT_V16SF_UHI:
35877 case VOID_FTYPE_PFLOAT_V8SF_UQI:
35878 case VOID_FTYPE_PFLOAT_V4SF_UQI:
35879 case VOID_FTYPE_PV32QI_V32HI_USI:
35880 case VOID_FTYPE_PV16QI_V16HI_UHI:
35881 case VOID_FTYPE_PV8QI_V8HI_UQI:
35882 nargs = 2;
35883 klass = store;
35884 /* Reserve memory operand for target. */
35885 memory = ARRAY_SIZE (args);
35886 break;
35887 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
35888 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
35889 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
35890 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
35891 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
35892 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
35893 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
35894 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
35895 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
35896 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
35897 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
35898 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
35899 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
35900 case V32HI_FTYPE_PCV32HI_V32HI_USI:
35901 case V32QI_FTYPE_PCV32QI_V32QI_USI:
35902 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
35903 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
35904 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
35905 switch (icode)
35907 /* These builtins and instructions require the memory
35908 to be properly aligned. */
35909 case CODE_FOR_avx512f_loadv16sf_mask:
35910 case CODE_FOR_avx512f_loadv16si_mask:
35911 case CODE_FOR_avx512f_loadv8df_mask:
35912 case CODE_FOR_avx512f_loadv8di_mask:
35913 case CODE_FOR_avx512vl_loadv8sf_mask:
35914 case CODE_FOR_avx512vl_loadv8si_mask:
35915 case CODE_FOR_avx512vl_loadv4df_mask:
35916 case CODE_FOR_avx512vl_loadv4di_mask:
35917 case CODE_FOR_avx512vl_loadv4sf_mask:
35918 case CODE_FOR_avx512vl_loadv4si_mask:
35919 case CODE_FOR_avx512vl_loadv2df_mask:
35920 case CODE_FOR_avx512vl_loadv2di_mask:
35921 case CODE_FOR_avx512bw_loadv64qi_mask:
35922 case CODE_FOR_avx512vl_loadv32qi_mask:
35923 case CODE_FOR_avx512vl_loadv16qi_mask:
35924 case CODE_FOR_avx512bw_loadv32hi_mask:
35925 case CODE_FOR_avx512vl_loadv16hi_mask:
35926 case CODE_FOR_avx512vl_loadv8hi_mask:
35927 aligned_mem = true;
35928 break;
35929 default:
35930 break;
35932 /* FALLTHRU */
35933 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35934 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35935 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35936 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35937 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35938 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35939 case V16SI_FTYPE_PCINT_V16SI_UHI:
35940 case V8SI_FTYPE_PCINT_V8SI_UQI:
35941 case V4SI_FTYPE_PCINT_V4SI_UQI:
35942 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35943 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35944 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35945 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35946 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35947 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35948 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35949 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35950 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35951 nargs = 3;
35952 klass = load;
35953 memory = 0;
35954 break;
35955 case VOID_FTYPE_UINT_UINT_UINT:
35956 case VOID_FTYPE_UINT64_UINT_UINT:
35957 case UCHAR_FTYPE_UINT_UINT_UINT:
35958 case UCHAR_FTYPE_UINT64_UINT_UINT:
35959 nargs = 3;
35960 klass = load;
35961 memory = ARRAY_SIZE (args);
35962 last_arg_constant = true;
35963 break;
35964 default:
35965 gcc_unreachable ();
35968 gcc_assert (nargs <= ARRAY_SIZE (args));
35970 if (klass == store)
35972 arg = CALL_EXPR_ARG (exp, 0);
35973 op = expand_normal (arg);
35974 gcc_assert (target == 0);
35975 if (memory)
35977 op = ix86_zero_extend_to_Pmode (op);
35978 target = gen_rtx_MEM (tmode, op);
35979 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35980 on it. Try to improve it using get_pointer_alignment,
35981 and if the special builtin is one that requires strict
35982 mode alignment, also from it's GET_MODE_ALIGNMENT.
35983 Failure to do so could lead to ix86_legitimate_combined_insn
35984 rejecting all changes to such insns. */
35985 unsigned int align = get_pointer_alignment (arg);
35986 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35987 align = GET_MODE_ALIGNMENT (tmode);
35988 if (MEM_ALIGN (target) < align)
35989 set_mem_align (target, align);
35991 else
35992 target = force_reg (tmode, op);
35993 arg_adjust = 1;
35995 else
35997 arg_adjust = 0;
35998 if (optimize
35999 || target == 0
36000 || !register_operand (target, tmode)
36001 || GET_MODE (target) != tmode)
36002 target = gen_reg_rtx (tmode);
36005 for (i = 0; i < nargs; i++)
36007 machine_mode mode = insn_p->operand[i + 1].mode;
36008 bool match;
36010 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36011 op = expand_normal (arg);
36012 match = insn_p->operand[i + 1].predicate (op, mode);
36014 if (last_arg_constant && (i + 1) == nargs)
36016 if (!match)
36018 if (icode == CODE_FOR_lwp_lwpvalsi3
36019 || icode == CODE_FOR_lwp_lwpinssi3
36020 || icode == CODE_FOR_lwp_lwpvaldi3
36021 || icode == CODE_FOR_lwp_lwpinsdi3)
36022 error ("the last argument must be a 32-bit immediate");
36023 else
36024 error ("the last argument must be an 8-bit immediate");
36025 return const0_rtx;
36028 else
36030 if (i == memory)
36032 /* This must be the memory operand. */
36033 op = ix86_zero_extend_to_Pmode (op);
36034 op = gen_rtx_MEM (mode, op);
36035 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36036 on it. Try to improve it using get_pointer_alignment,
36037 and if the special builtin is one that requires strict
36038 mode alignment, also from it's GET_MODE_ALIGNMENT.
36039 Failure to do so could lead to ix86_legitimate_combined_insn
36040 rejecting all changes to such insns. */
36041 unsigned int align = get_pointer_alignment (arg);
36042 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36043 align = GET_MODE_ALIGNMENT (mode);
36044 if (MEM_ALIGN (op) < align)
36045 set_mem_align (op, align);
36047 else
36049 /* This must be register. */
36050 if (VECTOR_MODE_P (mode))
36051 op = safe_vector_operand (op, mode);
36053 op = fixup_modeless_constant (op, mode);
36055 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36056 op = copy_to_mode_reg (mode, op);
36057 else
36059 op = copy_to_reg (op);
36060 op = lowpart_subreg (mode, op, GET_MODE (op));
36065 args[i].op = op;
36066 args[i].mode = mode;
36069 switch (nargs)
36071 case 0:
36072 pat = GEN_FCN (icode) (target);
36073 break;
36074 case 1:
36075 pat = GEN_FCN (icode) (target, args[0].op);
36076 break;
36077 case 2:
36078 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36079 break;
36080 case 3:
36081 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36082 break;
36083 default:
36084 gcc_unreachable ();
36087 if (! pat)
36088 return 0;
36089 emit_insn (pat);
36090 return klass == store ? 0 : target;
36093 /* Return the integer constant in ARG. Constrain it to be in the range
36094 of the subparts of VEC_TYPE; issue an error if not. */
36096 static int
36097 get_element_number (tree vec_type, tree arg)
36099 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36101 if (!tree_fits_uhwi_p (arg)
36102 || (elt = tree_to_uhwi (arg), elt > max))
36104 error ("selector must be an integer constant in the range 0..%wi", max);
36105 return 0;
36108 return elt;
36111 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36112 ix86_expand_vector_init. We DO have language-level syntax for this, in
36113 the form of (type){ init-list }. Except that since we can't place emms
36114 instructions from inside the compiler, we can't allow the use of MMX
36115 registers unless the user explicitly asks for it. So we do *not* define
36116 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36117 we have builtins invoked by mmintrin.h that gives us license to emit
36118 these sorts of instructions. */
36120 static rtx
36121 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36123 machine_mode tmode = TYPE_MODE (type);
36124 machine_mode inner_mode = GET_MODE_INNER (tmode);
36125 int i, n_elt = GET_MODE_NUNITS (tmode);
36126 rtvec v = rtvec_alloc (n_elt);
36128 gcc_assert (VECTOR_MODE_P (tmode));
36129 gcc_assert (call_expr_nargs (exp) == n_elt);
36131 for (i = 0; i < n_elt; ++i)
36133 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36134 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36137 if (!target || !register_operand (target, tmode))
36138 target = gen_reg_rtx (tmode);
36140 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36141 return target;
36144 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36145 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36146 had a language-level syntax for referencing vector elements. */
36148 static rtx
36149 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36151 machine_mode tmode, mode0;
36152 tree arg0, arg1;
36153 int elt;
36154 rtx op0;
36156 arg0 = CALL_EXPR_ARG (exp, 0);
36157 arg1 = CALL_EXPR_ARG (exp, 1);
36159 op0 = expand_normal (arg0);
36160 elt = get_element_number (TREE_TYPE (arg0), arg1);
36162 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36163 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36164 gcc_assert (VECTOR_MODE_P (mode0));
36166 op0 = force_reg (mode0, op0);
36168 if (optimize || !target || !register_operand (target, tmode))
36169 target = gen_reg_rtx (tmode);
36171 ix86_expand_vector_extract (true, target, op0, elt);
36173 return target;
36176 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36177 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36178 a language-level syntax for referencing vector elements. */
36180 static rtx
36181 ix86_expand_vec_set_builtin (tree exp)
36183 machine_mode tmode, mode1;
36184 tree arg0, arg1, arg2;
36185 int elt;
36186 rtx op0, op1, target;
36188 arg0 = CALL_EXPR_ARG (exp, 0);
36189 arg1 = CALL_EXPR_ARG (exp, 1);
36190 arg2 = CALL_EXPR_ARG (exp, 2);
36192 tmode = TYPE_MODE (TREE_TYPE (arg0));
36193 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36194 gcc_assert (VECTOR_MODE_P (tmode));
36196 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36197 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36198 elt = get_element_number (TREE_TYPE (arg0), arg2);
36200 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36201 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36203 op0 = force_reg (tmode, op0);
36204 op1 = force_reg (mode1, op1);
36206 /* OP0 is the source of these builtin functions and shouldn't be
36207 modified. Create a copy, use it and return it as target. */
36208 target = gen_reg_rtx (tmode);
36209 emit_move_insn (target, op0);
36210 ix86_expand_vector_set (true, target, op1, elt);
36212 return target;
36215 /* Emit conditional move of SRC to DST with condition
36216 OP1 CODE OP2. */
36217 static void
36218 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36220 rtx t;
36222 if (TARGET_CMOVE)
36224 t = ix86_expand_compare (code, op1, op2);
36225 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36226 src, dst)));
36228 else
36230 rtx_code_label *nomove = gen_label_rtx ();
36231 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36232 const0_rtx, GET_MODE (op1), 1, nomove);
36233 emit_move_insn (dst, src);
36234 emit_label (nomove);
36238 /* Choose max of DST and SRC and put it to DST. */
36239 static void
36240 ix86_emit_move_max (rtx dst, rtx src)
36242 ix86_emit_cmove (dst, src, LTU, dst, src);
36245 /* Expand an expression EXP that calls a built-in function,
36246 with result going to TARGET if that's convenient
36247 (and in mode MODE if that's convenient).
36248 SUBTARGET may be used as the target for computing one of EXP's operands.
36249 IGNORE is nonzero if the value is to be ignored. */
36251 static rtx
36252 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36253 machine_mode mode, int ignore)
36255 size_t i;
36256 enum insn_code icode, icode2;
36257 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36258 tree arg0, arg1, arg2, arg3, arg4;
36259 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
36260 machine_mode mode0, mode1, mode2, mode3, mode4;
36261 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36263 /* For CPU builtins that can be folded, fold first and expand the fold. */
36264 switch (fcode)
36266 case IX86_BUILTIN_CPU_INIT:
36268 /* Make it call __cpu_indicator_init in libgcc. */
36269 tree call_expr, fndecl, type;
36270 type = build_function_type_list (integer_type_node, NULL_TREE);
36271 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36272 call_expr = build_call_expr (fndecl, 0);
36273 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36275 case IX86_BUILTIN_CPU_IS:
36276 case IX86_BUILTIN_CPU_SUPPORTS:
36278 tree arg0 = CALL_EXPR_ARG (exp, 0);
36279 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36280 gcc_assert (fold_expr != NULL_TREE);
36281 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36285 HOST_WIDE_INT isa = ix86_isa_flags;
36286 HOST_WIDE_INT isa2 = ix86_isa_flags2;
36287 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
36288 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
36289 /* The general case is we require all the ISAs specified in bisa{,2}
36290 to be enabled.
36291 The exceptions are:
36292 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
36293 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
36294 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
36295 where for each this pair it is sufficient if either of the ISAs is
36296 enabled, plus if it is ored with other options also those others. */
36297 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36298 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36299 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
36300 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
36301 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36302 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36303 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
36304 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
36305 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36306 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36307 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
36308 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
36309 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
36311 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
36312 (enum fpmath_unit) 0, false);
36313 if (!opts)
36314 error ("%qE needs unknown isa option", fndecl);
36315 else
36317 gcc_assert (opts != NULL);
36318 error ("%qE needs isa option %s", fndecl, opts);
36319 free (opts);
36321 return expand_call (exp, target, ignore);
36324 switch (fcode)
36326 case IX86_BUILTIN_BNDMK:
36327 if (!target
36328 || GET_MODE (target) != BNDmode
36329 || !register_operand (target, BNDmode))
36330 target = gen_reg_rtx (BNDmode);
36332 arg0 = CALL_EXPR_ARG (exp, 0);
36333 arg1 = CALL_EXPR_ARG (exp, 1);
36335 op0 = expand_normal (arg0);
36336 op1 = expand_normal (arg1);
36338 if (!register_operand (op0, Pmode))
36339 op0 = ix86_zero_extend_to_Pmode (op0);
36340 if (!register_operand (op1, Pmode))
36341 op1 = ix86_zero_extend_to_Pmode (op1);
36343 /* Builtin arg1 is size of block but instruction op1 should
36344 be (size - 1). */
36345 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36346 NULL_RTX, 1, OPTAB_DIRECT);
36348 emit_insn (BNDmode == BND64mode
36349 ? gen_bnd64_mk (target, op0, op1)
36350 : gen_bnd32_mk (target, op0, op1));
36351 return target;
36353 case IX86_BUILTIN_BNDSTX:
36354 arg0 = CALL_EXPR_ARG (exp, 0);
36355 arg1 = CALL_EXPR_ARG (exp, 1);
36356 arg2 = CALL_EXPR_ARG (exp, 2);
36358 op0 = expand_normal (arg0);
36359 op1 = expand_normal (arg1);
36360 op2 = expand_normal (arg2);
36362 if (!register_operand (op0, Pmode))
36363 op0 = ix86_zero_extend_to_Pmode (op0);
36364 if (!register_operand (op1, BNDmode))
36365 op1 = copy_to_mode_reg (BNDmode, op1);
36366 if (!register_operand (op2, Pmode))
36367 op2 = ix86_zero_extend_to_Pmode (op2);
36369 emit_insn (BNDmode == BND64mode
36370 ? gen_bnd64_stx (op2, op0, op1)
36371 : gen_bnd32_stx (op2, op0, op1));
36372 return 0;
36374 case IX86_BUILTIN_BNDLDX:
36375 if (!target
36376 || GET_MODE (target) != BNDmode
36377 || !register_operand (target, BNDmode))
36378 target = gen_reg_rtx (BNDmode);
36380 arg0 = CALL_EXPR_ARG (exp, 0);
36381 arg1 = CALL_EXPR_ARG (exp, 1);
36383 op0 = expand_normal (arg0);
36384 op1 = expand_normal (arg1);
36386 if (!register_operand (op0, Pmode))
36387 op0 = ix86_zero_extend_to_Pmode (op0);
36388 if (!register_operand (op1, Pmode))
36389 op1 = ix86_zero_extend_to_Pmode (op1);
36391 emit_insn (BNDmode == BND64mode
36392 ? gen_bnd64_ldx (target, op0, op1)
36393 : gen_bnd32_ldx (target, op0, op1));
36394 return target;
36396 case IX86_BUILTIN_BNDCL:
36397 arg0 = CALL_EXPR_ARG (exp, 0);
36398 arg1 = CALL_EXPR_ARG (exp, 1);
36400 op0 = expand_normal (arg0);
36401 op1 = expand_normal (arg1);
36403 if (!register_operand (op0, Pmode))
36404 op0 = ix86_zero_extend_to_Pmode (op0);
36405 if (!register_operand (op1, BNDmode))
36406 op1 = copy_to_mode_reg (BNDmode, op1);
36408 emit_insn (BNDmode == BND64mode
36409 ? gen_bnd64_cl (op1, op0)
36410 : gen_bnd32_cl (op1, op0));
36411 return 0;
36413 case IX86_BUILTIN_BNDCU:
36414 arg0 = CALL_EXPR_ARG (exp, 0);
36415 arg1 = CALL_EXPR_ARG (exp, 1);
36417 op0 = expand_normal (arg0);
36418 op1 = expand_normal (arg1);
36420 if (!register_operand (op0, Pmode))
36421 op0 = ix86_zero_extend_to_Pmode (op0);
36422 if (!register_operand (op1, BNDmode))
36423 op1 = copy_to_mode_reg (BNDmode, op1);
36425 emit_insn (BNDmode == BND64mode
36426 ? gen_bnd64_cu (op1, op0)
36427 : gen_bnd32_cu (op1, op0));
36428 return 0;
36430 case IX86_BUILTIN_BNDRET:
36431 arg0 = CALL_EXPR_ARG (exp, 0);
36432 target = chkp_get_rtl_bounds (arg0);
36434 /* If no bounds were specified for returned value,
36435 then use INIT bounds. It usually happens when
36436 some built-in function is expanded. */
36437 if (!target)
36439 rtx t1 = gen_reg_rtx (Pmode);
36440 rtx t2 = gen_reg_rtx (Pmode);
36441 target = gen_reg_rtx (BNDmode);
36442 emit_move_insn (t1, const0_rtx);
36443 emit_move_insn (t2, constm1_rtx);
36444 emit_insn (BNDmode == BND64mode
36445 ? gen_bnd64_mk (target, t1, t2)
36446 : gen_bnd32_mk (target, t1, t2));
36449 gcc_assert (target && REG_P (target));
36450 return target;
36452 case IX86_BUILTIN_BNDNARROW:
36454 rtx m1, m1h1, m1h2, lb, ub, t1;
36456 /* Return value and lb. */
36457 arg0 = CALL_EXPR_ARG (exp, 0);
36458 /* Bounds. */
36459 arg1 = CALL_EXPR_ARG (exp, 1);
36460 /* Size. */
36461 arg2 = CALL_EXPR_ARG (exp, 2);
36463 lb = expand_normal (arg0);
36464 op1 = expand_normal (arg1);
36465 op2 = expand_normal (arg2);
36467 /* Size was passed but we need to use (size - 1) as for bndmk. */
36468 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36469 NULL_RTX, 1, OPTAB_DIRECT);
36471 /* Add LB to size and inverse to get UB. */
36472 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36473 op2, 1, OPTAB_DIRECT);
36474 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36476 if (!register_operand (lb, Pmode))
36477 lb = ix86_zero_extend_to_Pmode (lb);
36478 if (!register_operand (ub, Pmode))
36479 ub = ix86_zero_extend_to_Pmode (ub);
36481 /* We need to move bounds to memory before any computations. */
36482 if (MEM_P (op1))
36483 m1 = op1;
36484 else
36486 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36487 emit_move_insn (m1, op1);
36490 /* Generate mem expression to be used for access to LB and UB. */
36491 m1h1 = adjust_address (m1, Pmode, 0);
36492 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36494 t1 = gen_reg_rtx (Pmode);
36496 /* Compute LB. */
36497 emit_move_insn (t1, m1h1);
36498 ix86_emit_move_max (t1, lb);
36499 emit_move_insn (m1h1, t1);
36501 /* Compute UB. UB is stored in 1's complement form. Therefore
36502 we also use max here. */
36503 emit_move_insn (t1, m1h2);
36504 ix86_emit_move_max (t1, ub);
36505 emit_move_insn (m1h2, t1);
36507 op2 = gen_reg_rtx (BNDmode);
36508 emit_move_insn (op2, m1);
36510 return chkp_join_splitted_slot (lb, op2);
36513 case IX86_BUILTIN_BNDINT:
36515 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36517 if (!target
36518 || GET_MODE (target) != BNDmode
36519 || !register_operand (target, BNDmode))
36520 target = gen_reg_rtx (BNDmode);
36522 arg0 = CALL_EXPR_ARG (exp, 0);
36523 arg1 = CALL_EXPR_ARG (exp, 1);
36525 op0 = expand_normal (arg0);
36526 op1 = expand_normal (arg1);
36528 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36529 rh1 = adjust_address (res, Pmode, 0);
36530 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36532 /* Put first bounds to temporaries. */
36533 lb1 = gen_reg_rtx (Pmode);
36534 ub1 = gen_reg_rtx (Pmode);
36535 if (MEM_P (op0))
36537 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36538 emit_move_insn (ub1, adjust_address (op0, Pmode,
36539 GET_MODE_SIZE (Pmode)));
36541 else
36543 emit_move_insn (res, op0);
36544 emit_move_insn (lb1, rh1);
36545 emit_move_insn (ub1, rh2);
36548 /* Put second bounds to temporaries. */
36549 lb2 = gen_reg_rtx (Pmode);
36550 ub2 = gen_reg_rtx (Pmode);
36551 if (MEM_P (op1))
36553 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36554 emit_move_insn (ub2, adjust_address (op1, Pmode,
36555 GET_MODE_SIZE (Pmode)));
36557 else
36559 emit_move_insn (res, op1);
36560 emit_move_insn (lb2, rh1);
36561 emit_move_insn (ub2, rh2);
36564 /* Compute LB. */
36565 ix86_emit_move_max (lb1, lb2);
36566 emit_move_insn (rh1, lb1);
36568 /* Compute UB. UB is stored in 1's complement form. Therefore
36569 we also use max here. */
36570 ix86_emit_move_max (ub1, ub2);
36571 emit_move_insn (rh2, ub1);
36573 emit_move_insn (target, res);
36575 return target;
36578 case IX86_BUILTIN_SIZEOF:
36580 tree name;
36581 rtx symbol;
36583 if (!target
36584 || GET_MODE (target) != Pmode
36585 || !register_operand (target, Pmode))
36586 target = gen_reg_rtx (Pmode);
36588 arg0 = CALL_EXPR_ARG (exp, 0);
36589 gcc_assert (VAR_P (arg0));
36591 name = DECL_ASSEMBLER_NAME (arg0);
36592 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36594 emit_insn (Pmode == SImode
36595 ? gen_move_size_reloc_si (target, symbol)
36596 : gen_move_size_reloc_di (target, symbol));
36598 return target;
36601 case IX86_BUILTIN_BNDLOWER:
36603 rtx mem, hmem;
36605 if (!target
36606 || GET_MODE (target) != Pmode
36607 || !register_operand (target, Pmode))
36608 target = gen_reg_rtx (Pmode);
36610 arg0 = CALL_EXPR_ARG (exp, 0);
36611 op0 = expand_normal (arg0);
36613 /* We need to move bounds to memory first. */
36614 if (MEM_P (op0))
36615 mem = op0;
36616 else
36618 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36619 emit_move_insn (mem, op0);
36622 /* Generate mem expression to access LB and load it. */
36623 hmem = adjust_address (mem, Pmode, 0);
36624 emit_move_insn (target, hmem);
36626 return target;
36629 case IX86_BUILTIN_BNDUPPER:
36631 rtx mem, hmem, res;
36633 if (!target
36634 || GET_MODE (target) != Pmode
36635 || !register_operand (target, Pmode))
36636 target = gen_reg_rtx (Pmode);
36638 arg0 = CALL_EXPR_ARG (exp, 0);
36639 op0 = expand_normal (arg0);
36641 /* We need to move bounds to memory first. */
36642 if (MEM_P (op0))
36643 mem = op0;
36644 else
36646 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36647 emit_move_insn (mem, op0);
36650 /* Generate mem expression to access UB. */
36651 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36653 /* We need to inverse all bits of UB. */
36654 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36656 if (res != target)
36657 emit_move_insn (target, res);
36659 return target;
36662 case IX86_BUILTIN_MASKMOVQ:
36663 case IX86_BUILTIN_MASKMOVDQU:
36664 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36665 ? CODE_FOR_mmx_maskmovq
36666 : CODE_FOR_sse2_maskmovdqu);
36667 /* Note the arg order is different from the operand order. */
36668 arg1 = CALL_EXPR_ARG (exp, 0);
36669 arg2 = CALL_EXPR_ARG (exp, 1);
36670 arg0 = CALL_EXPR_ARG (exp, 2);
36671 op0 = expand_normal (arg0);
36672 op1 = expand_normal (arg1);
36673 op2 = expand_normal (arg2);
36674 mode0 = insn_data[icode].operand[0].mode;
36675 mode1 = insn_data[icode].operand[1].mode;
36676 mode2 = insn_data[icode].operand[2].mode;
36678 op0 = ix86_zero_extend_to_Pmode (op0);
36679 op0 = gen_rtx_MEM (mode1, op0);
36681 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36682 op0 = copy_to_mode_reg (mode0, op0);
36683 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36684 op1 = copy_to_mode_reg (mode1, op1);
36685 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36686 op2 = copy_to_mode_reg (mode2, op2);
36687 pat = GEN_FCN (icode) (op0, op1, op2);
36688 if (! pat)
36689 return 0;
36690 emit_insn (pat);
36691 return 0;
36693 case IX86_BUILTIN_LDMXCSR:
36694 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36695 target = assign_386_stack_local (SImode, SLOT_TEMP);
36696 emit_move_insn (target, op0);
36697 emit_insn (gen_sse_ldmxcsr (target));
36698 return 0;
36700 case IX86_BUILTIN_STMXCSR:
36701 target = assign_386_stack_local (SImode, SLOT_TEMP);
36702 emit_insn (gen_sse_stmxcsr (target));
36703 return copy_to_mode_reg (SImode, target);
36705 case IX86_BUILTIN_CLFLUSH:
36706 arg0 = CALL_EXPR_ARG (exp, 0);
36707 op0 = expand_normal (arg0);
36708 icode = CODE_FOR_sse2_clflush;
36709 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36710 op0 = ix86_zero_extend_to_Pmode (op0);
36712 emit_insn (gen_sse2_clflush (op0));
36713 return 0;
36715 case IX86_BUILTIN_CLWB:
36716 arg0 = CALL_EXPR_ARG (exp, 0);
36717 op0 = expand_normal (arg0);
36718 icode = CODE_FOR_clwb;
36719 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36720 op0 = ix86_zero_extend_to_Pmode (op0);
36722 emit_insn (gen_clwb (op0));
36723 return 0;
36725 case IX86_BUILTIN_CLFLUSHOPT:
36726 arg0 = CALL_EXPR_ARG (exp, 0);
36727 op0 = expand_normal (arg0);
36728 icode = CODE_FOR_clflushopt;
36729 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36730 op0 = ix86_zero_extend_to_Pmode (op0);
36732 emit_insn (gen_clflushopt (op0));
36733 return 0;
36735 case IX86_BUILTIN_MONITOR:
36736 case IX86_BUILTIN_MONITORX:
36737 arg0 = CALL_EXPR_ARG (exp, 0);
36738 arg1 = CALL_EXPR_ARG (exp, 1);
36739 arg2 = CALL_EXPR_ARG (exp, 2);
36740 op0 = expand_normal (arg0);
36741 op1 = expand_normal (arg1);
36742 op2 = expand_normal (arg2);
36743 if (!REG_P (op0))
36744 op0 = ix86_zero_extend_to_Pmode (op0);
36745 if (!REG_P (op1))
36746 op1 = copy_to_mode_reg (SImode, op1);
36747 if (!REG_P (op2))
36748 op2 = copy_to_mode_reg (SImode, op2);
36750 emit_insn (fcode == IX86_BUILTIN_MONITOR
36751 ? ix86_gen_monitor (op0, op1, op2)
36752 : ix86_gen_monitorx (op0, op1, op2));
36753 return 0;
36755 case IX86_BUILTIN_MWAIT:
36756 arg0 = CALL_EXPR_ARG (exp, 0);
36757 arg1 = CALL_EXPR_ARG (exp, 1);
36758 op0 = expand_normal (arg0);
36759 op1 = expand_normal (arg1);
36760 if (!REG_P (op0))
36761 op0 = copy_to_mode_reg (SImode, op0);
36762 if (!REG_P (op1))
36763 op1 = copy_to_mode_reg (SImode, op1);
36764 emit_insn (gen_sse3_mwait (op0, op1));
36765 return 0;
36767 case IX86_BUILTIN_MWAITX:
36768 arg0 = CALL_EXPR_ARG (exp, 0);
36769 arg1 = CALL_EXPR_ARG (exp, 1);
36770 arg2 = CALL_EXPR_ARG (exp, 2);
36771 op0 = expand_normal (arg0);
36772 op1 = expand_normal (arg1);
36773 op2 = expand_normal (arg2);
36774 if (!REG_P (op0))
36775 op0 = copy_to_mode_reg (SImode, op0);
36776 if (!REG_P (op1))
36777 op1 = copy_to_mode_reg (SImode, op1);
36778 if (!REG_P (op2))
36779 op2 = copy_to_mode_reg (SImode, op2);
36780 emit_insn (gen_mwaitx (op0, op1, op2));
36781 return 0;
36783 case IX86_BUILTIN_CLZERO:
36784 arg0 = CALL_EXPR_ARG (exp, 0);
36785 op0 = expand_normal (arg0);
36786 if (!REG_P (op0))
36787 op0 = ix86_zero_extend_to_Pmode (op0);
36788 emit_insn (ix86_gen_clzero (op0));
36789 return 0;
36791 case IX86_BUILTIN_VEC_INIT_V2SI:
36792 case IX86_BUILTIN_VEC_INIT_V4HI:
36793 case IX86_BUILTIN_VEC_INIT_V8QI:
36794 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
36796 case IX86_BUILTIN_VEC_EXT_V2DF:
36797 case IX86_BUILTIN_VEC_EXT_V2DI:
36798 case IX86_BUILTIN_VEC_EXT_V4SF:
36799 case IX86_BUILTIN_VEC_EXT_V4SI:
36800 case IX86_BUILTIN_VEC_EXT_V8HI:
36801 case IX86_BUILTIN_VEC_EXT_V2SI:
36802 case IX86_BUILTIN_VEC_EXT_V4HI:
36803 case IX86_BUILTIN_VEC_EXT_V16QI:
36804 return ix86_expand_vec_ext_builtin (exp, target);
36806 case IX86_BUILTIN_VEC_SET_V2DI:
36807 case IX86_BUILTIN_VEC_SET_V4SF:
36808 case IX86_BUILTIN_VEC_SET_V4SI:
36809 case IX86_BUILTIN_VEC_SET_V8HI:
36810 case IX86_BUILTIN_VEC_SET_V4HI:
36811 case IX86_BUILTIN_VEC_SET_V16QI:
36812 return ix86_expand_vec_set_builtin (exp);
36814 case IX86_BUILTIN_NANQ:
36815 case IX86_BUILTIN_NANSQ:
36816 return expand_call (exp, target, ignore);
36818 case IX86_BUILTIN_RDPID:
36820 op0 = gen_reg_rtx (TARGET_64BIT ? DImode : SImode);
36822 if (TARGET_64BIT)
36824 insn = gen_rdpid_rex64 (op0);
36825 op0 = convert_to_mode (SImode, op0, 1);
36827 else
36828 insn = gen_rdpid (op0);
36829 emit_insn (insn);
36831 if (target == 0)
36833 /* mode is VOIDmode if __builtin_rdpid has been called
36834 without lhs. */
36835 if (mode == VOIDmode)
36836 return target;
36837 target = gen_reg_rtx (mode);
36839 emit_move_insn (target, op0);
36840 return target;
36841 case IX86_BUILTIN_RDPMC:
36842 case IX86_BUILTIN_RDTSC:
36843 case IX86_BUILTIN_RDTSCP:
36844 case IX86_BUILTIN_XGETBV:
36846 op0 = gen_reg_rtx (DImode);
36847 op1 = gen_reg_rtx (DImode);
36849 if (fcode == IX86_BUILTIN_RDPMC)
36851 arg0 = CALL_EXPR_ARG (exp, 0);
36852 op2 = expand_normal (arg0);
36853 if (!register_operand (op2, SImode))
36854 op2 = copy_to_mode_reg (SImode, op2);
36856 insn = (TARGET_64BIT
36857 ? gen_rdpmc_rex64 (op0, op1, op2)
36858 : gen_rdpmc (op0, op2));
36859 emit_insn (insn);
36861 else if (fcode == IX86_BUILTIN_XGETBV)
36863 arg0 = CALL_EXPR_ARG (exp, 0);
36864 op2 = expand_normal (arg0);
36865 if (!register_operand (op2, SImode))
36866 op2 = copy_to_mode_reg (SImode, op2);
36868 insn = (TARGET_64BIT
36869 ? gen_xgetbv_rex64 (op0, op1, op2)
36870 : gen_xgetbv (op0, op2));
36871 emit_insn (insn);
36873 else if (fcode == IX86_BUILTIN_RDTSC)
36875 insn = (TARGET_64BIT
36876 ? gen_rdtsc_rex64 (op0, op1)
36877 : gen_rdtsc (op0));
36878 emit_insn (insn);
36880 else
36882 op2 = gen_reg_rtx (SImode);
36884 insn = (TARGET_64BIT
36885 ? gen_rdtscp_rex64 (op0, op1, op2)
36886 : gen_rdtscp (op0, op2));
36887 emit_insn (insn);
36889 arg0 = CALL_EXPR_ARG (exp, 0);
36890 op4 = expand_normal (arg0);
36891 if (!address_operand (op4, VOIDmode))
36893 op4 = convert_memory_address (Pmode, op4);
36894 op4 = copy_addr_to_reg (op4);
36896 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
36899 if (target == 0)
36901 /* mode is VOIDmode if __builtin_rd* has been called
36902 without lhs. */
36903 if (mode == VOIDmode)
36904 return target;
36905 target = gen_reg_rtx (mode);
36908 if (TARGET_64BIT)
36910 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
36911 op1, 1, OPTAB_DIRECT);
36912 op0 = expand_simple_binop (DImode, IOR, op0, op1,
36913 op0, 1, OPTAB_DIRECT);
36916 emit_move_insn (target, op0);
36917 return target;
36919 case IX86_BUILTIN_FXSAVE:
36920 case IX86_BUILTIN_FXRSTOR:
36921 case IX86_BUILTIN_FXSAVE64:
36922 case IX86_BUILTIN_FXRSTOR64:
36923 case IX86_BUILTIN_FNSTENV:
36924 case IX86_BUILTIN_FLDENV:
36925 mode0 = BLKmode;
36926 switch (fcode)
36928 case IX86_BUILTIN_FXSAVE:
36929 icode = CODE_FOR_fxsave;
36930 break;
36931 case IX86_BUILTIN_FXRSTOR:
36932 icode = CODE_FOR_fxrstor;
36933 break;
36934 case IX86_BUILTIN_FXSAVE64:
36935 icode = CODE_FOR_fxsave64;
36936 break;
36937 case IX86_BUILTIN_FXRSTOR64:
36938 icode = CODE_FOR_fxrstor64;
36939 break;
36940 case IX86_BUILTIN_FNSTENV:
36941 icode = CODE_FOR_fnstenv;
36942 break;
36943 case IX86_BUILTIN_FLDENV:
36944 icode = CODE_FOR_fldenv;
36945 break;
36946 default:
36947 gcc_unreachable ();
36950 arg0 = CALL_EXPR_ARG (exp, 0);
36951 op0 = expand_normal (arg0);
36953 if (!address_operand (op0, VOIDmode))
36955 op0 = convert_memory_address (Pmode, op0);
36956 op0 = copy_addr_to_reg (op0);
36958 op0 = gen_rtx_MEM (mode0, op0);
36960 pat = GEN_FCN (icode) (op0);
36961 if (pat)
36962 emit_insn (pat);
36963 return 0;
36965 case IX86_BUILTIN_XSETBV:
36966 arg0 = CALL_EXPR_ARG (exp, 0);
36967 arg1 = CALL_EXPR_ARG (exp, 1);
36968 op0 = expand_normal (arg0);
36969 op1 = expand_normal (arg1);
36971 if (!REG_P (op0))
36972 op0 = copy_to_mode_reg (SImode, op0);
36974 if (TARGET_64BIT)
36976 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36977 NULL, 1, OPTAB_DIRECT);
36979 op2 = gen_lowpart (SImode, op2);
36980 op1 = gen_lowpart (SImode, op1);
36981 if (!REG_P (op1))
36982 op1 = copy_to_mode_reg (SImode, op1);
36983 if (!REG_P (op2))
36984 op2 = copy_to_mode_reg (SImode, op2);
36985 icode = CODE_FOR_xsetbv_rex64;
36986 pat = GEN_FCN (icode) (op0, op1, op2);
36988 else
36990 if (!REG_P (op1))
36991 op1 = copy_to_mode_reg (DImode, op1);
36992 icode = CODE_FOR_xsetbv;
36993 pat = GEN_FCN (icode) (op0, op1);
36995 if (pat)
36996 emit_insn (pat);
36997 return 0;
36999 case IX86_BUILTIN_XSAVE:
37000 case IX86_BUILTIN_XRSTOR:
37001 case IX86_BUILTIN_XSAVE64:
37002 case IX86_BUILTIN_XRSTOR64:
37003 case IX86_BUILTIN_XSAVEOPT:
37004 case IX86_BUILTIN_XSAVEOPT64:
37005 case IX86_BUILTIN_XSAVES:
37006 case IX86_BUILTIN_XRSTORS:
37007 case IX86_BUILTIN_XSAVES64:
37008 case IX86_BUILTIN_XRSTORS64:
37009 case IX86_BUILTIN_XSAVEC:
37010 case IX86_BUILTIN_XSAVEC64:
37011 arg0 = CALL_EXPR_ARG (exp, 0);
37012 arg1 = CALL_EXPR_ARG (exp, 1);
37013 op0 = expand_normal (arg0);
37014 op1 = expand_normal (arg1);
37016 if (!address_operand (op0, VOIDmode))
37018 op0 = convert_memory_address (Pmode, op0);
37019 op0 = copy_addr_to_reg (op0);
37021 op0 = gen_rtx_MEM (BLKmode, op0);
37023 op1 = force_reg (DImode, op1);
37025 if (TARGET_64BIT)
37027 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37028 NULL, 1, OPTAB_DIRECT);
37029 switch (fcode)
37031 case IX86_BUILTIN_XSAVE:
37032 icode = CODE_FOR_xsave_rex64;
37033 break;
37034 case IX86_BUILTIN_XRSTOR:
37035 icode = CODE_FOR_xrstor_rex64;
37036 break;
37037 case IX86_BUILTIN_XSAVE64:
37038 icode = CODE_FOR_xsave64;
37039 break;
37040 case IX86_BUILTIN_XRSTOR64:
37041 icode = CODE_FOR_xrstor64;
37042 break;
37043 case IX86_BUILTIN_XSAVEOPT:
37044 icode = CODE_FOR_xsaveopt_rex64;
37045 break;
37046 case IX86_BUILTIN_XSAVEOPT64:
37047 icode = CODE_FOR_xsaveopt64;
37048 break;
37049 case IX86_BUILTIN_XSAVES:
37050 icode = CODE_FOR_xsaves_rex64;
37051 break;
37052 case IX86_BUILTIN_XRSTORS:
37053 icode = CODE_FOR_xrstors_rex64;
37054 break;
37055 case IX86_BUILTIN_XSAVES64:
37056 icode = CODE_FOR_xsaves64;
37057 break;
37058 case IX86_BUILTIN_XRSTORS64:
37059 icode = CODE_FOR_xrstors64;
37060 break;
37061 case IX86_BUILTIN_XSAVEC:
37062 icode = CODE_FOR_xsavec_rex64;
37063 break;
37064 case IX86_BUILTIN_XSAVEC64:
37065 icode = CODE_FOR_xsavec64;
37066 break;
37067 default:
37068 gcc_unreachable ();
37071 op2 = gen_lowpart (SImode, op2);
37072 op1 = gen_lowpart (SImode, op1);
37073 pat = GEN_FCN (icode) (op0, op1, op2);
37075 else
37077 switch (fcode)
37079 case IX86_BUILTIN_XSAVE:
37080 icode = CODE_FOR_xsave;
37081 break;
37082 case IX86_BUILTIN_XRSTOR:
37083 icode = CODE_FOR_xrstor;
37084 break;
37085 case IX86_BUILTIN_XSAVEOPT:
37086 icode = CODE_FOR_xsaveopt;
37087 break;
37088 case IX86_BUILTIN_XSAVES:
37089 icode = CODE_FOR_xsaves;
37090 break;
37091 case IX86_BUILTIN_XRSTORS:
37092 icode = CODE_FOR_xrstors;
37093 break;
37094 case IX86_BUILTIN_XSAVEC:
37095 icode = CODE_FOR_xsavec;
37096 break;
37097 default:
37098 gcc_unreachable ();
37100 pat = GEN_FCN (icode) (op0, op1);
37103 if (pat)
37104 emit_insn (pat);
37105 return 0;
37107 case IX86_BUILTIN_LLWPCB:
37108 arg0 = CALL_EXPR_ARG (exp, 0);
37109 op0 = expand_normal (arg0);
37110 icode = CODE_FOR_lwp_llwpcb;
37111 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37112 op0 = ix86_zero_extend_to_Pmode (op0);
37113 emit_insn (gen_lwp_llwpcb (op0));
37114 return 0;
37116 case IX86_BUILTIN_SLWPCB:
37117 icode = CODE_FOR_lwp_slwpcb;
37118 if (!target
37119 || !insn_data[icode].operand[0].predicate (target, Pmode))
37120 target = gen_reg_rtx (Pmode);
37121 emit_insn (gen_lwp_slwpcb (target));
37122 return target;
37124 case IX86_BUILTIN_BEXTRI32:
37125 case IX86_BUILTIN_BEXTRI64:
37126 arg0 = CALL_EXPR_ARG (exp, 0);
37127 arg1 = CALL_EXPR_ARG (exp, 1);
37128 op0 = expand_normal (arg0);
37129 op1 = expand_normal (arg1);
37130 icode = (fcode == IX86_BUILTIN_BEXTRI32
37131 ? CODE_FOR_tbm_bextri_si
37132 : CODE_FOR_tbm_bextri_di);
37133 if (!CONST_INT_P (op1))
37135 error ("last argument must be an immediate");
37136 return const0_rtx;
37138 else
37140 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37141 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37142 op1 = GEN_INT (length);
37143 op2 = GEN_INT (lsb_index);
37144 pat = GEN_FCN (icode) (target, op0, op1, op2);
37145 if (pat)
37146 emit_insn (pat);
37147 return target;
37150 case IX86_BUILTIN_RDRAND16_STEP:
37151 icode = CODE_FOR_rdrandhi_1;
37152 mode0 = HImode;
37153 goto rdrand_step;
37155 case IX86_BUILTIN_RDRAND32_STEP:
37156 icode = CODE_FOR_rdrandsi_1;
37157 mode0 = SImode;
37158 goto rdrand_step;
37160 case IX86_BUILTIN_RDRAND64_STEP:
37161 icode = CODE_FOR_rdranddi_1;
37162 mode0 = DImode;
37164 rdrand_step:
37165 arg0 = CALL_EXPR_ARG (exp, 0);
37166 op1 = expand_normal (arg0);
37167 if (!address_operand (op1, VOIDmode))
37169 op1 = convert_memory_address (Pmode, op1);
37170 op1 = copy_addr_to_reg (op1);
37173 op0 = gen_reg_rtx (mode0);
37174 emit_insn (GEN_FCN (icode) (op0));
37176 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37178 op1 = gen_reg_rtx (SImode);
37179 emit_move_insn (op1, CONST1_RTX (SImode));
37181 /* Emit SImode conditional move. */
37182 if (mode0 == HImode)
37184 if (TARGET_ZERO_EXTEND_WITH_AND
37185 && optimize_function_for_speed_p (cfun))
37187 op2 = force_reg (SImode, const0_rtx);
37189 emit_insn (gen_movstricthi
37190 (gen_lowpart (HImode, op2), op0));
37192 else
37194 op2 = gen_reg_rtx (SImode);
37196 emit_insn (gen_zero_extendhisi2 (op2, op0));
37199 else if (mode0 == SImode)
37200 op2 = op0;
37201 else
37202 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37204 if (target == 0
37205 || !register_operand (target, SImode))
37206 target = gen_reg_rtx (SImode);
37208 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37209 const0_rtx);
37210 emit_insn (gen_rtx_SET (target,
37211 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37212 return target;
37214 case IX86_BUILTIN_RDSEED16_STEP:
37215 icode = CODE_FOR_rdseedhi_1;
37216 mode0 = HImode;
37217 goto rdseed_step;
37219 case IX86_BUILTIN_RDSEED32_STEP:
37220 icode = CODE_FOR_rdseedsi_1;
37221 mode0 = SImode;
37222 goto rdseed_step;
37224 case IX86_BUILTIN_RDSEED64_STEP:
37225 icode = CODE_FOR_rdseeddi_1;
37226 mode0 = DImode;
37228 rdseed_step:
37229 arg0 = CALL_EXPR_ARG (exp, 0);
37230 op1 = expand_normal (arg0);
37231 if (!address_operand (op1, VOIDmode))
37233 op1 = convert_memory_address (Pmode, op1);
37234 op1 = copy_addr_to_reg (op1);
37237 op0 = gen_reg_rtx (mode0);
37238 emit_insn (GEN_FCN (icode) (op0));
37240 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37242 op2 = gen_reg_rtx (QImode);
37244 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37245 const0_rtx);
37246 emit_insn (gen_rtx_SET (op2, pat));
37248 if (target == 0
37249 || !register_operand (target, SImode))
37250 target = gen_reg_rtx (SImode);
37252 emit_insn (gen_zero_extendqisi2 (target, op2));
37253 return target;
37255 case IX86_BUILTIN_SBB32:
37256 icode = CODE_FOR_subborrowsi;
37257 icode2 = CODE_FOR_subborrowsi_0;
37258 mode0 = SImode;
37259 mode1 = DImode;
37260 mode2 = CCmode;
37261 goto handlecarry;
37263 case IX86_BUILTIN_SBB64:
37264 icode = CODE_FOR_subborrowdi;
37265 icode2 = CODE_FOR_subborrowdi_0;
37266 mode0 = DImode;
37267 mode1 = TImode;
37268 mode2 = CCmode;
37269 goto handlecarry;
37271 case IX86_BUILTIN_ADDCARRYX32:
37272 icode = CODE_FOR_addcarrysi;
37273 icode2 = CODE_FOR_addcarrysi_0;
37274 mode0 = SImode;
37275 mode1 = DImode;
37276 mode2 = CCCmode;
37277 goto handlecarry;
37279 case IX86_BUILTIN_ADDCARRYX64:
37280 icode = CODE_FOR_addcarrydi;
37281 icode2 = CODE_FOR_addcarrydi_0;
37282 mode0 = DImode;
37283 mode1 = TImode;
37284 mode2 = CCCmode;
37286 handlecarry:
37287 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37288 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37289 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37290 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37292 op1 = expand_normal (arg0);
37293 if (!integer_zerop (arg0))
37294 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37296 op2 = expand_normal (arg1);
37297 if (!register_operand (op2, mode0))
37298 op2 = copy_to_mode_reg (mode0, op2);
37300 op3 = expand_normal (arg2);
37301 if (!register_operand (op3, mode0))
37302 op3 = copy_to_mode_reg (mode0, op3);
37304 op4 = expand_normal (arg3);
37305 if (!address_operand (op4, VOIDmode))
37307 op4 = convert_memory_address (Pmode, op4);
37308 op4 = copy_addr_to_reg (op4);
37311 op0 = gen_reg_rtx (mode0);
37312 if (integer_zerop (arg0))
37314 /* If arg0 is 0, optimize right away into add or sub
37315 instruction that sets CCCmode flags. */
37316 op1 = gen_rtx_REG (mode2, FLAGS_REG);
37317 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
37319 else
37321 /* Generate CF from input operand. */
37322 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37324 /* Generate instruction that consumes CF. */
37325 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37326 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
37327 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
37328 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
37331 /* Return current CF value. */
37332 if (target == 0)
37333 target = gen_reg_rtx (QImode);
37335 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
37336 emit_insn (gen_rtx_SET (target, pat));
37338 /* Store the result. */
37339 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37341 return target;
37343 case IX86_BUILTIN_READ_FLAGS:
37344 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37346 if (optimize
37347 || target == NULL_RTX
37348 || !nonimmediate_operand (target, word_mode)
37349 || GET_MODE (target) != word_mode)
37350 target = gen_reg_rtx (word_mode);
37352 emit_insn (gen_pop (target));
37353 return target;
37355 case IX86_BUILTIN_WRITE_FLAGS:
37357 arg0 = CALL_EXPR_ARG (exp, 0);
37358 op0 = expand_normal (arg0);
37359 if (!general_no_elim_operand (op0, word_mode))
37360 op0 = copy_to_mode_reg (word_mode, op0);
37362 emit_insn (gen_push (op0));
37363 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37364 return 0;
37366 case IX86_BUILTIN_KTESTC8:
37367 icode = CODE_FOR_ktestqi;
37368 mode3 = CCCmode;
37369 goto kortest;
37371 case IX86_BUILTIN_KTESTZ8:
37372 icode = CODE_FOR_ktestqi;
37373 mode3 = CCZmode;
37374 goto kortest;
37376 case IX86_BUILTIN_KTESTC16:
37377 icode = CODE_FOR_ktesthi;
37378 mode3 = CCCmode;
37379 goto kortest;
37381 case IX86_BUILTIN_KTESTZ16:
37382 icode = CODE_FOR_ktesthi;
37383 mode3 = CCZmode;
37384 goto kortest;
37386 case IX86_BUILTIN_KTESTC32:
37387 icode = CODE_FOR_ktestsi;
37388 mode3 = CCCmode;
37389 goto kortest;
37391 case IX86_BUILTIN_KTESTZ32:
37392 icode = CODE_FOR_ktestsi;
37393 mode3 = CCZmode;
37394 goto kortest;
37396 case IX86_BUILTIN_KTESTC64:
37397 icode = CODE_FOR_ktestdi;
37398 mode3 = CCCmode;
37399 goto kortest;
37401 case IX86_BUILTIN_KTESTZ64:
37402 icode = CODE_FOR_ktestdi;
37403 mode3 = CCZmode;
37404 goto kortest;
37406 case IX86_BUILTIN_KORTESTC8:
37407 icode = CODE_FOR_kortestqi;
37408 mode3 = CCCmode;
37409 goto kortest;
37411 case IX86_BUILTIN_KORTESTZ8:
37412 icode = CODE_FOR_kortestqi;
37413 mode3 = CCZmode;
37414 goto kortest;
37416 case IX86_BUILTIN_KORTESTC16:
37417 icode = CODE_FOR_kortesthi;
37418 mode3 = CCCmode;
37419 goto kortest;
37421 case IX86_BUILTIN_KORTESTZ16:
37422 icode = CODE_FOR_kortesthi;
37423 mode3 = CCZmode;
37424 goto kortest;
37426 case IX86_BUILTIN_KORTESTC32:
37427 icode = CODE_FOR_kortestsi;
37428 mode3 = CCCmode;
37429 goto kortest;
37431 case IX86_BUILTIN_KORTESTZ32:
37432 icode = CODE_FOR_kortestsi;
37433 mode3 = CCZmode;
37434 goto kortest;
37436 case IX86_BUILTIN_KORTESTC64:
37437 icode = CODE_FOR_kortestdi;
37438 mode3 = CCCmode;
37439 goto kortest;
37441 case IX86_BUILTIN_KORTESTZ64:
37442 icode = CODE_FOR_kortestdi;
37443 mode3 = CCZmode;
37445 kortest:
37446 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37447 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37448 op0 = expand_normal (arg0);
37449 op1 = expand_normal (arg1);
37451 mode0 = insn_data[icode].operand[0].mode;
37452 mode1 = insn_data[icode].operand[1].mode;
37454 if (GET_MODE (op0) != VOIDmode)
37455 op0 = force_reg (GET_MODE (op0), op0);
37457 op0 = gen_lowpart (mode0, op0);
37459 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37460 op0 = copy_to_mode_reg (mode0, op0);
37462 if (GET_MODE (op1) != VOIDmode)
37463 op1 = force_reg (GET_MODE (op1), op1);
37465 op1 = gen_lowpart (mode1, op1);
37467 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37468 op1 = copy_to_mode_reg (mode1, op1);
37470 target = gen_reg_rtx (QImode);
37472 /* Emit kortest. */
37473 emit_insn (GEN_FCN (icode) (op0, op1));
37474 /* And use setcc to return result from flags. */
37475 ix86_expand_setcc (target, EQ,
37476 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
37477 return target;
37479 case IX86_BUILTIN_GATHERSIV2DF:
37480 icode = CODE_FOR_avx2_gathersiv2df;
37481 goto gather_gen;
37482 case IX86_BUILTIN_GATHERSIV4DF:
37483 icode = CODE_FOR_avx2_gathersiv4df;
37484 goto gather_gen;
37485 case IX86_BUILTIN_GATHERDIV2DF:
37486 icode = CODE_FOR_avx2_gatherdiv2df;
37487 goto gather_gen;
37488 case IX86_BUILTIN_GATHERDIV4DF:
37489 icode = CODE_FOR_avx2_gatherdiv4df;
37490 goto gather_gen;
37491 case IX86_BUILTIN_GATHERSIV4SF:
37492 icode = CODE_FOR_avx2_gathersiv4sf;
37493 goto gather_gen;
37494 case IX86_BUILTIN_GATHERSIV8SF:
37495 icode = CODE_FOR_avx2_gathersiv8sf;
37496 goto gather_gen;
37497 case IX86_BUILTIN_GATHERDIV4SF:
37498 icode = CODE_FOR_avx2_gatherdiv4sf;
37499 goto gather_gen;
37500 case IX86_BUILTIN_GATHERDIV8SF:
37501 icode = CODE_FOR_avx2_gatherdiv8sf;
37502 goto gather_gen;
37503 case IX86_BUILTIN_GATHERSIV2DI:
37504 icode = CODE_FOR_avx2_gathersiv2di;
37505 goto gather_gen;
37506 case IX86_BUILTIN_GATHERSIV4DI:
37507 icode = CODE_FOR_avx2_gathersiv4di;
37508 goto gather_gen;
37509 case IX86_BUILTIN_GATHERDIV2DI:
37510 icode = CODE_FOR_avx2_gatherdiv2di;
37511 goto gather_gen;
37512 case IX86_BUILTIN_GATHERDIV4DI:
37513 icode = CODE_FOR_avx2_gatherdiv4di;
37514 goto gather_gen;
37515 case IX86_BUILTIN_GATHERSIV4SI:
37516 icode = CODE_FOR_avx2_gathersiv4si;
37517 goto gather_gen;
37518 case IX86_BUILTIN_GATHERSIV8SI:
37519 icode = CODE_FOR_avx2_gathersiv8si;
37520 goto gather_gen;
37521 case IX86_BUILTIN_GATHERDIV4SI:
37522 icode = CODE_FOR_avx2_gatherdiv4si;
37523 goto gather_gen;
37524 case IX86_BUILTIN_GATHERDIV8SI:
37525 icode = CODE_FOR_avx2_gatherdiv8si;
37526 goto gather_gen;
37527 case IX86_BUILTIN_GATHERALTSIV4DF:
37528 icode = CODE_FOR_avx2_gathersiv4df;
37529 goto gather_gen;
37530 case IX86_BUILTIN_GATHERALTDIV8SF:
37531 icode = CODE_FOR_avx2_gatherdiv8sf;
37532 goto gather_gen;
37533 case IX86_BUILTIN_GATHERALTSIV4DI:
37534 icode = CODE_FOR_avx2_gathersiv4di;
37535 goto gather_gen;
37536 case IX86_BUILTIN_GATHERALTDIV8SI:
37537 icode = CODE_FOR_avx2_gatherdiv8si;
37538 goto gather_gen;
37539 case IX86_BUILTIN_GATHER3SIV16SF:
37540 icode = CODE_FOR_avx512f_gathersiv16sf;
37541 goto gather_gen;
37542 case IX86_BUILTIN_GATHER3SIV8DF:
37543 icode = CODE_FOR_avx512f_gathersiv8df;
37544 goto gather_gen;
37545 case IX86_BUILTIN_GATHER3DIV16SF:
37546 icode = CODE_FOR_avx512f_gatherdiv16sf;
37547 goto gather_gen;
37548 case IX86_BUILTIN_GATHER3DIV8DF:
37549 icode = CODE_FOR_avx512f_gatherdiv8df;
37550 goto gather_gen;
37551 case IX86_BUILTIN_GATHER3SIV16SI:
37552 icode = CODE_FOR_avx512f_gathersiv16si;
37553 goto gather_gen;
37554 case IX86_BUILTIN_GATHER3SIV8DI:
37555 icode = CODE_FOR_avx512f_gathersiv8di;
37556 goto gather_gen;
37557 case IX86_BUILTIN_GATHER3DIV16SI:
37558 icode = CODE_FOR_avx512f_gatherdiv16si;
37559 goto gather_gen;
37560 case IX86_BUILTIN_GATHER3DIV8DI:
37561 icode = CODE_FOR_avx512f_gatherdiv8di;
37562 goto gather_gen;
37563 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37564 icode = CODE_FOR_avx512f_gathersiv8df;
37565 goto gather_gen;
37566 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37567 icode = CODE_FOR_avx512f_gatherdiv16sf;
37568 goto gather_gen;
37569 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37570 icode = CODE_FOR_avx512f_gathersiv8di;
37571 goto gather_gen;
37572 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37573 icode = CODE_FOR_avx512f_gatherdiv16si;
37574 goto gather_gen;
37575 case IX86_BUILTIN_GATHER3SIV2DF:
37576 icode = CODE_FOR_avx512vl_gathersiv2df;
37577 goto gather_gen;
37578 case IX86_BUILTIN_GATHER3SIV4DF:
37579 icode = CODE_FOR_avx512vl_gathersiv4df;
37580 goto gather_gen;
37581 case IX86_BUILTIN_GATHER3DIV2DF:
37582 icode = CODE_FOR_avx512vl_gatherdiv2df;
37583 goto gather_gen;
37584 case IX86_BUILTIN_GATHER3DIV4DF:
37585 icode = CODE_FOR_avx512vl_gatherdiv4df;
37586 goto gather_gen;
37587 case IX86_BUILTIN_GATHER3SIV4SF:
37588 icode = CODE_FOR_avx512vl_gathersiv4sf;
37589 goto gather_gen;
37590 case IX86_BUILTIN_GATHER3SIV8SF:
37591 icode = CODE_FOR_avx512vl_gathersiv8sf;
37592 goto gather_gen;
37593 case IX86_BUILTIN_GATHER3DIV4SF:
37594 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37595 goto gather_gen;
37596 case IX86_BUILTIN_GATHER3DIV8SF:
37597 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37598 goto gather_gen;
37599 case IX86_BUILTIN_GATHER3SIV2DI:
37600 icode = CODE_FOR_avx512vl_gathersiv2di;
37601 goto gather_gen;
37602 case IX86_BUILTIN_GATHER3SIV4DI:
37603 icode = CODE_FOR_avx512vl_gathersiv4di;
37604 goto gather_gen;
37605 case IX86_BUILTIN_GATHER3DIV2DI:
37606 icode = CODE_FOR_avx512vl_gatherdiv2di;
37607 goto gather_gen;
37608 case IX86_BUILTIN_GATHER3DIV4DI:
37609 icode = CODE_FOR_avx512vl_gatherdiv4di;
37610 goto gather_gen;
37611 case IX86_BUILTIN_GATHER3SIV4SI:
37612 icode = CODE_FOR_avx512vl_gathersiv4si;
37613 goto gather_gen;
37614 case IX86_BUILTIN_GATHER3SIV8SI:
37615 icode = CODE_FOR_avx512vl_gathersiv8si;
37616 goto gather_gen;
37617 case IX86_BUILTIN_GATHER3DIV4SI:
37618 icode = CODE_FOR_avx512vl_gatherdiv4si;
37619 goto gather_gen;
37620 case IX86_BUILTIN_GATHER3DIV8SI:
37621 icode = CODE_FOR_avx512vl_gatherdiv8si;
37622 goto gather_gen;
37623 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37624 icode = CODE_FOR_avx512vl_gathersiv4df;
37625 goto gather_gen;
37626 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37627 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37628 goto gather_gen;
37629 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37630 icode = CODE_FOR_avx512vl_gathersiv4di;
37631 goto gather_gen;
37632 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37633 icode = CODE_FOR_avx512vl_gatherdiv8si;
37634 goto gather_gen;
37635 case IX86_BUILTIN_SCATTERSIV16SF:
37636 icode = CODE_FOR_avx512f_scattersiv16sf;
37637 goto scatter_gen;
37638 case IX86_BUILTIN_SCATTERSIV8DF:
37639 icode = CODE_FOR_avx512f_scattersiv8df;
37640 goto scatter_gen;
37641 case IX86_BUILTIN_SCATTERDIV16SF:
37642 icode = CODE_FOR_avx512f_scatterdiv16sf;
37643 goto scatter_gen;
37644 case IX86_BUILTIN_SCATTERDIV8DF:
37645 icode = CODE_FOR_avx512f_scatterdiv8df;
37646 goto scatter_gen;
37647 case IX86_BUILTIN_SCATTERSIV16SI:
37648 icode = CODE_FOR_avx512f_scattersiv16si;
37649 goto scatter_gen;
37650 case IX86_BUILTIN_SCATTERSIV8DI:
37651 icode = CODE_FOR_avx512f_scattersiv8di;
37652 goto scatter_gen;
37653 case IX86_BUILTIN_SCATTERDIV16SI:
37654 icode = CODE_FOR_avx512f_scatterdiv16si;
37655 goto scatter_gen;
37656 case IX86_BUILTIN_SCATTERDIV8DI:
37657 icode = CODE_FOR_avx512f_scatterdiv8di;
37658 goto scatter_gen;
37659 case IX86_BUILTIN_SCATTERSIV8SF:
37660 icode = CODE_FOR_avx512vl_scattersiv8sf;
37661 goto scatter_gen;
37662 case IX86_BUILTIN_SCATTERSIV4SF:
37663 icode = CODE_FOR_avx512vl_scattersiv4sf;
37664 goto scatter_gen;
37665 case IX86_BUILTIN_SCATTERSIV4DF:
37666 icode = CODE_FOR_avx512vl_scattersiv4df;
37667 goto scatter_gen;
37668 case IX86_BUILTIN_SCATTERSIV2DF:
37669 icode = CODE_FOR_avx512vl_scattersiv2df;
37670 goto scatter_gen;
37671 case IX86_BUILTIN_SCATTERDIV8SF:
37672 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37673 goto scatter_gen;
37674 case IX86_BUILTIN_SCATTERDIV4SF:
37675 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37676 goto scatter_gen;
37677 case IX86_BUILTIN_SCATTERDIV4DF:
37678 icode = CODE_FOR_avx512vl_scatterdiv4df;
37679 goto scatter_gen;
37680 case IX86_BUILTIN_SCATTERDIV2DF:
37681 icode = CODE_FOR_avx512vl_scatterdiv2df;
37682 goto scatter_gen;
37683 case IX86_BUILTIN_SCATTERSIV8SI:
37684 icode = CODE_FOR_avx512vl_scattersiv8si;
37685 goto scatter_gen;
37686 case IX86_BUILTIN_SCATTERSIV4SI:
37687 icode = CODE_FOR_avx512vl_scattersiv4si;
37688 goto scatter_gen;
37689 case IX86_BUILTIN_SCATTERSIV4DI:
37690 icode = CODE_FOR_avx512vl_scattersiv4di;
37691 goto scatter_gen;
37692 case IX86_BUILTIN_SCATTERSIV2DI:
37693 icode = CODE_FOR_avx512vl_scattersiv2di;
37694 goto scatter_gen;
37695 case IX86_BUILTIN_SCATTERDIV8SI:
37696 icode = CODE_FOR_avx512vl_scatterdiv8si;
37697 goto scatter_gen;
37698 case IX86_BUILTIN_SCATTERDIV4SI:
37699 icode = CODE_FOR_avx512vl_scatterdiv4si;
37700 goto scatter_gen;
37701 case IX86_BUILTIN_SCATTERDIV4DI:
37702 icode = CODE_FOR_avx512vl_scatterdiv4di;
37703 goto scatter_gen;
37704 case IX86_BUILTIN_SCATTERDIV2DI:
37705 icode = CODE_FOR_avx512vl_scatterdiv2di;
37706 goto scatter_gen;
37707 case IX86_BUILTIN_GATHERPFDPD:
37708 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37709 goto vec_prefetch_gen;
37710 case IX86_BUILTIN_SCATTERALTSIV8DF:
37711 icode = CODE_FOR_avx512f_scattersiv8df;
37712 goto scatter_gen;
37713 case IX86_BUILTIN_SCATTERALTDIV16SF:
37714 icode = CODE_FOR_avx512f_scatterdiv16sf;
37715 goto scatter_gen;
37716 case IX86_BUILTIN_SCATTERALTSIV8DI:
37717 icode = CODE_FOR_avx512f_scattersiv8di;
37718 goto scatter_gen;
37719 case IX86_BUILTIN_SCATTERALTDIV16SI:
37720 icode = CODE_FOR_avx512f_scatterdiv16si;
37721 goto scatter_gen;
37722 case IX86_BUILTIN_GATHERPFDPS:
37723 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37724 goto vec_prefetch_gen;
37725 case IX86_BUILTIN_GATHERPFQPD:
37726 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37727 goto vec_prefetch_gen;
37728 case IX86_BUILTIN_GATHERPFQPS:
37729 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37730 goto vec_prefetch_gen;
37731 case IX86_BUILTIN_SCATTERPFDPD:
37732 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37733 goto vec_prefetch_gen;
37734 case IX86_BUILTIN_SCATTERPFDPS:
37735 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37736 goto vec_prefetch_gen;
37737 case IX86_BUILTIN_SCATTERPFQPD:
37738 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37739 goto vec_prefetch_gen;
37740 case IX86_BUILTIN_SCATTERPFQPS:
37741 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37742 goto vec_prefetch_gen;
37744 gather_gen:
37745 rtx half;
37746 rtx (*gen) (rtx, rtx);
37748 arg0 = CALL_EXPR_ARG (exp, 0);
37749 arg1 = CALL_EXPR_ARG (exp, 1);
37750 arg2 = CALL_EXPR_ARG (exp, 2);
37751 arg3 = CALL_EXPR_ARG (exp, 3);
37752 arg4 = CALL_EXPR_ARG (exp, 4);
37753 op0 = expand_normal (arg0);
37754 op1 = expand_normal (arg1);
37755 op2 = expand_normal (arg2);
37756 op3 = expand_normal (arg3);
37757 op4 = expand_normal (arg4);
37758 /* Note the arg order is different from the operand order. */
37759 mode0 = insn_data[icode].operand[1].mode;
37760 mode2 = insn_data[icode].operand[3].mode;
37761 mode3 = insn_data[icode].operand[4].mode;
37762 mode4 = insn_data[icode].operand[5].mode;
37764 if (target == NULL_RTX
37765 || GET_MODE (target) != insn_data[icode].operand[0].mode
37766 || !insn_data[icode].operand[0].predicate (target,
37767 GET_MODE (target)))
37768 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37769 else
37770 subtarget = target;
37772 switch (fcode)
37774 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37775 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37776 half = gen_reg_rtx (V8SImode);
37777 if (!nonimmediate_operand (op2, V16SImode))
37778 op2 = copy_to_mode_reg (V16SImode, op2);
37779 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37780 op2 = half;
37781 break;
37782 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37783 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37784 case IX86_BUILTIN_GATHERALTSIV4DF:
37785 case IX86_BUILTIN_GATHERALTSIV4DI:
37786 half = gen_reg_rtx (V4SImode);
37787 if (!nonimmediate_operand (op2, V8SImode))
37788 op2 = copy_to_mode_reg (V8SImode, op2);
37789 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37790 op2 = half;
37791 break;
37792 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37793 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37794 half = gen_reg_rtx (mode0);
37795 if (mode0 == V8SFmode)
37796 gen = gen_vec_extract_lo_v16sf;
37797 else
37798 gen = gen_vec_extract_lo_v16si;
37799 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37800 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37801 emit_insn (gen (half, op0));
37802 op0 = half;
37803 if (GET_MODE (op3) != VOIDmode)
37805 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37806 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37807 emit_insn (gen (half, op3));
37808 op3 = half;
37810 break;
37811 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37812 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37813 case IX86_BUILTIN_GATHERALTDIV8SF:
37814 case IX86_BUILTIN_GATHERALTDIV8SI:
37815 half = gen_reg_rtx (mode0);
37816 if (mode0 == V4SFmode)
37817 gen = gen_vec_extract_lo_v8sf;
37818 else
37819 gen = gen_vec_extract_lo_v8si;
37820 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37821 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37822 emit_insn (gen (half, op0));
37823 op0 = half;
37824 if (GET_MODE (op3) != VOIDmode)
37826 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37827 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37828 emit_insn (gen (half, op3));
37829 op3 = half;
37831 break;
37832 default:
37833 break;
37836 /* Force memory operand only with base register here. But we
37837 don't want to do it on memory operand for other builtin
37838 functions. */
37839 op1 = ix86_zero_extend_to_Pmode (op1);
37841 if (!insn_data[icode].operand[1].predicate (op0, mode0))
37842 op0 = copy_to_mode_reg (mode0, op0);
37843 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
37844 op1 = copy_to_mode_reg (Pmode, op1);
37845 if (!insn_data[icode].operand[3].predicate (op2, mode2))
37846 op2 = copy_to_mode_reg (mode2, op2);
37848 op3 = fixup_modeless_constant (op3, mode3);
37850 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
37852 if (!insn_data[icode].operand[4].predicate (op3, mode3))
37853 op3 = copy_to_mode_reg (mode3, op3);
37855 else
37857 op3 = copy_to_reg (op3);
37858 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
37860 if (!insn_data[icode].operand[5].predicate (op4, mode4))
37862 error ("the last argument must be scale 1, 2, 4, 8");
37863 return const0_rtx;
37866 /* Optimize. If mask is known to have all high bits set,
37867 replace op0 with pc_rtx to signal that the instruction
37868 overwrites the whole destination and doesn't use its
37869 previous contents. */
37870 if (optimize)
37872 if (TREE_CODE (arg3) == INTEGER_CST)
37874 if (integer_all_onesp (arg3))
37875 op0 = pc_rtx;
37877 else if (TREE_CODE (arg3) == VECTOR_CST)
37879 unsigned int negative = 0;
37880 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
37882 tree cst = VECTOR_CST_ELT (arg3, i);
37883 if (TREE_CODE (cst) == INTEGER_CST
37884 && tree_int_cst_sign_bit (cst))
37885 negative++;
37886 else if (TREE_CODE (cst) == REAL_CST
37887 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
37888 negative++;
37890 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
37891 op0 = pc_rtx;
37893 else if (TREE_CODE (arg3) == SSA_NAME
37894 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
37896 /* Recognize also when mask is like:
37897 __v2df src = _mm_setzero_pd ();
37898 __v2df mask = _mm_cmpeq_pd (src, src);
37900 __v8sf src = _mm256_setzero_ps ();
37901 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
37902 as that is a cheaper way to load all ones into
37903 a register than having to load a constant from
37904 memory. */
37905 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
37906 if (is_gimple_call (def_stmt))
37908 tree fndecl = gimple_call_fndecl (def_stmt);
37909 if (fndecl
37910 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
37911 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
37913 case IX86_BUILTIN_CMPPD:
37914 case IX86_BUILTIN_CMPPS:
37915 case IX86_BUILTIN_CMPPD256:
37916 case IX86_BUILTIN_CMPPS256:
37917 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
37918 break;
37919 /* FALLTHRU */
37920 case IX86_BUILTIN_CMPEQPD:
37921 case IX86_BUILTIN_CMPEQPS:
37922 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
37923 && initializer_zerop (gimple_call_arg (def_stmt,
37924 1)))
37925 op0 = pc_rtx;
37926 break;
37927 default:
37928 break;
37934 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37935 if (! pat)
37936 return const0_rtx;
37937 emit_insn (pat);
37939 switch (fcode)
37941 case IX86_BUILTIN_GATHER3DIV16SF:
37942 if (target == NULL_RTX)
37943 target = gen_reg_rtx (V8SFmode);
37944 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37945 break;
37946 case IX86_BUILTIN_GATHER3DIV16SI:
37947 if (target == NULL_RTX)
37948 target = gen_reg_rtx (V8SImode);
37949 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37950 break;
37951 case IX86_BUILTIN_GATHER3DIV8SF:
37952 case IX86_BUILTIN_GATHERDIV8SF:
37953 if (target == NULL_RTX)
37954 target = gen_reg_rtx (V4SFmode);
37955 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37956 break;
37957 case IX86_BUILTIN_GATHER3DIV8SI:
37958 case IX86_BUILTIN_GATHERDIV8SI:
37959 if (target == NULL_RTX)
37960 target = gen_reg_rtx (V4SImode);
37961 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37962 break;
37963 default:
37964 target = subtarget;
37965 break;
37967 return target;
37969 scatter_gen:
37970 arg0 = CALL_EXPR_ARG (exp, 0);
37971 arg1 = CALL_EXPR_ARG (exp, 1);
37972 arg2 = CALL_EXPR_ARG (exp, 2);
37973 arg3 = CALL_EXPR_ARG (exp, 3);
37974 arg4 = CALL_EXPR_ARG (exp, 4);
37975 op0 = expand_normal (arg0);
37976 op1 = expand_normal (arg1);
37977 op2 = expand_normal (arg2);
37978 op3 = expand_normal (arg3);
37979 op4 = expand_normal (arg4);
37980 mode1 = insn_data[icode].operand[1].mode;
37981 mode2 = insn_data[icode].operand[2].mode;
37982 mode3 = insn_data[icode].operand[3].mode;
37983 mode4 = insn_data[icode].operand[4].mode;
37985 /* Scatter instruction stores operand op3 to memory with
37986 indices from op2 and scale from op4 under writemask op1.
37987 If index operand op2 has more elements then source operand
37988 op3 one need to use only its low half. And vice versa. */
37989 switch (fcode)
37991 case IX86_BUILTIN_SCATTERALTSIV8DF:
37992 case IX86_BUILTIN_SCATTERALTSIV8DI:
37993 half = gen_reg_rtx (V8SImode);
37994 if (!nonimmediate_operand (op2, V16SImode))
37995 op2 = copy_to_mode_reg (V16SImode, op2);
37996 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37997 op2 = half;
37998 break;
37999 case IX86_BUILTIN_SCATTERALTDIV16SF:
38000 case IX86_BUILTIN_SCATTERALTDIV16SI:
38001 half = gen_reg_rtx (mode3);
38002 if (mode3 == V8SFmode)
38003 gen = gen_vec_extract_lo_v16sf;
38004 else
38005 gen = gen_vec_extract_lo_v16si;
38006 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38007 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38008 emit_insn (gen (half, op3));
38009 op3 = half;
38010 break;
38011 default:
38012 break;
38015 /* Force memory operand only with base register here. But we
38016 don't want to do it on memory operand for other builtin
38017 functions. */
38018 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38020 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38021 op0 = copy_to_mode_reg (Pmode, op0);
38023 op1 = fixup_modeless_constant (op1, mode1);
38025 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38027 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38028 op1 = copy_to_mode_reg (mode1, op1);
38030 else
38032 op1 = copy_to_reg (op1);
38033 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38036 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38037 op2 = copy_to_mode_reg (mode2, op2);
38039 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38040 op3 = copy_to_mode_reg (mode3, op3);
38042 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38044 error ("the last argument must be scale 1, 2, 4, 8");
38045 return const0_rtx;
38048 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38049 if (! pat)
38050 return const0_rtx;
38052 emit_insn (pat);
38053 return 0;
38055 vec_prefetch_gen:
38056 arg0 = CALL_EXPR_ARG (exp, 0);
38057 arg1 = CALL_EXPR_ARG (exp, 1);
38058 arg2 = CALL_EXPR_ARG (exp, 2);
38059 arg3 = CALL_EXPR_ARG (exp, 3);
38060 arg4 = CALL_EXPR_ARG (exp, 4);
38061 op0 = expand_normal (arg0);
38062 op1 = expand_normal (arg1);
38063 op2 = expand_normal (arg2);
38064 op3 = expand_normal (arg3);
38065 op4 = expand_normal (arg4);
38066 mode0 = insn_data[icode].operand[0].mode;
38067 mode1 = insn_data[icode].operand[1].mode;
38068 mode3 = insn_data[icode].operand[3].mode;
38069 mode4 = insn_data[icode].operand[4].mode;
38071 op0 = fixup_modeless_constant (op0, mode0);
38073 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38075 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38076 op0 = copy_to_mode_reg (mode0, op0);
38078 else
38080 op0 = copy_to_reg (op0);
38081 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38084 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38085 op1 = copy_to_mode_reg (mode1, op1);
38087 /* Force memory operand only with base register here. But we
38088 don't want to do it on memory operand for other builtin
38089 functions. */
38090 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38092 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38093 op2 = copy_to_mode_reg (Pmode, op2);
38095 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38097 error ("the forth argument must be scale 1, 2, 4, 8");
38098 return const0_rtx;
38101 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38103 error ("incorrect hint operand");
38104 return const0_rtx;
38107 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38108 if (! pat)
38109 return const0_rtx;
38111 emit_insn (pat);
38113 return 0;
38115 case IX86_BUILTIN_XABORT:
38116 icode = CODE_FOR_xabort;
38117 arg0 = CALL_EXPR_ARG (exp, 0);
38118 op0 = expand_normal (arg0);
38119 mode0 = insn_data[icode].operand[0].mode;
38120 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38122 error ("the xabort's argument must be an 8-bit immediate");
38123 return const0_rtx;
38125 emit_insn (gen_xabort (op0));
38126 return 0;
38128 case IX86_BUILTIN_RSTORSSP:
38129 case IX86_BUILTIN_CLRSSBSY:
38130 arg0 = CALL_EXPR_ARG (exp, 0);
38131 op0 = expand_normal (arg0);
38132 icode = (fcode == IX86_BUILTIN_RSTORSSP
38133 ? CODE_FOR_rstorssp
38134 : CODE_FOR_clrssbsy);
38135 if (!address_operand (op0, VOIDmode))
38137 op1 = convert_memory_address (Pmode, op0);
38138 op0 = copy_addr_to_reg (op1);
38140 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
38141 return 0;
38143 case IX86_BUILTIN_WRSSD:
38144 case IX86_BUILTIN_WRSSQ:
38145 case IX86_BUILTIN_WRUSSD:
38146 case IX86_BUILTIN_WRUSSQ:
38147 arg0 = CALL_EXPR_ARG (exp, 0);
38148 op0 = expand_normal (arg0);
38149 arg1 = CALL_EXPR_ARG (exp, 1);
38150 op1 = expand_normal (arg1);
38151 switch (fcode)
38153 case IX86_BUILTIN_WRSSD:
38154 icode = CODE_FOR_wrsssi;
38155 mode = SImode;
38156 break;
38157 case IX86_BUILTIN_WRSSQ:
38158 icode = CODE_FOR_wrssdi;
38159 mode = DImode;
38160 break;
38161 case IX86_BUILTIN_WRUSSD:
38162 icode = CODE_FOR_wrusssi;
38163 mode = SImode;
38164 break;
38165 case IX86_BUILTIN_WRUSSQ:
38166 icode = CODE_FOR_wrussdi;
38167 mode = DImode;
38168 break;
38170 op0 = force_reg (mode, op0);
38171 if (!address_operand (op1, VOIDmode))
38173 op2 = convert_memory_address (Pmode, op1);
38174 op1 = copy_addr_to_reg (op2);
38176 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
38177 return 0;
38179 default:
38180 break;
38183 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38184 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38186 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38187 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38188 target);
38191 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38192 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38194 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38195 switch (fcode)
38197 case IX86_BUILTIN_FABSQ:
38198 case IX86_BUILTIN_COPYSIGNQ:
38199 if (!TARGET_SSE)
38200 /* Emit a normal call if SSE isn't available. */
38201 return expand_call (exp, target, ignore);
38202 /* FALLTHRU */
38203 default:
38204 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38208 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38209 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38211 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38212 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38213 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38214 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38215 int masked = 1;
38216 machine_mode mode, wide_mode, nar_mode;
38218 nar_mode = V4SFmode;
38219 mode = V16SFmode;
38220 wide_mode = V64SFmode;
38221 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38222 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38224 switch (fcode)
38226 case IX86_BUILTIN_4FMAPS:
38227 fcn = gen_avx5124fmaddps_4fmaddps;
38228 masked = 0;
38229 goto v4fma_expand;
38231 case IX86_BUILTIN_4DPWSSD:
38232 nar_mode = V4SImode;
38233 mode = V16SImode;
38234 wide_mode = V64SImode;
38235 fcn = gen_avx5124vnniw_vp4dpwssd;
38236 masked = 0;
38237 goto v4fma_expand;
38239 case IX86_BUILTIN_4DPWSSDS:
38240 nar_mode = V4SImode;
38241 mode = V16SImode;
38242 wide_mode = V64SImode;
38243 fcn = gen_avx5124vnniw_vp4dpwssds;
38244 masked = 0;
38245 goto v4fma_expand;
38247 case IX86_BUILTIN_4FNMAPS:
38248 fcn = gen_avx5124fmaddps_4fnmaddps;
38249 masked = 0;
38250 goto v4fma_expand;
38252 case IX86_BUILTIN_4FNMAPS_MASK:
38253 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38254 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38255 goto v4fma_expand;
38257 case IX86_BUILTIN_4DPWSSD_MASK:
38258 nar_mode = V4SImode;
38259 mode = V16SImode;
38260 wide_mode = V64SImode;
38261 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38262 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38263 goto v4fma_expand;
38265 case IX86_BUILTIN_4DPWSSDS_MASK:
38266 nar_mode = V4SImode;
38267 mode = V16SImode;
38268 wide_mode = V64SImode;
38269 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38270 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38271 goto v4fma_expand;
38273 case IX86_BUILTIN_4FMAPS_MASK:
38275 tree args[4];
38276 rtx ops[4];
38277 rtx wide_reg;
38278 rtx accum;
38279 rtx addr;
38280 rtx mem;
38282 v4fma_expand:
38283 wide_reg = gen_reg_rtx (wide_mode);
38284 for (i = 0; i < 4; i++)
38286 args[i] = CALL_EXPR_ARG (exp, i);
38287 ops[i] = expand_normal (args[i]);
38289 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38290 ops[i]);
38293 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38294 accum = force_reg (mode, accum);
38296 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38297 addr = force_reg (Pmode, addr);
38299 mem = gen_rtx_MEM (nar_mode, addr);
38301 target = gen_reg_rtx (mode);
38303 emit_move_insn (target, accum);
38305 if (! masked)
38306 emit_insn (fcn (target, accum, wide_reg, mem));
38307 else
38309 rtx merge, mask;
38310 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38312 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38314 if (CONST_INT_P (mask))
38315 mask = fixup_modeless_constant (mask, HImode);
38317 mask = force_reg (HImode, mask);
38319 if (GET_MODE (mask) != HImode)
38320 mask = gen_rtx_SUBREG (HImode, mask, 0);
38322 /* If merge is 0 then we're about to emit z-masked variant. */
38323 if (const0_operand (merge, mode))
38324 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38325 /* If merge is the same as accum then emit merge-masked variant. */
38326 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38328 merge = force_reg (mode, merge);
38329 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38331 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38332 else
38334 target = gen_reg_rtx (mode);
38335 emit_move_insn (target, merge);
38336 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38339 return target;
38342 case IX86_BUILTIN_4FNMASS:
38343 fcn = gen_avx5124fmaddps_4fnmaddss;
38344 masked = 0;
38345 goto s4fma_expand;
38347 case IX86_BUILTIN_4FMASS:
38348 fcn = gen_avx5124fmaddps_4fmaddss;
38349 masked = 0;
38350 goto s4fma_expand;
38352 case IX86_BUILTIN_4FNMASS_MASK:
38353 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38354 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38355 goto s4fma_expand;
38357 case IX86_BUILTIN_4FMASS_MASK:
38359 tree args[4];
38360 rtx ops[4];
38361 rtx wide_reg;
38362 rtx accum;
38363 rtx addr;
38364 rtx mem;
38366 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38367 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38369 s4fma_expand:
38370 mode = V4SFmode;
38371 wide_reg = gen_reg_rtx (V64SFmode);
38372 for (i = 0; i < 4; i++)
38374 rtx tmp;
38375 args[i] = CALL_EXPR_ARG (exp, i);
38376 ops[i] = expand_normal (args[i]);
38378 tmp = gen_reg_rtx (SFmode);
38379 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38381 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38382 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38385 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38386 accum = force_reg (V4SFmode, accum);
38388 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38389 addr = force_reg (Pmode, addr);
38391 mem = gen_rtx_MEM (V4SFmode, addr);
38393 target = gen_reg_rtx (V4SFmode);
38395 emit_move_insn (target, accum);
38397 if (! masked)
38398 emit_insn (fcn (target, accum, wide_reg, mem));
38399 else
38401 rtx merge, mask;
38402 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38404 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38406 if (CONST_INT_P (mask))
38407 mask = fixup_modeless_constant (mask, QImode);
38409 mask = force_reg (QImode, mask);
38411 if (GET_MODE (mask) != QImode)
38412 mask = gen_rtx_SUBREG (QImode, mask, 0);
38414 /* If merge is 0 then we're about to emit z-masked variant. */
38415 if (const0_operand (merge, mode))
38416 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38417 /* If merge is the same as accum then emit merge-masked
38418 variant. */
38419 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38421 merge = force_reg (mode, merge);
38422 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38424 /* Merge with something unknown might happen if we z-mask
38425 w/ -O0. */
38426 else
38428 target = gen_reg_rtx (mode);
38429 emit_move_insn (target, merge);
38430 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38433 return target;
38435 case IX86_BUILTIN_RDPID:
38436 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38437 target);
38438 default:
38439 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38443 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38444 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38446 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38447 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38450 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38451 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38453 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38454 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38457 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38458 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38460 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38461 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38464 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38465 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38467 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38468 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38471 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38472 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38474 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38475 const struct builtin_description *d = bdesc_multi_arg + i;
38476 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38477 (enum ix86_builtin_func_type)
38478 d->flag, d->comparison);
38481 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
38482 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
38484 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
38485 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
38486 target);
38489 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
38490 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
38492 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
38493 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
38494 target);
38497 gcc_unreachable ();
38500 /* This returns the target-specific builtin with code CODE if
38501 current_function_decl has visibility on this builtin, which is checked
38502 using isa flags. Returns NULL_TREE otherwise. */
38504 static tree ix86_get_builtin (enum ix86_builtins code)
38506 struct cl_target_option *opts;
38507 tree target_tree = NULL_TREE;
38509 /* Determine the isa flags of current_function_decl. */
38511 if (current_function_decl)
38512 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38514 if (target_tree == NULL)
38515 target_tree = target_option_default_node;
38517 opts = TREE_TARGET_OPTION (target_tree);
38519 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38520 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38521 return ix86_builtin_decl (code, true);
38522 else
38523 return NULL_TREE;
38526 /* Return function decl for target specific builtin
38527 for given MPX builtin passed i FCODE. */
38528 static tree
38529 ix86_builtin_mpx_function (unsigned fcode)
38531 switch (fcode)
38533 case BUILT_IN_CHKP_BNDMK:
38534 return ix86_builtins[IX86_BUILTIN_BNDMK];
38536 case BUILT_IN_CHKP_BNDSTX:
38537 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38539 case BUILT_IN_CHKP_BNDLDX:
38540 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38542 case BUILT_IN_CHKP_BNDCL:
38543 return ix86_builtins[IX86_BUILTIN_BNDCL];
38545 case BUILT_IN_CHKP_BNDCU:
38546 return ix86_builtins[IX86_BUILTIN_BNDCU];
38548 case BUILT_IN_CHKP_BNDRET:
38549 return ix86_builtins[IX86_BUILTIN_BNDRET];
38551 case BUILT_IN_CHKP_INTERSECT:
38552 return ix86_builtins[IX86_BUILTIN_BNDINT];
38554 case BUILT_IN_CHKP_NARROW:
38555 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38557 case BUILT_IN_CHKP_SIZEOF:
38558 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38560 case BUILT_IN_CHKP_EXTRACT_LOWER:
38561 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38563 case BUILT_IN_CHKP_EXTRACT_UPPER:
38564 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38566 default:
38567 return NULL_TREE;
38570 gcc_unreachable ();
38573 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38575 Return an address to be used to load/store bounds for pointer
38576 passed in SLOT.
38578 SLOT_NO is an integer constant holding number of a target
38579 dependent special slot to be used in case SLOT is not a memory.
38581 SPECIAL_BASE is a pointer to be used as a base of fake address
38582 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38583 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38585 static rtx
38586 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38588 rtx addr = NULL;
38590 /* NULL slot means we pass bounds for pointer not passed to the
38591 function at all. Register slot means we pass pointer in a
38592 register. In both these cases bounds are passed via Bounds
38593 Table. Since we do not have actual pointer stored in memory,
38594 we have to use fake addresses to access Bounds Table. We
38595 start with (special_base - sizeof (void*)) and decrease this
38596 address by pointer size to get addresses for other slots. */
38597 if (!slot || REG_P (slot))
38599 gcc_assert (CONST_INT_P (slot_no));
38600 addr = plus_constant (Pmode, special_base,
38601 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38603 /* If pointer is passed in a memory then its address is used to
38604 access Bounds Table. */
38605 else if (MEM_P (slot))
38607 addr = XEXP (slot, 0);
38608 if (!register_operand (addr, Pmode))
38609 addr = copy_addr_to_reg (addr);
38611 else
38612 gcc_unreachable ();
38614 return addr;
38617 /* Expand pass uses this hook to load bounds for function parameter
38618 PTR passed in SLOT in case its bounds are not passed in a register.
38620 If SLOT is a memory, then bounds are loaded as for regular pointer
38621 loaded from memory. PTR may be NULL in case SLOT is a memory.
38622 In such case value of PTR (if required) may be loaded from SLOT.
38624 If SLOT is NULL or a register then SLOT_NO is an integer constant
38625 holding number of the target dependent special slot which should be
38626 used to obtain bounds.
38628 Return loaded bounds. */
38630 static rtx
38631 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38633 rtx reg = gen_reg_rtx (BNDmode);
38634 rtx addr;
38636 /* Get address to be used to access Bounds Table. Special slots start
38637 at the location of return address of the current function. */
38638 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38640 /* Load pointer value from a memory if we don't have it. */
38641 if (!ptr)
38643 gcc_assert (MEM_P (slot));
38644 ptr = copy_addr_to_reg (slot);
38647 if (!register_operand (ptr, Pmode))
38648 ptr = ix86_zero_extend_to_Pmode (ptr);
38650 emit_insn (BNDmode == BND64mode
38651 ? gen_bnd64_ldx (reg, addr, ptr)
38652 : gen_bnd32_ldx (reg, addr, ptr));
38654 return reg;
38657 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38658 passed in SLOT in case BOUNDS are not passed in a register.
38660 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38661 stored in memory. PTR may be NULL in case SLOT is a memory.
38662 In such case value of PTR (if required) may be loaded from SLOT.
38664 If SLOT is NULL or a register then SLOT_NO is an integer constant
38665 holding number of the target dependent special slot which should be
38666 used to store BOUNDS. */
38668 static void
38669 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38671 rtx addr;
38673 /* Get address to be used to access Bounds Table. Special slots start
38674 at the location of return address of a called function. */
38675 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38677 /* Load pointer value from a memory if we don't have it. */
38678 if (!ptr)
38680 gcc_assert (MEM_P (slot));
38681 ptr = copy_addr_to_reg (slot);
38684 if (!register_operand (ptr, Pmode))
38685 ptr = ix86_zero_extend_to_Pmode (ptr);
38687 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38688 if (!register_operand (bounds, BNDmode))
38689 bounds = copy_to_mode_reg (BNDmode, bounds);
38691 emit_insn (BNDmode == BND64mode
38692 ? gen_bnd64_stx (addr, ptr, bounds)
38693 : gen_bnd32_stx (addr, ptr, bounds));
38696 /* Load and return bounds returned by function in SLOT. */
38698 static rtx
38699 ix86_load_returned_bounds (rtx slot)
38701 rtx res;
38703 gcc_assert (REG_P (slot));
38704 res = gen_reg_rtx (BNDmode);
38705 emit_move_insn (res, slot);
38707 return res;
38710 /* Store BOUNDS returned by function into SLOT. */
38712 static void
38713 ix86_store_returned_bounds (rtx slot, rtx bounds)
38715 gcc_assert (REG_P (slot));
38716 emit_move_insn (slot, bounds);
38719 /* Returns a function decl for a vectorized version of the combined function
38720 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38721 if it is not available. */
38723 static tree
38724 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38725 tree type_in)
38727 machine_mode in_mode, out_mode;
38728 int in_n, out_n;
38730 if (TREE_CODE (type_out) != VECTOR_TYPE
38731 || TREE_CODE (type_in) != VECTOR_TYPE)
38732 return NULL_TREE;
38734 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38735 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38736 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38737 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38739 switch (fn)
38741 CASE_CFN_EXP2:
38742 if (out_mode == SFmode && in_mode == SFmode)
38744 if (out_n == 16 && in_n == 16)
38745 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38747 break;
38749 CASE_CFN_IFLOOR:
38750 CASE_CFN_LFLOOR:
38751 CASE_CFN_LLFLOOR:
38752 /* The round insn does not trap on denormals. */
38753 if (flag_trapping_math || !TARGET_SSE4_1)
38754 break;
38756 if (out_mode == SImode && in_mode == DFmode)
38758 if (out_n == 4 && in_n == 2)
38759 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38760 else if (out_n == 8 && in_n == 4)
38761 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38762 else if (out_n == 16 && in_n == 8)
38763 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38765 if (out_mode == SImode && in_mode == SFmode)
38767 if (out_n == 4 && in_n == 4)
38768 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38769 else if (out_n == 8 && in_n == 8)
38770 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38771 else if (out_n == 16 && in_n == 16)
38772 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38774 break;
38776 CASE_CFN_ICEIL:
38777 CASE_CFN_LCEIL:
38778 CASE_CFN_LLCEIL:
38779 /* The round insn does not trap on denormals. */
38780 if (flag_trapping_math || !TARGET_SSE4_1)
38781 break;
38783 if (out_mode == SImode && in_mode == DFmode)
38785 if (out_n == 4 && in_n == 2)
38786 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38787 else if (out_n == 8 && in_n == 4)
38788 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38789 else if (out_n == 16 && in_n == 8)
38790 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38792 if (out_mode == SImode && in_mode == SFmode)
38794 if (out_n == 4 && in_n == 4)
38795 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38796 else if (out_n == 8 && in_n == 8)
38797 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38798 else if (out_n == 16 && in_n == 16)
38799 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38801 break;
38803 CASE_CFN_IRINT:
38804 CASE_CFN_LRINT:
38805 CASE_CFN_LLRINT:
38806 if (out_mode == SImode && in_mode == DFmode)
38808 if (out_n == 4 && in_n == 2)
38809 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38810 else if (out_n == 8 && in_n == 4)
38811 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38812 else if (out_n == 16 && in_n == 8)
38813 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38815 if (out_mode == SImode && in_mode == SFmode)
38817 if (out_n == 4 && in_n == 4)
38818 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38819 else if (out_n == 8 && in_n == 8)
38820 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38821 else if (out_n == 16 && in_n == 16)
38822 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38824 break;
38826 CASE_CFN_IROUND:
38827 CASE_CFN_LROUND:
38828 CASE_CFN_LLROUND:
38829 /* The round insn does not trap on denormals. */
38830 if (flag_trapping_math || !TARGET_SSE4_1)
38831 break;
38833 if (out_mode == SImode && in_mode == DFmode)
38835 if (out_n == 4 && in_n == 2)
38836 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38837 else if (out_n == 8 && in_n == 4)
38838 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38839 else if (out_n == 16 && in_n == 8)
38840 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38842 if (out_mode == SImode && in_mode == SFmode)
38844 if (out_n == 4 && in_n == 4)
38845 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38846 else if (out_n == 8 && in_n == 8)
38847 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38848 else if (out_n == 16 && in_n == 16)
38849 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38851 break;
38853 CASE_CFN_FLOOR:
38854 /* The round insn does not trap on denormals. */
38855 if (flag_trapping_math || !TARGET_SSE4_1)
38856 break;
38858 if (out_mode == DFmode && in_mode == DFmode)
38860 if (out_n == 2 && in_n == 2)
38861 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38862 else if (out_n == 4 && in_n == 4)
38863 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38864 else if (out_n == 8 && in_n == 8)
38865 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38867 if (out_mode == SFmode && in_mode == SFmode)
38869 if (out_n == 4 && in_n == 4)
38870 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38871 else if (out_n == 8 && in_n == 8)
38872 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38873 else if (out_n == 16 && in_n == 16)
38874 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38876 break;
38878 CASE_CFN_CEIL:
38879 /* The round insn does not trap on denormals. */
38880 if (flag_trapping_math || !TARGET_SSE4_1)
38881 break;
38883 if (out_mode == DFmode && in_mode == DFmode)
38885 if (out_n == 2 && in_n == 2)
38886 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38887 else if (out_n == 4 && in_n == 4)
38888 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38889 else if (out_n == 8 && in_n == 8)
38890 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38892 if (out_mode == SFmode && in_mode == SFmode)
38894 if (out_n == 4 && in_n == 4)
38895 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38896 else if (out_n == 8 && in_n == 8)
38897 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38898 else if (out_n == 16 && in_n == 16)
38899 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38901 break;
38903 CASE_CFN_TRUNC:
38904 /* The round insn does not trap on denormals. */
38905 if (flag_trapping_math || !TARGET_SSE4_1)
38906 break;
38908 if (out_mode == DFmode && in_mode == DFmode)
38910 if (out_n == 2 && in_n == 2)
38911 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
38912 else if (out_n == 4 && in_n == 4)
38913 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
38914 else if (out_n == 8 && in_n == 8)
38915 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
38917 if (out_mode == SFmode && in_mode == SFmode)
38919 if (out_n == 4 && in_n == 4)
38920 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
38921 else if (out_n == 8 && in_n == 8)
38922 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
38923 else if (out_n == 16 && in_n == 16)
38924 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38926 break;
38928 CASE_CFN_RINT:
38929 /* The round insn does not trap on denormals. */
38930 if (flag_trapping_math || !TARGET_SSE4_1)
38931 break;
38933 if (out_mode == DFmode && in_mode == DFmode)
38935 if (out_n == 2 && in_n == 2)
38936 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38937 else if (out_n == 4 && in_n == 4)
38938 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38940 if (out_mode == SFmode && in_mode == SFmode)
38942 if (out_n == 4 && in_n == 4)
38943 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38944 else if (out_n == 8 && in_n == 8)
38945 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38947 break;
38949 CASE_CFN_FMA:
38950 if (out_mode == DFmode && in_mode == DFmode)
38952 if (out_n == 2 && in_n == 2)
38953 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38954 if (out_n == 4 && in_n == 4)
38955 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38957 if (out_mode == SFmode && in_mode == SFmode)
38959 if (out_n == 4 && in_n == 4)
38960 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38961 if (out_n == 8 && in_n == 8)
38962 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38964 break;
38966 default:
38967 break;
38970 /* Dispatch to a handler for a vectorization library. */
38971 if (ix86_veclib_handler)
38972 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38974 return NULL_TREE;
38977 /* Handler for an SVML-style interface to
38978 a library with vectorized intrinsics. */
38980 static tree
38981 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38983 char name[20];
38984 tree fntype, new_fndecl, args;
38985 unsigned arity;
38986 const char *bname;
38987 machine_mode el_mode, in_mode;
38988 int n, in_n;
38990 /* The SVML is suitable for unsafe math only. */
38991 if (!flag_unsafe_math_optimizations)
38992 return NULL_TREE;
38994 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38995 n = TYPE_VECTOR_SUBPARTS (type_out);
38996 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38997 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38998 if (el_mode != in_mode
38999 || n != in_n)
39000 return NULL_TREE;
39002 switch (fn)
39004 CASE_CFN_EXP:
39005 CASE_CFN_LOG:
39006 CASE_CFN_LOG10:
39007 CASE_CFN_POW:
39008 CASE_CFN_TANH:
39009 CASE_CFN_TAN:
39010 CASE_CFN_ATAN:
39011 CASE_CFN_ATAN2:
39012 CASE_CFN_ATANH:
39013 CASE_CFN_CBRT:
39014 CASE_CFN_SINH:
39015 CASE_CFN_SIN:
39016 CASE_CFN_ASINH:
39017 CASE_CFN_ASIN:
39018 CASE_CFN_COSH:
39019 CASE_CFN_COS:
39020 CASE_CFN_ACOSH:
39021 CASE_CFN_ACOS:
39022 if ((el_mode != DFmode || n != 2)
39023 && (el_mode != SFmode || n != 4))
39024 return NULL_TREE;
39025 break;
39027 default:
39028 return NULL_TREE;
39031 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39032 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39034 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39035 strcpy (name, "vmlsLn4");
39036 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39037 strcpy (name, "vmldLn2");
39038 else if (n == 4)
39040 sprintf (name, "vmls%s", bname+10);
39041 name[strlen (name)-1] = '4';
39043 else
39044 sprintf (name, "vmld%s2", bname+10);
39046 /* Convert to uppercase. */
39047 name[4] &= ~0x20;
39049 arity = 0;
39050 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39051 arity++;
39053 if (arity == 1)
39054 fntype = build_function_type_list (type_out, type_in, NULL);
39055 else
39056 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39058 /* Build a function declaration for the vectorized function. */
39059 new_fndecl = build_decl (BUILTINS_LOCATION,
39060 FUNCTION_DECL, get_identifier (name), fntype);
39061 TREE_PUBLIC (new_fndecl) = 1;
39062 DECL_EXTERNAL (new_fndecl) = 1;
39063 DECL_IS_NOVOPS (new_fndecl) = 1;
39064 TREE_READONLY (new_fndecl) = 1;
39066 return new_fndecl;
39069 /* Handler for an ACML-style interface to
39070 a library with vectorized intrinsics. */
39072 static tree
39073 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39075 char name[20] = "__vr.._";
39076 tree fntype, new_fndecl, args;
39077 unsigned arity;
39078 const char *bname;
39079 machine_mode el_mode, in_mode;
39080 int n, in_n;
39082 /* The ACML is 64bits only and suitable for unsafe math only as
39083 it does not correctly support parts of IEEE with the required
39084 precision such as denormals. */
39085 if (!TARGET_64BIT
39086 || !flag_unsafe_math_optimizations)
39087 return NULL_TREE;
39089 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39090 n = TYPE_VECTOR_SUBPARTS (type_out);
39091 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39092 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39093 if (el_mode != in_mode
39094 || n != in_n)
39095 return NULL_TREE;
39097 switch (fn)
39099 CASE_CFN_SIN:
39100 CASE_CFN_COS:
39101 CASE_CFN_EXP:
39102 CASE_CFN_LOG:
39103 CASE_CFN_LOG2:
39104 CASE_CFN_LOG10:
39105 if (el_mode == DFmode && n == 2)
39107 name[4] = 'd';
39108 name[5] = '2';
39110 else if (el_mode == SFmode && n == 4)
39112 name[4] = 's';
39113 name[5] = '4';
39115 else
39116 return NULL_TREE;
39117 break;
39119 default:
39120 return NULL_TREE;
39123 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39124 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39125 sprintf (name + 7, "%s", bname+10);
39127 arity = 0;
39128 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39129 arity++;
39131 if (arity == 1)
39132 fntype = build_function_type_list (type_out, type_in, NULL);
39133 else
39134 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39136 /* Build a function declaration for the vectorized function. */
39137 new_fndecl = build_decl (BUILTINS_LOCATION,
39138 FUNCTION_DECL, get_identifier (name), fntype);
39139 TREE_PUBLIC (new_fndecl) = 1;
39140 DECL_EXTERNAL (new_fndecl) = 1;
39141 DECL_IS_NOVOPS (new_fndecl) = 1;
39142 TREE_READONLY (new_fndecl) = 1;
39144 return new_fndecl;
39147 /* Returns a decl of a function that implements gather load with
39148 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39149 Return NULL_TREE if it is not available. */
39151 static tree
39152 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39153 const_tree index_type, int scale)
39155 bool si;
39156 enum ix86_builtins code;
39158 if (! TARGET_AVX2 || !TARGET_USE_GATHER)
39159 return NULL_TREE;
39161 if ((TREE_CODE (index_type) != INTEGER_TYPE
39162 && !POINTER_TYPE_P (index_type))
39163 || (TYPE_MODE (index_type) != SImode
39164 && TYPE_MODE (index_type) != DImode))
39165 return NULL_TREE;
39167 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39168 return NULL_TREE;
39170 /* v*gather* insn sign extends index to pointer mode. */
39171 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39172 && TYPE_UNSIGNED (index_type))
39173 return NULL_TREE;
39175 if (scale <= 0
39176 || scale > 8
39177 || (scale & (scale - 1)) != 0)
39178 return NULL_TREE;
39180 si = TYPE_MODE (index_type) == SImode;
39181 switch (TYPE_MODE (mem_vectype))
39183 case E_V2DFmode:
39184 if (TARGET_AVX512VL)
39185 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39186 else
39187 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39188 break;
39189 case E_V4DFmode:
39190 if (TARGET_AVX512VL)
39191 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39192 else
39193 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39194 break;
39195 case E_V2DImode:
39196 if (TARGET_AVX512VL)
39197 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39198 else
39199 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39200 break;
39201 case E_V4DImode:
39202 if (TARGET_AVX512VL)
39203 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39204 else
39205 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39206 break;
39207 case E_V4SFmode:
39208 if (TARGET_AVX512VL)
39209 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39210 else
39211 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39212 break;
39213 case E_V8SFmode:
39214 if (TARGET_AVX512VL)
39215 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39216 else
39217 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39218 break;
39219 case E_V4SImode:
39220 if (TARGET_AVX512VL)
39221 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39222 else
39223 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39224 break;
39225 case E_V8SImode:
39226 if (TARGET_AVX512VL)
39227 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39228 else
39229 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39230 break;
39231 case E_V8DFmode:
39232 if (TARGET_AVX512F)
39233 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39234 else
39235 return NULL_TREE;
39236 break;
39237 case E_V8DImode:
39238 if (TARGET_AVX512F)
39239 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39240 else
39241 return NULL_TREE;
39242 break;
39243 case E_V16SFmode:
39244 if (TARGET_AVX512F)
39245 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39246 else
39247 return NULL_TREE;
39248 break;
39249 case E_V16SImode:
39250 if (TARGET_AVX512F)
39251 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39252 else
39253 return NULL_TREE;
39254 break;
39255 default:
39256 return NULL_TREE;
39259 return ix86_get_builtin (code);
39262 /* Returns a decl of a function that implements scatter store with
39263 register type VECTYPE and index type INDEX_TYPE and SCALE.
39264 Return NULL_TREE if it is not available. */
39266 static tree
39267 ix86_vectorize_builtin_scatter (const_tree vectype,
39268 const_tree index_type, int scale)
39270 bool si;
39271 enum ix86_builtins code;
39273 if (!TARGET_AVX512F)
39274 return NULL_TREE;
39276 if ((TREE_CODE (index_type) != INTEGER_TYPE
39277 && !POINTER_TYPE_P (index_type))
39278 || (TYPE_MODE (index_type) != SImode
39279 && TYPE_MODE (index_type) != DImode))
39280 return NULL_TREE;
39282 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39283 return NULL_TREE;
39285 /* v*scatter* insn sign extends index to pointer mode. */
39286 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39287 && TYPE_UNSIGNED (index_type))
39288 return NULL_TREE;
39290 /* Scale can be 1, 2, 4 or 8. */
39291 if (scale <= 0
39292 || scale > 8
39293 || (scale & (scale - 1)) != 0)
39294 return NULL_TREE;
39296 si = TYPE_MODE (index_type) == SImode;
39297 switch (TYPE_MODE (vectype))
39299 case E_V8DFmode:
39300 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39301 break;
39302 case E_V8DImode:
39303 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39304 break;
39305 case E_V16SFmode:
39306 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39307 break;
39308 case E_V16SImode:
39309 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39310 break;
39311 default:
39312 return NULL_TREE;
39315 return ix86_builtins[code];
39318 /* Return true if it is safe to use the rsqrt optabs to optimize
39319 1.0/sqrt. */
39321 static bool
39322 use_rsqrt_p ()
39324 return (TARGET_SSE_MATH
39325 && flag_finite_math_only
39326 && !flag_trapping_math
39327 && flag_unsafe_math_optimizations);
39330 /* Returns a code for a target-specific builtin that implements
39331 reciprocal of the function, or NULL_TREE if not available. */
39333 static tree
39334 ix86_builtin_reciprocal (tree fndecl)
39336 switch (DECL_FUNCTION_CODE (fndecl))
39338 /* Vectorized version of sqrt to rsqrt conversion. */
39339 case IX86_BUILTIN_SQRTPS_NR:
39340 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39342 case IX86_BUILTIN_SQRTPS_NR256:
39343 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39345 default:
39346 return NULL_TREE;
39350 /* Helper for avx_vpermilps256_operand et al. This is also used by
39351 the expansion functions to turn the parallel back into a mask.
39352 The return value is 0 for no match and the imm8+1 for a match. */
39355 avx_vpermilp_parallel (rtx par, machine_mode mode)
39357 unsigned i, nelt = GET_MODE_NUNITS (mode);
39358 unsigned mask = 0;
39359 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39361 if (XVECLEN (par, 0) != (int) nelt)
39362 return 0;
39364 /* Validate that all of the elements are constants, and not totally
39365 out of range. Copy the data into an integral array to make the
39366 subsequent checks easier. */
39367 for (i = 0; i < nelt; ++i)
39369 rtx er = XVECEXP (par, 0, i);
39370 unsigned HOST_WIDE_INT ei;
39372 if (!CONST_INT_P (er))
39373 return 0;
39374 ei = INTVAL (er);
39375 if (ei >= nelt)
39376 return 0;
39377 ipar[i] = ei;
39380 switch (mode)
39382 case E_V8DFmode:
39383 /* In the 512-bit DFmode case, we can only move elements within
39384 a 128-bit lane. First fill the second part of the mask,
39385 then fallthru. */
39386 for (i = 4; i < 6; ++i)
39388 if (ipar[i] < 4 || ipar[i] >= 6)
39389 return 0;
39390 mask |= (ipar[i] - 4) << i;
39392 for (i = 6; i < 8; ++i)
39394 if (ipar[i] < 6)
39395 return 0;
39396 mask |= (ipar[i] - 6) << i;
39398 /* FALLTHRU */
39400 case E_V4DFmode:
39401 /* In the 256-bit DFmode case, we can only move elements within
39402 a 128-bit lane. */
39403 for (i = 0; i < 2; ++i)
39405 if (ipar[i] >= 2)
39406 return 0;
39407 mask |= ipar[i] << i;
39409 for (i = 2; i < 4; ++i)
39411 if (ipar[i] < 2)
39412 return 0;
39413 mask |= (ipar[i] - 2) << i;
39415 break;
39417 case E_V16SFmode:
39418 /* In 512 bit SFmode case, permutation in the upper 256 bits
39419 must mirror the permutation in the lower 256-bits. */
39420 for (i = 0; i < 8; ++i)
39421 if (ipar[i] + 8 != ipar[i + 8])
39422 return 0;
39423 /* FALLTHRU */
39425 case E_V8SFmode:
39426 /* In 256 bit SFmode case, we have full freedom of
39427 movement within the low 128-bit lane, but the high 128-bit
39428 lane must mirror the exact same pattern. */
39429 for (i = 0; i < 4; ++i)
39430 if (ipar[i] + 4 != ipar[i + 4])
39431 return 0;
39432 nelt = 4;
39433 /* FALLTHRU */
39435 case E_V2DFmode:
39436 case E_V4SFmode:
39437 /* In the 128-bit case, we've full freedom in the placement of
39438 the elements from the source operand. */
39439 for (i = 0; i < nelt; ++i)
39440 mask |= ipar[i] << (i * (nelt / 2));
39441 break;
39443 default:
39444 gcc_unreachable ();
39447 /* Make sure success has a non-zero value by adding one. */
39448 return mask + 1;
39451 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39452 the expansion functions to turn the parallel back into a mask.
39453 The return value is 0 for no match and the imm8+1 for a match. */
39456 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39458 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39459 unsigned mask = 0;
39460 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39462 if (XVECLEN (par, 0) != (int) nelt)
39463 return 0;
39465 /* Validate that all of the elements are constants, and not totally
39466 out of range. Copy the data into an integral array to make the
39467 subsequent checks easier. */
39468 for (i = 0; i < nelt; ++i)
39470 rtx er = XVECEXP (par, 0, i);
39471 unsigned HOST_WIDE_INT ei;
39473 if (!CONST_INT_P (er))
39474 return 0;
39475 ei = INTVAL (er);
39476 if (ei >= 2 * nelt)
39477 return 0;
39478 ipar[i] = ei;
39481 /* Validate that the halves of the permute are halves. */
39482 for (i = 0; i < nelt2 - 1; ++i)
39483 if (ipar[i] + 1 != ipar[i + 1])
39484 return 0;
39485 for (i = nelt2; i < nelt - 1; ++i)
39486 if (ipar[i] + 1 != ipar[i + 1])
39487 return 0;
39489 /* Reconstruct the mask. */
39490 for (i = 0; i < 2; ++i)
39492 unsigned e = ipar[i * nelt2];
39493 if (e % nelt2)
39494 return 0;
39495 e /= nelt2;
39496 mask |= e << (i * 4);
39499 /* Make sure success has a non-zero value by adding one. */
39500 return mask + 1;
39503 /* Return a register priority for hard reg REGNO. */
39504 static int
39505 ix86_register_priority (int hard_regno)
39507 /* ebp and r13 as the base always wants a displacement, r12 as the
39508 base always wants an index. So discourage their usage in an
39509 address. */
39510 if (hard_regno == R12_REG || hard_regno == R13_REG)
39511 return 0;
39512 if (hard_regno == BP_REG)
39513 return 1;
39514 /* New x86-64 int registers result in bigger code size. Discourage
39515 them. */
39516 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39517 return 2;
39518 /* New x86-64 SSE registers result in bigger code size. Discourage
39519 them. */
39520 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39521 return 2;
39522 /* Usage of AX register results in smaller code. Prefer it. */
39523 if (hard_regno == AX_REG)
39524 return 4;
39525 return 3;
39528 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39530 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39531 QImode must go into class Q_REGS.
39532 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39533 movdf to do mem-to-mem moves through integer regs. */
39535 static reg_class_t
39536 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39538 machine_mode mode = GET_MODE (x);
39540 /* We're only allowed to return a subclass of CLASS. Many of the
39541 following checks fail for NO_REGS, so eliminate that early. */
39542 if (regclass == NO_REGS)
39543 return NO_REGS;
39545 /* All classes can load zeros. */
39546 if (x == CONST0_RTX (mode))
39547 return regclass;
39549 /* Force constants into memory if we are loading a (nonzero) constant into
39550 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39551 instructions to load from a constant. */
39552 if (CONSTANT_P (x)
39553 && (MAYBE_MMX_CLASS_P (regclass)
39554 || MAYBE_SSE_CLASS_P (regclass)
39555 || MAYBE_MASK_CLASS_P (regclass)))
39556 return NO_REGS;
39558 /* Floating-point constants need more complex checks. */
39559 if (CONST_DOUBLE_P (x))
39561 /* General regs can load everything. */
39562 if (INTEGER_CLASS_P (regclass))
39563 return regclass;
39565 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39566 zero above. We only want to wind up preferring 80387 registers if
39567 we plan on doing computation with them. */
39568 if (IS_STACK_MODE (mode)
39569 && standard_80387_constant_p (x) > 0)
39571 /* Limit class to FP regs. */
39572 if (FLOAT_CLASS_P (regclass))
39573 return FLOAT_REGS;
39574 else if (regclass == FP_TOP_SSE_REGS)
39575 return FP_TOP_REG;
39576 else if (regclass == FP_SECOND_SSE_REGS)
39577 return FP_SECOND_REG;
39580 return NO_REGS;
39583 /* Prefer SSE regs only, if we can use them for math. */
39584 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39585 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39587 /* Generally when we see PLUS here, it's the function invariant
39588 (plus soft-fp const_int). Which can only be computed into general
39589 regs. */
39590 if (GET_CODE (x) == PLUS)
39591 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39593 /* QImode constants are easy to load, but non-constant QImode data
39594 must go into Q_REGS. */
39595 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39597 if (Q_CLASS_P (regclass))
39598 return regclass;
39599 else if (reg_class_subset_p (Q_REGS, regclass))
39600 return Q_REGS;
39601 else
39602 return NO_REGS;
39605 return regclass;
39608 /* Discourage putting floating-point values in SSE registers unless
39609 SSE math is being used, and likewise for the 387 registers. */
39610 static reg_class_t
39611 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39613 machine_mode mode = GET_MODE (x);
39615 /* Restrict the output reload class to the register bank that we are doing
39616 math on. If we would like not to return a subset of CLASS, reject this
39617 alternative: if reload cannot do this, it will still use its choice. */
39618 mode = GET_MODE (x);
39619 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39620 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39622 if (IS_STACK_MODE (mode))
39624 if (regclass == FP_TOP_SSE_REGS)
39625 return FP_TOP_REG;
39626 else if (regclass == FP_SECOND_SSE_REGS)
39627 return FP_SECOND_REG;
39628 else
39629 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39632 return regclass;
39635 static reg_class_t
39636 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39637 machine_mode mode, secondary_reload_info *sri)
39639 /* Double-word spills from general registers to non-offsettable memory
39640 references (zero-extended addresses) require special handling. */
39641 if (TARGET_64BIT
39642 && MEM_P (x)
39643 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39644 && INTEGER_CLASS_P (rclass)
39645 && !offsettable_memref_p (x))
39647 sri->icode = (in_p
39648 ? CODE_FOR_reload_noff_load
39649 : CODE_FOR_reload_noff_store);
39650 /* Add the cost of moving address to a temporary. */
39651 sri->extra_cost = 1;
39653 return NO_REGS;
39656 /* QImode spills from non-QI registers require
39657 intermediate register on 32bit targets. */
39658 if (mode == QImode
39659 && ((!TARGET_64BIT && !in_p
39660 && INTEGER_CLASS_P (rclass)
39661 && MAYBE_NON_Q_CLASS_P (rclass))
39662 || (!TARGET_AVX512DQ
39663 && MAYBE_MASK_CLASS_P (rclass))))
39665 int regno = true_regnum (x);
39667 /* Return Q_REGS if the operand is in memory. */
39668 if (regno == -1)
39669 return Q_REGS;
39671 return NO_REGS;
39674 /* This condition handles corner case where an expression involving
39675 pointers gets vectorized. We're trying to use the address of a
39676 stack slot as a vector initializer.
39678 (set (reg:V2DI 74 [ vect_cst_.2 ])
39679 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39681 Eventually frame gets turned into sp+offset like this:
39683 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39684 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39685 (const_int 392 [0x188]))))
39687 That later gets turned into:
39689 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39690 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39691 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39693 We'll have the following reload recorded:
39695 Reload 0: reload_in (DI) =
39696 (plus:DI (reg/f:DI 7 sp)
39697 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39698 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39699 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39700 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39701 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39702 reload_reg_rtx: (reg:V2DI 22 xmm1)
39704 Which isn't going to work since SSE instructions can't handle scalar
39705 additions. Returning GENERAL_REGS forces the addition into integer
39706 register and reload can handle subsequent reloads without problems. */
39708 if (in_p && GET_CODE (x) == PLUS
39709 && SSE_CLASS_P (rclass)
39710 && SCALAR_INT_MODE_P (mode))
39711 return GENERAL_REGS;
39713 return NO_REGS;
39716 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39718 static bool
39719 ix86_class_likely_spilled_p (reg_class_t rclass)
39721 switch (rclass)
39723 case AREG:
39724 case DREG:
39725 case CREG:
39726 case BREG:
39727 case AD_REGS:
39728 case SIREG:
39729 case DIREG:
39730 case SSE_FIRST_REG:
39731 case FP_TOP_REG:
39732 case FP_SECOND_REG:
39733 case BND_REGS:
39734 return true;
39736 default:
39737 break;
39740 return false;
39743 /* If we are copying between registers from different register sets
39744 (e.g. FP and integer), we may need a memory location.
39746 The function can't work reliably when one of the CLASSES is a class
39747 containing registers from multiple sets. We avoid this by never combining
39748 different sets in a single alternative in the machine description.
39749 Ensure that this constraint holds to avoid unexpected surprises.
39751 When STRICT is false, we are being called from REGISTER_MOVE_COST,
39752 so do not enforce these sanity checks.
39754 To optimize register_move_cost performance, define inline variant. */
39756 static inline bool
39757 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39758 reg_class_t class2, int strict)
39760 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39761 return false;
39763 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39764 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39765 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39766 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39767 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39768 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
39769 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
39770 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
39772 gcc_assert (!strict || lra_in_progress);
39773 return true;
39776 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39777 return true;
39779 /* Between mask and general, we have moves no larger than word size. */
39780 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
39781 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39782 return true;
39784 /* ??? This is a lie. We do have moves between mmx/general, and for
39785 mmx/sse2. But by saying we need secondary memory we discourage the
39786 register allocator from using the mmx registers unless needed. */
39787 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39788 return true;
39790 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39792 /* SSE1 doesn't have any direct moves from other classes. */
39793 if (!TARGET_SSE2)
39794 return true;
39796 /* If the target says that inter-unit moves are more expensive
39797 than moving through memory, then don't generate them. */
39798 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39799 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39800 return true;
39802 /* Between SSE and general, we have moves no larger than word size. */
39803 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39804 return true;
39807 return false;
39810 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
39812 static bool
39813 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39814 reg_class_t class2)
39816 return inline_secondary_memory_needed (mode, class1, class2, true);
39819 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
39821 get_secondary_mem widens integral modes to BITS_PER_WORD.
39822 There is no need to emit full 64 bit move on 64 bit targets
39823 for integral modes that can be moved using 32 bit move. */
39825 static machine_mode
39826 ix86_secondary_memory_needed_mode (machine_mode mode)
39828 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
39829 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
39830 return mode;
39833 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39835 On the 80386, this is the size of MODE in words,
39836 except in the FP regs, where a single reg is always enough. */
39838 static unsigned char
39839 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39841 if (MAYBE_INTEGER_CLASS_P (rclass))
39843 if (mode == XFmode)
39844 return (TARGET_64BIT ? 2 : 3);
39845 else if (mode == XCmode)
39846 return (TARGET_64BIT ? 4 : 6);
39847 else
39848 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39850 else
39852 if (COMPLEX_MODE_P (mode))
39853 return 2;
39854 else
39855 return 1;
39859 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
39861 static bool
39862 ix86_can_change_mode_class (machine_mode from, machine_mode to,
39863 reg_class_t regclass)
39865 if (from == to)
39866 return true;
39868 /* x87 registers can't do subreg at all, as all values are reformatted
39869 to extended precision. */
39870 if (MAYBE_FLOAT_CLASS_P (regclass))
39871 return false;
39873 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39875 /* Vector registers do not support QI or HImode loads. If we don't
39876 disallow a change to these modes, reload will assume it's ok to
39877 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39878 the vec_dupv4hi pattern. */
39879 if (GET_MODE_SIZE (from) < 4)
39880 return false;
39883 return true;
39886 /* Return index of MODE in the sse load/store tables. */
39888 static inline int
39889 sse_store_index (machine_mode mode)
39891 switch (GET_MODE_SIZE (mode))
39893 case 4:
39894 return 0;
39895 case 8:
39896 return 1;
39897 case 16:
39898 return 2;
39899 case 32:
39900 return 3;
39901 case 64:
39902 return 4;
39903 default:
39904 return -1;
39908 /* Return the cost of moving data of mode M between a
39909 register and memory. A value of 2 is the default; this cost is
39910 relative to those in `REGISTER_MOVE_COST'.
39912 This function is used extensively by register_move_cost that is used to
39913 build tables at startup. Make it inline in this case.
39914 When IN is 2, return maximum of in and out move cost.
39916 If moving between registers and memory is more expensive than
39917 between two registers, you should define this macro to express the
39918 relative cost.
39920 Model also increased moving costs of QImode registers in non
39921 Q_REGS classes.
39923 static inline int
39924 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39925 int in)
39927 int cost;
39928 if (FLOAT_CLASS_P (regclass))
39930 int index;
39931 switch (mode)
39933 case E_SFmode:
39934 index = 0;
39935 break;
39936 case E_DFmode:
39937 index = 1;
39938 break;
39939 case E_XFmode:
39940 index = 2;
39941 break;
39942 default:
39943 return 100;
39945 if (in == 2)
39946 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39947 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39949 if (SSE_CLASS_P (regclass))
39951 int index = sse_store_index (mode);
39952 if (index == -1)
39953 return 100;
39954 if (in == 2)
39955 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39956 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39958 if (MMX_CLASS_P (regclass))
39960 int index;
39961 switch (GET_MODE_SIZE (mode))
39963 case 4:
39964 index = 0;
39965 break;
39966 case 8:
39967 index = 1;
39968 break;
39969 default:
39970 return 100;
39972 if (in)
39973 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39974 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39976 switch (GET_MODE_SIZE (mode))
39978 case 1:
39979 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39981 if (!in)
39982 return ix86_cost->int_store[0];
39983 if (TARGET_PARTIAL_REG_DEPENDENCY
39984 && optimize_function_for_speed_p (cfun))
39985 cost = ix86_cost->movzbl_load;
39986 else
39987 cost = ix86_cost->int_load[0];
39988 if (in == 2)
39989 return MAX (cost, ix86_cost->int_store[0]);
39990 return cost;
39992 else
39994 if (in == 2)
39995 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39996 if (in)
39997 return ix86_cost->movzbl_load;
39998 else
39999 return ix86_cost->int_store[0] + 4;
40001 break;
40002 case 2:
40003 if (in == 2)
40004 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40005 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40006 default:
40007 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40008 if (mode == TFmode)
40009 mode = XFmode;
40010 if (in == 2)
40011 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40012 else if (in)
40013 cost = ix86_cost->int_load[2];
40014 else
40015 cost = ix86_cost->int_store[2];
40016 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40020 static int
40021 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40022 bool in)
40024 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40028 /* Return the cost of moving data from a register in class CLASS1 to
40029 one in class CLASS2.
40031 It is not required that the cost always equal 2 when FROM is the same as TO;
40032 on some machines it is expensive to move between registers if they are not
40033 general registers. */
40035 static int
40036 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40037 reg_class_t class2_i)
40039 enum reg_class class1 = (enum reg_class) class1_i;
40040 enum reg_class class2 = (enum reg_class) class2_i;
40042 /* In case we require secondary memory, compute cost of the store followed
40043 by load. In order to avoid bad register allocation choices, we need
40044 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40046 if (inline_secondary_memory_needed (mode, class1, class2, false))
40048 int cost = 1;
40050 cost += inline_memory_move_cost (mode, class1, 2);
40051 cost += inline_memory_move_cost (mode, class2, 2);
40053 /* In case of copying from general_purpose_register we may emit multiple
40054 stores followed by single load causing memory size mismatch stall.
40055 Count this as arbitrarily high cost of 20. */
40056 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
40057 && TARGET_MEMORY_MISMATCH_STALL
40058 && targetm.class_max_nregs (class1, mode)
40059 > targetm.class_max_nregs (class2, mode))
40060 cost += 20;
40062 /* In the case of FP/MMX moves, the registers actually overlap, and we
40063 have to switch modes in order to treat them differently. */
40064 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40065 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40066 cost += 20;
40068 return cost;
40071 /* Moves between SSE/MMX and integer unit are expensive. */
40072 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40073 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40075 /* ??? By keeping returned value relatively high, we limit the number
40076 of moves between integer and MMX/SSE registers for all targets.
40077 Additionally, high value prevents problem with x86_modes_tieable_p(),
40078 where integer modes in MMX/SSE registers are not tieable
40079 because of missing QImode and HImode moves to, from or between
40080 MMX/SSE registers. */
40081 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
40082 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
40084 if (MAYBE_FLOAT_CLASS_P (class1))
40085 return ix86_cost->fp_move;
40086 if (MAYBE_SSE_CLASS_P (class1))
40088 if (GET_MODE_BITSIZE (mode) <= 128)
40089 return ix86_cost->xmm_move;
40090 if (GET_MODE_BITSIZE (mode) <= 256)
40091 return ix86_cost->ymm_move;
40092 return ix86_cost->zmm_move;
40094 if (MAYBE_MMX_CLASS_P (class1))
40095 return ix86_cost->mmx_move;
40096 return 2;
40099 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
40100 words of a value of mode MODE but can be less for certain modes in
40101 special long registers.
40103 Actually there are no two word move instructions for consecutive
40104 registers. And only registers 0-3 may have mov byte instructions
40105 applied to them. */
40107 static unsigned int
40108 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
40110 if (GENERAL_REGNO_P (regno))
40112 if (mode == XFmode)
40113 return TARGET_64BIT ? 2 : 3;
40114 if (mode == XCmode)
40115 return TARGET_64BIT ? 4 : 6;
40116 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40118 if (COMPLEX_MODE_P (mode))
40119 return 2;
40120 if (mode == V64SFmode || mode == V64SImode)
40121 return 4;
40122 return 1;
40125 /* Implement TARGET_HARD_REGNO_MODE_OK. */
40127 static bool
40128 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
40130 /* Flags and only flags can only hold CCmode values. */
40131 if (CC_REGNO_P (regno))
40132 return GET_MODE_CLASS (mode) == MODE_CC;
40133 if (GET_MODE_CLASS (mode) == MODE_CC
40134 || GET_MODE_CLASS (mode) == MODE_RANDOM
40135 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40136 return false;
40137 if (STACK_REGNO_P (regno))
40138 return VALID_FP_MODE_P (mode);
40139 if (MASK_REGNO_P (regno))
40140 return (VALID_MASK_REG_MODE (mode)
40141 || (TARGET_AVX512BW
40142 && VALID_MASK_AVX512BW_MODE (mode)));
40143 if (BND_REGNO_P (regno))
40144 return VALID_BND_REG_MODE (mode);
40145 if (SSE_REGNO_P (regno))
40147 /* We implement the move patterns for all vector modes into and
40148 out of SSE registers, even when no operation instructions
40149 are available. */
40151 /* For AVX-512 we allow, regardless of regno:
40152 - XI mode
40153 - any of 512-bit wide vector mode
40154 - any scalar mode. */
40155 if (TARGET_AVX512F
40156 && (mode == XImode
40157 || VALID_AVX512F_REG_MODE (mode)
40158 || VALID_AVX512F_SCALAR_MODE (mode)))
40159 return true;
40161 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40162 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40163 && MOD4_SSE_REGNO_P (regno)
40164 && mode == V64SFmode)
40165 return true;
40167 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40168 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40169 && MOD4_SSE_REGNO_P (regno)
40170 && mode == V64SImode)
40171 return true;
40173 /* TODO check for QI/HI scalars. */
40174 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40175 if (TARGET_AVX512VL
40176 && (mode == OImode
40177 || mode == TImode
40178 || VALID_AVX256_REG_MODE (mode)
40179 || VALID_AVX512VL_128_REG_MODE (mode)))
40180 return true;
40182 /* xmm16-xmm31 are only available for AVX-512. */
40183 if (EXT_REX_SSE_REGNO_P (regno))
40184 return false;
40186 /* OImode and AVX modes are available only when AVX is enabled. */
40187 return ((TARGET_AVX
40188 && VALID_AVX256_REG_OR_OI_MODE (mode))
40189 || VALID_SSE_REG_MODE (mode)
40190 || VALID_SSE2_REG_MODE (mode)
40191 || VALID_MMX_REG_MODE (mode)
40192 || VALID_MMX_REG_MODE_3DNOW (mode));
40194 if (MMX_REGNO_P (regno))
40196 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40197 so if the register is available at all, then we can move data of
40198 the given mode into or out of it. */
40199 return (VALID_MMX_REG_MODE (mode)
40200 || VALID_MMX_REG_MODE_3DNOW (mode));
40203 if (mode == QImode)
40205 /* Take care for QImode values - they can be in non-QI regs,
40206 but then they do cause partial register stalls. */
40207 if (ANY_QI_REGNO_P (regno))
40208 return true;
40209 if (!TARGET_PARTIAL_REG_STALL)
40210 return true;
40211 /* LRA checks if the hard register is OK for the given mode.
40212 QImode values can live in non-QI regs, so we allow all
40213 registers here. */
40214 if (lra_in_progress)
40215 return true;
40216 return !can_create_pseudo_p ();
40218 /* We handle both integer and floats in the general purpose registers. */
40219 else if (VALID_INT_MODE_P (mode))
40220 return true;
40221 else if (VALID_FP_MODE_P (mode))
40222 return true;
40223 else if (VALID_DFP_MODE_P (mode))
40224 return true;
40225 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40226 on to use that value in smaller contexts, this can easily force a
40227 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40228 supporting DImode, allow it. */
40229 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40230 return true;
40232 return false;
40235 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
40236 saves SSE registers across calls is Win64 (thus no need to check the
40237 current ABI here), and with AVX enabled Win64 only guarantees that
40238 the low 16 bytes are saved. */
40240 static bool
40241 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
40243 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
40246 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40247 tieable integer mode. */
40249 static bool
40250 ix86_tieable_integer_mode_p (machine_mode mode)
40252 switch (mode)
40254 case E_HImode:
40255 case E_SImode:
40256 return true;
40258 case E_QImode:
40259 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40261 case E_DImode:
40262 return TARGET_64BIT;
40264 default:
40265 return false;
40269 /* Implement TARGET_MODES_TIEABLE_P.
40271 Return true if MODE1 is accessible in a register that can hold MODE2
40272 without copying. That is, all register classes that can hold MODE2
40273 can also hold MODE1. */
40275 static bool
40276 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40278 if (mode1 == mode2)
40279 return true;
40281 if (ix86_tieable_integer_mode_p (mode1)
40282 && ix86_tieable_integer_mode_p (mode2))
40283 return true;
40285 /* MODE2 being XFmode implies fp stack or general regs, which means we
40286 can tie any smaller floating point modes to it. Note that we do not
40287 tie this with TFmode. */
40288 if (mode2 == XFmode)
40289 return mode1 == SFmode || mode1 == DFmode;
40291 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40292 that we can tie it with SFmode. */
40293 if (mode2 == DFmode)
40294 return mode1 == SFmode;
40296 /* If MODE2 is only appropriate for an SSE register, then tie with
40297 any other mode acceptable to SSE registers. */
40298 if (GET_MODE_SIZE (mode2) == 32
40299 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40300 return (GET_MODE_SIZE (mode1) == 32
40301 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40302 if (GET_MODE_SIZE (mode2) == 16
40303 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40304 return (GET_MODE_SIZE (mode1) == 16
40305 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40307 /* If MODE2 is appropriate for an MMX register, then tie
40308 with any other mode acceptable to MMX registers. */
40309 if (GET_MODE_SIZE (mode2) == 8
40310 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40311 return (GET_MODE_SIZE (mode1) == 8
40312 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40314 return false;
40317 /* Return the cost of moving between two registers of mode MODE. */
40319 static int
40320 ix86_set_reg_reg_cost (machine_mode mode)
40322 unsigned int units = UNITS_PER_WORD;
40324 switch (GET_MODE_CLASS (mode))
40326 default:
40327 break;
40329 case MODE_CC:
40330 units = GET_MODE_SIZE (CCmode);
40331 break;
40333 case MODE_FLOAT:
40334 if ((TARGET_SSE && mode == TFmode)
40335 || (TARGET_80387 && mode == XFmode)
40336 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40337 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40338 units = GET_MODE_SIZE (mode);
40339 break;
40341 case MODE_COMPLEX_FLOAT:
40342 if ((TARGET_SSE && mode == TCmode)
40343 || (TARGET_80387 && mode == XCmode)
40344 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40345 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40346 units = GET_MODE_SIZE (mode);
40347 break;
40349 case MODE_VECTOR_INT:
40350 case MODE_VECTOR_FLOAT:
40351 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40352 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40353 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40354 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40355 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40356 units = GET_MODE_SIZE (mode);
40359 /* Return the cost of moving between two registers of mode MODE,
40360 assuming that the move will be in pieces of at most UNITS bytes. */
40361 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40364 /* Return cost of vector operation in MODE given that scalar version has
40365 COST. If PARALLEL is true assume that CPU has more than one unit
40366 performing the operation. */
40368 static int
40369 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
40371 if (!VECTOR_MODE_P (mode))
40372 return cost;
40374 if (!parallel)
40375 return cost * GET_MODE_NUNITS (mode);
40376 if (GET_MODE_BITSIZE (mode) == 128
40377 && TARGET_SSE_SPLIT_REGS)
40378 return cost * 2;
40379 if (GET_MODE_BITSIZE (mode) > 128
40380 && TARGET_AVX128_OPTIMAL)
40381 return cost * GET_MODE_BITSIZE (mode) / 128;
40382 return cost;
40385 /* Return cost of multiplication in MODE. */
40387 static int
40388 ix86_multiplication_cost (const struct processor_costs *cost,
40389 enum machine_mode mode)
40391 machine_mode inner_mode = mode;
40392 if (VECTOR_MODE_P (mode))
40393 inner_mode = GET_MODE_INNER (mode);
40395 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40396 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
40397 else if (X87_FLOAT_MODE_P (mode))
40398 return cost->fmul;
40399 else if (FLOAT_MODE_P (mode))
40400 return ix86_vec_cost (mode,
40401 inner_mode == DFmode
40402 ? cost->mulsd : cost->mulss, true);
40403 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40405 /* V*QImode is emulated with 7-13 insns. */
40406 if (mode == V16QImode || mode == V32QImode)
40408 int extra = 11;
40409 if (TARGET_XOP && mode == V16QImode)
40410 extra = 5;
40411 else if (TARGET_SSSE3)
40412 extra = 6;
40413 return ix86_vec_cost (mode,
40414 cost->mulss * 2 + cost->sse_op * extra,
40415 true);
40417 /* V*DImode is emulated with 5-8 insns. */
40418 else if (mode == V2DImode || mode == V4DImode)
40420 if (TARGET_XOP && mode == V2DImode)
40421 return ix86_vec_cost (mode,
40422 cost->mulss * 2 + cost->sse_op * 3,
40423 true);
40424 else
40425 return ix86_vec_cost (mode,
40426 cost->mulss * 3 + cost->sse_op * 5,
40427 true);
40429 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40430 insns, including two PMULUDQ. */
40431 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40432 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
40433 true);
40434 else
40435 return ix86_vec_cost (mode, cost->mulss, true);
40437 else
40438 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
40441 /* Return cost of multiplication in MODE. */
40443 static int
40444 ix86_division_cost (const struct processor_costs *cost,
40445 enum machine_mode mode)
40447 machine_mode inner_mode = mode;
40448 if (VECTOR_MODE_P (mode))
40449 inner_mode = GET_MODE_INNER (mode);
40451 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40452 return inner_mode == DFmode ? cost->divsd : cost->divss;
40453 else if (X87_FLOAT_MODE_P (mode))
40454 return cost->fdiv;
40455 else if (FLOAT_MODE_P (mode))
40456 return ix86_vec_cost (mode,
40457 inner_mode == DFmode ? cost->divsd : cost->divss,
40458 true);
40459 else
40460 return cost->divide[MODE_INDEX (mode)];
40463 /* Return cost of shift in MODE.
40464 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
40465 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
40466 if op1 is a result of subreg.
40468 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
40470 static int
40471 ix86_shift_rotate_cost (const struct processor_costs *cost,
40472 enum machine_mode mode, bool constant_op1,
40473 HOST_WIDE_INT op1_val,
40474 bool speed,
40475 bool and_in_op1,
40476 bool shift_and_truncate,
40477 bool *skip_op0, bool *skip_op1)
40479 if (skip_op0)
40480 *skip_op0 = *skip_op1 = false;
40481 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40483 /* V*QImode is emulated with 1-11 insns. */
40484 if (mode == V16QImode || mode == V32QImode)
40486 int count = 11;
40487 if (TARGET_XOP && mode == V16QImode)
40489 /* For XOP we use vpshab, which requires a broadcast of the
40490 value to the variable shift insn. For constants this
40491 means a V16Q const in mem; even when we can perform the
40492 shift with one insn set the cost to prefer paddb. */
40493 if (constant_op1)
40495 if (skip_op1)
40496 *skip_op1 = true;
40497 return ix86_vec_cost (mode,
40498 cost->sse_op
40499 + (speed
40501 : COSTS_N_BYTES
40502 (GET_MODE_UNIT_SIZE (mode))), true);
40504 count = 3;
40506 else if (TARGET_SSSE3)
40507 count = 7;
40508 return ix86_vec_cost (mode, cost->sse_op * count, true);
40510 else
40511 return ix86_vec_cost (mode, cost->sse_op, true);
40513 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40515 if (constant_op1)
40517 if (op1_val > 32)
40518 return cost->shift_const + COSTS_N_INSNS (2);
40519 else
40520 return cost->shift_const * 2;
40522 else
40524 if (and_in_op1)
40525 return cost->shift_var * 2;
40526 else
40527 return cost->shift_var * 6 + COSTS_N_INSNS (2);
40530 else
40532 if (constant_op1)
40533 return cost->shift_const;
40534 else if (shift_and_truncate)
40536 if (skip_op0)
40537 *skip_op0 = *skip_op1 = true;
40538 /* Return the cost after shift-and truncation. */
40539 return cost->shift_var;
40541 else
40542 return cost->shift_var;
40544 return cost->shift_const;
40547 /* Compute a (partial) cost for rtx X. Return true if the complete
40548 cost has been computed, and false if subexpressions should be
40549 scanned. In either case, *TOTAL contains the cost result. */
40551 static bool
40552 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40553 int *total, bool speed)
40555 rtx mask;
40556 enum rtx_code code = GET_CODE (x);
40557 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40558 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40559 int src_cost;
40561 switch (code)
40563 case SET:
40564 if (register_operand (SET_DEST (x), VOIDmode)
40565 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40567 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40568 return true;
40571 if (register_operand (SET_SRC (x), VOIDmode))
40572 /* Avoid potentially incorrect high cost from rtx_costs
40573 for non-tieable SUBREGs. */
40574 src_cost = 0;
40575 else
40577 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40579 if (CONSTANT_P (SET_SRC (x)))
40580 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40581 a small value, possibly zero for cheap constants. */
40582 src_cost += COSTS_N_INSNS (1);
40585 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40586 return true;
40588 case CONST_INT:
40589 case CONST:
40590 case LABEL_REF:
40591 case SYMBOL_REF:
40592 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40593 *total = 3;
40594 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40595 *total = 2;
40596 else if (flag_pic && SYMBOLIC_CONST (x)
40597 && !(TARGET_64BIT
40598 && (GET_CODE (x) == LABEL_REF
40599 || (GET_CODE (x) == SYMBOL_REF
40600 && SYMBOL_REF_LOCAL_P (x))))
40601 /* Use 0 cost for CONST to improve its propagation. */
40602 && (TARGET_64BIT || GET_CODE (x) != CONST))
40603 *total = 1;
40604 else
40605 *total = 0;
40606 return true;
40608 case CONST_DOUBLE:
40609 if (IS_STACK_MODE (mode))
40610 switch (standard_80387_constant_p (x))
40612 case -1:
40613 case 0:
40614 break;
40615 case 1: /* 0.0 */
40616 *total = 1;
40617 return true;
40618 default: /* Other constants */
40619 *total = 2;
40620 return true;
40622 /* FALLTHRU */
40624 case CONST_VECTOR:
40625 switch (standard_sse_constant_p (x, mode))
40627 case 0:
40628 break;
40629 case 1: /* 0: xor eliminates false dependency */
40630 *total = 0;
40631 return true;
40632 default: /* -1: cmp contains false dependency */
40633 *total = 1;
40634 return true;
40636 /* FALLTHRU */
40638 case CONST_WIDE_INT:
40639 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40640 it'll probably end up. Add a penalty for size. */
40641 *total = (COSTS_N_INSNS (1)
40642 + (!TARGET_64BIT && flag_pic)
40643 + (GET_MODE_SIZE (mode) <= 4
40644 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40645 return true;
40647 case ZERO_EXTEND:
40648 /* The zero extensions is often completely free on x86_64, so make
40649 it as cheap as possible. */
40650 if (TARGET_64BIT && mode == DImode
40651 && GET_MODE (XEXP (x, 0)) == SImode)
40652 *total = 1;
40653 else if (TARGET_ZERO_EXTEND_WITH_AND)
40654 *total = cost->add;
40655 else
40656 *total = cost->movzx;
40657 return false;
40659 case SIGN_EXTEND:
40660 *total = cost->movsx;
40661 return false;
40663 case ASHIFT:
40664 if (SCALAR_INT_MODE_P (mode)
40665 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40666 && CONST_INT_P (XEXP (x, 1)))
40668 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40669 if (value == 1)
40671 *total = cost->add;
40672 return false;
40674 if ((value == 2 || value == 3)
40675 && cost->lea <= cost->shift_const)
40677 *total = cost->lea;
40678 return false;
40681 /* FALLTHRU */
40683 case ROTATE:
40684 case ASHIFTRT:
40685 case LSHIFTRT:
40686 case ROTATERT:
40687 bool skip_op0, skip_op1;
40688 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
40689 CONST_INT_P (XEXP (x, 1))
40690 ? INTVAL (XEXP (x, 1)) : -1,
40691 speed,
40692 GET_CODE (XEXP (x, 1)) == AND,
40693 SUBREG_P (XEXP (x, 1))
40694 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
40695 &skip_op0, &skip_op1);
40696 if (skip_op0 || skip_op1)
40698 if (!skip_op0)
40699 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
40700 if (!skip_op1)
40701 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
40702 return true;
40704 return false;
40706 case FMA:
40708 rtx sub;
40710 gcc_assert (FLOAT_MODE_P (mode));
40711 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40713 *total = ix86_vec_cost (mode,
40714 mode == SFmode ? cost->fmass : cost->fmasd,
40715 true);
40716 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40718 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40719 sub = XEXP (x, 0);
40720 if (GET_CODE (sub) == NEG)
40721 sub = XEXP (sub, 0);
40722 *total += rtx_cost (sub, mode, FMA, 0, speed);
40724 sub = XEXP (x, 2);
40725 if (GET_CODE (sub) == NEG)
40726 sub = XEXP (sub, 0);
40727 *total += rtx_cost (sub, mode, FMA, 2, speed);
40728 return true;
40731 case MULT:
40732 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
40734 rtx op0 = XEXP (x, 0);
40735 rtx op1 = XEXP (x, 1);
40736 int nbits;
40737 if (CONST_INT_P (XEXP (x, 1)))
40739 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40740 for (nbits = 0; value != 0; value &= value - 1)
40741 nbits++;
40743 else
40744 /* This is arbitrary. */
40745 nbits = 7;
40747 /* Compute costs correctly for widening multiplication. */
40748 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40749 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40750 == GET_MODE_SIZE (mode))
40752 int is_mulwiden = 0;
40753 machine_mode inner_mode = GET_MODE (op0);
40755 if (GET_CODE (op0) == GET_CODE (op1))
40756 is_mulwiden = 1, op1 = XEXP (op1, 0);
40757 else if (CONST_INT_P (op1))
40759 if (GET_CODE (op0) == SIGN_EXTEND)
40760 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40761 == INTVAL (op1);
40762 else
40763 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40766 if (is_mulwiden)
40767 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40770 *total = (cost->mult_init[MODE_INDEX (mode)]
40771 + nbits * cost->mult_bit
40772 + rtx_cost (op0, mode, outer_code, opno, speed)
40773 + rtx_cost (op1, mode, outer_code, opno, speed));
40775 return true;
40777 *total = ix86_multiplication_cost (cost, mode);
40778 return false;
40780 case DIV:
40781 case UDIV:
40782 case MOD:
40783 case UMOD:
40784 *total = ix86_division_cost (cost, mode);
40785 return false;
40787 case PLUS:
40788 if (GET_MODE_CLASS (mode) == MODE_INT
40789 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40791 if (GET_CODE (XEXP (x, 0)) == PLUS
40792 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40793 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40794 && CONSTANT_P (XEXP (x, 1)))
40796 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40797 if (val == 2 || val == 4 || val == 8)
40799 *total = cost->lea;
40800 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40801 outer_code, opno, speed);
40802 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40803 outer_code, opno, speed);
40804 *total += rtx_cost (XEXP (x, 1), mode,
40805 outer_code, opno, speed);
40806 return true;
40809 else if (GET_CODE (XEXP (x, 0)) == MULT
40810 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40812 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40813 if (val == 2 || val == 4 || val == 8)
40815 *total = cost->lea;
40816 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40817 outer_code, opno, speed);
40818 *total += rtx_cost (XEXP (x, 1), mode,
40819 outer_code, opno, speed);
40820 return true;
40823 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40825 /* Add with carry, ignore the cost of adding a carry flag. */
40826 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
40827 *total = cost->add;
40828 else
40830 *total = cost->lea;
40831 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40832 outer_code, opno, speed);
40835 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40836 outer_code, opno, speed);
40837 *total += rtx_cost (XEXP (x, 1), mode,
40838 outer_code, opno, speed);
40839 return true;
40842 /* FALLTHRU */
40844 case MINUS:
40845 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
40846 if (GET_MODE_CLASS (mode) == MODE_INT
40847 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
40848 && GET_CODE (XEXP (x, 0)) == MINUS
40849 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
40851 *total = cost->add;
40852 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40853 outer_code, opno, speed);
40854 *total += rtx_cost (XEXP (x, 1), mode,
40855 outer_code, opno, speed);
40856 return true;
40859 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40861 *total = cost->addss;
40862 return false;
40864 else if (X87_FLOAT_MODE_P (mode))
40866 *total = cost->fadd;
40867 return false;
40869 else if (FLOAT_MODE_P (mode))
40871 *total = ix86_vec_cost (mode, cost->addss, true);
40872 return false;
40874 /* FALLTHRU */
40876 case AND:
40877 case IOR:
40878 case XOR:
40879 if (GET_MODE_CLASS (mode) == MODE_INT
40880 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40882 *total = (cost->add * 2
40883 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40884 << (GET_MODE (XEXP (x, 0)) != DImode))
40885 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40886 << (GET_MODE (XEXP (x, 1)) != DImode)));
40887 return true;
40889 /* FALLTHRU */
40891 case NEG:
40892 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40894 *total = cost->sse_op;
40895 return false;
40897 else if (X87_FLOAT_MODE_P (mode))
40899 *total = cost->fchs;
40900 return false;
40902 else if (FLOAT_MODE_P (mode))
40904 *total = ix86_vec_cost (mode, cost->sse_op, true);
40905 return false;
40907 /* FALLTHRU */
40909 case NOT:
40910 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40911 *total = ix86_vec_cost (mode, cost->sse_op, true);
40912 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40913 *total = cost->add * 2;
40914 else
40915 *total = cost->add;
40916 return false;
40918 case COMPARE:
40919 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40920 && XEXP (XEXP (x, 0), 1) == const1_rtx
40921 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40922 && XEXP (x, 1) == const0_rtx)
40924 /* This kind of construct is implemented using test[bwl].
40925 Treat it as if we had an AND. */
40926 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40927 *total = (cost->add
40928 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40929 opno, speed)
40930 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40931 return true;
40934 /* The embedded comparison operand is completely free. */
40935 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40936 && XEXP (x, 1) == const0_rtx)
40937 *total = 0;
40939 return false;
40941 case FLOAT_EXTEND:
40942 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40943 *total = 0;
40944 else
40945 *total = ix86_vec_cost (mode, cost->addss, true);
40946 return false;
40948 case FLOAT_TRUNCATE:
40949 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40950 *total = cost->fadd;
40951 else
40952 *total = ix86_vec_cost (mode, cost->addss, true);
40953 return false;
40955 case ABS:
40956 /* SSE requires memory load for the constant operand. It may make
40957 sense to account for this. Of course the constant operand may or
40958 may not be reused. */
40959 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40960 *total = cost->sse_op;
40961 else if (X87_FLOAT_MODE_P (mode))
40962 *total = cost->fabs;
40963 else if (FLOAT_MODE_P (mode))
40964 *total = ix86_vec_cost (mode, cost->sse_op, true);
40965 return false;
40967 case SQRT:
40968 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40969 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40970 else if (X87_FLOAT_MODE_P (mode))
40971 *total = cost->fsqrt;
40972 else if (FLOAT_MODE_P (mode))
40973 *total = ix86_vec_cost (mode,
40974 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40975 true);
40976 return false;
40978 case UNSPEC:
40979 if (XINT (x, 1) == UNSPEC_TP)
40980 *total = 0;
40981 return false;
40983 case VEC_SELECT:
40984 case VEC_CONCAT:
40985 case VEC_DUPLICATE:
40986 /* ??? Assume all of these vector manipulation patterns are
40987 recognizable. In which case they all pretty much have the
40988 same cost. */
40989 *total = cost->sse_op;
40990 return true;
40991 case VEC_MERGE:
40992 mask = XEXP (x, 2);
40993 /* This is masked instruction, assume the same cost,
40994 as nonmasked variant. */
40995 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40996 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40997 else
40998 *total = cost->sse_op;
40999 return true;
41001 default:
41002 return false;
41006 #if TARGET_MACHO
41008 static int current_machopic_label_num;
41010 /* Given a symbol name and its associated stub, write out the
41011 definition of the stub. */
41013 void
41014 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41016 unsigned int length;
41017 char *binder_name, *symbol_name, lazy_ptr_name[32];
41018 int label = ++current_machopic_label_num;
41020 /* For 64-bit we shouldn't get here. */
41021 gcc_assert (!TARGET_64BIT);
41023 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41024 symb = targetm.strip_name_encoding (symb);
41026 length = strlen (stub);
41027 binder_name = XALLOCAVEC (char, length + 32);
41028 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41030 length = strlen (symb);
41031 symbol_name = XALLOCAVEC (char, length + 32);
41032 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41034 sprintf (lazy_ptr_name, "L%d$lz", label);
41036 if (MACHOPIC_ATT_STUB)
41037 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41038 else if (MACHOPIC_PURE)
41039 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41040 else
41041 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41043 fprintf (file, "%s:\n", stub);
41044 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41046 if (MACHOPIC_ATT_STUB)
41048 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41050 else if (MACHOPIC_PURE)
41052 /* PIC stub. */
41053 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41054 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41055 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41056 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41057 label, lazy_ptr_name, label);
41058 fprintf (file, "\tjmp\t*%%ecx\n");
41060 else
41061 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41063 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41064 it needs no stub-binding-helper. */
41065 if (MACHOPIC_ATT_STUB)
41066 return;
41068 fprintf (file, "%s:\n", binder_name);
41070 if (MACHOPIC_PURE)
41072 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41073 fprintf (file, "\tpushl\t%%ecx\n");
41075 else
41076 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41078 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41080 /* N.B. Keep the correspondence of these
41081 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41082 old-pic/new-pic/non-pic stubs; altering this will break
41083 compatibility with existing dylibs. */
41084 if (MACHOPIC_PURE)
41086 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41087 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41089 else
41090 /* 16-byte -mdynamic-no-pic stub. */
41091 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41093 fprintf (file, "%s:\n", lazy_ptr_name);
41094 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41095 fprintf (file, ASM_LONG "%s\n", binder_name);
41097 #endif /* TARGET_MACHO */
41099 /* Order the registers for register allocator. */
41101 void
41102 x86_order_regs_for_local_alloc (void)
41104 int pos = 0;
41105 int i;
41107 /* First allocate the local general purpose registers. */
41108 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41109 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41110 reg_alloc_order [pos++] = i;
41112 /* Global general purpose registers. */
41113 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41114 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41115 reg_alloc_order [pos++] = i;
41117 /* x87 registers come first in case we are doing FP math
41118 using them. */
41119 if (!TARGET_SSE_MATH)
41120 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41121 reg_alloc_order [pos++] = i;
41123 /* SSE registers. */
41124 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41125 reg_alloc_order [pos++] = i;
41126 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41127 reg_alloc_order [pos++] = i;
41129 /* Extended REX SSE registers. */
41130 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41131 reg_alloc_order [pos++] = i;
41133 /* Mask register. */
41134 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41135 reg_alloc_order [pos++] = i;
41137 /* MPX bound registers. */
41138 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41139 reg_alloc_order [pos++] = i;
41141 /* x87 registers. */
41142 if (TARGET_SSE_MATH)
41143 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41144 reg_alloc_order [pos++] = i;
41146 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41147 reg_alloc_order [pos++] = i;
41149 /* Initialize the rest of array as we do not allocate some registers
41150 at all. */
41151 while (pos < FIRST_PSEUDO_REGISTER)
41152 reg_alloc_order [pos++] = 0;
41155 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41156 in struct attribute_spec handler. */
41157 static tree
41158 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
41159 bool *no_add_attrs)
41161 if (TREE_CODE (*node) != FUNCTION_TYPE
41162 && TREE_CODE (*node) != METHOD_TYPE
41163 && TREE_CODE (*node) != FIELD_DECL
41164 && TREE_CODE (*node) != TYPE_DECL)
41166 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41167 name);
41168 *no_add_attrs = true;
41169 return NULL_TREE;
41171 if (TARGET_64BIT)
41173 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41174 name);
41175 *no_add_attrs = true;
41176 return NULL_TREE;
41178 if (is_attribute_p ("callee_pop_aggregate_return", name))
41180 tree cst;
41182 cst = TREE_VALUE (args);
41183 if (TREE_CODE (cst) != INTEGER_CST)
41185 warning (OPT_Wattributes,
41186 "%qE attribute requires an integer constant argument",
41187 name);
41188 *no_add_attrs = true;
41190 else if (compare_tree_int (cst, 0) != 0
41191 && compare_tree_int (cst, 1) != 0)
41193 warning (OPT_Wattributes,
41194 "argument to %qE attribute is neither zero, nor one",
41195 name);
41196 *no_add_attrs = true;
41199 return NULL_TREE;
41202 return NULL_TREE;
41205 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41206 struct attribute_spec.handler. */
41207 static tree
41208 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41209 bool *no_add_attrs)
41211 if (TREE_CODE (*node) != FUNCTION_TYPE
41212 && TREE_CODE (*node) != METHOD_TYPE
41213 && TREE_CODE (*node) != FIELD_DECL
41214 && TREE_CODE (*node) != TYPE_DECL)
41216 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41217 name);
41218 *no_add_attrs = true;
41219 return NULL_TREE;
41222 /* Can combine regparm with all attributes but fastcall. */
41223 if (is_attribute_p ("ms_abi", name))
41225 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41227 error ("ms_abi and sysv_abi attributes are not compatible");
41230 return NULL_TREE;
41232 else if (is_attribute_p ("sysv_abi", name))
41234 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41236 error ("ms_abi and sysv_abi attributes are not compatible");
41239 return NULL_TREE;
41242 return NULL_TREE;
41245 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41246 struct attribute_spec.handler. */
41247 static tree
41248 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41249 bool *no_add_attrs)
41251 tree *type = NULL;
41252 if (DECL_P (*node))
41254 if (TREE_CODE (*node) == TYPE_DECL)
41255 type = &TREE_TYPE (*node);
41257 else
41258 type = node;
41260 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41262 warning (OPT_Wattributes, "%qE attribute ignored",
41263 name);
41264 *no_add_attrs = true;
41267 else if ((is_attribute_p ("ms_struct", name)
41268 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41269 || ((is_attribute_p ("gcc_struct", name)
41270 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41272 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41273 name);
41274 *no_add_attrs = true;
41277 return NULL_TREE;
41280 static tree
41281 ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
41282 bool *no_add_attrs)
41284 if (TREE_CODE (*node) != FUNCTION_DECL)
41286 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41287 name);
41288 *no_add_attrs = true;
41291 if (is_attribute_p ("indirect_branch", name))
41293 tree cst = TREE_VALUE (args);
41294 if (TREE_CODE (cst) != STRING_CST)
41296 warning (OPT_Wattributes,
41297 "%qE attribute requires a string constant argument",
41298 name);
41299 *no_add_attrs = true;
41301 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41302 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41303 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41304 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41306 warning (OPT_Wattributes,
41307 "argument to %qE attribute is not "
41308 "(keep|thunk|thunk-inline|thunk-extern)", name);
41309 *no_add_attrs = true;
41313 if (is_attribute_p ("function_return", name))
41315 tree cst = TREE_VALUE (args);
41316 if (TREE_CODE (cst) != STRING_CST)
41318 warning (OPT_Wattributes,
41319 "%qE attribute requires a string constant argument",
41320 name);
41321 *no_add_attrs = true;
41323 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41324 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41325 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41326 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41328 warning (OPT_Wattributes,
41329 "argument to %qE attribute is not "
41330 "(keep|thunk|thunk-inline|thunk-extern)", name);
41331 *no_add_attrs = true;
41335 return NULL_TREE;
41338 static tree
41339 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41340 int, bool *)
41342 return NULL_TREE;
41345 static tree
41346 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41348 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41349 but the function type contains args and return type data. */
41350 tree func_type = *node;
41351 tree return_type = TREE_TYPE (func_type);
41353 int nargs = 0;
41354 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41355 while (current_arg_type
41356 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41358 if (nargs == 0)
41360 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41361 error ("interrupt service routine should have a pointer "
41362 "as the first argument");
41364 else if (nargs == 1)
41366 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41367 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41368 error ("interrupt service routine should have unsigned %s"
41369 "int as the second argument",
41370 TARGET_64BIT
41371 ? (TARGET_X32 ? "long long " : "long ")
41372 : "");
41374 nargs++;
41375 current_arg_type = TREE_CHAIN (current_arg_type);
41377 if (!nargs || nargs > 2)
41378 error ("interrupt service routine can only have a pointer argument "
41379 "and an optional integer argument");
41380 if (! VOID_TYPE_P (return_type))
41381 error ("interrupt service routine can't have non-void return value");
41383 return NULL_TREE;
41386 static bool
41387 ix86_ms_bitfield_layout_p (const_tree record_type)
41389 return ((TARGET_MS_BITFIELD_LAYOUT
41390 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41391 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41394 /* Returns an expression indicating where the this parameter is
41395 located on entry to the FUNCTION. */
41397 static rtx
41398 x86_this_parameter (tree function)
41400 tree type = TREE_TYPE (function);
41401 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41402 int nregs;
41404 if (TARGET_64BIT)
41406 const int *parm_regs;
41408 if (ix86_function_type_abi (type) == MS_ABI)
41409 parm_regs = x86_64_ms_abi_int_parameter_registers;
41410 else
41411 parm_regs = x86_64_int_parameter_registers;
41412 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41415 nregs = ix86_function_regparm (type, function);
41417 if (nregs > 0 && !stdarg_p (type))
41419 int regno;
41420 unsigned int ccvt = ix86_get_callcvt (type);
41422 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41423 regno = aggr ? DX_REG : CX_REG;
41424 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41426 regno = CX_REG;
41427 if (aggr)
41428 return gen_rtx_MEM (SImode,
41429 plus_constant (Pmode, stack_pointer_rtx, 4));
41431 else
41433 regno = AX_REG;
41434 if (aggr)
41436 regno = DX_REG;
41437 if (nregs == 1)
41438 return gen_rtx_MEM (SImode,
41439 plus_constant (Pmode,
41440 stack_pointer_rtx, 4));
41443 return gen_rtx_REG (SImode, regno);
41446 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41447 aggr ? 8 : 4));
41450 /* Determine whether x86_output_mi_thunk can succeed. */
41452 static bool
41453 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41454 const_tree function)
41456 /* 64-bit can handle anything. */
41457 if (TARGET_64BIT)
41458 return true;
41460 /* For 32-bit, everything's fine if we have one free register. */
41461 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41462 return true;
41464 /* Need a free register for vcall_offset. */
41465 if (vcall_offset)
41466 return false;
41468 /* Need a free register for GOT references. */
41469 if (flag_pic && !targetm.binds_local_p (function))
41470 return false;
41472 /* Otherwise ok. */
41473 return true;
41476 /* Output the assembler code for a thunk function. THUNK_DECL is the
41477 declaration for the thunk function itself, FUNCTION is the decl for
41478 the target function. DELTA is an immediate constant offset to be
41479 added to THIS. If VCALL_OFFSET is nonzero, the word at
41480 *(*this + vcall_offset) should be added to THIS. */
41482 static void
41483 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41484 HOST_WIDE_INT vcall_offset, tree function)
41486 rtx this_param = x86_this_parameter (function);
41487 rtx this_reg, tmp, fnaddr;
41488 unsigned int tmp_regno;
41489 rtx_insn *insn;
41491 if (TARGET_64BIT)
41492 tmp_regno = R10_REG;
41493 else
41495 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41496 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41497 tmp_regno = AX_REG;
41498 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41499 tmp_regno = DX_REG;
41500 else
41501 tmp_regno = CX_REG;
41504 emit_note (NOTE_INSN_PROLOGUE_END);
41506 /* CET is enabled, insert EB instruction. */
41507 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
41508 emit_insn (gen_nop_endbr ());
41510 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41511 pull it in now and let DELTA benefit. */
41512 if (REG_P (this_param))
41513 this_reg = this_param;
41514 else if (vcall_offset)
41516 /* Put the this parameter into %eax. */
41517 this_reg = gen_rtx_REG (Pmode, AX_REG);
41518 emit_move_insn (this_reg, this_param);
41520 else
41521 this_reg = NULL_RTX;
41523 /* Adjust the this parameter by a fixed constant. */
41524 if (delta)
41526 rtx delta_rtx = GEN_INT (delta);
41527 rtx delta_dst = this_reg ? this_reg : this_param;
41529 if (TARGET_64BIT)
41531 if (!x86_64_general_operand (delta_rtx, Pmode))
41533 tmp = gen_rtx_REG (Pmode, tmp_regno);
41534 emit_move_insn (tmp, delta_rtx);
41535 delta_rtx = tmp;
41539 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41542 /* Adjust the this parameter by a value stored in the vtable. */
41543 if (vcall_offset)
41545 rtx vcall_addr, vcall_mem, this_mem;
41547 tmp = gen_rtx_REG (Pmode, tmp_regno);
41549 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41550 if (Pmode != ptr_mode)
41551 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41552 emit_move_insn (tmp, this_mem);
41554 /* Adjust the this parameter. */
41555 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41556 if (TARGET_64BIT
41557 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41559 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41560 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41561 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41564 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41565 if (Pmode != ptr_mode)
41566 emit_insn (gen_addsi_1_zext (this_reg,
41567 gen_rtx_REG (ptr_mode,
41568 REGNO (this_reg)),
41569 vcall_mem));
41570 else
41571 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41574 /* If necessary, drop THIS back to its stack slot. */
41575 if (this_reg && this_reg != this_param)
41576 emit_move_insn (this_param, this_reg);
41578 fnaddr = XEXP (DECL_RTL (function), 0);
41579 if (TARGET_64BIT)
41581 if (!flag_pic || targetm.binds_local_p (function)
41582 || TARGET_PECOFF)
41584 else
41586 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41587 tmp = gen_rtx_CONST (Pmode, tmp);
41588 fnaddr = gen_const_mem (Pmode, tmp);
41591 else
41593 if (!flag_pic || targetm.binds_local_p (function))
41595 #if TARGET_MACHO
41596 else if (TARGET_MACHO)
41598 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41599 fnaddr = XEXP (fnaddr, 0);
41601 #endif /* TARGET_MACHO */
41602 else
41604 tmp = gen_rtx_REG (Pmode, CX_REG);
41605 output_set_got (tmp, NULL_RTX);
41607 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41608 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41609 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41610 fnaddr = gen_const_mem (Pmode, fnaddr);
41614 /* Our sibling call patterns do not allow memories, because we have no
41615 predicate that can distinguish between frame and non-frame memory.
41616 For our purposes here, we can get away with (ab)using a jump pattern,
41617 because we're going to do no optimization. */
41618 if (MEM_P (fnaddr))
41620 if (sibcall_insn_operand (fnaddr, word_mode))
41622 fnaddr = XEXP (DECL_RTL (function), 0);
41623 tmp = gen_rtx_MEM (QImode, fnaddr);
41624 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41625 tmp = emit_call_insn (tmp);
41626 SIBLING_CALL_P (tmp) = 1;
41628 else
41629 emit_jump_insn (gen_indirect_jump (fnaddr));
41631 else
41633 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41635 // CM_LARGE_PIC always uses pseudo PIC register which is
41636 // uninitialized. Since FUNCTION is local and calling it
41637 // doesn't go through PLT, we use scratch register %r11 as
41638 // PIC register and initialize it here.
41639 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41640 ix86_init_large_pic_reg (tmp_regno);
41641 fnaddr = legitimize_pic_address (fnaddr,
41642 gen_rtx_REG (Pmode, tmp_regno));
41645 if (!sibcall_insn_operand (fnaddr, word_mode))
41647 tmp = gen_rtx_REG (word_mode, tmp_regno);
41648 if (GET_MODE (fnaddr) != word_mode)
41649 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41650 emit_move_insn (tmp, fnaddr);
41651 fnaddr = tmp;
41654 tmp = gen_rtx_MEM (QImode, fnaddr);
41655 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41656 tmp = emit_call_insn (tmp);
41657 SIBLING_CALL_P (tmp) = 1;
41659 emit_barrier ();
41661 /* Emit just enough of rest_of_compilation to get the insns emitted.
41662 Note that use_thunk calls assemble_start_function et al. */
41663 insn = get_insns ();
41664 shorten_branches (insn);
41665 final_start_function (insn, file, 1);
41666 final (insn, file, 1);
41667 final_end_function ();
41670 static void
41671 x86_file_start (void)
41673 default_file_start ();
41674 if (TARGET_16BIT)
41675 fputs ("\t.code16gcc\n", asm_out_file);
41676 #if TARGET_MACHO
41677 darwin_file_start ();
41678 #endif
41679 if (X86_FILE_START_VERSION_DIRECTIVE)
41680 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41681 if (X86_FILE_START_FLTUSED)
41682 fputs ("\t.global\t__fltused\n", asm_out_file);
41683 if (ix86_asm_dialect == ASM_INTEL)
41684 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41688 x86_field_alignment (tree type, int computed)
41690 machine_mode mode;
41692 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41693 return computed;
41694 if (TARGET_IAMCU)
41695 return iamcu_alignment (type, computed);
41696 mode = TYPE_MODE (strip_array_types (type));
41697 if (mode == DFmode || mode == DCmode
41698 || GET_MODE_CLASS (mode) == MODE_INT
41699 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41700 return MIN (32, computed);
41701 return computed;
41704 /* Print call to TARGET to FILE. */
41706 static void
41707 x86_print_call_or_nop (FILE *file, const char *target)
41709 if (flag_nop_mcount)
41710 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
41711 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
41712 else
41713 fprintf (file, "1:\tcall\t%s\n", target);
41716 /* Output assembler code to FILE to increment profiler label # LABELNO
41717 for profiling a function entry. */
41718 void
41719 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41721 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41722 : MCOUNT_NAME);
41723 if (TARGET_64BIT)
41725 #ifndef NO_PROFILE_COUNTERS
41726 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41727 #endif
41729 if (!TARGET_PECOFF && flag_pic)
41730 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41731 else
41732 x86_print_call_or_nop (file, mcount_name);
41734 else if (flag_pic)
41736 #ifndef NO_PROFILE_COUNTERS
41737 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41738 LPREFIX, labelno);
41739 #endif
41740 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41742 else
41744 #ifndef NO_PROFILE_COUNTERS
41745 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41746 LPREFIX, labelno);
41747 #endif
41748 x86_print_call_or_nop (file, mcount_name);
41751 if (flag_record_mcount)
41753 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41754 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41755 fprintf (file, "\t.previous\n");
41759 /* We don't have exact information about the insn sizes, but we may assume
41760 quite safely that we are informed about all 1 byte insns and memory
41761 address sizes. This is enough to eliminate unnecessary padding in
41762 99% of cases. */
41765 ix86_min_insn_size (rtx_insn *insn)
41767 int l = 0, len;
41769 if (!INSN_P (insn) || !active_insn_p (insn))
41770 return 0;
41772 /* Discard alignments we've emit and jump instructions. */
41773 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41774 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41775 return 0;
41777 /* Important case - calls are always 5 bytes.
41778 It is common to have many calls in the row. */
41779 if (CALL_P (insn)
41780 && symbolic_reference_mentioned_p (PATTERN (insn))
41781 && !SIBLING_CALL_P (insn))
41782 return 5;
41783 len = get_attr_length (insn);
41784 if (len <= 1)
41785 return 1;
41787 /* For normal instructions we rely on get_attr_length being exact,
41788 with a few exceptions. */
41789 if (!JUMP_P (insn))
41791 enum attr_type type = get_attr_type (insn);
41793 switch (type)
41795 case TYPE_MULTI:
41796 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
41797 || asm_noperands (PATTERN (insn)) >= 0)
41798 return 0;
41799 break;
41800 case TYPE_OTHER:
41801 case TYPE_FCMP:
41802 break;
41803 default:
41804 /* Otherwise trust get_attr_length. */
41805 return len;
41808 l = get_attr_length_address (insn);
41809 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
41810 l = 4;
41812 if (l)
41813 return 1+l;
41814 else
41815 return 2;
41818 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41820 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
41821 window. */
41823 static void
41824 ix86_avoid_jump_mispredicts (void)
41826 rtx_insn *insn, *start = get_insns ();
41827 int nbytes = 0, njumps = 0;
41828 bool isjump = false;
41830 /* Look for all minimal intervals of instructions containing 4 jumps.
41831 The intervals are bounded by START and INSN. NBYTES is the total
41832 size of instructions in the interval including INSN and not including
41833 START. When the NBYTES is smaller than 16 bytes, it is possible
41834 that the end of START and INSN ends up in the same 16byte page.
41836 The smallest offset in the page INSN can start is the case where START
41837 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
41838 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41840 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41841 have to, control transfer to label(s) can be performed through other
41842 means, and also we estimate minimum length of all asm stmts as 0. */
41843 for (insn = start; insn; insn = NEXT_INSN (insn))
41845 int min_size;
41847 if (LABEL_P (insn))
41849 int align = label_to_alignment (insn);
41850 int max_skip = label_to_max_skip (insn);
41852 if (max_skip > 15)
41853 max_skip = 15;
41854 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41855 already in the current 16 byte page, because otherwise
41856 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41857 bytes to reach 16 byte boundary. */
41858 if (align <= 0
41859 || (align <= 3 && max_skip != (1 << align) - 1))
41860 max_skip = 0;
41861 if (dump_file)
41862 fprintf (dump_file, "Label %i with max_skip %i\n",
41863 INSN_UID (insn), max_skip);
41864 if (max_skip)
41866 while (nbytes + max_skip >= 16)
41868 start = NEXT_INSN (start);
41869 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41870 || CALL_P (start))
41871 njumps--, isjump = true;
41872 else
41873 isjump = false;
41874 nbytes -= ix86_min_insn_size (start);
41877 continue;
41880 min_size = ix86_min_insn_size (insn);
41881 nbytes += min_size;
41882 if (dump_file)
41883 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41884 INSN_UID (insn), min_size);
41885 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41886 || CALL_P (insn))
41887 njumps++;
41888 else
41889 continue;
41891 while (njumps > 3)
41893 start = NEXT_INSN (start);
41894 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41895 || CALL_P (start))
41896 njumps--, isjump = true;
41897 else
41898 isjump = false;
41899 nbytes -= ix86_min_insn_size (start);
41901 gcc_assert (njumps >= 0);
41902 if (dump_file)
41903 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41904 INSN_UID (start), INSN_UID (insn), nbytes);
41906 if (njumps == 3 && isjump && nbytes < 16)
41908 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
41910 if (dump_file)
41911 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41912 INSN_UID (insn), padsize);
41913 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41917 #endif
41919 /* AMD Athlon works faster
41920 when RET is not destination of conditional jump or directly preceded
41921 by other jump instruction. We avoid the penalty by inserting NOP just
41922 before the RET instructions in such cases. */
41923 static void
41924 ix86_pad_returns (void)
41926 edge e;
41927 edge_iterator ei;
41929 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41931 basic_block bb = e->src;
41932 rtx_insn *ret = BB_END (bb);
41933 rtx_insn *prev;
41934 bool replace = false;
41936 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41937 || optimize_bb_for_size_p (bb))
41938 continue;
41939 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41940 if (active_insn_p (prev) || LABEL_P (prev))
41941 break;
41942 if (prev && LABEL_P (prev))
41944 edge e;
41945 edge_iterator ei;
41947 FOR_EACH_EDGE (e, ei, bb->preds)
41948 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41949 && !(e->flags & EDGE_FALLTHRU))
41951 replace = true;
41952 break;
41955 if (!replace)
41957 prev = prev_active_insn (ret);
41958 if (prev
41959 && ((JUMP_P (prev) && any_condjump_p (prev))
41960 || CALL_P (prev)))
41961 replace = true;
41962 /* Empty functions get branch mispredict even when
41963 the jump destination is not visible to us. */
41964 if (!prev && !optimize_function_for_size_p (cfun))
41965 replace = true;
41967 if (replace)
41969 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41970 delete_insn (ret);
41975 /* Count the minimum number of instructions in BB. Return 4 if the
41976 number of instructions >= 4. */
41978 static int
41979 ix86_count_insn_bb (basic_block bb)
41981 rtx_insn *insn;
41982 int insn_count = 0;
41984 /* Count number of instructions in this block. Return 4 if the number
41985 of instructions >= 4. */
41986 FOR_BB_INSNS (bb, insn)
41988 /* Only happen in exit blocks. */
41989 if (JUMP_P (insn)
41990 && ANY_RETURN_P (PATTERN (insn)))
41991 break;
41993 if (NONDEBUG_INSN_P (insn)
41994 && GET_CODE (PATTERN (insn)) != USE
41995 && GET_CODE (PATTERN (insn)) != CLOBBER)
41997 insn_count++;
41998 if (insn_count >= 4)
41999 return insn_count;
42003 return insn_count;
42007 /* Count the minimum number of instructions in code path in BB.
42008 Return 4 if the number of instructions >= 4. */
42010 static int
42011 ix86_count_insn (basic_block bb)
42013 edge e;
42014 edge_iterator ei;
42015 int min_prev_count;
42017 /* Only bother counting instructions along paths with no
42018 more than 2 basic blocks between entry and exit. Given
42019 that BB has an edge to exit, determine if a predecessor
42020 of BB has an edge from entry. If so, compute the number
42021 of instructions in the predecessor block. If there
42022 happen to be multiple such blocks, compute the minimum. */
42023 min_prev_count = 4;
42024 FOR_EACH_EDGE (e, ei, bb->preds)
42026 edge prev_e;
42027 edge_iterator prev_ei;
42029 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42031 min_prev_count = 0;
42032 break;
42034 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42036 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42038 int count = ix86_count_insn_bb (e->src);
42039 if (count < min_prev_count)
42040 min_prev_count = count;
42041 break;
42046 if (min_prev_count < 4)
42047 min_prev_count += ix86_count_insn_bb (bb);
42049 return min_prev_count;
42052 /* Pad short function to 4 instructions. */
42054 static void
42055 ix86_pad_short_function (void)
42057 edge e;
42058 edge_iterator ei;
42060 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42062 rtx_insn *ret = BB_END (e->src);
42063 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42065 int insn_count = ix86_count_insn (e->src);
42067 /* Pad short function. */
42068 if (insn_count < 4)
42070 rtx_insn *insn = ret;
42072 /* Find epilogue. */
42073 while (insn
42074 && (!NOTE_P (insn)
42075 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42076 insn = PREV_INSN (insn);
42078 if (!insn)
42079 insn = ret;
42081 /* Two NOPs count as one instruction. */
42082 insn_count = 2 * (4 - insn_count);
42083 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42089 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42090 the epilogue, the Windows system unwinder will apply epilogue logic and
42091 produce incorrect offsets. This can be avoided by adding a nop between
42092 the last insn that can throw and the first insn of the epilogue. */
42094 static void
42095 ix86_seh_fixup_eh_fallthru (void)
42097 edge e;
42098 edge_iterator ei;
42100 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42102 rtx_insn *insn, *next;
42104 /* Find the beginning of the epilogue. */
42105 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42106 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42107 break;
42108 if (insn == NULL)
42109 continue;
42111 /* We only care about preceding insns that can throw. */
42112 insn = prev_active_insn (insn);
42113 if (insn == NULL || !can_throw_internal (insn))
42114 continue;
42116 /* Do not separate calls from their debug information. */
42117 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42118 if (NOTE_P (next)
42119 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42120 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42121 insn = next;
42122 else
42123 break;
42125 emit_insn_after (gen_nops (const1_rtx), insn);
42129 /* Given a register number BASE, the lowest of a group of registers, update
42130 regsets IN and OUT with the registers that should be avoided in input
42131 and output operands respectively when trying to avoid generating a modr/m
42132 byte for -mmitigate-rop. */
42134 static void
42135 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42137 SET_HARD_REG_BIT (out, base);
42138 SET_HARD_REG_BIT (out, base + 1);
42139 SET_HARD_REG_BIT (in, base + 2);
42140 SET_HARD_REG_BIT (in, base + 3);
42143 /* Called if -mmitigate-rop is in effect. Try to rewrite instructions so
42144 that certain encodings of modr/m bytes do not occur. */
42145 static void
42146 ix86_mitigate_rop (void)
42148 HARD_REG_SET input_risky;
42149 HARD_REG_SET output_risky;
42150 HARD_REG_SET inout_risky;
42152 CLEAR_HARD_REG_SET (output_risky);
42153 CLEAR_HARD_REG_SET (input_risky);
42154 SET_HARD_REG_BIT (output_risky, AX_REG);
42155 SET_HARD_REG_BIT (output_risky, CX_REG);
42156 SET_HARD_REG_BIT (input_risky, BX_REG);
42157 SET_HARD_REG_BIT (input_risky, DX_REG);
42158 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42159 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42160 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42161 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42162 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42163 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42164 COPY_HARD_REG_SET (inout_risky, input_risky);
42165 IOR_HARD_REG_SET (inout_risky, output_risky);
42167 df_note_add_problem ();
42168 /* Fix up what stack-regs did. */
42169 df_insn_rescan_all ();
42170 df_analyze ();
42172 regrename_init (true);
42173 regrename_analyze (NULL);
42175 auto_vec<du_head_p> cands;
42177 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42179 if (!NONDEBUG_INSN_P (insn))
42180 continue;
42182 if (GET_CODE (PATTERN (insn)) == USE
42183 || GET_CODE (PATTERN (insn)) == CLOBBER)
42184 continue;
42186 extract_insn (insn);
42188 int opno0, opno1;
42189 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42190 recog_data.n_operands, &opno0,
42191 &opno1);
42193 if (!ix86_rop_should_change_byte_p (modrm))
42194 continue;
42196 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42198 /* This happens when regrename has to fail a block. */
42199 if (!info->op_info)
42200 continue;
42202 if (info->op_info[opno0].n_chains != 0)
42204 gcc_assert (info->op_info[opno0].n_chains == 1);
42205 du_head_p op0c;
42206 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42207 if (op0c->target_data_1 + op0c->target_data_2 == 0
42208 && !op0c->cannot_rename)
42209 cands.safe_push (op0c);
42211 op0c->target_data_1++;
42213 if (info->op_info[opno1].n_chains != 0)
42215 gcc_assert (info->op_info[opno1].n_chains == 1);
42216 du_head_p op1c;
42217 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42218 if (op1c->target_data_1 + op1c->target_data_2 == 0
42219 && !op1c->cannot_rename)
42220 cands.safe_push (op1c);
42222 op1c->target_data_2++;
42226 int i;
42227 du_head_p head;
42228 FOR_EACH_VEC_ELT (cands, i, head)
42230 int old_reg, best_reg;
42231 HARD_REG_SET unavailable;
42233 CLEAR_HARD_REG_SET (unavailable);
42234 if (head->target_data_1)
42235 IOR_HARD_REG_SET (unavailable, output_risky);
42236 if (head->target_data_2)
42237 IOR_HARD_REG_SET (unavailable, input_risky);
42239 int n_uses;
42240 reg_class superclass = regrename_find_superclass (head, &n_uses,
42241 &unavailable);
42242 old_reg = head->regno;
42243 best_reg = find_rename_reg (head, superclass, &unavailable,
42244 old_reg, false);
42245 bool ok = regrename_do_replace (head, best_reg);
42246 gcc_assert (ok);
42247 if (dump_file)
42248 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42249 reg_names[best_reg], reg_class_names[superclass]);
42253 regrename_finish ();
42255 df_analyze ();
42257 basic_block bb;
42258 regset_head live;
42260 INIT_REG_SET (&live);
42262 FOR_EACH_BB_FN (bb, cfun)
42264 rtx_insn *insn;
42266 COPY_REG_SET (&live, DF_LR_OUT (bb));
42267 df_simulate_initialize_backwards (bb, &live);
42269 FOR_BB_INSNS_REVERSE (bb, insn)
42271 if (!NONDEBUG_INSN_P (insn))
42272 continue;
42274 df_simulate_one_insn_backwards (bb, insn, &live);
42276 if (GET_CODE (PATTERN (insn)) == USE
42277 || GET_CODE (PATTERN (insn)) == CLOBBER)
42278 continue;
42280 extract_insn (insn);
42281 constrain_operands_cached (insn, reload_completed);
42282 int opno0, opno1;
42283 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42284 recog_data.n_operands, &opno0,
42285 &opno1);
42286 if (modrm < 0
42287 || !ix86_rop_should_change_byte_p (modrm)
42288 || opno0 == opno1)
42289 continue;
42291 rtx oldreg = recog_data.operand[opno1];
42292 preprocess_constraints (insn);
42293 const operand_alternative *alt = which_op_alt ();
42295 int i;
42296 for (i = 0; i < recog_data.n_operands; i++)
42297 if (i != opno1
42298 && alt[i].earlyclobber
42299 && reg_overlap_mentioned_p (recog_data.operand[i],
42300 oldreg))
42301 break;
42303 if (i < recog_data.n_operands)
42304 continue;
42306 if (dump_file)
42307 fprintf (dump_file,
42308 "attempting to fix modrm byte in insn %d:"
42309 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42310 reg_class_names[alt[opno1].cl]);
42312 HARD_REG_SET unavailable;
42313 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42314 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42315 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42316 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42317 IOR_HARD_REG_SET (unavailable, output_risky);
42318 IOR_COMPL_HARD_REG_SET (unavailable,
42319 reg_class_contents[alt[opno1].cl]);
42321 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42322 if (!TEST_HARD_REG_BIT (unavailable, i))
42323 break;
42324 if (i == FIRST_PSEUDO_REGISTER)
42326 if (dump_file)
42327 fprintf (dump_file, ", none available\n");
42328 continue;
42330 if (dump_file)
42331 fprintf (dump_file, " -> %d\n", i);
42332 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42333 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42334 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42339 /* Implement machine specific optimizations. We implement padding of returns
42340 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42341 static void
42342 ix86_reorg (void)
42344 /* We are freeing block_for_insn in the toplev to keep compatibility
42345 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42346 compute_bb_for_insn ();
42348 if (flag_mitigate_rop)
42349 ix86_mitigate_rop ();
42351 if (TARGET_SEH && current_function_has_exception_handlers ())
42352 ix86_seh_fixup_eh_fallthru ();
42354 if (optimize && optimize_function_for_speed_p (cfun))
42356 if (TARGET_PAD_SHORT_FUNCTION)
42357 ix86_pad_short_function ();
42358 else if (TARGET_PAD_RETURNS)
42359 ix86_pad_returns ();
42360 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42361 if (TARGET_FOUR_JUMP_LIMIT)
42362 ix86_avoid_jump_mispredicts ();
42363 #endif
42367 /* Return nonzero when QImode register that must be represented via REX prefix
42368 is used. */
42369 bool
42370 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42372 int i;
42373 extract_insn_cached (insn);
42374 for (i = 0; i < recog_data.n_operands; i++)
42375 if (GENERAL_REG_P (recog_data.operand[i])
42376 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42377 return true;
42378 return false;
42381 /* Return true when INSN mentions register that must be encoded using REX
42382 prefix. */
42383 bool
42384 x86_extended_reg_mentioned_p (rtx insn)
42386 subrtx_iterator::array_type array;
42387 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42389 const_rtx x = *iter;
42390 if (REG_P (x)
42391 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42392 return true;
42394 return false;
42397 /* If profitable, negate (without causing overflow) integer constant
42398 of mode MODE at location LOC. Return true in this case. */
42399 bool
42400 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42402 HOST_WIDE_INT val;
42404 if (!CONST_INT_P (*loc))
42405 return false;
42407 switch (mode)
42409 case E_DImode:
42410 /* DImode x86_64 constants must fit in 32 bits. */
42411 gcc_assert (x86_64_immediate_operand (*loc, mode));
42413 mode = SImode;
42414 break;
42416 case E_SImode:
42417 case E_HImode:
42418 case E_QImode:
42419 break;
42421 default:
42422 gcc_unreachable ();
42425 /* Avoid overflows. */
42426 if (mode_signbit_p (mode, *loc))
42427 return false;
42429 val = INTVAL (*loc);
42431 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42432 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42433 if ((val < 0 && val != -128)
42434 || val == 128)
42436 *loc = GEN_INT (-val);
42437 return true;
42440 return false;
42443 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42444 optabs would emit if we didn't have TFmode patterns. */
42446 void
42447 x86_emit_floatuns (rtx operands[2])
42449 rtx_code_label *neglab, *donelab;
42450 rtx i0, i1, f0, in, out;
42451 machine_mode mode, inmode;
42453 inmode = GET_MODE (operands[1]);
42454 gcc_assert (inmode == SImode || inmode == DImode);
42456 out = operands[0];
42457 in = force_reg (inmode, operands[1]);
42458 mode = GET_MODE (out);
42459 neglab = gen_label_rtx ();
42460 donelab = gen_label_rtx ();
42461 f0 = gen_reg_rtx (mode);
42463 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42465 expand_float (out, in, 0);
42467 emit_jump_insn (gen_jump (donelab));
42468 emit_barrier ();
42470 emit_label (neglab);
42472 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42473 1, OPTAB_DIRECT);
42474 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42475 1, OPTAB_DIRECT);
42476 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42478 expand_float (f0, i0, 0);
42480 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42482 emit_label (donelab);
42485 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42486 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42487 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42488 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42490 /* Get a vector mode of the same size as the original but with elements
42491 twice as wide. This is only guaranteed to apply to integral vectors. */
42493 static inline machine_mode
42494 get_mode_wider_vector (machine_mode o)
42496 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42497 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
42498 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42499 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42500 return n;
42503 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42504 fill target with val via vec_duplicate. */
42506 static bool
42507 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42509 bool ok;
42510 rtx_insn *insn;
42511 rtx dup;
42513 /* First attempt to recognize VAL as-is. */
42514 dup = gen_vec_duplicate (mode, val);
42515 insn = emit_insn (gen_rtx_SET (target, dup));
42516 if (recog_memoized (insn) < 0)
42518 rtx_insn *seq;
42519 machine_mode innermode = GET_MODE_INNER (mode);
42520 rtx reg;
42522 /* If that fails, force VAL into a register. */
42524 start_sequence ();
42525 reg = force_reg (innermode, val);
42526 if (GET_MODE (reg) != innermode)
42527 reg = gen_lowpart (innermode, reg);
42528 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
42529 seq = get_insns ();
42530 end_sequence ();
42531 if (seq)
42532 emit_insn_before (seq, insn);
42534 ok = recog_memoized (insn) >= 0;
42535 gcc_assert (ok);
42537 return true;
42540 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42541 with all elements equal to VAR. Return true if successful. */
42543 static bool
42544 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42545 rtx target, rtx val)
42547 bool ok;
42549 switch (mode)
42551 case E_V2SImode:
42552 case E_V2SFmode:
42553 if (!mmx_ok)
42554 return false;
42555 /* FALLTHRU */
42557 case E_V4DFmode:
42558 case E_V4DImode:
42559 case E_V8SFmode:
42560 case E_V8SImode:
42561 case E_V2DFmode:
42562 case E_V2DImode:
42563 case E_V4SFmode:
42564 case E_V4SImode:
42565 case E_V16SImode:
42566 case E_V8DImode:
42567 case E_V16SFmode:
42568 case E_V8DFmode:
42569 return ix86_vector_duplicate_value (mode, target, val);
42571 case E_V4HImode:
42572 if (!mmx_ok)
42573 return false;
42574 if (TARGET_SSE || TARGET_3DNOW_A)
42576 rtx x;
42578 val = gen_lowpart (SImode, val);
42579 x = gen_rtx_TRUNCATE (HImode, val);
42580 x = gen_rtx_VEC_DUPLICATE (mode, x);
42581 emit_insn (gen_rtx_SET (target, x));
42582 return true;
42584 goto widen;
42586 case E_V8QImode:
42587 if (!mmx_ok)
42588 return false;
42589 goto widen;
42591 case E_V8HImode:
42592 if (TARGET_AVX2)
42593 return ix86_vector_duplicate_value (mode, target, val);
42595 if (TARGET_SSE2)
42597 struct expand_vec_perm_d dperm;
42598 rtx tmp1, tmp2;
42600 permute:
42601 memset (&dperm, 0, sizeof (dperm));
42602 dperm.target = target;
42603 dperm.vmode = mode;
42604 dperm.nelt = GET_MODE_NUNITS (mode);
42605 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42606 dperm.one_operand_p = true;
42608 /* Extend to SImode using a paradoxical SUBREG. */
42609 tmp1 = gen_reg_rtx (SImode);
42610 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42612 /* Insert the SImode value as low element of a V4SImode vector. */
42613 tmp2 = gen_reg_rtx (V4SImode);
42614 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42615 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42617 ok = (expand_vec_perm_1 (&dperm)
42618 || expand_vec_perm_broadcast_1 (&dperm));
42619 gcc_assert (ok);
42620 return ok;
42622 goto widen;
42624 case E_V16QImode:
42625 if (TARGET_AVX2)
42626 return ix86_vector_duplicate_value (mode, target, val);
42628 if (TARGET_SSE2)
42629 goto permute;
42630 goto widen;
42632 widen:
42633 /* Replicate the value once into the next wider mode and recurse. */
42635 machine_mode smode, wsmode, wvmode;
42636 rtx x;
42638 smode = GET_MODE_INNER (mode);
42639 wvmode = get_mode_wider_vector (mode);
42640 wsmode = GET_MODE_INNER (wvmode);
42642 val = convert_modes (wsmode, smode, val, true);
42643 x = expand_simple_binop (wsmode, ASHIFT, val,
42644 GEN_INT (GET_MODE_BITSIZE (smode)),
42645 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42646 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42648 x = gen_reg_rtx (wvmode);
42649 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42650 gcc_assert (ok);
42651 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42652 return ok;
42655 case E_V16HImode:
42656 case E_V32QImode:
42657 if (TARGET_AVX2)
42658 return ix86_vector_duplicate_value (mode, target, val);
42659 else
42661 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42662 rtx x = gen_reg_rtx (hvmode);
42664 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42665 gcc_assert (ok);
42667 x = gen_rtx_VEC_CONCAT (mode, x, x);
42668 emit_insn (gen_rtx_SET (target, x));
42670 return true;
42672 case E_V64QImode:
42673 case E_V32HImode:
42674 if (TARGET_AVX512BW)
42675 return ix86_vector_duplicate_value (mode, target, val);
42676 else
42678 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42679 rtx x = gen_reg_rtx (hvmode);
42681 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42682 gcc_assert (ok);
42684 x = gen_rtx_VEC_CONCAT (mode, x, x);
42685 emit_insn (gen_rtx_SET (target, x));
42687 return true;
42689 default:
42690 return false;
42694 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42695 whose ONE_VAR element is VAR, and other elements are zero. Return true
42696 if successful. */
42698 static bool
42699 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42700 rtx target, rtx var, int one_var)
42702 machine_mode vsimode;
42703 rtx new_target;
42704 rtx x, tmp;
42705 bool use_vector_set = false;
42706 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
42708 switch (mode)
42710 case E_V2DImode:
42711 /* For SSE4.1, we normally use vector set. But if the second
42712 element is zero and inter-unit moves are OK, we use movq
42713 instead. */
42714 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42715 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42716 && one_var == 0));
42717 break;
42718 case E_V16QImode:
42719 case E_V4SImode:
42720 case E_V4SFmode:
42721 use_vector_set = TARGET_SSE4_1;
42722 break;
42723 case E_V8HImode:
42724 use_vector_set = TARGET_SSE2;
42725 break;
42726 case E_V4HImode:
42727 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42728 break;
42729 case E_V32QImode:
42730 case E_V16HImode:
42731 use_vector_set = TARGET_AVX;
42732 break;
42733 case E_V8SImode:
42734 use_vector_set = TARGET_AVX;
42735 gen_vec_set_0 = gen_vec_setv8si_0;
42736 break;
42737 case E_V8SFmode:
42738 use_vector_set = TARGET_AVX;
42739 gen_vec_set_0 = gen_vec_setv8sf_0;
42740 break;
42741 case E_V4DFmode:
42742 use_vector_set = TARGET_AVX;
42743 gen_vec_set_0 = gen_vec_setv4df_0;
42744 break;
42745 case E_V4DImode:
42746 /* Use ix86_expand_vector_set in 64bit mode only. */
42747 use_vector_set = TARGET_AVX && TARGET_64BIT;
42748 gen_vec_set_0 = gen_vec_setv4di_0;
42749 break;
42750 case E_V16SImode:
42751 use_vector_set = TARGET_AVX512F && one_var == 0;
42752 gen_vec_set_0 = gen_vec_setv16si_0;
42753 break;
42754 case E_V16SFmode:
42755 use_vector_set = TARGET_AVX512F && one_var == 0;
42756 gen_vec_set_0 = gen_vec_setv16sf_0;
42757 break;
42758 case E_V8DFmode:
42759 use_vector_set = TARGET_AVX512F && one_var == 0;
42760 gen_vec_set_0 = gen_vec_setv8df_0;
42761 break;
42762 case E_V8DImode:
42763 /* Use ix86_expand_vector_set in 64bit mode only. */
42764 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
42765 gen_vec_set_0 = gen_vec_setv8di_0;
42766 break;
42767 default:
42768 break;
42771 if (use_vector_set)
42773 if (gen_vec_set_0 && one_var == 0)
42775 var = force_reg (GET_MODE_INNER (mode), var);
42776 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
42777 return true;
42779 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42780 var = force_reg (GET_MODE_INNER (mode), var);
42781 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42782 return true;
42785 switch (mode)
42787 case E_V2SFmode:
42788 case E_V2SImode:
42789 if (!mmx_ok)
42790 return false;
42791 /* FALLTHRU */
42793 case E_V2DFmode:
42794 case E_V2DImode:
42795 if (one_var != 0)
42796 return false;
42797 var = force_reg (GET_MODE_INNER (mode), var);
42798 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
42799 emit_insn (gen_rtx_SET (target, x));
42800 return true;
42802 case E_V4SFmode:
42803 case E_V4SImode:
42804 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
42805 new_target = gen_reg_rtx (mode);
42806 else
42807 new_target = target;
42808 var = force_reg (GET_MODE_INNER (mode), var);
42809 x = gen_rtx_VEC_DUPLICATE (mode, var);
42810 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
42811 emit_insn (gen_rtx_SET (new_target, x));
42812 if (one_var != 0)
42814 /* We need to shuffle the value to the correct position, so
42815 create a new pseudo to store the intermediate result. */
42817 /* With SSE2, we can use the integer shuffle insns. */
42818 if (mode != V4SFmode && TARGET_SSE2)
42820 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
42821 const1_rtx,
42822 GEN_INT (one_var == 1 ? 0 : 1),
42823 GEN_INT (one_var == 2 ? 0 : 1),
42824 GEN_INT (one_var == 3 ? 0 : 1)));
42825 if (target != new_target)
42826 emit_move_insn (target, new_target);
42827 return true;
42830 /* Otherwise convert the intermediate result to V4SFmode and
42831 use the SSE1 shuffle instructions. */
42832 if (mode != V4SFmode)
42834 tmp = gen_reg_rtx (V4SFmode);
42835 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
42837 else
42838 tmp = new_target;
42840 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
42841 const1_rtx,
42842 GEN_INT (one_var == 1 ? 0 : 1),
42843 GEN_INT (one_var == 2 ? 0+4 : 1+4),
42844 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
42846 if (mode != V4SFmode)
42847 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
42848 else if (tmp != target)
42849 emit_move_insn (target, tmp);
42851 else if (target != new_target)
42852 emit_move_insn (target, new_target);
42853 return true;
42855 case E_V8HImode:
42856 case E_V16QImode:
42857 vsimode = V4SImode;
42858 goto widen;
42859 case E_V4HImode:
42860 case E_V8QImode:
42861 if (!mmx_ok)
42862 return false;
42863 vsimode = V2SImode;
42864 goto widen;
42865 widen:
42866 if (one_var != 0)
42867 return false;
42869 /* Zero extend the variable element to SImode and recurse. */
42870 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
42872 x = gen_reg_rtx (vsimode);
42873 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
42874 var, one_var))
42875 gcc_unreachable ();
42877 emit_move_insn (target, gen_lowpart (mode, x));
42878 return true;
42880 default:
42881 return false;
42885 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42886 consisting of the values in VALS. It is known that all elements
42887 except ONE_VAR are constants. Return true if successful. */
42889 static bool
42890 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42891 rtx target, rtx vals, int one_var)
42893 rtx var = XVECEXP (vals, 0, one_var);
42894 machine_mode wmode;
42895 rtx const_vec, x;
42897 const_vec = copy_rtx (vals);
42898 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42899 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42901 switch (mode)
42903 case E_V2DFmode:
42904 case E_V2DImode:
42905 case E_V2SFmode:
42906 case E_V2SImode:
42907 /* For the two element vectors, it's just as easy to use
42908 the general case. */
42909 return false;
42911 case E_V4DImode:
42912 /* Use ix86_expand_vector_set in 64bit mode only. */
42913 if (!TARGET_64BIT)
42914 return false;
42915 /* FALLTHRU */
42916 case E_V4DFmode:
42917 case E_V8SFmode:
42918 case E_V8SImode:
42919 case E_V16HImode:
42920 case E_V32QImode:
42921 case E_V4SFmode:
42922 case E_V4SImode:
42923 case E_V8HImode:
42924 case E_V4HImode:
42925 break;
42927 case E_V16QImode:
42928 if (TARGET_SSE4_1)
42929 break;
42930 wmode = V8HImode;
42931 goto widen;
42932 case E_V8QImode:
42933 wmode = V4HImode;
42934 goto widen;
42935 widen:
42936 /* There's no way to set one QImode entry easily. Combine
42937 the variable value with its adjacent constant value, and
42938 promote to an HImode set. */
42939 x = XVECEXP (vals, 0, one_var ^ 1);
42940 if (one_var & 1)
42942 var = convert_modes (HImode, QImode, var, true);
42943 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42944 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42945 x = GEN_INT (INTVAL (x) & 0xff);
42947 else
42949 var = convert_modes (HImode, QImode, var, true);
42950 x = gen_int_mode (INTVAL (x) << 8, HImode);
42952 if (x != const0_rtx)
42953 var = expand_simple_binop (HImode, IOR, var, x, var,
42954 1, OPTAB_LIB_WIDEN);
42956 x = gen_reg_rtx (wmode);
42957 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42958 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42960 emit_move_insn (target, gen_lowpart (mode, x));
42961 return true;
42963 default:
42964 return false;
42967 emit_move_insn (target, const_vec);
42968 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42969 return true;
42972 /* A subroutine of ix86_expand_vector_init_general. Use vector
42973 concatenate to handle the most general case: all values variable,
42974 and none identical. */
42976 static void
42977 ix86_expand_vector_init_concat (machine_mode mode,
42978 rtx target, rtx *ops, int n)
42980 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42981 rtx first[16], second[8], third[4];
42982 rtvec v;
42983 int i, j;
42985 switch (n)
42987 case 2:
42988 switch (mode)
42990 case E_V16SImode:
42991 cmode = V8SImode;
42992 break;
42993 case E_V16SFmode:
42994 cmode = V8SFmode;
42995 break;
42996 case E_V8DImode:
42997 cmode = V4DImode;
42998 break;
42999 case E_V8DFmode:
43000 cmode = V4DFmode;
43001 break;
43002 case E_V8SImode:
43003 cmode = V4SImode;
43004 break;
43005 case E_V8SFmode:
43006 cmode = V4SFmode;
43007 break;
43008 case E_V4DImode:
43009 cmode = V2DImode;
43010 break;
43011 case E_V4DFmode:
43012 cmode = V2DFmode;
43013 break;
43014 case E_V4SImode:
43015 cmode = V2SImode;
43016 break;
43017 case E_V4SFmode:
43018 cmode = V2SFmode;
43019 break;
43020 case E_V2DImode:
43021 cmode = DImode;
43022 break;
43023 case E_V2SImode:
43024 cmode = SImode;
43025 break;
43026 case E_V2DFmode:
43027 cmode = DFmode;
43028 break;
43029 case E_V2SFmode:
43030 cmode = SFmode;
43031 break;
43032 default:
43033 gcc_unreachable ();
43036 if (!register_operand (ops[1], cmode))
43037 ops[1] = force_reg (cmode, ops[1]);
43038 if (!register_operand (ops[0], cmode))
43039 ops[0] = force_reg (cmode, ops[0]);
43040 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43041 ops[1])));
43042 break;
43044 case 4:
43045 switch (mode)
43047 case E_V4DImode:
43048 cmode = V2DImode;
43049 break;
43050 case E_V4DFmode:
43051 cmode = V2DFmode;
43052 break;
43053 case E_V4SImode:
43054 cmode = V2SImode;
43055 break;
43056 case E_V4SFmode:
43057 cmode = V2SFmode;
43058 break;
43059 default:
43060 gcc_unreachable ();
43062 goto half;
43064 case 8:
43065 switch (mode)
43067 case E_V8DImode:
43068 cmode = V2DImode;
43069 hmode = V4DImode;
43070 break;
43071 case E_V8DFmode:
43072 cmode = V2DFmode;
43073 hmode = V4DFmode;
43074 break;
43075 case E_V8SImode:
43076 cmode = V2SImode;
43077 hmode = V4SImode;
43078 break;
43079 case E_V8SFmode:
43080 cmode = V2SFmode;
43081 hmode = V4SFmode;
43082 break;
43083 default:
43084 gcc_unreachable ();
43086 goto half;
43088 case 16:
43089 switch (mode)
43091 case E_V16SImode:
43092 cmode = V2SImode;
43093 hmode = V4SImode;
43094 gmode = V8SImode;
43095 break;
43096 case E_V16SFmode:
43097 cmode = V2SFmode;
43098 hmode = V4SFmode;
43099 gmode = V8SFmode;
43100 break;
43101 default:
43102 gcc_unreachable ();
43104 goto half;
43106 half:
43107 /* FIXME: We process inputs backward to help RA. PR 36222. */
43108 i = n - 1;
43109 j = (n >> 1) - 1;
43110 for (; i > 0; i -= 2, j--)
43112 first[j] = gen_reg_rtx (cmode);
43113 v = gen_rtvec (2, ops[i - 1], ops[i]);
43114 ix86_expand_vector_init (false, first[j],
43115 gen_rtx_PARALLEL (cmode, v));
43118 n >>= 1;
43119 if (n > 4)
43121 gcc_assert (hmode != VOIDmode);
43122 gcc_assert (gmode != VOIDmode);
43123 for (i = j = 0; i < n; i += 2, j++)
43125 second[j] = gen_reg_rtx (hmode);
43126 ix86_expand_vector_init_concat (hmode, second [j],
43127 &first [i], 2);
43129 n >>= 1;
43130 for (i = j = 0; i < n; i += 2, j++)
43132 third[j] = gen_reg_rtx (gmode);
43133 ix86_expand_vector_init_concat (gmode, third[j],
43134 &second[i], 2);
43136 n >>= 1;
43137 ix86_expand_vector_init_concat (mode, target, third, n);
43139 else if (n > 2)
43141 gcc_assert (hmode != VOIDmode);
43142 for (i = j = 0; i < n; i += 2, j++)
43144 second[j] = gen_reg_rtx (hmode);
43145 ix86_expand_vector_init_concat (hmode, second [j],
43146 &first [i], 2);
43148 n >>= 1;
43149 ix86_expand_vector_init_concat (mode, target, second, n);
43151 else
43152 ix86_expand_vector_init_concat (mode, target, first, n);
43153 break;
43155 default:
43156 gcc_unreachable ();
43160 /* A subroutine of ix86_expand_vector_init_general. Use vector
43161 interleave to handle the most general case: all values variable,
43162 and none identical. */
43164 static void
43165 ix86_expand_vector_init_interleave (machine_mode mode,
43166 rtx target, rtx *ops, int n)
43168 machine_mode first_imode, second_imode, third_imode, inner_mode;
43169 int i, j;
43170 rtx op0, op1;
43171 rtx (*gen_load_even) (rtx, rtx, rtx);
43172 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43173 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43175 switch (mode)
43177 case E_V8HImode:
43178 gen_load_even = gen_vec_setv8hi;
43179 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43180 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43181 inner_mode = HImode;
43182 first_imode = V4SImode;
43183 second_imode = V2DImode;
43184 third_imode = VOIDmode;
43185 break;
43186 case E_V16QImode:
43187 gen_load_even = gen_vec_setv16qi;
43188 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43189 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43190 inner_mode = QImode;
43191 first_imode = V8HImode;
43192 second_imode = V4SImode;
43193 third_imode = V2DImode;
43194 break;
43195 default:
43196 gcc_unreachable ();
43199 for (i = 0; i < n; i++)
43201 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43202 op0 = gen_reg_rtx (SImode);
43203 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43205 /* Insert the SImode value as low element of V4SImode vector. */
43206 op1 = gen_reg_rtx (V4SImode);
43207 op0 = gen_rtx_VEC_MERGE (V4SImode,
43208 gen_rtx_VEC_DUPLICATE (V4SImode,
43209 op0),
43210 CONST0_RTX (V4SImode),
43211 const1_rtx);
43212 emit_insn (gen_rtx_SET (op1, op0));
43214 /* Cast the V4SImode vector back to a vector in orignal mode. */
43215 op0 = gen_reg_rtx (mode);
43216 emit_move_insn (op0, gen_lowpart (mode, op1));
43218 /* Load even elements into the second position. */
43219 emit_insn (gen_load_even (op0,
43220 force_reg (inner_mode,
43221 ops [i + i + 1]),
43222 const1_rtx));
43224 /* Cast vector to FIRST_IMODE vector. */
43225 ops[i] = gen_reg_rtx (first_imode);
43226 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43229 /* Interleave low FIRST_IMODE vectors. */
43230 for (i = j = 0; i < n; i += 2, j++)
43232 op0 = gen_reg_rtx (first_imode);
43233 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43235 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43236 ops[j] = gen_reg_rtx (second_imode);
43237 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43240 /* Interleave low SECOND_IMODE vectors. */
43241 switch (second_imode)
43243 case E_V4SImode:
43244 for (i = j = 0; i < n / 2; i += 2, j++)
43246 op0 = gen_reg_rtx (second_imode);
43247 emit_insn (gen_interleave_second_low (op0, ops[i],
43248 ops[i + 1]));
43250 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43251 vector. */
43252 ops[j] = gen_reg_rtx (third_imode);
43253 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43255 second_imode = V2DImode;
43256 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43257 /* FALLTHRU */
43259 case E_V2DImode:
43260 op0 = gen_reg_rtx (second_imode);
43261 emit_insn (gen_interleave_second_low (op0, ops[0],
43262 ops[1]));
43264 /* Cast the SECOND_IMODE vector back to a vector on original
43265 mode. */
43266 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43267 break;
43269 default:
43270 gcc_unreachable ();
43274 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43275 all values variable, and none identical. */
43277 static void
43278 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43279 rtx target, rtx vals)
43281 rtx ops[64], op0, op1, op2, op3, op4, op5;
43282 machine_mode half_mode = VOIDmode;
43283 machine_mode quarter_mode = VOIDmode;
43284 int n, i;
43286 switch (mode)
43288 case E_V2SFmode:
43289 case E_V2SImode:
43290 if (!mmx_ok && !TARGET_SSE)
43291 break;
43292 /* FALLTHRU */
43294 case E_V16SImode:
43295 case E_V16SFmode:
43296 case E_V8DFmode:
43297 case E_V8DImode:
43298 case E_V8SFmode:
43299 case E_V8SImode:
43300 case E_V4DFmode:
43301 case E_V4DImode:
43302 case E_V4SFmode:
43303 case E_V4SImode:
43304 case E_V2DFmode:
43305 case E_V2DImode:
43306 n = GET_MODE_NUNITS (mode);
43307 for (i = 0; i < n; i++)
43308 ops[i] = XVECEXP (vals, 0, i);
43309 ix86_expand_vector_init_concat (mode, target, ops, n);
43310 return;
43312 case E_V2TImode:
43313 for (i = 0; i < 2; i++)
43314 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43315 op0 = gen_reg_rtx (V4DImode);
43316 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
43317 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43318 return;
43320 case E_V4TImode:
43321 for (i = 0; i < 4; i++)
43322 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43323 ops[4] = gen_reg_rtx (V4DImode);
43324 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
43325 ops[5] = gen_reg_rtx (V4DImode);
43326 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
43327 op0 = gen_reg_rtx (V8DImode);
43328 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
43329 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43330 return;
43332 case E_V32QImode:
43333 half_mode = V16QImode;
43334 goto half;
43336 case E_V16HImode:
43337 half_mode = V8HImode;
43338 goto half;
43340 half:
43341 n = GET_MODE_NUNITS (mode);
43342 for (i = 0; i < n; i++)
43343 ops[i] = XVECEXP (vals, 0, i);
43344 op0 = gen_reg_rtx (half_mode);
43345 op1 = gen_reg_rtx (half_mode);
43346 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43347 n >> 2);
43348 ix86_expand_vector_init_interleave (half_mode, op1,
43349 &ops [n >> 1], n >> 2);
43350 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43351 return;
43353 case E_V64QImode:
43354 quarter_mode = V16QImode;
43355 half_mode = V32QImode;
43356 goto quarter;
43358 case E_V32HImode:
43359 quarter_mode = V8HImode;
43360 half_mode = V16HImode;
43361 goto quarter;
43363 quarter:
43364 n = GET_MODE_NUNITS (mode);
43365 for (i = 0; i < n; i++)
43366 ops[i] = XVECEXP (vals, 0, i);
43367 op0 = gen_reg_rtx (quarter_mode);
43368 op1 = gen_reg_rtx (quarter_mode);
43369 op2 = gen_reg_rtx (quarter_mode);
43370 op3 = gen_reg_rtx (quarter_mode);
43371 op4 = gen_reg_rtx (half_mode);
43372 op5 = gen_reg_rtx (half_mode);
43373 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43374 n >> 3);
43375 ix86_expand_vector_init_interleave (quarter_mode, op1,
43376 &ops [n >> 2], n >> 3);
43377 ix86_expand_vector_init_interleave (quarter_mode, op2,
43378 &ops [n >> 1], n >> 3);
43379 ix86_expand_vector_init_interleave (quarter_mode, op3,
43380 &ops [(n >> 1) | (n >> 2)], n >> 3);
43381 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43382 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43383 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43384 return;
43386 case E_V16QImode:
43387 if (!TARGET_SSE4_1)
43388 break;
43389 /* FALLTHRU */
43391 case E_V8HImode:
43392 if (!TARGET_SSE2)
43393 break;
43395 /* Don't use ix86_expand_vector_init_interleave if we can't
43396 move from GPR to SSE register directly. */
43397 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43398 break;
43400 n = GET_MODE_NUNITS (mode);
43401 for (i = 0; i < n; i++)
43402 ops[i] = XVECEXP (vals, 0, i);
43403 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43404 return;
43406 case E_V4HImode:
43407 case E_V8QImode:
43408 break;
43410 default:
43411 gcc_unreachable ();
43415 int i, j, n_elts, n_words, n_elt_per_word;
43416 machine_mode inner_mode;
43417 rtx words[4], shift;
43419 inner_mode = GET_MODE_INNER (mode);
43420 n_elts = GET_MODE_NUNITS (mode);
43421 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43422 n_elt_per_word = n_elts / n_words;
43423 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43425 for (i = 0; i < n_words; ++i)
43427 rtx word = NULL_RTX;
43429 for (j = 0; j < n_elt_per_word; ++j)
43431 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43432 elt = convert_modes (word_mode, inner_mode, elt, true);
43434 if (j == 0)
43435 word = elt;
43436 else
43438 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43439 word, 1, OPTAB_LIB_WIDEN);
43440 word = expand_simple_binop (word_mode, IOR, word, elt,
43441 word, 1, OPTAB_LIB_WIDEN);
43445 words[i] = word;
43448 if (n_words == 1)
43449 emit_move_insn (target, gen_lowpart (mode, words[0]));
43450 else if (n_words == 2)
43452 rtx tmp = gen_reg_rtx (mode);
43453 emit_clobber (tmp);
43454 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43455 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43456 emit_move_insn (target, tmp);
43458 else if (n_words == 4)
43460 rtx tmp = gen_reg_rtx (V4SImode);
43461 gcc_assert (word_mode == SImode);
43462 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43463 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43464 emit_move_insn (target, gen_lowpart (mode, tmp));
43466 else
43467 gcc_unreachable ();
43471 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43472 instructions unless MMX_OK is true. */
43474 void
43475 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43477 machine_mode mode = GET_MODE (target);
43478 machine_mode inner_mode = GET_MODE_INNER (mode);
43479 int n_elts = GET_MODE_NUNITS (mode);
43480 int n_var = 0, one_var = -1;
43481 bool all_same = true, all_const_zero = true;
43482 int i;
43483 rtx x;
43485 /* Handle first initialization from vector elts. */
43486 if (n_elts != XVECLEN (vals, 0))
43488 rtx subtarget = target;
43489 x = XVECEXP (vals, 0, 0);
43490 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
43491 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
43493 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
43494 if (inner_mode == QImode || inner_mode == HImode)
43496 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
43497 mode = mode_for_vector (SImode, n_bits / 4).require ();
43498 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
43499 ops[0] = gen_lowpart (inner_mode, ops[0]);
43500 ops[1] = gen_lowpart (inner_mode, ops[1]);
43501 subtarget = gen_reg_rtx (mode);
43503 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
43504 if (subtarget != target)
43505 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
43506 return;
43508 gcc_unreachable ();
43511 for (i = 0; i < n_elts; ++i)
43513 x = XVECEXP (vals, 0, i);
43514 if (!(CONST_SCALAR_INT_P (x)
43515 || CONST_DOUBLE_P (x)
43516 || CONST_FIXED_P (x)))
43517 n_var++, one_var = i;
43518 else if (x != CONST0_RTX (inner_mode))
43519 all_const_zero = false;
43520 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43521 all_same = false;
43524 /* Constants are best loaded from the constant pool. */
43525 if (n_var == 0)
43527 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43528 return;
43531 /* If all values are identical, broadcast the value. */
43532 if (all_same
43533 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43534 XVECEXP (vals, 0, 0)))
43535 return;
43537 /* Values where only one field is non-constant are best loaded from
43538 the pool and overwritten via move later. */
43539 if (n_var == 1)
43541 if (all_const_zero
43542 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43543 XVECEXP (vals, 0, one_var),
43544 one_var))
43545 return;
43547 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43548 return;
43551 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43554 void
43555 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43557 machine_mode mode = GET_MODE (target);
43558 machine_mode inner_mode = GET_MODE_INNER (mode);
43559 machine_mode half_mode;
43560 bool use_vec_merge = false;
43561 rtx tmp;
43562 static rtx (*gen_extract[6][2]) (rtx, rtx)
43564 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43565 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43566 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43567 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43568 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43569 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43571 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43573 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43574 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43575 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43576 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43577 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43578 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43580 int i, j, n;
43581 machine_mode mmode = VOIDmode;
43582 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43584 switch (mode)
43586 case E_V2SFmode:
43587 case E_V2SImode:
43588 if (mmx_ok)
43590 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43591 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43592 if (elt == 0)
43593 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43594 else
43595 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43596 emit_insn (gen_rtx_SET (target, tmp));
43597 return;
43599 break;
43601 case E_V2DImode:
43602 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43603 if (use_vec_merge)
43604 break;
43606 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43607 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43608 if (elt == 0)
43609 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43610 else
43611 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43612 emit_insn (gen_rtx_SET (target, tmp));
43613 return;
43615 case E_V2DFmode:
43617 rtx op0, op1;
43619 /* For the two element vectors, we implement a VEC_CONCAT with
43620 the extraction of the other element. */
43622 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43623 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43625 if (elt == 0)
43626 op0 = val, op1 = tmp;
43627 else
43628 op0 = tmp, op1 = val;
43630 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43631 emit_insn (gen_rtx_SET (target, tmp));
43633 return;
43635 case E_V4SFmode:
43636 use_vec_merge = TARGET_SSE4_1;
43637 if (use_vec_merge)
43638 break;
43640 switch (elt)
43642 case 0:
43643 use_vec_merge = true;
43644 break;
43646 case 1:
43647 /* tmp = target = A B C D */
43648 tmp = copy_to_reg (target);
43649 /* target = A A B B */
43650 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43651 /* target = X A B B */
43652 ix86_expand_vector_set (false, target, val, 0);
43653 /* target = A X C D */
43654 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43655 const1_rtx, const0_rtx,
43656 GEN_INT (2+4), GEN_INT (3+4)));
43657 return;
43659 case 2:
43660 /* tmp = target = A B C D */
43661 tmp = copy_to_reg (target);
43662 /* tmp = X B C D */
43663 ix86_expand_vector_set (false, tmp, val, 0);
43664 /* target = A B X D */
43665 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43666 const0_rtx, const1_rtx,
43667 GEN_INT (0+4), GEN_INT (3+4)));
43668 return;
43670 case 3:
43671 /* tmp = target = A B C D */
43672 tmp = copy_to_reg (target);
43673 /* tmp = X B C D */
43674 ix86_expand_vector_set (false, tmp, val, 0);
43675 /* target = A B X D */
43676 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43677 const0_rtx, const1_rtx,
43678 GEN_INT (2+4), GEN_INT (0+4)));
43679 return;
43681 default:
43682 gcc_unreachable ();
43684 break;
43686 case E_V4SImode:
43687 use_vec_merge = TARGET_SSE4_1;
43688 if (use_vec_merge)
43689 break;
43691 /* Element 0 handled by vec_merge below. */
43692 if (elt == 0)
43694 use_vec_merge = true;
43695 break;
43698 if (TARGET_SSE2)
43700 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43701 store into element 0, then shuffle them back. */
43703 rtx order[4];
43705 order[0] = GEN_INT (elt);
43706 order[1] = const1_rtx;
43707 order[2] = const2_rtx;
43708 order[3] = GEN_INT (3);
43709 order[elt] = const0_rtx;
43711 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43712 order[1], order[2], order[3]));
43714 ix86_expand_vector_set (false, target, val, 0);
43716 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43717 order[1], order[2], order[3]));
43719 else
43721 /* For SSE1, we have to reuse the V4SF code. */
43722 rtx t = gen_reg_rtx (V4SFmode);
43723 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43724 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43725 emit_move_insn (target, gen_lowpart (mode, t));
43727 return;
43729 case E_V8HImode:
43730 use_vec_merge = TARGET_SSE2;
43731 break;
43732 case E_V4HImode:
43733 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43734 break;
43736 case E_V16QImode:
43737 use_vec_merge = TARGET_SSE4_1;
43738 break;
43740 case E_V8QImode:
43741 break;
43743 case E_V32QImode:
43744 half_mode = V16QImode;
43745 j = 0;
43746 n = 16;
43747 goto half;
43749 case E_V16HImode:
43750 half_mode = V8HImode;
43751 j = 1;
43752 n = 8;
43753 goto half;
43755 case E_V8SImode:
43756 half_mode = V4SImode;
43757 j = 2;
43758 n = 4;
43759 goto half;
43761 case E_V4DImode:
43762 half_mode = V2DImode;
43763 j = 3;
43764 n = 2;
43765 goto half;
43767 case E_V8SFmode:
43768 half_mode = V4SFmode;
43769 j = 4;
43770 n = 4;
43771 goto half;
43773 case E_V4DFmode:
43774 half_mode = V2DFmode;
43775 j = 5;
43776 n = 2;
43777 goto half;
43779 half:
43780 /* Compute offset. */
43781 i = elt / n;
43782 elt %= n;
43784 gcc_assert (i <= 1);
43786 /* Extract the half. */
43787 tmp = gen_reg_rtx (half_mode);
43788 emit_insn (gen_extract[j][i] (tmp, target));
43790 /* Put val in tmp at elt. */
43791 ix86_expand_vector_set (false, tmp, val, elt);
43793 /* Put it back. */
43794 emit_insn (gen_insert[j][i] (target, target, tmp));
43795 return;
43797 case E_V8DFmode:
43798 if (TARGET_AVX512F)
43800 mmode = QImode;
43801 gen_blendm = gen_avx512f_blendmv8df;
43803 break;
43805 case E_V8DImode:
43806 if (TARGET_AVX512F)
43808 mmode = QImode;
43809 gen_blendm = gen_avx512f_blendmv8di;
43811 break;
43813 case E_V16SFmode:
43814 if (TARGET_AVX512F)
43816 mmode = HImode;
43817 gen_blendm = gen_avx512f_blendmv16sf;
43819 break;
43821 case E_V16SImode:
43822 if (TARGET_AVX512F)
43824 mmode = HImode;
43825 gen_blendm = gen_avx512f_blendmv16si;
43827 break;
43829 case E_V32HImode:
43830 if (TARGET_AVX512F && TARGET_AVX512BW)
43832 mmode = SImode;
43833 gen_blendm = gen_avx512bw_blendmv32hi;
43835 break;
43837 case E_V64QImode:
43838 if (TARGET_AVX512F && TARGET_AVX512BW)
43840 mmode = DImode;
43841 gen_blendm = gen_avx512bw_blendmv64qi;
43843 break;
43845 default:
43846 break;
43849 if (mmode != VOIDmode)
43851 tmp = gen_reg_rtx (mode);
43852 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
43853 /* The avx512*_blendm<mode> expanders have different operand order
43854 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
43855 elements where the mask is set and second input operand otherwise,
43856 in {sse,avx}*_*blend* the first input operand is used for elements
43857 where the mask is clear and second input operand otherwise. */
43858 emit_insn (gen_blendm (target, target, tmp,
43859 force_reg (mmode,
43860 gen_int_mode (1 << elt, mmode))));
43862 else if (use_vec_merge)
43864 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
43865 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
43866 emit_insn (gen_rtx_SET (target, tmp));
43868 else
43870 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43872 emit_move_insn (mem, target);
43874 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43875 emit_move_insn (tmp, val);
43877 emit_move_insn (target, mem);
43881 void
43882 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
43884 machine_mode mode = GET_MODE (vec);
43885 machine_mode inner_mode = GET_MODE_INNER (mode);
43886 bool use_vec_extr = false;
43887 rtx tmp;
43889 switch (mode)
43891 case E_V2SImode:
43892 case E_V2SFmode:
43893 if (!mmx_ok)
43894 break;
43895 /* FALLTHRU */
43897 case E_V2DFmode:
43898 case E_V2DImode:
43899 case E_V2TImode:
43900 case E_V4TImode:
43901 use_vec_extr = true;
43902 break;
43904 case E_V4SFmode:
43905 use_vec_extr = TARGET_SSE4_1;
43906 if (use_vec_extr)
43907 break;
43909 switch (elt)
43911 case 0:
43912 tmp = vec;
43913 break;
43915 case 1:
43916 case 3:
43917 tmp = gen_reg_rtx (mode);
43918 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
43919 GEN_INT (elt), GEN_INT (elt),
43920 GEN_INT (elt+4), GEN_INT (elt+4)));
43921 break;
43923 case 2:
43924 tmp = gen_reg_rtx (mode);
43925 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
43926 break;
43928 default:
43929 gcc_unreachable ();
43931 vec = tmp;
43932 use_vec_extr = true;
43933 elt = 0;
43934 break;
43936 case E_V4SImode:
43937 use_vec_extr = TARGET_SSE4_1;
43938 if (use_vec_extr)
43939 break;
43941 if (TARGET_SSE2)
43943 switch (elt)
43945 case 0:
43946 tmp = vec;
43947 break;
43949 case 1:
43950 case 3:
43951 tmp = gen_reg_rtx (mode);
43952 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43953 GEN_INT (elt), GEN_INT (elt),
43954 GEN_INT (elt), GEN_INT (elt)));
43955 break;
43957 case 2:
43958 tmp = gen_reg_rtx (mode);
43959 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43960 break;
43962 default:
43963 gcc_unreachable ();
43965 vec = tmp;
43966 use_vec_extr = true;
43967 elt = 0;
43969 else
43971 /* For SSE1, we have to reuse the V4SF code. */
43972 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43973 gen_lowpart (V4SFmode, vec), elt);
43974 return;
43976 break;
43978 case E_V8HImode:
43979 use_vec_extr = TARGET_SSE2;
43980 break;
43981 case E_V4HImode:
43982 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43983 break;
43985 case E_V16QImode:
43986 use_vec_extr = TARGET_SSE4_1;
43987 break;
43989 case E_V8SFmode:
43990 if (TARGET_AVX)
43992 tmp = gen_reg_rtx (V4SFmode);
43993 if (elt < 4)
43994 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43995 else
43996 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43997 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43998 return;
44000 break;
44002 case E_V4DFmode:
44003 if (TARGET_AVX)
44005 tmp = gen_reg_rtx (V2DFmode);
44006 if (elt < 2)
44007 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44008 else
44009 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44010 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44011 return;
44013 break;
44015 case E_V32QImode:
44016 if (TARGET_AVX)
44018 tmp = gen_reg_rtx (V16QImode);
44019 if (elt < 16)
44020 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44021 else
44022 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44023 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44024 return;
44026 break;
44028 case E_V16HImode:
44029 if (TARGET_AVX)
44031 tmp = gen_reg_rtx (V8HImode);
44032 if (elt < 8)
44033 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44034 else
44035 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44036 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44037 return;
44039 break;
44041 case E_V8SImode:
44042 if (TARGET_AVX)
44044 tmp = gen_reg_rtx (V4SImode);
44045 if (elt < 4)
44046 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44047 else
44048 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44049 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44050 return;
44052 break;
44054 case E_V4DImode:
44055 if (TARGET_AVX)
44057 tmp = gen_reg_rtx (V2DImode);
44058 if (elt < 2)
44059 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44060 else
44061 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44062 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44063 return;
44065 break;
44067 case E_V32HImode:
44068 if (TARGET_AVX512BW)
44070 tmp = gen_reg_rtx (V16HImode);
44071 if (elt < 16)
44072 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44073 else
44074 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44075 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44076 return;
44078 break;
44080 case E_V64QImode:
44081 if (TARGET_AVX512BW)
44083 tmp = gen_reg_rtx (V32QImode);
44084 if (elt < 32)
44085 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44086 else
44087 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44088 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44089 return;
44091 break;
44093 case E_V16SFmode:
44094 tmp = gen_reg_rtx (V8SFmode);
44095 if (elt < 8)
44096 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44097 else
44098 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44099 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44100 return;
44102 case E_V8DFmode:
44103 tmp = gen_reg_rtx (V4DFmode);
44104 if (elt < 4)
44105 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44106 else
44107 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44108 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44109 return;
44111 case E_V16SImode:
44112 tmp = gen_reg_rtx (V8SImode);
44113 if (elt < 8)
44114 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44115 else
44116 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44117 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44118 return;
44120 case E_V8DImode:
44121 tmp = gen_reg_rtx (V4DImode);
44122 if (elt < 4)
44123 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44124 else
44125 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44126 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44127 return;
44129 case E_V8QImode:
44130 /* ??? Could extract the appropriate HImode element and shift. */
44131 default:
44132 break;
44135 if (use_vec_extr)
44137 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44138 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44140 /* Let the rtl optimizers know about the zero extension performed. */
44141 if (inner_mode == QImode || inner_mode == HImode)
44143 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44144 target = gen_lowpart (SImode, target);
44147 emit_insn (gen_rtx_SET (target, tmp));
44149 else
44151 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44153 emit_move_insn (mem, vec);
44155 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44156 emit_move_insn (target, tmp);
44160 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44161 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44162 The upper bits of DEST are undefined, though they shouldn't cause
44163 exceptions (some bits from src or all zeros are ok). */
44165 static void
44166 emit_reduc_half (rtx dest, rtx src, int i)
44168 rtx tem, d = dest;
44169 switch (GET_MODE (src))
44171 case E_V4SFmode:
44172 if (i == 128)
44173 tem = gen_sse_movhlps (dest, src, src);
44174 else
44175 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44176 GEN_INT (1 + 4), GEN_INT (1 + 4));
44177 break;
44178 case E_V2DFmode:
44179 tem = gen_vec_interleave_highv2df (dest, src, src);
44180 break;
44181 case E_V16QImode:
44182 case E_V8HImode:
44183 case E_V4SImode:
44184 case E_V2DImode:
44185 d = gen_reg_rtx (V1TImode);
44186 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44187 GEN_INT (i / 2));
44188 break;
44189 case E_V8SFmode:
44190 if (i == 256)
44191 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44192 else
44193 tem = gen_avx_shufps256 (dest, src, src,
44194 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44195 break;
44196 case E_V4DFmode:
44197 if (i == 256)
44198 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44199 else
44200 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44201 break;
44202 case E_V32QImode:
44203 case E_V16HImode:
44204 case E_V8SImode:
44205 case E_V4DImode:
44206 if (i == 256)
44208 if (GET_MODE (dest) != V4DImode)
44209 d = gen_reg_rtx (V4DImode);
44210 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44211 gen_lowpart (V4DImode, src),
44212 const1_rtx);
44214 else
44216 d = gen_reg_rtx (V2TImode);
44217 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44218 GEN_INT (i / 2));
44220 break;
44221 case E_V64QImode:
44222 case E_V32HImode:
44223 case E_V16SImode:
44224 case E_V16SFmode:
44225 case E_V8DImode:
44226 case E_V8DFmode:
44227 if (i > 128)
44228 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44229 gen_lowpart (V16SImode, src),
44230 gen_lowpart (V16SImode, src),
44231 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44232 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44233 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44234 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44235 GEN_INT (0xC), GEN_INT (0xD),
44236 GEN_INT (0xE), GEN_INT (0xF),
44237 GEN_INT (0x10), GEN_INT (0x11),
44238 GEN_INT (0x12), GEN_INT (0x13),
44239 GEN_INT (0x14), GEN_INT (0x15),
44240 GEN_INT (0x16), GEN_INT (0x17));
44241 else
44242 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44243 gen_lowpart (V16SImode, src),
44244 GEN_INT (i == 128 ? 0x2 : 0x1),
44245 GEN_INT (0x3),
44246 GEN_INT (0x3),
44247 GEN_INT (0x3),
44248 GEN_INT (i == 128 ? 0x6 : 0x5),
44249 GEN_INT (0x7),
44250 GEN_INT (0x7),
44251 GEN_INT (0x7),
44252 GEN_INT (i == 128 ? 0xA : 0x9),
44253 GEN_INT (0xB),
44254 GEN_INT (0xB),
44255 GEN_INT (0xB),
44256 GEN_INT (i == 128 ? 0xE : 0xD),
44257 GEN_INT (0xF),
44258 GEN_INT (0xF),
44259 GEN_INT (0xF));
44260 break;
44261 default:
44262 gcc_unreachable ();
44264 emit_insn (tem);
44265 if (d != dest)
44266 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44269 /* Expand a vector reduction. FN is the binary pattern to reduce;
44270 DEST is the destination; IN is the input vector. */
44272 void
44273 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44275 rtx half, dst, vec = in;
44276 machine_mode mode = GET_MODE (in);
44277 int i;
44279 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44280 if (TARGET_SSE4_1
44281 && mode == V8HImode
44282 && fn == gen_uminv8hi3)
44284 emit_insn (gen_sse4_1_phminposuw (dest, in));
44285 return;
44288 for (i = GET_MODE_BITSIZE (mode);
44289 i > GET_MODE_UNIT_BITSIZE (mode);
44290 i >>= 1)
44292 half = gen_reg_rtx (mode);
44293 emit_reduc_half (half, vec, i);
44294 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44295 dst = dest;
44296 else
44297 dst = gen_reg_rtx (mode);
44298 emit_insn (fn (dst, half, vec));
44299 vec = dst;
44303 /* Target hook for scalar_mode_supported_p. */
44304 static bool
44305 ix86_scalar_mode_supported_p (scalar_mode mode)
44307 if (DECIMAL_FLOAT_MODE_P (mode))
44308 return default_decimal_float_supported_p ();
44309 else if (mode == TFmode)
44310 return true;
44311 else
44312 return default_scalar_mode_supported_p (mode);
44315 /* Implements target hook vector_mode_supported_p. */
44316 static bool
44317 ix86_vector_mode_supported_p (machine_mode mode)
44319 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44320 return true;
44321 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44322 return true;
44323 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44324 return true;
44325 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44326 return true;
44327 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44328 return true;
44329 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44330 return true;
44331 return false;
44334 /* Target hook for c_mode_for_suffix. */
44335 static machine_mode
44336 ix86_c_mode_for_suffix (char suffix)
44338 if (suffix == 'q')
44339 return TFmode;
44340 if (suffix == 'w')
44341 return XFmode;
44343 return VOIDmode;
44346 /* Worker function for TARGET_MD_ASM_ADJUST.
44348 We implement asm flag outputs, and maintain source compatibility
44349 with the old cc0-based compiler. */
44351 static rtx_insn *
44352 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44353 vec<const char *> &constraints,
44354 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44356 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44357 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44359 bool saw_asm_flag = false;
44361 start_sequence ();
44362 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44364 const char *con = constraints[i];
44365 if (strncmp (con, "=@cc", 4) != 0)
44366 continue;
44367 con += 4;
44368 if (strchr (con, ',') != NULL)
44370 error ("alternatives not allowed in asm flag output");
44371 continue;
44374 bool invert = false;
44375 if (con[0] == 'n')
44376 invert = true, con++;
44378 machine_mode mode = CCmode;
44379 rtx_code code = UNKNOWN;
44381 switch (con[0])
44383 case 'a':
44384 if (con[1] == 0)
44385 mode = CCAmode, code = EQ;
44386 else if (con[1] == 'e' && con[2] == 0)
44387 mode = CCCmode, code = NE;
44388 break;
44389 case 'b':
44390 if (con[1] == 0)
44391 mode = CCCmode, code = EQ;
44392 else if (con[1] == 'e' && con[2] == 0)
44393 mode = CCAmode, code = NE;
44394 break;
44395 case 'c':
44396 if (con[1] == 0)
44397 mode = CCCmode, code = EQ;
44398 break;
44399 case 'e':
44400 if (con[1] == 0)
44401 mode = CCZmode, code = EQ;
44402 break;
44403 case 'g':
44404 if (con[1] == 0)
44405 mode = CCGCmode, code = GT;
44406 else if (con[1] == 'e' && con[2] == 0)
44407 mode = CCGCmode, code = GE;
44408 break;
44409 case 'l':
44410 if (con[1] == 0)
44411 mode = CCGCmode, code = LT;
44412 else if (con[1] == 'e' && con[2] == 0)
44413 mode = CCGCmode, code = LE;
44414 break;
44415 case 'o':
44416 if (con[1] == 0)
44417 mode = CCOmode, code = EQ;
44418 break;
44419 case 'p':
44420 if (con[1] == 0)
44421 mode = CCPmode, code = EQ;
44422 break;
44423 case 's':
44424 if (con[1] == 0)
44425 mode = CCSmode, code = EQ;
44426 break;
44427 case 'z':
44428 if (con[1] == 0)
44429 mode = CCZmode, code = EQ;
44430 break;
44432 if (code == UNKNOWN)
44434 error ("unknown asm flag output %qs", constraints[i]);
44435 continue;
44437 if (invert)
44438 code = reverse_condition (code);
44440 rtx dest = outputs[i];
44441 if (!saw_asm_flag)
44443 /* This is the first asm flag output. Here we put the flags
44444 register in as the real output and adjust the condition to
44445 allow it. */
44446 constraints[i] = "=Bf";
44447 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44448 saw_asm_flag = true;
44450 else
44452 /* We don't need the flags register as output twice. */
44453 constraints[i] = "=X";
44454 outputs[i] = gen_rtx_SCRATCH (SImode);
44457 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44458 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44460 machine_mode dest_mode = GET_MODE (dest);
44461 if (!SCALAR_INT_MODE_P (dest_mode))
44463 error ("invalid type for asm flag output");
44464 continue;
44467 if (dest_mode == DImode && !TARGET_64BIT)
44468 dest_mode = SImode;
44470 if (dest_mode != QImode)
44472 rtx destqi = gen_reg_rtx (QImode);
44473 emit_insn (gen_rtx_SET (destqi, x));
44475 if (TARGET_ZERO_EXTEND_WITH_AND
44476 && optimize_function_for_speed_p (cfun))
44478 x = force_reg (dest_mode, const0_rtx);
44480 emit_insn (gen_movstrictqi
44481 (gen_lowpart (QImode, x), destqi));
44483 else
44484 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44487 if (dest_mode != GET_MODE (dest))
44489 rtx tmp = gen_reg_rtx (SImode);
44491 emit_insn (gen_rtx_SET (tmp, x));
44492 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44494 else
44495 emit_insn (gen_rtx_SET (dest, x));
44497 rtx_insn *seq = get_insns ();
44498 end_sequence ();
44500 if (saw_asm_flag)
44501 return seq;
44502 else
44504 /* If we had no asm flag outputs, clobber the flags. */
44505 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44506 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44507 return NULL;
44511 /* Implements target vector targetm.asm.encode_section_info. */
44513 static void ATTRIBUTE_UNUSED
44514 ix86_encode_section_info (tree decl, rtx rtl, int first)
44516 default_encode_section_info (decl, rtl, first);
44518 if (ix86_in_large_data_p (decl))
44519 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44522 /* Worker function for REVERSE_CONDITION. */
44524 enum rtx_code
44525 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44527 return (mode == CCFPmode
44528 ? reverse_condition_maybe_unordered (code)
44529 : reverse_condition (code));
44532 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44533 to OPERANDS[0]. */
44535 const char *
44536 output_387_reg_move (rtx_insn *insn, rtx *operands)
44538 if (REG_P (operands[0]))
44540 if (REG_P (operands[1])
44541 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44543 if (REGNO (operands[0]) == FIRST_STACK_REG)
44544 return output_387_ffreep (operands, 0);
44545 return "fstp\t%y0";
44547 if (STACK_TOP_P (operands[0]))
44548 return "fld%Z1\t%y1";
44549 return "fst\t%y0";
44551 else if (MEM_P (operands[0]))
44553 gcc_assert (REG_P (operands[1]));
44554 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44555 return "fstp%Z0\t%y0";
44556 else
44558 /* There is no non-popping store to memory for XFmode.
44559 So if we need one, follow the store with a load. */
44560 if (GET_MODE (operands[0]) == XFmode)
44561 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44562 else
44563 return "fst%Z0\t%y0";
44566 else
44567 gcc_unreachable();
44570 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44571 FP status register is set. */
44573 void
44574 ix86_emit_fp_unordered_jump (rtx label)
44576 rtx reg = gen_reg_rtx (HImode);
44577 rtx temp;
44579 emit_insn (gen_x86_fnstsw_1 (reg));
44581 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44583 emit_insn (gen_x86_sahf_1 (reg));
44585 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44586 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44588 else
44590 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44592 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44593 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44596 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44597 gen_rtx_LABEL_REF (VOIDmode, label),
44598 pc_rtx);
44599 temp = gen_rtx_SET (pc_rtx, temp);
44601 emit_jump_insn (temp);
44602 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44605 /* Output code to perform a log1p XFmode calculation. */
44607 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44609 rtx_code_label *label1 = gen_label_rtx ();
44610 rtx_code_label *label2 = gen_label_rtx ();
44612 rtx tmp = gen_reg_rtx (XFmode);
44613 rtx tmp2 = gen_reg_rtx (XFmode);
44614 rtx test;
44616 emit_insn (gen_absxf2 (tmp, op1));
44617 test = gen_rtx_GE (VOIDmode, tmp,
44618 const_double_from_real_value (
44619 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44620 XFmode));
44621 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44623 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44624 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44625 emit_jump (label2);
44627 emit_label (label1);
44628 emit_move_insn (tmp, CONST1_RTX (XFmode));
44629 emit_insn (gen_addxf3 (tmp, op1, tmp));
44630 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44631 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44633 emit_label (label2);
44636 /* Emit code for round calculation. */
44637 void ix86_emit_i387_round (rtx op0, rtx op1)
44639 machine_mode inmode = GET_MODE (op1);
44640 machine_mode outmode = GET_MODE (op0);
44641 rtx e1, e2, res, tmp, tmp1, half;
44642 rtx scratch = gen_reg_rtx (HImode);
44643 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44644 rtx_code_label *jump_label = gen_label_rtx ();
44645 rtx insn;
44646 rtx (*gen_abs) (rtx, rtx);
44647 rtx (*gen_neg) (rtx, rtx);
44649 switch (inmode)
44651 case E_SFmode:
44652 gen_abs = gen_abssf2;
44653 break;
44654 case E_DFmode:
44655 gen_abs = gen_absdf2;
44656 break;
44657 case E_XFmode:
44658 gen_abs = gen_absxf2;
44659 break;
44660 default:
44661 gcc_unreachable ();
44664 switch (outmode)
44666 case E_SFmode:
44667 gen_neg = gen_negsf2;
44668 break;
44669 case E_DFmode:
44670 gen_neg = gen_negdf2;
44671 break;
44672 case E_XFmode:
44673 gen_neg = gen_negxf2;
44674 break;
44675 case E_HImode:
44676 gen_neg = gen_neghi2;
44677 break;
44678 case E_SImode:
44679 gen_neg = gen_negsi2;
44680 break;
44681 case E_DImode:
44682 gen_neg = gen_negdi2;
44683 break;
44684 default:
44685 gcc_unreachable ();
44688 e1 = gen_reg_rtx (inmode);
44689 e2 = gen_reg_rtx (inmode);
44690 res = gen_reg_rtx (outmode);
44692 half = const_double_from_real_value (dconsthalf, inmode);
44694 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44696 /* scratch = fxam(op1) */
44697 emit_insn (gen_rtx_SET (scratch,
44698 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44699 UNSPEC_FXAM)));
44700 /* e1 = fabs(op1) */
44701 emit_insn (gen_abs (e1, op1));
44703 /* e2 = e1 + 0.5 */
44704 half = force_reg (inmode, half);
44705 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44707 /* res = floor(e2) */
44708 if (inmode != XFmode)
44710 tmp1 = gen_reg_rtx (XFmode);
44712 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44714 else
44715 tmp1 = e2;
44717 switch (outmode)
44719 case E_SFmode:
44720 case E_DFmode:
44722 rtx tmp0 = gen_reg_rtx (XFmode);
44724 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44726 emit_insn (gen_rtx_SET (res,
44727 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44728 UNSPEC_TRUNC_NOOP)));
44730 break;
44731 case E_XFmode:
44732 emit_insn (gen_frndintxf2_floor (res, tmp1));
44733 break;
44734 case E_HImode:
44735 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44736 break;
44737 case E_SImode:
44738 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44739 break;
44740 case E_DImode:
44741 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44742 break;
44743 default:
44744 gcc_unreachable ();
44747 /* flags = signbit(a) */
44748 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44750 /* if (flags) then res = -res */
44751 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44752 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44753 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44754 pc_rtx);
44755 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44756 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44757 JUMP_LABEL (insn) = jump_label;
44759 emit_insn (gen_neg (res, res));
44761 emit_label (jump_label);
44762 LABEL_NUSES (jump_label) = 1;
44764 emit_move_insn (op0, res);
44767 /* Output code to perform a Newton-Rhapson approximation of a single precision
44768 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44770 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44772 rtx x0, x1, e0, e1;
44774 x0 = gen_reg_rtx (mode);
44775 e0 = gen_reg_rtx (mode);
44776 e1 = gen_reg_rtx (mode);
44777 x1 = gen_reg_rtx (mode);
44779 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44781 b = force_reg (mode, b);
44783 /* x0 = rcp(b) estimate */
44784 if (mode == V16SFmode || mode == V8DFmode)
44786 if (TARGET_AVX512ER)
44788 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44789 UNSPEC_RCP28)));
44790 /* res = a * x0 */
44791 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
44792 return;
44794 else
44795 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44796 UNSPEC_RCP14)));
44798 else
44799 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44800 UNSPEC_RCP)));
44802 /* e0 = x0 * b */
44803 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
44805 /* e0 = x0 * e0 */
44806 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
44808 /* e1 = x0 + x0 */
44809 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
44811 /* x1 = e1 - e0 */
44812 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
44814 /* res = a * x1 */
44815 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
44818 /* Output code to perform a Newton-Rhapson approximation of a
44819 single precision floating point [reciprocal] square root. */
44821 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
44823 rtx x0, e0, e1, e2, e3, mthree, mhalf;
44824 REAL_VALUE_TYPE r;
44825 int unspec;
44827 x0 = gen_reg_rtx (mode);
44828 e0 = gen_reg_rtx (mode);
44829 e1 = gen_reg_rtx (mode);
44830 e2 = gen_reg_rtx (mode);
44831 e3 = gen_reg_rtx (mode);
44833 if (TARGET_AVX512ER && mode == V16SFmode)
44835 if (recip)
44836 /* res = rsqrt28(a) estimate */
44837 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44838 UNSPEC_RSQRT28)));
44839 else
44841 /* x0 = rsqrt28(a) estimate */
44842 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44843 UNSPEC_RSQRT28)));
44844 /* res = rcp28(x0) estimate */
44845 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
44846 UNSPEC_RCP28)));
44848 return;
44851 real_from_integer (&r, VOIDmode, -3, SIGNED);
44852 mthree = const_double_from_real_value (r, SFmode);
44854 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
44855 mhalf = const_double_from_real_value (r, SFmode);
44856 unspec = UNSPEC_RSQRT;
44858 if (VECTOR_MODE_P (mode))
44860 mthree = ix86_build_const_vector (mode, true, mthree);
44861 mhalf = ix86_build_const_vector (mode, true, mhalf);
44862 /* There is no 512-bit rsqrt. There is however rsqrt14. */
44863 if (GET_MODE_SIZE (mode) == 64)
44864 unspec = UNSPEC_RSQRT14;
44867 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
44868 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
44870 a = force_reg (mode, a);
44872 /* x0 = rsqrt(a) estimate */
44873 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44874 unspec)));
44876 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
44877 if (!recip)
44879 rtx zero = force_reg (mode, CONST0_RTX(mode));
44880 rtx mask;
44882 /* Handle masked compare. */
44883 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
44885 mask = gen_reg_rtx (HImode);
44886 /* Imm value 0x4 corresponds to not-equal comparison. */
44887 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
44888 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
44890 else
44892 mask = gen_reg_rtx (mode);
44893 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
44894 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
44898 /* e0 = x0 * a */
44899 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
44900 /* e1 = e0 * x0 */
44901 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
44903 /* e2 = e1 - 3. */
44904 mthree = force_reg (mode, mthree);
44905 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
44907 mhalf = force_reg (mode, mhalf);
44908 if (recip)
44909 /* e3 = -.5 * x0 */
44910 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
44911 else
44912 /* e3 = -.5 * e0 */
44913 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
44914 /* ret = e2 * e3 */
44915 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
44918 #ifdef TARGET_SOLARIS
44919 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
44921 static void
44922 i386_solaris_elf_named_section (const char *name, unsigned int flags,
44923 tree decl)
44925 /* With Binutils 2.15, the "@unwind" marker must be specified on
44926 every occurrence of the ".eh_frame" section, not just the first
44927 one. */
44928 if (TARGET_64BIT
44929 && strcmp (name, ".eh_frame") == 0)
44931 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
44932 flags & SECTION_WRITE ? "aw" : "a");
44933 return;
44936 #ifndef USE_GAS
44937 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44939 solaris_elf_asm_comdat_section (name, flags, decl);
44940 return;
44942 #endif
44944 default_elf_asm_named_section (name, flags, decl);
44946 #endif /* TARGET_SOLARIS */
44948 /* Return the mangling of TYPE if it is an extended fundamental type. */
44950 static const char *
44951 ix86_mangle_type (const_tree type)
44953 type = TYPE_MAIN_VARIANT (type);
44955 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44956 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44957 return NULL;
44959 switch (TYPE_MODE (type))
44961 case E_TFmode:
44962 /* __float128 is "g". */
44963 return "g";
44964 case E_XFmode:
44965 /* "long double" or __float80 is "e". */
44966 return "e";
44967 default:
44968 return NULL;
44972 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
44974 static tree
44975 ix86_stack_protect_guard (void)
44977 if (TARGET_SSP_TLS_GUARD)
44979 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
44980 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
44981 tree type = build_qualified_type (type_node, qual);
44982 tree t;
44984 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
44986 t = ix86_tls_stack_chk_guard_decl;
44988 if (t == NULL)
44990 rtx x;
44992 t = build_decl
44993 (UNKNOWN_LOCATION, VAR_DECL,
44994 get_identifier (ix86_stack_protector_guard_symbol_str),
44995 type);
44996 TREE_STATIC (t) = 1;
44997 TREE_PUBLIC (t) = 1;
44998 DECL_EXTERNAL (t) = 1;
44999 TREE_USED (t) = 1;
45000 TREE_THIS_VOLATILE (t) = 1;
45001 DECL_ARTIFICIAL (t) = 1;
45002 DECL_IGNORED_P (t) = 1;
45004 /* Do not share RTL as the declaration is visible outside of
45005 current function. */
45006 x = DECL_RTL (t);
45007 RTX_FLAG (x, used) = 1;
45009 ix86_tls_stack_chk_guard_decl = t;
45012 else
45014 tree asptrtype = build_pointer_type (type);
45016 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45017 t = build2 (MEM_REF, asptrtype, t,
45018 build_int_cst (asptrtype, 0));
45021 return t;
45024 return default_stack_protect_guard ();
45027 /* For 32-bit code we can save PIC register setup by using
45028 __stack_chk_fail_local hidden function instead of calling
45029 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45030 register, so it is better to call __stack_chk_fail directly. */
45032 static tree ATTRIBUTE_UNUSED
45033 ix86_stack_protect_fail (void)
45035 return TARGET_64BIT
45036 ? default_external_stack_protect_fail ()
45037 : default_hidden_stack_protect_fail ();
45040 /* Select a format to encode pointers in exception handling data. CODE
45041 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45042 true if the symbol may be affected by dynamic relocations.
45044 ??? All x86 object file formats are capable of representing this.
45045 After all, the relocation needed is the same as for the call insn.
45046 Whether or not a particular assembler allows us to enter such, I
45047 guess we'll have to see. */
45049 asm_preferred_eh_data_format (int code, int global)
45051 if (flag_pic)
45053 int type = DW_EH_PE_sdata8;
45054 if (!TARGET_64BIT
45055 || ix86_cmodel == CM_SMALL_PIC
45056 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45057 type = DW_EH_PE_sdata4;
45058 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45060 if (ix86_cmodel == CM_SMALL
45061 || (ix86_cmodel == CM_MEDIUM && code))
45062 return DW_EH_PE_udata4;
45063 return DW_EH_PE_absptr;
45066 /* Expand copysign from SIGN to the positive value ABS_VALUE
45067 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45068 the sign-bit. */
45069 static void
45070 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45072 machine_mode mode = GET_MODE (sign);
45073 rtx sgn = gen_reg_rtx (mode);
45074 if (mask == NULL_RTX)
45076 machine_mode vmode;
45078 if (mode == SFmode)
45079 vmode = V4SFmode;
45080 else if (mode == DFmode)
45081 vmode = V2DFmode;
45082 else
45083 vmode = mode;
45085 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45086 if (!VECTOR_MODE_P (mode))
45088 /* We need to generate a scalar mode mask in this case. */
45089 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45090 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45091 mask = gen_reg_rtx (mode);
45092 emit_insn (gen_rtx_SET (mask, tmp));
45095 else
45096 mask = gen_rtx_NOT (mode, mask);
45097 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45098 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45101 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45102 mask for masking out the sign-bit is stored in *SMASK, if that is
45103 non-null. */
45104 static rtx
45105 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45107 machine_mode vmode, mode = GET_MODE (op0);
45108 rtx xa, mask;
45110 xa = gen_reg_rtx (mode);
45111 if (mode == SFmode)
45112 vmode = V4SFmode;
45113 else if (mode == DFmode)
45114 vmode = V2DFmode;
45115 else
45116 vmode = mode;
45117 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45118 if (!VECTOR_MODE_P (mode))
45120 /* We need to generate a scalar mode mask in this case. */
45121 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45122 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45123 mask = gen_reg_rtx (mode);
45124 emit_insn (gen_rtx_SET (mask, tmp));
45126 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45128 if (smask)
45129 *smask = mask;
45131 return xa;
45134 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45135 swapping the operands if SWAP_OPERANDS is true. The expanded
45136 code is a forward jump to a newly created label in case the
45137 comparison is true. The generated label rtx is returned. */
45138 static rtx_code_label *
45139 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45140 bool swap_operands)
45142 bool unordered_compare = ix86_unordered_fp_compare (code);
45143 rtx_code_label *label;
45144 rtx tmp, reg;
45146 if (swap_operands)
45147 std::swap (op0, op1);
45149 label = gen_label_rtx ();
45150 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
45151 if (unordered_compare)
45152 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
45153 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
45154 emit_insn (gen_rtx_SET (reg, tmp));
45155 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
45156 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45157 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45158 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45159 JUMP_LABEL (tmp) = label;
45161 return label;
45164 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45165 using comparison code CODE. Operands are swapped for the comparison if
45166 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45167 static rtx
45168 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45169 bool swap_operands)
45171 rtx (*insn)(rtx, rtx, rtx, rtx);
45172 machine_mode mode = GET_MODE (op0);
45173 rtx mask = gen_reg_rtx (mode);
45175 if (swap_operands)
45176 std::swap (op0, op1);
45178 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45180 emit_insn (insn (mask, op0, op1,
45181 gen_rtx_fmt_ee (code, mode, op0, op1)));
45182 return mask;
45185 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45186 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45187 static rtx
45188 ix86_gen_TWO52 (machine_mode mode)
45190 REAL_VALUE_TYPE TWO52r;
45191 rtx TWO52;
45193 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45194 TWO52 = const_double_from_real_value (TWO52r, mode);
45195 TWO52 = force_reg (mode, TWO52);
45197 return TWO52;
45200 /* Expand SSE sequence for computing lround from OP1 storing
45201 into OP0. */
45202 void
45203 ix86_expand_lround (rtx op0, rtx op1)
45205 /* C code for the stuff we're doing below:
45206 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45207 return (long)tmp;
45209 machine_mode mode = GET_MODE (op1);
45210 const struct real_format *fmt;
45211 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45212 rtx adj;
45214 /* load nextafter (0.5, 0.0) */
45215 fmt = REAL_MODE_FORMAT (mode);
45216 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45217 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45219 /* adj = copysign (0.5, op1) */
45220 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45221 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45223 /* adj = op1 + adj */
45224 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45226 /* op0 = (imode)adj */
45227 expand_fix (op0, adj, 0);
45230 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45231 into OPERAND0. */
45232 void
45233 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45235 /* C code for the stuff we're doing below (for do_floor):
45236 xi = (long)op1;
45237 xi -= (double)xi > op1 ? 1 : 0;
45238 return xi;
45240 machine_mode fmode = GET_MODE (op1);
45241 machine_mode imode = GET_MODE (op0);
45242 rtx ireg, freg, tmp;
45243 rtx_code_label *label;
45245 /* reg = (long)op1 */
45246 ireg = gen_reg_rtx (imode);
45247 expand_fix (ireg, op1, 0);
45249 /* freg = (double)reg */
45250 freg = gen_reg_rtx (fmode);
45251 expand_float (freg, ireg, 0);
45253 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45254 label = ix86_expand_sse_compare_and_jump (UNLE,
45255 freg, op1, !do_floor);
45256 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45257 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45258 emit_move_insn (ireg, tmp);
45260 emit_label (label);
45261 LABEL_NUSES (label) = 1;
45263 emit_move_insn (op0, ireg);
45266 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
45267 void
45268 ix86_expand_rint (rtx operand0, rtx operand1)
45270 /* C code for the stuff we're doing below:
45271 xa = fabs (operand1);
45272 if (!isless (xa, 2**52))
45273 return operand1;
45274 two52 = 2**52;
45275 if (flag_rounding_math)
45277 two52 = copysign (two52, operand1);
45278 xa = operand1;
45280 xa = xa + two52 - two52;
45281 return copysign (xa, operand1);
45283 machine_mode mode = GET_MODE (operand0);
45284 rtx res, xa, TWO52, two52, mask;
45285 rtx_code_label *label;
45287 res = gen_reg_rtx (mode);
45288 emit_move_insn (res, operand1);
45290 /* xa = abs (operand1) */
45291 xa = ix86_expand_sse_fabs (res, &mask);
45293 /* if (!isless (xa, TWO52)) goto label; */
45294 TWO52 = ix86_gen_TWO52 (mode);
45295 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45297 two52 = TWO52;
45298 if (flag_rounding_math)
45300 two52 = gen_reg_rtx (mode);
45301 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
45302 xa = res;
45305 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
45306 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
45308 ix86_sse_copysign_to_positive (res, xa, res, mask);
45310 emit_label (label);
45311 LABEL_NUSES (label) = 1;
45313 emit_move_insn (operand0, res);
45316 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45317 into OPERAND0. */
45318 void
45319 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45321 /* C code for the stuff we expand below.
45322 double xa = fabs (x), x2;
45323 if (!isless (xa, TWO52))
45324 return x;
45325 xa = xa + TWO52 - TWO52;
45326 x2 = copysign (xa, x);
45327 Compensate. Floor:
45328 if (x2 > x)
45329 x2 -= 1;
45330 Compensate. Ceil:
45331 if (x2 < x)
45332 x2 -= -1;
45333 return x2;
45335 machine_mode mode = GET_MODE (operand0);
45336 rtx xa, TWO52, tmp, one, res, mask;
45337 rtx_code_label *label;
45339 TWO52 = ix86_gen_TWO52 (mode);
45341 /* Temporary for holding the result, initialized to the input
45342 operand to ease control flow. */
45343 res = gen_reg_rtx (mode);
45344 emit_move_insn (res, operand1);
45346 /* xa = abs (operand1) */
45347 xa = ix86_expand_sse_fabs (res, &mask);
45349 /* if (!isless (xa, TWO52)) goto label; */
45350 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45352 /* xa = xa + TWO52 - TWO52; */
45353 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45354 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45356 /* xa = copysign (xa, operand1) */
45357 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45359 /* generate 1.0 or -1.0 */
45360 one = force_reg (mode,
45361 const_double_from_real_value (do_floor
45362 ? dconst1 : dconstm1, mode));
45364 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45365 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45366 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45367 /* We always need to subtract here to preserve signed zero. */
45368 tmp = expand_simple_binop (mode, MINUS,
45369 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45370 emit_move_insn (res, tmp);
45372 emit_label (label);
45373 LABEL_NUSES (label) = 1;
45375 emit_move_insn (operand0, res);
45378 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45379 into OPERAND0. */
45380 void
45381 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45383 /* C code for the stuff we expand below.
45384 double xa = fabs (x), x2;
45385 if (!isless (xa, TWO52))
45386 return x;
45387 x2 = (double)(long)x;
45388 Compensate. Floor:
45389 if (x2 > x)
45390 x2 -= 1;
45391 Compensate. Ceil:
45392 if (x2 < x)
45393 x2 += 1;
45394 if (HONOR_SIGNED_ZEROS (mode))
45395 return copysign (x2, x);
45396 return x2;
45398 machine_mode mode = GET_MODE (operand0);
45399 rtx xa, xi, TWO52, tmp, one, res, mask;
45400 rtx_code_label *label;
45402 TWO52 = ix86_gen_TWO52 (mode);
45404 /* Temporary for holding the result, initialized to the input
45405 operand to ease control flow. */
45406 res = gen_reg_rtx (mode);
45407 emit_move_insn (res, operand1);
45409 /* xa = abs (operand1) */
45410 xa = ix86_expand_sse_fabs (res, &mask);
45412 /* if (!isless (xa, TWO52)) goto label; */
45413 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45415 /* xa = (double)(long)x */
45416 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45417 expand_fix (xi, res, 0);
45418 expand_float (xa, xi, 0);
45420 /* generate 1.0 */
45421 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45423 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45424 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45425 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45426 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45427 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45428 emit_move_insn (res, tmp);
45430 if (HONOR_SIGNED_ZEROS (mode))
45431 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45433 emit_label (label);
45434 LABEL_NUSES (label) = 1;
45436 emit_move_insn (operand0, res);
45439 /* Expand SSE sequence for computing round from OPERAND1 storing
45440 into OPERAND0. Sequence that works without relying on DImode truncation
45441 via cvttsd2siq that is only available on 64bit targets. */
45442 void
45443 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45445 /* C code for the stuff we expand below.
45446 double xa = fabs (x), xa2, x2;
45447 if (!isless (xa, TWO52))
45448 return x;
45449 Using the absolute value and copying back sign makes
45450 -0.0 -> -0.0 correct.
45451 xa2 = xa + TWO52 - TWO52;
45452 Compensate.
45453 dxa = xa2 - xa;
45454 if (dxa <= -0.5)
45455 xa2 += 1;
45456 else if (dxa > 0.5)
45457 xa2 -= 1;
45458 x2 = copysign (xa2, x);
45459 return x2;
45461 machine_mode mode = GET_MODE (operand0);
45462 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45463 rtx_code_label *label;
45465 TWO52 = ix86_gen_TWO52 (mode);
45467 /* Temporary for holding the result, initialized to the input
45468 operand to ease control flow. */
45469 res = gen_reg_rtx (mode);
45470 emit_move_insn (res, operand1);
45472 /* xa = abs (operand1) */
45473 xa = ix86_expand_sse_fabs (res, &mask);
45475 /* if (!isless (xa, TWO52)) goto label; */
45476 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45478 /* xa2 = xa + TWO52 - TWO52; */
45479 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45480 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45482 /* dxa = xa2 - xa; */
45483 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45485 /* generate 0.5, 1.0 and -0.5 */
45486 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45487 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45488 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45489 0, OPTAB_DIRECT);
45491 /* Compensate. */
45492 tmp = gen_reg_rtx (mode);
45493 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45494 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45495 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45496 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45497 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45498 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45499 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45500 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45502 /* res = copysign (xa2, operand1) */
45503 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45505 emit_label (label);
45506 LABEL_NUSES (label) = 1;
45508 emit_move_insn (operand0, res);
45511 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45512 into OPERAND0. */
45513 void
45514 ix86_expand_trunc (rtx operand0, rtx operand1)
45516 /* C code for SSE variant we expand below.
45517 double xa = fabs (x), x2;
45518 if (!isless (xa, TWO52))
45519 return x;
45520 x2 = (double)(long)x;
45521 if (HONOR_SIGNED_ZEROS (mode))
45522 return copysign (x2, x);
45523 return x2;
45525 machine_mode mode = GET_MODE (operand0);
45526 rtx xa, xi, TWO52, res, mask;
45527 rtx_code_label *label;
45529 TWO52 = ix86_gen_TWO52 (mode);
45531 /* Temporary for holding the result, initialized to the input
45532 operand to ease control flow. */
45533 res = gen_reg_rtx (mode);
45534 emit_move_insn (res, operand1);
45536 /* xa = abs (operand1) */
45537 xa = ix86_expand_sse_fabs (res, &mask);
45539 /* if (!isless (xa, TWO52)) goto label; */
45540 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45542 /* x = (double)(long)x */
45543 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45544 expand_fix (xi, res, 0);
45545 expand_float (res, xi, 0);
45547 if (HONOR_SIGNED_ZEROS (mode))
45548 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45550 emit_label (label);
45551 LABEL_NUSES (label) = 1;
45553 emit_move_insn (operand0, res);
45556 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45557 into OPERAND0. */
45558 void
45559 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45561 machine_mode mode = GET_MODE (operand0);
45562 rtx xa, mask, TWO52, one, res, smask, tmp;
45563 rtx_code_label *label;
45565 /* C code for SSE variant we expand below.
45566 double xa = fabs (x), x2;
45567 if (!isless (xa, TWO52))
45568 return x;
45569 xa2 = xa + TWO52 - TWO52;
45570 Compensate:
45571 if (xa2 > xa)
45572 xa2 -= 1.0;
45573 x2 = copysign (xa2, x);
45574 return x2;
45577 TWO52 = ix86_gen_TWO52 (mode);
45579 /* Temporary for holding the result, initialized to the input
45580 operand to ease control flow. */
45581 res = gen_reg_rtx (mode);
45582 emit_move_insn (res, operand1);
45584 /* xa = abs (operand1) */
45585 xa = ix86_expand_sse_fabs (res, &smask);
45587 /* if (!isless (xa, TWO52)) goto label; */
45588 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45590 /* res = xa + TWO52 - TWO52; */
45591 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45592 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45593 emit_move_insn (res, tmp);
45595 /* generate 1.0 */
45596 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45598 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45599 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45600 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45601 tmp = expand_simple_binop (mode, MINUS,
45602 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45603 emit_move_insn (res, tmp);
45605 /* res = copysign (res, operand1) */
45606 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45608 emit_label (label);
45609 LABEL_NUSES (label) = 1;
45611 emit_move_insn (operand0, res);
45614 /* Expand SSE sequence for computing round from OPERAND1 storing
45615 into OPERAND0. */
45616 void
45617 ix86_expand_round (rtx operand0, rtx operand1)
45619 /* C code for the stuff we're doing below:
45620 double xa = fabs (x);
45621 if (!isless (xa, TWO52))
45622 return x;
45623 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45624 return copysign (xa, x);
45626 machine_mode mode = GET_MODE (operand0);
45627 rtx res, TWO52, xa, xi, half, mask;
45628 rtx_code_label *label;
45629 const struct real_format *fmt;
45630 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45632 /* Temporary for holding the result, initialized to the input
45633 operand to ease control flow. */
45634 res = gen_reg_rtx (mode);
45635 emit_move_insn (res, operand1);
45637 TWO52 = ix86_gen_TWO52 (mode);
45638 xa = ix86_expand_sse_fabs (res, &mask);
45639 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45641 /* load nextafter (0.5, 0.0) */
45642 fmt = REAL_MODE_FORMAT (mode);
45643 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45644 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45646 /* xa = xa + 0.5 */
45647 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45648 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45650 /* xa = (double)(int64_t)xa */
45651 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45652 expand_fix (xi, xa, 0);
45653 expand_float (xa, xi, 0);
45655 /* res = copysign (xa, operand1) */
45656 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45658 emit_label (label);
45659 LABEL_NUSES (label) = 1;
45661 emit_move_insn (operand0, res);
45664 /* Expand SSE sequence for computing round
45665 from OP1 storing into OP0 using sse4 round insn. */
45666 void
45667 ix86_expand_round_sse4 (rtx op0, rtx op1)
45669 machine_mode mode = GET_MODE (op0);
45670 rtx e1, e2, res, half;
45671 const struct real_format *fmt;
45672 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45673 rtx (*gen_copysign) (rtx, rtx, rtx);
45674 rtx (*gen_round) (rtx, rtx, rtx);
45676 switch (mode)
45678 case E_SFmode:
45679 gen_copysign = gen_copysignsf3;
45680 gen_round = gen_sse4_1_roundsf2;
45681 break;
45682 case E_DFmode:
45683 gen_copysign = gen_copysigndf3;
45684 gen_round = gen_sse4_1_rounddf2;
45685 break;
45686 default:
45687 gcc_unreachable ();
45690 /* round (a) = trunc (a + copysign (0.5, a)) */
45692 /* load nextafter (0.5, 0.0) */
45693 fmt = REAL_MODE_FORMAT (mode);
45694 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45695 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45696 half = const_double_from_real_value (pred_half, mode);
45698 /* e1 = copysign (0.5, op1) */
45699 e1 = gen_reg_rtx (mode);
45700 emit_insn (gen_copysign (e1, half, op1));
45702 /* e2 = op1 + e1 */
45703 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45705 /* res = trunc (e2) */
45706 res = gen_reg_rtx (mode);
45707 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45709 emit_move_insn (op0, res);
45713 /* Table of valid machine attributes. */
45714 static const struct attribute_spec ix86_attribute_table[] =
45716 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
45717 affects_type_identity, handler, exclude } */
45718 /* Stdcall attribute says callee is responsible for popping arguments
45719 if they are not variable. */
45720 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45721 NULL },
45722 /* Fastcall attribute says callee is responsible for popping arguments
45723 if they are not variable. */
45724 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45725 NULL },
45726 /* Thiscall attribute says callee is responsible for popping arguments
45727 if they are not variable. */
45728 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45729 NULL },
45730 /* Cdecl attribute says the callee is a normal C declaration */
45731 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45732 NULL },
45733 /* Regparm attribute specifies how many integer arguments are to be
45734 passed in registers. */
45735 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
45736 NULL },
45737 /* Sseregparm attribute says we are using x86_64 calling conventions
45738 for FP arguments. */
45739 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45740 NULL },
45741 /* The transactional memory builtins are implicitly regparm or fastcall
45742 depending on the ABI. Override the generic do-nothing attribute that
45743 these builtins were declared with. */
45744 { "*tm regparm", 0, 0, false, true, true, true,
45745 ix86_handle_tm_regparm_attribute, NULL },
45746 /* force_align_arg_pointer says this function realigns the stack at entry. */
45747 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45748 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
45749 NULL },
45750 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45751 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
45752 NULL },
45753 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
45754 NULL },
45755 { "shared", 0, 0, true, false, false, false,
45756 ix86_handle_shared_attribute, NULL },
45757 #endif
45758 { "ms_struct", 0, 0, false, false, false, false,
45759 ix86_handle_struct_attribute, NULL },
45760 { "gcc_struct", 0, 0, false, false, false, false,
45761 ix86_handle_struct_attribute, NULL },
45762 #ifdef SUBTARGET_ATTRIBUTE_TABLE
45763 SUBTARGET_ATTRIBUTE_TABLE,
45764 #endif
45765 /* ms_abi and sysv_abi calling convention function attributes. */
45766 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
45767 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
45768 NULL },
45769 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45770 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45771 { "ms_hook_prologue", 0, 0, true, false, false, false,
45772 ix86_handle_fndecl_attribute, NULL },
45773 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
45774 ix86_handle_callee_pop_aggregate_return, NULL },
45775 { "interrupt", 0, 0, false, true, true, false,
45776 ix86_handle_interrupt_attribute, NULL },
45777 { "no_caller_saved_registers", 0, 0, false, true, true, false,
45778 ix86_handle_no_caller_saved_registers_attribute, NULL },
45779 { "naked", 0, 0, true, false, false, false,
45780 ix86_handle_fndecl_attribute, NULL },
45781 { "indirect_branch", 1, 1, true, false, false, false,
45782 ix86_handle_fndecl_attribute, NULL },
45783 { "function_return", 1, 1, true, false, false, false,
45784 ix86_handle_fndecl_attribute, NULL },
45786 /* End element. */
45787 { NULL, 0, 0, false, false, false, false, NULL, NULL }
45790 /* Implement targetm.vectorize.builtin_vectorization_cost. */
45791 static int
45792 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
45793 tree vectype, int)
45795 bool fp = false;
45796 machine_mode mode = TImode;
45797 int index;
45798 if (vectype != NULL)
45800 fp = FLOAT_TYPE_P (vectype);
45801 mode = TYPE_MODE (vectype);
45804 switch (type_of_cost)
45806 case scalar_stmt:
45807 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
45809 case scalar_load:
45810 /* load/store costs are relative to register move which is 2. Recompute
45811 it to COSTS_N_INSNS so everything have same base. */
45812 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
45813 : ix86_cost->int_load [2]) / 2;
45815 case scalar_store:
45816 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
45817 : ix86_cost->int_store [2]) / 2;
45819 case vector_stmt:
45820 return ix86_vec_cost (mode,
45821 fp ? ix86_cost->addss : ix86_cost->sse_op,
45822 true);
45824 case vector_load:
45825 index = sse_store_index (mode);
45826 /* See PR82713 - we may end up being called on non-vector type. */
45827 if (index < 0)
45828 index = 2;
45829 return ix86_vec_cost (mode,
45830 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
45831 true);
45833 case vector_store:
45834 index = sse_store_index (mode);
45835 /* See PR82713 - we may end up being called on non-vector type. */
45836 if (index < 0)
45837 index = 2;
45838 return ix86_vec_cost (mode,
45839 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
45840 true);
45842 case vec_to_scalar:
45843 case scalar_to_vec:
45844 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
45846 /* We should have separate costs for unaligned loads and gather/scatter.
45847 Do that incrementally. */
45848 case unaligned_load:
45849 index = sse_store_index (mode);
45850 /* See PR82713 - we may end up being called on non-vector type. */
45851 if (index < 0)
45852 index = 2;
45853 return ix86_vec_cost (mode,
45854 COSTS_N_INSNS
45855 (ix86_cost->sse_unaligned_load[index]) / 2,
45856 true);
45858 case unaligned_store:
45859 index = sse_store_index (mode);
45860 /* See PR82713 - we may end up being called on non-vector type. */
45861 if (index < 0)
45862 index = 2;
45863 return ix86_vec_cost (mode,
45864 COSTS_N_INSNS
45865 (ix86_cost->sse_unaligned_store[index]) / 2,
45866 true);
45868 case vector_gather_load:
45869 return ix86_vec_cost (mode,
45870 COSTS_N_INSNS
45871 (ix86_cost->gather_static
45872 + ix86_cost->gather_per_elt
45873 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
45874 true);
45876 case vector_scatter_store:
45877 return ix86_vec_cost (mode,
45878 COSTS_N_INSNS
45879 (ix86_cost->scatter_static
45880 + ix86_cost->scatter_per_elt
45881 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
45882 true);
45884 case cond_branch_taken:
45885 return ix86_cost->cond_taken_branch_cost;
45887 case cond_branch_not_taken:
45888 return ix86_cost->cond_not_taken_branch_cost;
45890 case vec_perm:
45891 case vec_promote_demote:
45892 return ix86_vec_cost (mode,
45893 ix86_cost->sse_op, true);
45895 case vec_construct:
45896 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
45898 default:
45899 gcc_unreachable ();
45903 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
45904 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
45905 insn every time. */
45907 static GTY(()) rtx_insn *vselect_insn;
45909 /* Initialize vselect_insn. */
45911 static void
45912 init_vselect_insn (void)
45914 unsigned i;
45915 rtx x;
45917 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
45918 for (i = 0; i < MAX_VECT_LEN; ++i)
45919 XVECEXP (x, 0, i) = const0_rtx;
45920 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
45921 const0_rtx), x);
45922 x = gen_rtx_SET (const0_rtx, x);
45923 start_sequence ();
45924 vselect_insn = emit_insn (x);
45925 end_sequence ();
45928 /* Construct (set target (vec_select op0 (parallel perm))) and
45929 return true if that's a valid instruction in the active ISA. */
45931 static bool
45932 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
45933 unsigned nelt, bool testing_p)
45935 unsigned int i;
45936 rtx x, save_vconcat;
45937 int icode;
45939 if (vselect_insn == NULL_RTX)
45940 init_vselect_insn ();
45942 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
45943 PUT_NUM_ELEM (XVEC (x, 0), nelt);
45944 for (i = 0; i < nelt; ++i)
45945 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
45946 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45947 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
45948 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
45949 SET_DEST (PATTERN (vselect_insn)) = target;
45950 icode = recog_memoized (vselect_insn);
45952 if (icode >= 0 && !testing_p)
45953 emit_insn (copy_rtx (PATTERN (vselect_insn)));
45955 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
45956 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
45957 INSN_CODE (vselect_insn) = -1;
45959 return icode >= 0;
45962 /* Similar, but generate a vec_concat from op0 and op1 as well. */
45964 static bool
45965 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
45966 const unsigned char *perm, unsigned nelt,
45967 bool testing_p)
45969 machine_mode v2mode;
45970 rtx x;
45971 bool ok;
45973 if (vselect_insn == NULL_RTX)
45974 init_vselect_insn ();
45976 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
45977 return false;
45978 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45979 PUT_MODE (x, v2mode);
45980 XEXP (x, 0) = op0;
45981 XEXP (x, 1) = op1;
45982 ok = expand_vselect (target, x, perm, nelt, testing_p);
45983 XEXP (x, 0) = const0_rtx;
45984 XEXP (x, 1) = const0_rtx;
45985 return ok;
45988 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45989 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
45991 static bool
45992 expand_vec_perm_blend (struct expand_vec_perm_d *d)
45994 machine_mode mmode, vmode = d->vmode;
45995 unsigned i, mask, nelt = d->nelt;
45996 rtx target, op0, op1, maskop, x;
45997 rtx rperm[32], vperm;
45999 if (d->one_operand_p)
46000 return false;
46001 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46002 && (TARGET_AVX512BW
46003 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46005 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46007 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46009 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46011 else
46012 return false;
46014 /* This is a blend, not a permute. Elements must stay in their
46015 respective lanes. */
46016 for (i = 0; i < nelt; ++i)
46018 unsigned e = d->perm[i];
46019 if (!(e == i || e == i + nelt))
46020 return false;
46023 if (d->testing_p)
46024 return true;
46026 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46027 decision should be extracted elsewhere, so that we only try that
46028 sequence once all budget==3 options have been tried. */
46029 target = d->target;
46030 op0 = d->op0;
46031 op1 = d->op1;
46032 mask = 0;
46034 switch (vmode)
46036 case E_V8DFmode:
46037 case E_V16SFmode:
46038 case E_V4DFmode:
46039 case E_V8SFmode:
46040 case E_V2DFmode:
46041 case E_V4SFmode:
46042 case E_V8HImode:
46043 case E_V8SImode:
46044 case E_V32HImode:
46045 case E_V64QImode:
46046 case E_V16SImode:
46047 case E_V8DImode:
46048 for (i = 0; i < nelt; ++i)
46049 mask |= (d->perm[i] >= nelt) << i;
46050 break;
46052 case E_V2DImode:
46053 for (i = 0; i < 2; ++i)
46054 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46055 vmode = V8HImode;
46056 goto do_subreg;
46058 case E_V4SImode:
46059 for (i = 0; i < 4; ++i)
46060 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46061 vmode = V8HImode;
46062 goto do_subreg;
46064 case E_V16QImode:
46065 /* See if bytes move in pairs so we can use pblendw with
46066 an immediate argument, rather than pblendvb with a vector
46067 argument. */
46068 for (i = 0; i < 16; i += 2)
46069 if (d->perm[i] + 1 != d->perm[i + 1])
46071 use_pblendvb:
46072 for (i = 0; i < nelt; ++i)
46073 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46075 finish_pblendvb:
46076 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46077 vperm = force_reg (vmode, vperm);
46079 if (GET_MODE_SIZE (vmode) == 16)
46080 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46081 else
46082 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46083 if (target != d->target)
46084 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46085 return true;
46088 for (i = 0; i < 8; ++i)
46089 mask |= (d->perm[i * 2] >= 16) << i;
46090 vmode = V8HImode;
46091 /* FALLTHRU */
46093 do_subreg:
46094 target = gen_reg_rtx (vmode);
46095 op0 = gen_lowpart (vmode, op0);
46096 op1 = gen_lowpart (vmode, op1);
46097 break;
46099 case E_V32QImode:
46100 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46101 for (i = 0; i < 32; i += 2)
46102 if (d->perm[i] + 1 != d->perm[i + 1])
46103 goto use_pblendvb;
46104 /* See if bytes move in quadruplets. If yes, vpblendd
46105 with immediate can be used. */
46106 for (i = 0; i < 32; i += 4)
46107 if (d->perm[i] + 2 != d->perm[i + 2])
46108 break;
46109 if (i < 32)
46111 /* See if bytes move the same in both lanes. If yes,
46112 vpblendw with immediate can be used. */
46113 for (i = 0; i < 16; i += 2)
46114 if (d->perm[i] + 16 != d->perm[i + 16])
46115 goto use_pblendvb;
46117 /* Use vpblendw. */
46118 for (i = 0; i < 16; ++i)
46119 mask |= (d->perm[i * 2] >= 32) << i;
46120 vmode = V16HImode;
46121 goto do_subreg;
46124 /* Use vpblendd. */
46125 for (i = 0; i < 8; ++i)
46126 mask |= (d->perm[i * 4] >= 32) << i;
46127 vmode = V8SImode;
46128 goto do_subreg;
46130 case E_V16HImode:
46131 /* See if words move in pairs. If yes, vpblendd can be used. */
46132 for (i = 0; i < 16; i += 2)
46133 if (d->perm[i] + 1 != d->perm[i + 1])
46134 break;
46135 if (i < 16)
46137 /* See if words move the same in both lanes. If not,
46138 vpblendvb must be used. */
46139 for (i = 0; i < 8; i++)
46140 if (d->perm[i] + 8 != d->perm[i + 8])
46142 /* Use vpblendvb. */
46143 for (i = 0; i < 32; ++i)
46144 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46146 vmode = V32QImode;
46147 nelt = 32;
46148 target = gen_reg_rtx (vmode);
46149 op0 = gen_lowpart (vmode, op0);
46150 op1 = gen_lowpart (vmode, op1);
46151 goto finish_pblendvb;
46154 /* Use vpblendw. */
46155 for (i = 0; i < 16; ++i)
46156 mask |= (d->perm[i] >= 16) << i;
46157 break;
46160 /* Use vpblendd. */
46161 for (i = 0; i < 8; ++i)
46162 mask |= (d->perm[i * 2] >= 16) << i;
46163 vmode = V8SImode;
46164 goto do_subreg;
46166 case E_V4DImode:
46167 /* Use vpblendd. */
46168 for (i = 0; i < 4; ++i)
46169 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46170 vmode = V8SImode;
46171 goto do_subreg;
46173 default:
46174 gcc_unreachable ();
46177 switch (vmode)
46179 case E_V8DFmode:
46180 case E_V8DImode:
46181 mmode = QImode;
46182 break;
46183 case E_V16SFmode:
46184 case E_V16SImode:
46185 mmode = HImode;
46186 break;
46187 case E_V32HImode:
46188 mmode = SImode;
46189 break;
46190 case E_V64QImode:
46191 mmode = DImode;
46192 break;
46193 default:
46194 mmode = VOIDmode;
46197 if (mmode != VOIDmode)
46198 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46199 else
46200 maskop = GEN_INT (mask);
46202 /* This matches five different patterns with the different modes. */
46203 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46204 x = gen_rtx_SET (target, x);
46205 emit_insn (x);
46206 if (target != d->target)
46207 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46209 return true;
46212 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46213 in terms of the variable form of vpermilps.
46215 Note that we will have already failed the immediate input vpermilps,
46216 which requires that the high and low part shuffle be identical; the
46217 variable form doesn't require that. */
46219 static bool
46220 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46222 rtx rperm[8], vperm;
46223 unsigned i;
46225 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46226 return false;
46228 /* We can only permute within the 128-bit lane. */
46229 for (i = 0; i < 8; ++i)
46231 unsigned e = d->perm[i];
46232 if (i < 4 ? e >= 4 : e < 4)
46233 return false;
46236 if (d->testing_p)
46237 return true;
46239 for (i = 0; i < 8; ++i)
46241 unsigned e = d->perm[i];
46243 /* Within each 128-bit lane, the elements of op0 are numbered
46244 from 0 and the elements of op1 are numbered from 4. */
46245 if (e >= 8 + 4)
46246 e -= 8;
46247 else if (e >= 4)
46248 e -= 4;
46250 rperm[i] = GEN_INT (e);
46253 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46254 vperm = force_reg (V8SImode, vperm);
46255 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46257 return true;
46260 /* Return true if permutation D can be performed as VMODE permutation
46261 instead. */
46263 static bool
46264 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46266 unsigned int i, j, chunk;
46268 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46269 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46270 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46271 return false;
46273 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46274 return true;
46276 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46277 for (i = 0; i < d->nelt; i += chunk)
46278 if (d->perm[i] & (chunk - 1))
46279 return false;
46280 else
46281 for (j = 1; j < chunk; ++j)
46282 if (d->perm[i] + j != d->perm[i + j])
46283 return false;
46285 return true;
46288 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46289 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46291 static bool
46292 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46294 unsigned i, nelt, eltsz, mask;
46295 unsigned char perm[64];
46296 machine_mode vmode = V16QImode;
46297 rtx rperm[64], vperm, target, op0, op1;
46299 nelt = d->nelt;
46301 if (!d->one_operand_p)
46303 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46305 if (TARGET_AVX2
46306 && valid_perm_using_mode_p (V2TImode, d))
46308 if (d->testing_p)
46309 return true;
46311 /* Use vperm2i128 insn. The pattern uses
46312 V4DImode instead of V2TImode. */
46313 target = d->target;
46314 if (d->vmode != V4DImode)
46315 target = gen_reg_rtx (V4DImode);
46316 op0 = gen_lowpart (V4DImode, d->op0);
46317 op1 = gen_lowpart (V4DImode, d->op1);
46318 rperm[0]
46319 = GEN_INT ((d->perm[0] / (nelt / 2))
46320 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46321 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46322 if (target != d->target)
46323 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46324 return true;
46326 return false;
46329 else
46331 if (GET_MODE_SIZE (d->vmode) == 16)
46333 if (!TARGET_SSSE3)
46334 return false;
46336 else if (GET_MODE_SIZE (d->vmode) == 32)
46338 if (!TARGET_AVX2)
46339 return false;
46341 /* V4DImode should be already handled through
46342 expand_vselect by vpermq instruction. */
46343 gcc_assert (d->vmode != V4DImode);
46345 vmode = V32QImode;
46346 if (d->vmode == V8SImode
46347 || d->vmode == V16HImode
46348 || d->vmode == V32QImode)
46350 /* First see if vpermq can be used for
46351 V8SImode/V16HImode/V32QImode. */
46352 if (valid_perm_using_mode_p (V4DImode, d))
46354 for (i = 0; i < 4; i++)
46355 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46356 if (d->testing_p)
46357 return true;
46358 target = gen_reg_rtx (V4DImode);
46359 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46360 perm, 4, false))
46362 emit_move_insn (d->target,
46363 gen_lowpart (d->vmode, target));
46364 return true;
46366 return false;
46369 /* Next see if vpermd can be used. */
46370 if (valid_perm_using_mode_p (V8SImode, d))
46371 vmode = V8SImode;
46373 /* Or if vpermps can be used. */
46374 else if (d->vmode == V8SFmode)
46375 vmode = V8SImode;
46377 if (vmode == V32QImode)
46379 /* vpshufb only works intra lanes, it is not
46380 possible to shuffle bytes in between the lanes. */
46381 for (i = 0; i < nelt; ++i)
46382 if ((d->perm[i] ^ i) & (nelt / 2))
46383 return false;
46386 else if (GET_MODE_SIZE (d->vmode) == 64)
46388 if (!TARGET_AVX512BW)
46389 return false;
46391 /* If vpermq didn't work, vpshufb won't work either. */
46392 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46393 return false;
46395 vmode = V64QImode;
46396 if (d->vmode == V16SImode
46397 || d->vmode == V32HImode
46398 || d->vmode == V64QImode)
46400 /* First see if vpermq can be used for
46401 V16SImode/V32HImode/V64QImode. */
46402 if (valid_perm_using_mode_p (V8DImode, d))
46404 for (i = 0; i < 8; i++)
46405 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46406 if (d->testing_p)
46407 return true;
46408 target = gen_reg_rtx (V8DImode);
46409 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46410 perm, 8, false))
46412 emit_move_insn (d->target,
46413 gen_lowpart (d->vmode, target));
46414 return true;
46416 return false;
46419 /* Next see if vpermd can be used. */
46420 if (valid_perm_using_mode_p (V16SImode, d))
46421 vmode = V16SImode;
46423 /* Or if vpermps can be used. */
46424 else if (d->vmode == V16SFmode)
46425 vmode = V16SImode;
46426 if (vmode == V64QImode)
46428 /* vpshufb only works intra lanes, it is not
46429 possible to shuffle bytes in between the lanes. */
46430 for (i = 0; i < nelt; ++i)
46431 if ((d->perm[i] ^ i) & (nelt / 4))
46432 return false;
46435 else
46436 return false;
46439 if (d->testing_p)
46440 return true;
46442 if (vmode == V8SImode)
46443 for (i = 0; i < 8; ++i)
46444 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46445 else if (vmode == V16SImode)
46446 for (i = 0; i < 16; ++i)
46447 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46448 else
46450 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46451 if (!d->one_operand_p)
46452 mask = 2 * nelt - 1;
46453 else if (vmode == V16QImode)
46454 mask = nelt - 1;
46455 else if (vmode == V64QImode)
46456 mask = nelt / 4 - 1;
46457 else
46458 mask = nelt / 2 - 1;
46460 for (i = 0; i < nelt; ++i)
46462 unsigned j, e = d->perm[i] & mask;
46463 for (j = 0; j < eltsz; ++j)
46464 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46468 vperm = gen_rtx_CONST_VECTOR (vmode,
46469 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46470 vperm = force_reg (vmode, vperm);
46472 target = d->target;
46473 if (d->vmode != vmode)
46474 target = gen_reg_rtx (vmode);
46475 op0 = gen_lowpart (vmode, d->op0);
46476 if (d->one_operand_p)
46478 if (vmode == V16QImode)
46479 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46480 else if (vmode == V32QImode)
46481 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46482 else if (vmode == V64QImode)
46483 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46484 else if (vmode == V8SFmode)
46485 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46486 else if (vmode == V8SImode)
46487 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46488 else if (vmode == V16SFmode)
46489 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46490 else if (vmode == V16SImode)
46491 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46492 else
46493 gcc_unreachable ();
46495 else
46497 op1 = gen_lowpart (vmode, d->op1);
46498 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46500 if (target != d->target)
46501 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46503 return true;
46506 /* For V*[QHS]Imode permutations, check if the same permutation
46507 can't be performed in a 2x, 4x or 8x wider inner mode. */
46509 static bool
46510 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46511 struct expand_vec_perm_d *nd)
46513 int i;
46514 machine_mode mode = VOIDmode;
46516 switch (d->vmode)
46518 case E_V16QImode: mode = V8HImode; break;
46519 case E_V32QImode: mode = V16HImode; break;
46520 case E_V64QImode: mode = V32HImode; break;
46521 case E_V8HImode: mode = V4SImode; break;
46522 case E_V16HImode: mode = V8SImode; break;
46523 case E_V32HImode: mode = V16SImode; break;
46524 case E_V4SImode: mode = V2DImode; break;
46525 case E_V8SImode: mode = V4DImode; break;
46526 case E_V16SImode: mode = V8DImode; break;
46527 default: return false;
46529 for (i = 0; i < d->nelt; i += 2)
46530 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46531 return false;
46532 nd->vmode = mode;
46533 nd->nelt = d->nelt / 2;
46534 for (i = 0; i < nd->nelt; i++)
46535 nd->perm[i] = d->perm[2 * i] / 2;
46536 if (GET_MODE_INNER (mode) != DImode)
46537 canonicalize_vector_int_perm (nd, nd);
46538 if (nd != d)
46540 nd->one_operand_p = d->one_operand_p;
46541 nd->testing_p = d->testing_p;
46542 if (d->op0 == d->op1)
46543 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46544 else
46546 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46547 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46549 if (d->testing_p)
46550 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46551 else
46552 nd->target = gen_reg_rtx (nd->vmode);
46554 return true;
46557 /* Try to expand one-operand permutation with constant mask. */
46559 static bool
46560 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46562 machine_mode mode = GET_MODE (d->op0);
46563 machine_mode maskmode = mode;
46564 rtx (*gen) (rtx, rtx, rtx) = NULL;
46565 rtx target, op0, mask;
46566 rtx vec[64];
46568 if (!rtx_equal_p (d->op0, d->op1))
46569 return false;
46571 if (!TARGET_AVX512F)
46572 return false;
46574 switch (mode)
46576 case E_V16SImode:
46577 gen = gen_avx512f_permvarv16si;
46578 break;
46579 case E_V16SFmode:
46580 gen = gen_avx512f_permvarv16sf;
46581 maskmode = V16SImode;
46582 break;
46583 case E_V8DImode:
46584 gen = gen_avx512f_permvarv8di;
46585 break;
46586 case E_V8DFmode:
46587 gen = gen_avx512f_permvarv8df;
46588 maskmode = V8DImode;
46589 break;
46590 default:
46591 return false;
46594 target = d->target;
46595 op0 = d->op0;
46596 for (int i = 0; i < d->nelt; ++i)
46597 vec[i] = GEN_INT (d->perm[i]);
46598 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46599 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46600 return true;
46603 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46604 in a single instruction. */
46606 static bool
46607 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46609 unsigned i, nelt = d->nelt;
46610 struct expand_vec_perm_d nd;
46612 /* Check plain VEC_SELECT first, because AVX has instructions that could
46613 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46614 input where SEL+CONCAT may not. */
46615 if (d->one_operand_p)
46617 int mask = nelt - 1;
46618 bool identity_perm = true;
46619 bool broadcast_perm = true;
46621 for (i = 0; i < nelt; i++)
46623 nd.perm[i] = d->perm[i] & mask;
46624 if (nd.perm[i] != i)
46625 identity_perm = false;
46626 if (nd.perm[i])
46627 broadcast_perm = false;
46630 if (identity_perm)
46632 if (!d->testing_p)
46633 emit_move_insn (d->target, d->op0);
46634 return true;
46636 else if (broadcast_perm && TARGET_AVX2)
46638 /* Use vpbroadcast{b,w,d}. */
46639 rtx (*gen) (rtx, rtx) = NULL;
46640 switch (d->vmode)
46642 case E_V64QImode:
46643 if (TARGET_AVX512BW)
46644 gen = gen_avx512bw_vec_dupv64qi_1;
46645 break;
46646 case E_V32QImode:
46647 gen = gen_avx2_pbroadcastv32qi_1;
46648 break;
46649 case E_V32HImode:
46650 if (TARGET_AVX512BW)
46651 gen = gen_avx512bw_vec_dupv32hi_1;
46652 break;
46653 case E_V16HImode:
46654 gen = gen_avx2_pbroadcastv16hi_1;
46655 break;
46656 case E_V16SImode:
46657 if (TARGET_AVX512F)
46658 gen = gen_avx512f_vec_dupv16si_1;
46659 break;
46660 case E_V8SImode:
46661 gen = gen_avx2_pbroadcastv8si_1;
46662 break;
46663 case E_V16QImode:
46664 gen = gen_avx2_pbroadcastv16qi;
46665 break;
46666 case E_V8HImode:
46667 gen = gen_avx2_pbroadcastv8hi;
46668 break;
46669 case E_V16SFmode:
46670 if (TARGET_AVX512F)
46671 gen = gen_avx512f_vec_dupv16sf_1;
46672 break;
46673 case E_V8SFmode:
46674 gen = gen_avx2_vec_dupv8sf_1;
46675 break;
46676 case E_V8DFmode:
46677 if (TARGET_AVX512F)
46678 gen = gen_avx512f_vec_dupv8df_1;
46679 break;
46680 case E_V8DImode:
46681 if (TARGET_AVX512F)
46682 gen = gen_avx512f_vec_dupv8di_1;
46683 break;
46684 /* For other modes prefer other shuffles this function creates. */
46685 default: break;
46687 if (gen != NULL)
46689 if (!d->testing_p)
46690 emit_insn (gen (d->target, d->op0));
46691 return true;
46695 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46696 return true;
46698 /* There are plenty of patterns in sse.md that are written for
46699 SEL+CONCAT and are not replicated for a single op. Perhaps
46700 that should be changed, to avoid the nastiness here. */
46702 /* Recognize interleave style patterns, which means incrementing
46703 every other permutation operand. */
46704 for (i = 0; i < nelt; i += 2)
46706 nd.perm[i] = d->perm[i] & mask;
46707 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46709 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46710 d->testing_p))
46711 return true;
46713 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46714 if (nelt >= 4)
46716 for (i = 0; i < nelt; i += 4)
46718 nd.perm[i + 0] = d->perm[i + 0] & mask;
46719 nd.perm[i + 1] = d->perm[i + 1] & mask;
46720 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46721 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46724 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46725 d->testing_p))
46726 return true;
46730 /* Finally, try the fully general two operand permute. */
46731 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46732 d->testing_p))
46733 return true;
46735 /* Recognize interleave style patterns with reversed operands. */
46736 if (!d->one_operand_p)
46738 for (i = 0; i < nelt; ++i)
46740 unsigned e = d->perm[i];
46741 if (e >= nelt)
46742 e -= nelt;
46743 else
46744 e += nelt;
46745 nd.perm[i] = e;
46748 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
46749 d->testing_p))
46750 return true;
46753 /* Try the SSE4.1 blend variable merge instructions. */
46754 if (expand_vec_perm_blend (d))
46755 return true;
46757 /* Try one of the AVX vpermil variable permutations. */
46758 if (expand_vec_perm_vpermil (d))
46759 return true;
46761 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
46762 vpshufb, vpermd, vpermps or vpermq variable permutation. */
46763 if (expand_vec_perm_pshufb (d))
46764 return true;
46766 /* Try the AVX2 vpalignr instruction. */
46767 if (expand_vec_perm_palignr (d, true))
46768 return true;
46770 /* Try the AVX512F vperm{s,d} instructions. */
46771 if (ix86_expand_vec_one_operand_perm_avx512 (d))
46772 return true;
46774 /* Try the AVX512F vpermt2/vpermi2 instructions. */
46775 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
46776 return true;
46778 /* See if we can get the same permutation in different vector integer
46779 mode. */
46780 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
46782 if (!d->testing_p)
46783 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
46784 return true;
46786 return false;
46789 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46790 in terms of a pair of pshuflw + pshufhw instructions. */
46792 static bool
46793 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
46795 unsigned char perm2[MAX_VECT_LEN];
46796 unsigned i;
46797 bool ok;
46799 if (d->vmode != V8HImode || !d->one_operand_p)
46800 return false;
46802 /* The two permutations only operate in 64-bit lanes. */
46803 for (i = 0; i < 4; ++i)
46804 if (d->perm[i] >= 4)
46805 return false;
46806 for (i = 4; i < 8; ++i)
46807 if (d->perm[i] < 4)
46808 return false;
46810 if (d->testing_p)
46811 return true;
46813 /* Emit the pshuflw. */
46814 memcpy (perm2, d->perm, 4);
46815 for (i = 4; i < 8; ++i)
46816 perm2[i] = i;
46817 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
46818 gcc_assert (ok);
46820 /* Emit the pshufhw. */
46821 memcpy (perm2 + 4, d->perm + 4, 4);
46822 for (i = 0; i < 4; ++i)
46823 perm2[i] = i;
46824 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
46825 gcc_assert (ok);
46827 return true;
46830 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46831 the permutation using the SSSE3 palignr instruction. This succeeds
46832 when all of the elements in PERM fit within one vector and we merely
46833 need to shift them down so that a single vector permutation has a
46834 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
46835 the vpalignr instruction itself can perform the requested permutation. */
46837 static bool
46838 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
46840 unsigned i, nelt = d->nelt;
46841 unsigned min, max, minswap, maxswap;
46842 bool in_order, ok, swap = false;
46843 rtx shift, target;
46844 struct expand_vec_perm_d dcopy;
46846 /* Even with AVX, palignr only operates on 128-bit vectors,
46847 in AVX2 palignr operates on both 128-bit lanes. */
46848 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46849 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
46850 return false;
46852 min = 2 * nelt;
46853 max = 0;
46854 minswap = 2 * nelt;
46855 maxswap = 0;
46856 for (i = 0; i < nelt; ++i)
46858 unsigned e = d->perm[i];
46859 unsigned eswap = d->perm[i] ^ nelt;
46860 if (GET_MODE_SIZE (d->vmode) == 32)
46862 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
46863 eswap = e ^ (nelt / 2);
46865 if (e < min)
46866 min = e;
46867 if (e > max)
46868 max = e;
46869 if (eswap < minswap)
46870 minswap = eswap;
46871 if (eswap > maxswap)
46872 maxswap = eswap;
46874 if (min == 0
46875 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
46877 if (d->one_operand_p
46878 || minswap == 0
46879 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
46880 ? nelt / 2 : nelt))
46881 return false;
46882 swap = true;
46883 min = minswap;
46884 max = maxswap;
46887 /* Given that we have SSSE3, we know we'll be able to implement the
46888 single operand permutation after the palignr with pshufb for
46889 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
46890 first. */
46891 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
46892 return true;
46894 dcopy = *d;
46895 if (swap)
46897 dcopy.op0 = d->op1;
46898 dcopy.op1 = d->op0;
46899 for (i = 0; i < nelt; ++i)
46900 dcopy.perm[i] ^= nelt;
46903 in_order = true;
46904 for (i = 0; i < nelt; ++i)
46906 unsigned e = dcopy.perm[i];
46907 if (GET_MODE_SIZE (d->vmode) == 32
46908 && e >= nelt
46909 && (e & (nelt / 2 - 1)) < min)
46910 e = e - min - (nelt / 2);
46911 else
46912 e = e - min;
46913 if (e != i)
46914 in_order = false;
46915 dcopy.perm[i] = e;
46917 dcopy.one_operand_p = true;
46919 if (single_insn_only_p && !in_order)
46920 return false;
46922 /* For AVX2, test whether we can permute the result in one instruction. */
46923 if (d->testing_p)
46925 if (in_order)
46926 return true;
46927 dcopy.op1 = dcopy.op0;
46928 return expand_vec_perm_1 (&dcopy);
46931 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
46932 if (GET_MODE_SIZE (d->vmode) == 16)
46934 target = gen_reg_rtx (TImode);
46935 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
46936 gen_lowpart (TImode, dcopy.op0), shift));
46938 else
46940 target = gen_reg_rtx (V2TImode);
46941 emit_insn (gen_avx2_palignrv2ti (target,
46942 gen_lowpart (V2TImode, dcopy.op1),
46943 gen_lowpart (V2TImode, dcopy.op0),
46944 shift));
46947 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
46949 /* Test for the degenerate case where the alignment by itself
46950 produces the desired permutation. */
46951 if (in_order)
46953 emit_move_insn (d->target, dcopy.op0);
46954 return true;
46957 ok = expand_vec_perm_1 (&dcopy);
46958 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
46960 return ok;
46963 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
46964 the permutation using the SSE4_1 pblendv instruction. Potentially
46965 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
46967 static bool
46968 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
46970 unsigned i, which, nelt = d->nelt;
46971 struct expand_vec_perm_d dcopy, dcopy1;
46972 machine_mode vmode = d->vmode;
46973 bool ok;
46975 /* Use the same checks as in expand_vec_perm_blend. */
46976 if (d->one_operand_p)
46977 return false;
46978 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46980 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46982 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46984 else
46985 return false;
46987 /* Figure out where permutation elements stay not in their
46988 respective lanes. */
46989 for (i = 0, which = 0; i < nelt; ++i)
46991 unsigned e = d->perm[i];
46992 if (e != i)
46993 which |= (e < nelt ? 1 : 2);
46995 /* We can pblend the part where elements stay not in their
46996 respective lanes only when these elements are all in one
46997 half of a permutation.
46998 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
46999 lanes, but both 8 and 9 >= 8
47000 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47001 respective lanes and 8 >= 8, but 2 not. */
47002 if (which != 1 && which != 2)
47003 return false;
47004 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47005 return true;
47007 /* First we apply one operand permutation to the part where
47008 elements stay not in their respective lanes. */
47009 dcopy = *d;
47010 if (which == 2)
47011 dcopy.op0 = dcopy.op1 = d->op1;
47012 else
47013 dcopy.op0 = dcopy.op1 = d->op0;
47014 if (!d->testing_p)
47015 dcopy.target = gen_reg_rtx (vmode);
47016 dcopy.one_operand_p = true;
47018 for (i = 0; i < nelt; ++i)
47019 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47021 ok = expand_vec_perm_1 (&dcopy);
47022 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47023 return false;
47024 else
47025 gcc_assert (ok);
47026 if (d->testing_p)
47027 return true;
47029 /* Next we put permuted elements into their positions. */
47030 dcopy1 = *d;
47031 if (which == 2)
47032 dcopy1.op1 = dcopy.target;
47033 else
47034 dcopy1.op0 = dcopy.target;
47036 for (i = 0; i < nelt; ++i)
47037 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47039 ok = expand_vec_perm_blend (&dcopy1);
47040 gcc_assert (ok);
47042 return true;
47045 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47047 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47048 a two vector permutation into a single vector permutation by using
47049 an interleave operation to merge the vectors. */
47051 static bool
47052 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47054 struct expand_vec_perm_d dremap, dfinal;
47055 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47056 unsigned HOST_WIDE_INT contents;
47057 unsigned char remap[2 * MAX_VECT_LEN];
47058 rtx_insn *seq;
47059 bool ok, same_halves = false;
47061 if (GET_MODE_SIZE (d->vmode) == 16)
47063 if (d->one_operand_p)
47064 return false;
47066 else if (GET_MODE_SIZE (d->vmode) == 32)
47068 if (!TARGET_AVX)
47069 return false;
47070 /* For 32-byte modes allow even d->one_operand_p.
47071 The lack of cross-lane shuffling in some instructions
47072 might prevent a single insn shuffle. */
47073 dfinal = *d;
47074 dfinal.testing_p = true;
47075 /* If expand_vec_perm_interleave3 can expand this into
47076 a 3 insn sequence, give up and let it be expanded as
47077 3 insn sequence. While that is one insn longer,
47078 it doesn't need a memory operand and in the common
47079 case that both interleave low and high permutations
47080 with the same operands are adjacent needs 4 insns
47081 for both after CSE. */
47082 if (expand_vec_perm_interleave3 (&dfinal))
47083 return false;
47085 else
47086 return false;
47088 /* Examine from whence the elements come. */
47089 contents = 0;
47090 for (i = 0; i < nelt; ++i)
47091 contents |= HOST_WIDE_INT_1U << d->perm[i];
47093 memset (remap, 0xff, sizeof (remap));
47094 dremap = *d;
47096 if (GET_MODE_SIZE (d->vmode) == 16)
47098 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47100 /* Split the two input vectors into 4 halves. */
47101 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47102 h2 = h1 << nelt2;
47103 h3 = h2 << nelt2;
47104 h4 = h3 << nelt2;
47106 /* If the elements from the low halves use interleave low, and similarly
47107 for interleave high. If the elements are from mis-matched halves, we
47108 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47109 if ((contents & (h1 | h3)) == contents)
47111 /* punpckl* */
47112 for (i = 0; i < nelt2; ++i)
47114 remap[i] = i * 2;
47115 remap[i + nelt] = i * 2 + 1;
47116 dremap.perm[i * 2] = i;
47117 dremap.perm[i * 2 + 1] = i + nelt;
47119 if (!TARGET_SSE2 && d->vmode == V4SImode)
47120 dremap.vmode = V4SFmode;
47122 else if ((contents & (h2 | h4)) == contents)
47124 /* punpckh* */
47125 for (i = 0; i < nelt2; ++i)
47127 remap[i + nelt2] = i * 2;
47128 remap[i + nelt + nelt2] = i * 2 + 1;
47129 dremap.perm[i * 2] = i + nelt2;
47130 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47132 if (!TARGET_SSE2 && d->vmode == V4SImode)
47133 dremap.vmode = V4SFmode;
47135 else if ((contents & (h1 | h4)) == contents)
47137 /* shufps */
47138 for (i = 0; i < nelt2; ++i)
47140 remap[i] = i;
47141 remap[i + nelt + nelt2] = i + nelt2;
47142 dremap.perm[i] = i;
47143 dremap.perm[i + nelt2] = i + nelt + nelt2;
47145 if (nelt != 4)
47147 /* shufpd */
47148 dremap.vmode = V2DImode;
47149 dremap.nelt = 2;
47150 dremap.perm[0] = 0;
47151 dremap.perm[1] = 3;
47154 else if ((contents & (h2 | h3)) == contents)
47156 /* shufps */
47157 for (i = 0; i < nelt2; ++i)
47159 remap[i + nelt2] = i;
47160 remap[i + nelt] = i + nelt2;
47161 dremap.perm[i] = i + nelt2;
47162 dremap.perm[i + nelt2] = i + nelt;
47164 if (nelt != 4)
47166 /* shufpd */
47167 dremap.vmode = V2DImode;
47168 dremap.nelt = 2;
47169 dremap.perm[0] = 1;
47170 dremap.perm[1] = 2;
47173 else
47174 return false;
47176 else
47178 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47179 unsigned HOST_WIDE_INT q[8];
47180 unsigned int nonzero_halves[4];
47182 /* Split the two input vectors into 8 quarters. */
47183 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47184 for (i = 1; i < 8; ++i)
47185 q[i] = q[0] << (nelt4 * i);
47186 for (i = 0; i < 4; ++i)
47187 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47189 nonzero_halves[nzcnt] = i;
47190 ++nzcnt;
47193 if (nzcnt == 1)
47195 gcc_assert (d->one_operand_p);
47196 nonzero_halves[1] = nonzero_halves[0];
47197 same_halves = true;
47199 else if (d->one_operand_p)
47201 gcc_assert (nonzero_halves[0] == 0);
47202 gcc_assert (nonzero_halves[1] == 1);
47205 if (nzcnt <= 2)
47207 if (d->perm[0] / nelt2 == nonzero_halves[1])
47209 /* Attempt to increase the likelihood that dfinal
47210 shuffle will be intra-lane. */
47211 std::swap (nonzero_halves[0], nonzero_halves[1]);
47214 /* vperm2f128 or vperm2i128. */
47215 for (i = 0; i < nelt2; ++i)
47217 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47218 remap[i + nonzero_halves[0] * nelt2] = i;
47219 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47220 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47223 if (d->vmode != V8SFmode
47224 && d->vmode != V4DFmode
47225 && d->vmode != V8SImode)
47227 dremap.vmode = V8SImode;
47228 dremap.nelt = 8;
47229 for (i = 0; i < 4; ++i)
47231 dremap.perm[i] = i + nonzero_halves[0] * 4;
47232 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47236 else if (d->one_operand_p)
47237 return false;
47238 else if (TARGET_AVX2
47239 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47241 /* vpunpckl* */
47242 for (i = 0; i < nelt4; ++i)
47244 remap[i] = i * 2;
47245 remap[i + nelt] = i * 2 + 1;
47246 remap[i + nelt2] = i * 2 + nelt2;
47247 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47248 dremap.perm[i * 2] = i;
47249 dremap.perm[i * 2 + 1] = i + nelt;
47250 dremap.perm[i * 2 + nelt2] = i + nelt2;
47251 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47254 else if (TARGET_AVX2
47255 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47257 /* vpunpckh* */
47258 for (i = 0; i < nelt4; ++i)
47260 remap[i + nelt4] = i * 2;
47261 remap[i + nelt + nelt4] = i * 2 + 1;
47262 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47263 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47264 dremap.perm[i * 2] = i + nelt4;
47265 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47266 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47267 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47270 else
47271 return false;
47274 /* Use the remapping array set up above to move the elements from their
47275 swizzled locations into their final destinations. */
47276 dfinal = *d;
47277 for (i = 0; i < nelt; ++i)
47279 unsigned e = remap[d->perm[i]];
47280 gcc_assert (e < nelt);
47281 /* If same_halves is true, both halves of the remapped vector are the
47282 same. Avoid cross-lane accesses if possible. */
47283 if (same_halves && i >= nelt2)
47285 gcc_assert (e < nelt2);
47286 dfinal.perm[i] = e + nelt2;
47288 else
47289 dfinal.perm[i] = e;
47291 if (!d->testing_p)
47293 dremap.target = gen_reg_rtx (dremap.vmode);
47294 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47296 dfinal.op1 = dfinal.op0;
47297 dfinal.one_operand_p = true;
47299 /* Test if the final remap can be done with a single insn. For V4SFmode or
47300 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47301 start_sequence ();
47302 ok = expand_vec_perm_1 (&dfinal);
47303 seq = get_insns ();
47304 end_sequence ();
47306 if (!ok)
47307 return false;
47309 if (d->testing_p)
47310 return true;
47312 if (dremap.vmode != dfinal.vmode)
47314 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47315 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47318 ok = expand_vec_perm_1 (&dremap);
47319 gcc_assert (ok);
47321 emit_insn (seq);
47322 return true;
47325 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47326 a single vector cross-lane permutation into vpermq followed
47327 by any of the single insn permutations. */
47329 static bool
47330 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47332 struct expand_vec_perm_d dremap, dfinal;
47333 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47334 unsigned contents[2];
47335 bool ok;
47337 if (!(TARGET_AVX2
47338 && (d->vmode == V32QImode || d->vmode == V16HImode)
47339 && d->one_operand_p))
47340 return false;
47342 contents[0] = 0;
47343 contents[1] = 0;
47344 for (i = 0; i < nelt2; ++i)
47346 contents[0] |= 1u << (d->perm[i] / nelt4);
47347 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47350 for (i = 0; i < 2; ++i)
47352 unsigned int cnt = 0;
47353 for (j = 0; j < 4; ++j)
47354 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47355 return false;
47358 if (d->testing_p)
47359 return true;
47361 dremap = *d;
47362 dremap.vmode = V4DImode;
47363 dremap.nelt = 4;
47364 dremap.target = gen_reg_rtx (V4DImode);
47365 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47366 dremap.op1 = dremap.op0;
47367 dremap.one_operand_p = true;
47368 for (i = 0; i < 2; ++i)
47370 unsigned int cnt = 0;
47371 for (j = 0; j < 4; ++j)
47372 if ((contents[i] & (1u << j)) != 0)
47373 dremap.perm[2 * i + cnt++] = j;
47374 for (; cnt < 2; ++cnt)
47375 dremap.perm[2 * i + cnt] = 0;
47378 dfinal = *d;
47379 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47380 dfinal.op1 = dfinal.op0;
47381 dfinal.one_operand_p = true;
47382 for (i = 0, j = 0; i < nelt; ++i)
47384 if (i == nelt2)
47385 j = 2;
47386 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47387 if ((d->perm[i] / nelt4) == dremap.perm[j])
47389 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47390 dfinal.perm[i] |= nelt4;
47391 else
47392 gcc_unreachable ();
47395 ok = expand_vec_perm_1 (&dremap);
47396 gcc_assert (ok);
47398 ok = expand_vec_perm_1 (&dfinal);
47399 gcc_assert (ok);
47401 return true;
47404 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47405 a vector permutation using two instructions, vperm2f128 resp.
47406 vperm2i128 followed by any single in-lane permutation. */
47408 static bool
47409 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47411 struct expand_vec_perm_d dfirst, dsecond;
47412 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47413 bool ok;
47415 if (!TARGET_AVX
47416 || GET_MODE_SIZE (d->vmode) != 32
47417 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47418 return false;
47420 dsecond = *d;
47421 dsecond.one_operand_p = false;
47422 dsecond.testing_p = true;
47424 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47425 immediate. For perm < 16 the second permutation uses
47426 d->op0 as first operand, for perm >= 16 it uses d->op1
47427 as first operand. The second operand is the result of
47428 vperm2[fi]128. */
47429 for (perm = 0; perm < 32; perm++)
47431 /* Ignore permutations which do not move anything cross-lane. */
47432 if (perm < 16)
47434 /* The second shuffle for e.g. V4DFmode has
47435 0123 and ABCD operands.
47436 Ignore AB23, as 23 is already in the second lane
47437 of the first operand. */
47438 if ((perm & 0xc) == (1 << 2)) continue;
47439 /* And 01CD, as 01 is in the first lane of the first
47440 operand. */
47441 if ((perm & 3) == 0) continue;
47442 /* And 4567, as then the vperm2[fi]128 doesn't change
47443 anything on the original 4567 second operand. */
47444 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47446 else
47448 /* The second shuffle for e.g. V4DFmode has
47449 4567 and ABCD operands.
47450 Ignore AB67, as 67 is already in the second lane
47451 of the first operand. */
47452 if ((perm & 0xc) == (3 << 2)) continue;
47453 /* And 45CD, as 45 is in the first lane of the first
47454 operand. */
47455 if ((perm & 3) == 2) continue;
47456 /* And 0123, as then the vperm2[fi]128 doesn't change
47457 anything on the original 0123 first operand. */
47458 if ((perm & 0xf) == (1 << 2)) continue;
47461 for (i = 0; i < nelt; i++)
47463 j = d->perm[i] / nelt2;
47464 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47465 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47466 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47467 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47468 else
47469 break;
47472 if (i == nelt)
47474 start_sequence ();
47475 ok = expand_vec_perm_1 (&dsecond);
47476 end_sequence ();
47478 else
47479 ok = false;
47481 if (ok)
47483 if (d->testing_p)
47484 return true;
47486 /* Found a usable second shuffle. dfirst will be
47487 vperm2f128 on d->op0 and d->op1. */
47488 dsecond.testing_p = false;
47489 dfirst = *d;
47490 dfirst.target = gen_reg_rtx (d->vmode);
47491 for (i = 0; i < nelt; i++)
47492 dfirst.perm[i] = (i & (nelt2 - 1))
47493 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47495 canonicalize_perm (&dfirst);
47496 ok = expand_vec_perm_1 (&dfirst);
47497 gcc_assert (ok);
47499 /* And dsecond is some single insn shuffle, taking
47500 d->op0 and result of vperm2f128 (if perm < 16) or
47501 d->op1 and result of vperm2f128 (otherwise). */
47502 if (perm >= 16)
47503 dsecond.op0 = dsecond.op1;
47504 dsecond.op1 = dfirst.target;
47506 ok = expand_vec_perm_1 (&dsecond);
47507 gcc_assert (ok);
47509 return true;
47512 /* For one operand, the only useful vperm2f128 permutation is 0x01
47513 aka lanes swap. */
47514 if (d->one_operand_p)
47515 return false;
47518 return false;
47521 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47522 a two vector permutation using 2 intra-lane interleave insns
47523 and cross-lane shuffle for 32-byte vectors. */
47525 static bool
47526 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47528 unsigned i, nelt;
47529 rtx (*gen) (rtx, rtx, rtx);
47531 if (d->one_operand_p)
47532 return false;
47533 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47535 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47537 else
47538 return false;
47540 nelt = d->nelt;
47541 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47542 return false;
47543 for (i = 0; i < nelt; i += 2)
47544 if (d->perm[i] != d->perm[0] + i / 2
47545 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47546 return false;
47548 if (d->testing_p)
47549 return true;
47551 switch (d->vmode)
47553 case E_V32QImode:
47554 if (d->perm[0])
47555 gen = gen_vec_interleave_highv32qi;
47556 else
47557 gen = gen_vec_interleave_lowv32qi;
47558 break;
47559 case E_V16HImode:
47560 if (d->perm[0])
47561 gen = gen_vec_interleave_highv16hi;
47562 else
47563 gen = gen_vec_interleave_lowv16hi;
47564 break;
47565 case E_V8SImode:
47566 if (d->perm[0])
47567 gen = gen_vec_interleave_highv8si;
47568 else
47569 gen = gen_vec_interleave_lowv8si;
47570 break;
47571 case E_V4DImode:
47572 if (d->perm[0])
47573 gen = gen_vec_interleave_highv4di;
47574 else
47575 gen = gen_vec_interleave_lowv4di;
47576 break;
47577 case E_V8SFmode:
47578 if (d->perm[0])
47579 gen = gen_vec_interleave_highv8sf;
47580 else
47581 gen = gen_vec_interleave_lowv8sf;
47582 break;
47583 case E_V4DFmode:
47584 if (d->perm[0])
47585 gen = gen_vec_interleave_highv4df;
47586 else
47587 gen = gen_vec_interleave_lowv4df;
47588 break;
47589 default:
47590 gcc_unreachable ();
47593 emit_insn (gen (d->target, d->op0, d->op1));
47594 return true;
47597 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47598 a single vector permutation using a single intra-lane vector
47599 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47600 the non-swapped and swapped vectors together. */
47602 static bool
47603 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47605 struct expand_vec_perm_d dfirst, dsecond;
47606 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47607 rtx_insn *seq;
47608 bool ok;
47609 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47611 if (!TARGET_AVX
47612 || TARGET_AVX2
47613 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47614 || !d->one_operand_p)
47615 return false;
47617 dfirst = *d;
47618 for (i = 0; i < nelt; i++)
47619 dfirst.perm[i] = 0xff;
47620 for (i = 0, msk = 0; i < nelt; i++)
47622 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47623 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47624 return false;
47625 dfirst.perm[j] = d->perm[i];
47626 if (j != i)
47627 msk |= (1 << i);
47629 for (i = 0; i < nelt; i++)
47630 if (dfirst.perm[i] == 0xff)
47631 dfirst.perm[i] = i;
47633 if (!d->testing_p)
47634 dfirst.target = gen_reg_rtx (dfirst.vmode);
47636 start_sequence ();
47637 ok = expand_vec_perm_1 (&dfirst);
47638 seq = get_insns ();
47639 end_sequence ();
47641 if (!ok)
47642 return false;
47644 if (d->testing_p)
47645 return true;
47647 emit_insn (seq);
47649 dsecond = *d;
47650 dsecond.op0 = dfirst.target;
47651 dsecond.op1 = dfirst.target;
47652 dsecond.one_operand_p = true;
47653 dsecond.target = gen_reg_rtx (dsecond.vmode);
47654 for (i = 0; i < nelt; i++)
47655 dsecond.perm[i] = i ^ nelt2;
47657 ok = expand_vec_perm_1 (&dsecond);
47658 gcc_assert (ok);
47660 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47661 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47662 return true;
47665 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47666 permutation using two vperm2f128, followed by a vshufpd insn blending
47667 the two vectors together. */
47669 static bool
47670 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47672 struct expand_vec_perm_d dfirst, dsecond, dthird;
47673 bool ok;
47675 if (!TARGET_AVX || (d->vmode != V4DFmode))
47676 return false;
47678 if (d->testing_p)
47679 return true;
47681 dfirst = *d;
47682 dsecond = *d;
47683 dthird = *d;
47685 dfirst.perm[0] = (d->perm[0] & ~1);
47686 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47687 dfirst.perm[2] = (d->perm[2] & ~1);
47688 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47689 dsecond.perm[0] = (d->perm[1] & ~1);
47690 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47691 dsecond.perm[2] = (d->perm[3] & ~1);
47692 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47693 dthird.perm[0] = (d->perm[0] % 2);
47694 dthird.perm[1] = (d->perm[1] % 2) + 4;
47695 dthird.perm[2] = (d->perm[2] % 2) + 2;
47696 dthird.perm[3] = (d->perm[3] % 2) + 6;
47698 dfirst.target = gen_reg_rtx (dfirst.vmode);
47699 dsecond.target = gen_reg_rtx (dsecond.vmode);
47700 dthird.op0 = dfirst.target;
47701 dthird.op1 = dsecond.target;
47702 dthird.one_operand_p = false;
47704 canonicalize_perm (&dfirst);
47705 canonicalize_perm (&dsecond);
47707 ok = expand_vec_perm_1 (&dfirst)
47708 && expand_vec_perm_1 (&dsecond)
47709 && expand_vec_perm_1 (&dthird);
47711 gcc_assert (ok);
47713 return true;
47716 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47717 permutation with two pshufb insns and an ior. We should have already
47718 failed all two instruction sequences. */
47720 static bool
47721 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47723 rtx rperm[2][16], vperm, l, h, op, m128;
47724 unsigned int i, nelt, eltsz;
47726 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47727 return false;
47728 gcc_assert (!d->one_operand_p);
47730 if (d->testing_p)
47731 return true;
47733 nelt = d->nelt;
47734 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47736 /* Generate two permutation masks. If the required element is within
47737 the given vector it is shuffled into the proper lane. If the required
47738 element is in the other vector, force a zero into the lane by setting
47739 bit 7 in the permutation mask. */
47740 m128 = GEN_INT (-128);
47741 for (i = 0; i < nelt; ++i)
47743 unsigned j, e = d->perm[i];
47744 unsigned which = (e >= nelt);
47745 if (e >= nelt)
47746 e -= nelt;
47748 for (j = 0; j < eltsz; ++j)
47750 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
47751 rperm[1-which][i*eltsz + j] = m128;
47755 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
47756 vperm = force_reg (V16QImode, vperm);
47758 l = gen_reg_rtx (V16QImode);
47759 op = gen_lowpart (V16QImode, d->op0);
47760 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
47762 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
47763 vperm = force_reg (V16QImode, vperm);
47765 h = gen_reg_rtx (V16QImode);
47766 op = gen_lowpart (V16QImode, d->op1);
47767 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
47769 op = d->target;
47770 if (d->vmode != V16QImode)
47771 op = gen_reg_rtx (V16QImode);
47772 emit_insn (gen_iorv16qi3 (op, l, h));
47773 if (op != d->target)
47774 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47776 return true;
47779 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
47780 with two vpshufb insns, vpermq and vpor. We should have already failed
47781 all two or three instruction sequences. */
47783 static bool
47784 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
47786 rtx rperm[2][32], vperm, l, h, hp, op, m128;
47787 unsigned int i, nelt, eltsz;
47789 if (!TARGET_AVX2
47790 || !d->one_operand_p
47791 || (d->vmode != V32QImode && d->vmode != V16HImode))
47792 return false;
47794 if (d->testing_p)
47795 return true;
47797 nelt = d->nelt;
47798 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47800 /* Generate two permutation masks. If the required element is within
47801 the same lane, it is shuffled in. If the required element from the
47802 other lane, force a zero by setting bit 7 in the permutation mask.
47803 In the other mask the mask has non-negative elements if element
47804 is requested from the other lane, but also moved to the other lane,
47805 so that the result of vpshufb can have the two V2TImode halves
47806 swapped. */
47807 m128 = GEN_INT (-128);
47808 for (i = 0; i < nelt; ++i)
47810 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47811 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47813 for (j = 0; j < eltsz; ++j)
47815 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
47816 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
47820 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47821 vperm = force_reg (V32QImode, vperm);
47823 h = gen_reg_rtx (V32QImode);
47824 op = gen_lowpart (V32QImode, d->op0);
47825 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47827 /* Swap the 128-byte lanes of h into hp. */
47828 hp = gen_reg_rtx (V4DImode);
47829 op = gen_lowpart (V4DImode, h);
47830 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
47831 const1_rtx));
47833 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47834 vperm = force_reg (V32QImode, vperm);
47836 l = gen_reg_rtx (V32QImode);
47837 op = gen_lowpart (V32QImode, d->op0);
47838 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47840 op = d->target;
47841 if (d->vmode != V32QImode)
47842 op = gen_reg_rtx (V32QImode);
47843 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
47844 if (op != d->target)
47845 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47847 return true;
47850 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47851 and extract-odd permutations of two V32QImode and V16QImode operand
47852 with two vpshufb insns, vpor and vpermq. We should have already
47853 failed all two or three instruction sequences. */
47855 static bool
47856 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
47858 rtx rperm[2][32], vperm, l, h, ior, op, m128;
47859 unsigned int i, nelt, eltsz;
47861 if (!TARGET_AVX2
47862 || d->one_operand_p
47863 || (d->vmode != V32QImode && d->vmode != V16HImode))
47864 return false;
47866 for (i = 0; i < d->nelt; ++i)
47867 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
47868 return false;
47870 if (d->testing_p)
47871 return true;
47873 nelt = d->nelt;
47874 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47876 /* Generate two permutation masks. In the first permutation mask
47877 the first quarter will contain indexes for the first half
47878 of the op0, the second quarter will contain bit 7 set, third quarter
47879 will contain indexes for the second half of the op0 and the
47880 last quarter bit 7 set. In the second permutation mask
47881 the first quarter will contain bit 7 set, the second quarter
47882 indexes for the first half of the op1, the third quarter bit 7 set
47883 and last quarter indexes for the second half of the op1.
47884 I.e. the first mask e.g. for V32QImode extract even will be:
47885 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
47886 (all values masked with 0xf except for -128) and second mask
47887 for extract even will be
47888 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
47889 m128 = GEN_INT (-128);
47890 for (i = 0; i < nelt; ++i)
47892 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47893 unsigned which = d->perm[i] >= nelt;
47894 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
47896 for (j = 0; j < eltsz; ++j)
47898 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
47899 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
47903 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47904 vperm = force_reg (V32QImode, vperm);
47906 l = gen_reg_rtx (V32QImode);
47907 op = gen_lowpart (V32QImode, d->op0);
47908 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47910 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47911 vperm = force_reg (V32QImode, vperm);
47913 h = gen_reg_rtx (V32QImode);
47914 op = gen_lowpart (V32QImode, d->op1);
47915 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47917 ior = gen_reg_rtx (V32QImode);
47918 emit_insn (gen_iorv32qi3 (ior, l, h));
47920 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
47921 op = gen_reg_rtx (V4DImode);
47922 ior = gen_lowpart (V4DImode, ior);
47923 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
47924 const1_rtx, GEN_INT (3)));
47925 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47927 return true;
47930 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47931 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
47932 with two "and" and "pack" or two "shift" and "pack" insns. We should
47933 have already failed all two instruction sequences. */
47935 static bool
47936 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
47938 rtx op, dop0, dop1, t;
47939 unsigned i, odd, c, s, nelt = d->nelt;
47940 bool end_perm = false;
47941 machine_mode half_mode;
47942 rtx (*gen_and) (rtx, rtx, rtx);
47943 rtx (*gen_pack) (rtx, rtx, rtx);
47944 rtx (*gen_shift) (rtx, rtx, rtx);
47946 if (d->one_operand_p)
47947 return false;
47949 switch (d->vmode)
47951 case E_V8HImode:
47952 /* Required for "pack". */
47953 if (!TARGET_SSE4_1)
47954 return false;
47955 c = 0xffff;
47956 s = 16;
47957 half_mode = V4SImode;
47958 gen_and = gen_andv4si3;
47959 gen_pack = gen_sse4_1_packusdw;
47960 gen_shift = gen_lshrv4si3;
47961 break;
47962 case E_V16QImode:
47963 /* No check as all instructions are SSE2. */
47964 c = 0xff;
47965 s = 8;
47966 half_mode = V8HImode;
47967 gen_and = gen_andv8hi3;
47968 gen_pack = gen_sse2_packuswb;
47969 gen_shift = gen_lshrv8hi3;
47970 break;
47971 case E_V16HImode:
47972 if (!TARGET_AVX2)
47973 return false;
47974 c = 0xffff;
47975 s = 16;
47976 half_mode = V8SImode;
47977 gen_and = gen_andv8si3;
47978 gen_pack = gen_avx2_packusdw;
47979 gen_shift = gen_lshrv8si3;
47980 end_perm = true;
47981 break;
47982 case E_V32QImode:
47983 if (!TARGET_AVX2)
47984 return false;
47985 c = 0xff;
47986 s = 8;
47987 half_mode = V16HImode;
47988 gen_and = gen_andv16hi3;
47989 gen_pack = gen_avx2_packuswb;
47990 gen_shift = gen_lshrv16hi3;
47991 end_perm = true;
47992 break;
47993 default:
47994 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
47995 general shuffles. */
47996 return false;
47999 /* Check that permutation is even or odd. */
48000 odd = d->perm[0];
48001 if (odd > 1)
48002 return false;
48004 for (i = 1; i < nelt; ++i)
48005 if (d->perm[i] != 2 * i + odd)
48006 return false;
48008 if (d->testing_p)
48009 return true;
48011 dop0 = gen_reg_rtx (half_mode);
48012 dop1 = gen_reg_rtx (half_mode);
48013 if (odd == 0)
48015 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
48016 t = force_reg (half_mode, t);
48017 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48018 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48020 else
48022 emit_insn (gen_shift (dop0,
48023 gen_lowpart (half_mode, d->op0),
48024 GEN_INT (s)));
48025 emit_insn (gen_shift (dop1,
48026 gen_lowpart (half_mode, d->op1),
48027 GEN_INT (s)));
48029 /* In AVX2 for 256 bit case we need to permute pack result. */
48030 if (TARGET_AVX2 && end_perm)
48032 op = gen_reg_rtx (d->vmode);
48033 t = gen_reg_rtx (V4DImode);
48034 emit_insn (gen_pack (op, dop0, dop1));
48035 emit_insn (gen_avx2_permv4di_1 (t,
48036 gen_lowpart (V4DImode, op),
48037 const0_rtx,
48038 const2_rtx,
48039 const1_rtx,
48040 GEN_INT (3)));
48041 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48043 else
48044 emit_insn (gen_pack (d->target, dop0, dop1));
48046 return true;
48049 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48050 and extract-odd permutations of two V64QI operands
48051 with two "shifts", two "truncs" and one "concat" insns for "odd"
48052 and two "truncs" and one concat insn for "even."
48053 Have already failed all two instruction sequences. */
48055 static bool
48056 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48058 rtx t1, t2, t3, t4;
48059 unsigned i, odd, nelt = d->nelt;
48061 if (!TARGET_AVX512BW
48062 || d->one_operand_p
48063 || d->vmode != V64QImode)
48064 return false;
48066 /* Check that permutation is even or odd. */
48067 odd = d->perm[0];
48068 if (odd > 1)
48069 return false;
48071 for (i = 1; i < nelt; ++i)
48072 if (d->perm[i] != 2 * i + odd)
48073 return false;
48075 if (d->testing_p)
48076 return true;
48079 if (odd)
48081 t1 = gen_reg_rtx (V32HImode);
48082 t2 = gen_reg_rtx (V32HImode);
48083 emit_insn (gen_lshrv32hi3 (t1,
48084 gen_lowpart (V32HImode, d->op0),
48085 GEN_INT (8)));
48086 emit_insn (gen_lshrv32hi3 (t2,
48087 gen_lowpart (V32HImode, d->op1),
48088 GEN_INT (8)));
48090 else
48092 t1 = gen_lowpart (V32HImode, d->op0);
48093 t2 = gen_lowpart (V32HImode, d->op1);
48096 t3 = gen_reg_rtx (V32QImode);
48097 t4 = gen_reg_rtx (V32QImode);
48098 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48099 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48100 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48102 return true;
48105 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48106 and extract-odd permutations. */
48108 static bool
48109 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48111 rtx t1, t2, t3, t4, t5;
48113 switch (d->vmode)
48115 case E_V4DFmode:
48116 if (d->testing_p)
48117 break;
48118 t1 = gen_reg_rtx (V4DFmode);
48119 t2 = gen_reg_rtx (V4DFmode);
48121 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48122 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48123 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48125 /* Now an unpck[lh]pd will produce the result required. */
48126 if (odd)
48127 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48128 else
48129 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48130 emit_insn (t3);
48131 break;
48133 case E_V8SFmode:
48135 int mask = odd ? 0xdd : 0x88;
48137 if (d->testing_p)
48138 break;
48139 t1 = gen_reg_rtx (V8SFmode);
48140 t2 = gen_reg_rtx (V8SFmode);
48141 t3 = gen_reg_rtx (V8SFmode);
48143 /* Shuffle within the 128-bit lanes to produce:
48144 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48145 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48146 GEN_INT (mask)));
48148 /* Shuffle the lanes around to produce:
48149 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48150 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48151 GEN_INT (0x3)));
48153 /* Shuffle within the 128-bit lanes to produce:
48154 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48155 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48157 /* Shuffle within the 128-bit lanes to produce:
48158 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48159 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48161 /* Shuffle the lanes around to produce:
48162 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48163 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48164 GEN_INT (0x20)));
48166 break;
48168 case E_V2DFmode:
48169 case E_V4SFmode:
48170 case E_V2DImode:
48171 case E_V4SImode:
48172 /* These are always directly implementable by expand_vec_perm_1. */
48173 gcc_unreachable ();
48175 case E_V8HImode:
48176 if (TARGET_SSE4_1)
48177 return expand_vec_perm_even_odd_pack (d);
48178 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48179 return expand_vec_perm_pshufb2 (d);
48180 else
48182 if (d->testing_p)
48183 break;
48184 /* We need 2*log2(N)-1 operations to achieve odd/even
48185 with interleave. */
48186 t1 = gen_reg_rtx (V8HImode);
48187 t2 = gen_reg_rtx (V8HImode);
48188 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48189 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48190 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48191 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48192 if (odd)
48193 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48194 else
48195 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48196 emit_insn (t3);
48198 break;
48200 case E_V16QImode:
48201 return expand_vec_perm_even_odd_pack (d);
48203 case E_V16HImode:
48204 case E_V32QImode:
48205 return expand_vec_perm_even_odd_pack (d);
48207 case E_V64QImode:
48208 return expand_vec_perm_even_odd_trunc (d);
48210 case E_V4DImode:
48211 if (!TARGET_AVX2)
48213 struct expand_vec_perm_d d_copy = *d;
48214 d_copy.vmode = V4DFmode;
48215 if (d->testing_p)
48216 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48217 else
48218 d_copy.target = gen_reg_rtx (V4DFmode);
48219 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48220 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48221 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48223 if (!d->testing_p)
48224 emit_move_insn (d->target,
48225 gen_lowpart (V4DImode, d_copy.target));
48226 return true;
48228 return false;
48231 if (d->testing_p)
48232 break;
48234 t1 = gen_reg_rtx (V4DImode);
48235 t2 = gen_reg_rtx (V4DImode);
48237 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48238 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48239 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48241 /* Now an vpunpck[lh]qdq will produce the result required. */
48242 if (odd)
48243 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48244 else
48245 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48246 emit_insn (t3);
48247 break;
48249 case E_V8SImode:
48250 if (!TARGET_AVX2)
48252 struct expand_vec_perm_d d_copy = *d;
48253 d_copy.vmode = V8SFmode;
48254 if (d->testing_p)
48255 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48256 else
48257 d_copy.target = gen_reg_rtx (V8SFmode);
48258 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48259 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48260 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48262 if (!d->testing_p)
48263 emit_move_insn (d->target,
48264 gen_lowpart (V8SImode, d_copy.target));
48265 return true;
48267 return false;
48270 if (d->testing_p)
48271 break;
48273 t1 = gen_reg_rtx (V8SImode);
48274 t2 = gen_reg_rtx (V8SImode);
48275 t3 = gen_reg_rtx (V4DImode);
48276 t4 = gen_reg_rtx (V4DImode);
48277 t5 = gen_reg_rtx (V4DImode);
48279 /* Shuffle the lanes around into
48280 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48281 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48282 gen_lowpart (V4DImode, d->op1),
48283 GEN_INT (0x20)));
48284 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48285 gen_lowpart (V4DImode, d->op1),
48286 GEN_INT (0x31)));
48288 /* Swap the 2nd and 3rd position in each lane into
48289 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48290 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48291 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48292 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48293 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48295 /* Now an vpunpck[lh]qdq will produce
48296 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48297 if (odd)
48298 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48299 gen_lowpart (V4DImode, t2));
48300 else
48301 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48302 gen_lowpart (V4DImode, t2));
48303 emit_insn (t3);
48304 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48305 break;
48307 default:
48308 gcc_unreachable ();
48311 return true;
48314 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48315 extract-even and extract-odd permutations. */
48317 static bool
48318 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48320 unsigned i, odd, nelt = d->nelt;
48322 odd = d->perm[0];
48323 if (odd != 0 && odd != 1)
48324 return false;
48326 for (i = 1; i < nelt; ++i)
48327 if (d->perm[i] != 2 * i + odd)
48328 return false;
48330 return expand_vec_perm_even_odd_1 (d, odd);
48333 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48334 permutations. We assume that expand_vec_perm_1 has already failed. */
48336 static bool
48337 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48339 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48340 machine_mode vmode = d->vmode;
48341 unsigned char perm2[4];
48342 rtx op0 = d->op0, dest;
48343 bool ok;
48345 switch (vmode)
48347 case E_V4DFmode:
48348 case E_V8SFmode:
48349 /* These are special-cased in sse.md so that we can optionally
48350 use the vbroadcast instruction. They expand to two insns
48351 if the input happens to be in a register. */
48352 gcc_unreachable ();
48354 case E_V2DFmode:
48355 case E_V2DImode:
48356 case E_V4SFmode:
48357 case E_V4SImode:
48358 /* These are always implementable using standard shuffle patterns. */
48359 gcc_unreachable ();
48361 case E_V8HImode:
48362 case E_V16QImode:
48363 /* These can be implemented via interleave. We save one insn by
48364 stopping once we have promoted to V4SImode and then use pshufd. */
48365 if (d->testing_p)
48366 return true;
48369 rtx dest;
48370 rtx (*gen) (rtx, rtx, rtx)
48371 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48372 : gen_vec_interleave_lowv8hi;
48374 if (elt >= nelt2)
48376 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48377 : gen_vec_interleave_highv8hi;
48378 elt -= nelt2;
48380 nelt2 /= 2;
48382 dest = gen_reg_rtx (vmode);
48383 emit_insn (gen (dest, op0, op0));
48384 vmode = get_mode_wider_vector (vmode);
48385 op0 = gen_lowpart (vmode, dest);
48387 while (vmode != V4SImode);
48389 memset (perm2, elt, 4);
48390 dest = gen_reg_rtx (V4SImode);
48391 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48392 gcc_assert (ok);
48393 if (!d->testing_p)
48394 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48395 return true;
48397 case E_V64QImode:
48398 case E_V32QImode:
48399 case E_V16HImode:
48400 case E_V8SImode:
48401 case E_V4DImode:
48402 /* For AVX2 broadcasts of the first element vpbroadcast* or
48403 vpermq should be used by expand_vec_perm_1. */
48404 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48405 return false;
48407 default:
48408 gcc_unreachable ();
48412 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48413 broadcast permutations. */
48415 static bool
48416 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48418 unsigned i, elt, nelt = d->nelt;
48420 if (!d->one_operand_p)
48421 return false;
48423 elt = d->perm[0];
48424 for (i = 1; i < nelt; ++i)
48425 if (d->perm[i] != elt)
48426 return false;
48428 return expand_vec_perm_broadcast_1 (d);
48431 /* Implement arbitrary permutations of two V64QImode operands
48432 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
48433 static bool
48434 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
48436 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48437 return false;
48439 if (d->testing_p)
48440 return true;
48442 struct expand_vec_perm_d ds[2];
48443 rtx rperm[128], vperm, target0, target1;
48444 unsigned int i, nelt;
48445 machine_mode vmode;
48447 nelt = d->nelt;
48448 vmode = V64QImode;
48450 for (i = 0; i < 2; i++)
48452 ds[i] = *d;
48453 ds[i].vmode = V32HImode;
48454 ds[i].nelt = 32;
48455 ds[i].target = gen_reg_rtx (V32HImode);
48456 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48457 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48460 /* Prepare permutations such that the first one takes care of
48461 putting the even bytes into the right positions or one higher
48462 positions (ds[0]) and the second one takes care of
48463 putting the odd bytes into the right positions or one below
48464 (ds[1]). */
48466 for (i = 0; i < nelt; i++)
48468 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48469 if (i & 1)
48471 rperm[i] = constm1_rtx;
48472 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48474 else
48476 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48477 rperm[i + 64] = constm1_rtx;
48481 bool ok = expand_vec_perm_1 (&ds[0]);
48482 gcc_assert (ok);
48483 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48485 ok = expand_vec_perm_1 (&ds[1]);
48486 gcc_assert (ok);
48487 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48489 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48490 vperm = force_reg (vmode, vperm);
48491 target0 = gen_reg_rtx (V64QImode);
48492 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48494 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48495 vperm = force_reg (vmode, vperm);
48496 target1 = gen_reg_rtx (V64QImode);
48497 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48499 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48500 return true;
48503 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48504 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48505 all the shorter instruction sequences. */
48507 static bool
48508 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48510 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48511 unsigned int i, nelt, eltsz;
48512 bool used[4];
48514 if (!TARGET_AVX2
48515 || d->one_operand_p
48516 || (d->vmode != V32QImode && d->vmode != V16HImode))
48517 return false;
48519 if (d->testing_p)
48520 return true;
48522 nelt = d->nelt;
48523 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48525 /* Generate 4 permutation masks. If the required element is within
48526 the same lane, it is shuffled in. If the required element from the
48527 other lane, force a zero by setting bit 7 in the permutation mask.
48528 In the other mask the mask has non-negative elements if element
48529 is requested from the other lane, but also moved to the other lane,
48530 so that the result of vpshufb can have the two V2TImode halves
48531 swapped. */
48532 m128 = GEN_INT (-128);
48533 for (i = 0; i < 32; ++i)
48535 rperm[0][i] = m128;
48536 rperm[1][i] = m128;
48537 rperm[2][i] = m128;
48538 rperm[3][i] = m128;
48540 used[0] = false;
48541 used[1] = false;
48542 used[2] = false;
48543 used[3] = false;
48544 for (i = 0; i < nelt; ++i)
48546 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48547 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48548 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48550 for (j = 0; j < eltsz; ++j)
48551 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48552 used[which] = true;
48555 for (i = 0; i < 2; ++i)
48557 if (!used[2 * i + 1])
48559 h[i] = NULL_RTX;
48560 continue;
48562 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48563 gen_rtvec_v (32, rperm[2 * i + 1]));
48564 vperm = force_reg (V32QImode, vperm);
48565 h[i] = gen_reg_rtx (V32QImode);
48566 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48567 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48570 /* Swap the 128-byte lanes of h[X]. */
48571 for (i = 0; i < 2; ++i)
48573 if (h[i] == NULL_RTX)
48574 continue;
48575 op = gen_reg_rtx (V4DImode);
48576 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48577 const2_rtx, GEN_INT (3), const0_rtx,
48578 const1_rtx));
48579 h[i] = gen_lowpart (V32QImode, op);
48582 for (i = 0; i < 2; ++i)
48584 if (!used[2 * i])
48586 l[i] = NULL_RTX;
48587 continue;
48589 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48590 vperm = force_reg (V32QImode, vperm);
48591 l[i] = gen_reg_rtx (V32QImode);
48592 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48593 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48596 for (i = 0; i < 2; ++i)
48598 if (h[i] && l[i])
48600 op = gen_reg_rtx (V32QImode);
48601 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48602 l[i] = op;
48604 else if (h[i])
48605 l[i] = h[i];
48608 gcc_assert (l[0] && l[1]);
48609 op = d->target;
48610 if (d->vmode != V32QImode)
48611 op = gen_reg_rtx (V32QImode);
48612 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48613 if (op != d->target)
48614 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48615 return true;
48618 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
48619 taken care of, perform the expansion in D and return true on success. */
48621 static bool
48622 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48624 /* Try a single instruction expansion. */
48625 if (expand_vec_perm_1 (d))
48626 return true;
48628 /* Try sequences of two instructions. */
48630 if (expand_vec_perm_pshuflw_pshufhw (d))
48631 return true;
48633 if (expand_vec_perm_palignr (d, false))
48634 return true;
48636 if (expand_vec_perm_interleave2 (d))
48637 return true;
48639 if (expand_vec_perm_broadcast (d))
48640 return true;
48642 if (expand_vec_perm_vpermq_perm_1 (d))
48643 return true;
48645 if (expand_vec_perm_vperm2f128 (d))
48646 return true;
48648 if (expand_vec_perm_pblendv (d))
48649 return true;
48651 /* Try sequences of three instructions. */
48653 if (expand_vec_perm_even_odd_pack (d))
48654 return true;
48656 if (expand_vec_perm_2vperm2f128_vshuf (d))
48657 return true;
48659 if (expand_vec_perm_pshufb2 (d))
48660 return true;
48662 if (expand_vec_perm_interleave3 (d))
48663 return true;
48665 if (expand_vec_perm_vperm2f128_vblend (d))
48666 return true;
48668 /* Try sequences of four instructions. */
48670 if (expand_vec_perm_even_odd_trunc (d))
48671 return true;
48672 if (expand_vec_perm_vpshufb2_vpermq (d))
48673 return true;
48675 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48676 return true;
48678 if (expand_vec_perm_vpermt2_vpshub2 (d))
48679 return true;
48681 /* ??? Look for narrow permutations whose element orderings would
48682 allow the promotion to a wider mode. */
48684 /* ??? Look for sequences of interleave or a wider permute that place
48685 the data into the correct lanes for a half-vector shuffle like
48686 pshuf[lh]w or vpermilps. */
48688 /* ??? Look for sequences of interleave that produce the desired results.
48689 The combinatorics of punpck[lh] get pretty ugly... */
48691 if (expand_vec_perm_even_odd (d))
48692 return true;
48694 /* Even longer sequences. */
48695 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48696 return true;
48698 /* See if we can get the same permutation in different vector integer
48699 mode. */
48700 struct expand_vec_perm_d nd;
48701 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48703 if (!d->testing_p)
48704 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48705 return true;
48708 return false;
48711 /* If a permutation only uses one operand, make it clear. Returns true
48712 if the permutation references both operands. */
48714 static bool
48715 canonicalize_perm (struct expand_vec_perm_d *d)
48717 int i, which, nelt = d->nelt;
48719 for (i = which = 0; i < nelt; ++i)
48720 which |= (d->perm[i] < nelt ? 1 : 2);
48722 d->one_operand_p = true;
48723 switch (which)
48725 default:
48726 gcc_unreachable();
48728 case 3:
48729 if (!rtx_equal_p (d->op0, d->op1))
48731 d->one_operand_p = false;
48732 break;
48734 /* The elements of PERM do not suggest that only the first operand
48735 is used, but both operands are identical. Allow easier matching
48736 of the permutation by folding the permutation into the single
48737 input vector. */
48738 /* FALLTHRU */
48740 case 2:
48741 for (i = 0; i < nelt; ++i)
48742 d->perm[i] &= nelt - 1;
48743 d->op0 = d->op1;
48744 break;
48746 case 1:
48747 d->op1 = d->op0;
48748 break;
48751 return (which == 3);
48754 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
48756 static bool
48757 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
48758 rtx op1, const vec_perm_indices &sel)
48760 struct expand_vec_perm_d d;
48761 unsigned char perm[MAX_VECT_LEN];
48762 unsigned int i, nelt, which;
48763 bool two_args;
48765 d.target = target;
48766 d.op0 = op0;
48767 d.op1 = op1;
48769 d.vmode = vmode;
48770 gcc_assert (VECTOR_MODE_P (d.vmode));
48771 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48772 d.testing_p = !target;
48774 gcc_assert (sel.length () == nelt);
48775 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
48777 /* Given sufficient ISA support we can just return true here
48778 for selected vector modes. */
48779 switch (d.vmode)
48781 case E_V16SFmode:
48782 case E_V16SImode:
48783 case E_V8DImode:
48784 case E_V8DFmode:
48785 if (!TARGET_AVX512F)
48786 return false;
48787 /* All implementable with a single vperm[it]2 insn. */
48788 if (d.testing_p)
48789 return true;
48790 break;
48791 case E_V32HImode:
48792 if (!TARGET_AVX512BW)
48793 return false;
48794 if (d.testing_p)
48795 /* All implementable with a single vperm[it]2 insn. */
48796 return true;
48797 break;
48798 case E_V64QImode:
48799 if (!TARGET_AVX512BW)
48800 return false;
48801 if (d.testing_p)
48802 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
48803 return true;
48804 break;
48805 case E_V8SImode:
48806 case E_V8SFmode:
48807 case E_V4DFmode:
48808 case E_V4DImode:
48809 if (!TARGET_AVX)
48810 return false;
48811 if (d.testing_p && TARGET_AVX512VL)
48812 /* All implementable with a single vperm[it]2 insn. */
48813 return true;
48814 break;
48815 case E_V16HImode:
48816 if (!TARGET_SSE2)
48817 return false;
48818 if (d.testing_p && TARGET_AVX2)
48819 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48820 return true;
48821 break;
48822 case E_V32QImode:
48823 if (!TARGET_SSE2)
48824 return false;
48825 if (d.testing_p && TARGET_AVX2)
48826 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48827 return true;
48828 break;
48829 case E_V8HImode:
48830 case E_V16QImode:
48831 if (!TARGET_SSE2)
48832 return false;
48833 /* Fall through. */
48834 case E_V4SImode:
48835 case E_V4SFmode:
48836 if (!TARGET_SSE)
48837 return false;
48838 /* All implementable with a single vpperm insn. */
48839 if (d.testing_p && TARGET_XOP)
48840 return true;
48841 /* All implementable with 2 pshufb + 1 ior. */
48842 if (d.testing_p && TARGET_SSSE3)
48843 return true;
48844 break;
48845 case E_V2DImode:
48846 case E_V2DFmode:
48847 if (!TARGET_SSE)
48848 return false;
48849 /* All implementable with shufpd or unpck[lh]pd. */
48850 if (d.testing_p)
48851 return true;
48852 break;
48853 default:
48854 return false;
48857 for (i = which = 0; i < nelt; ++i)
48859 unsigned char e = sel[i];
48860 gcc_assert (e < 2 * nelt);
48861 d.perm[i] = e;
48862 perm[i] = e;
48863 which |= (e < nelt ? 1 : 2);
48866 if (d.testing_p)
48868 /* For all elements from second vector, fold the elements to first. */
48869 if (which == 2)
48870 for (i = 0; i < nelt; ++i)
48871 d.perm[i] -= nelt;
48873 /* Check whether the mask can be applied to the vector type. */
48874 d.one_operand_p = (which != 3);
48876 /* Implementable with shufps or pshufd. */
48877 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
48878 return true;
48880 /* Otherwise we have to go through the motions and see if we can
48881 figure out how to generate the requested permutation. */
48882 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
48883 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
48884 if (!d.one_operand_p)
48885 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
48887 start_sequence ();
48888 bool ret = ix86_expand_vec_perm_const_1 (&d);
48889 end_sequence ();
48891 return ret;
48894 two_args = canonicalize_perm (&d);
48896 if (ix86_expand_vec_perm_const_1 (&d))
48897 return true;
48899 /* If the selector says both arguments are needed, but the operands are the
48900 same, the above tried to expand with one_operand_p and flattened selector.
48901 If that didn't work, retry without one_operand_p; we succeeded with that
48902 during testing. */
48903 if (two_args && d.one_operand_p)
48905 d.one_operand_p = false;
48906 memcpy (d.perm, perm, sizeof (perm));
48907 return ix86_expand_vec_perm_const_1 (&d);
48910 return false;
48913 void
48914 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
48916 struct expand_vec_perm_d d;
48917 unsigned i, nelt;
48919 d.target = targ;
48920 d.op0 = op0;
48921 d.op1 = op1;
48922 d.vmode = GET_MODE (targ);
48923 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48924 d.one_operand_p = false;
48925 d.testing_p = false;
48927 for (i = 0; i < nelt; ++i)
48928 d.perm[i] = i * 2 + odd;
48930 /* We'll either be able to implement the permutation directly... */
48931 if (expand_vec_perm_1 (&d))
48932 return;
48934 /* ... or we use the special-case patterns. */
48935 expand_vec_perm_even_odd_1 (&d, odd);
48938 static void
48939 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
48941 struct expand_vec_perm_d d;
48942 unsigned i, nelt, base;
48943 bool ok;
48945 d.target = targ;
48946 d.op0 = op0;
48947 d.op1 = op1;
48948 d.vmode = GET_MODE (targ);
48949 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48950 d.one_operand_p = false;
48951 d.testing_p = false;
48953 base = high_p ? nelt / 2 : 0;
48954 for (i = 0; i < nelt / 2; ++i)
48956 d.perm[i * 2] = i + base;
48957 d.perm[i * 2 + 1] = i + base + nelt;
48960 /* Note that for AVX this isn't one instruction. */
48961 ok = ix86_expand_vec_perm_const_1 (&d);
48962 gcc_assert (ok);
48966 /* Expand a vector operation CODE for a V*QImode in terms of the
48967 same operation on V*HImode. */
48969 void
48970 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
48972 machine_mode qimode = GET_MODE (dest);
48973 machine_mode himode;
48974 rtx (*gen_il) (rtx, rtx, rtx);
48975 rtx (*gen_ih) (rtx, rtx, rtx);
48976 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
48977 struct expand_vec_perm_d d;
48978 bool ok, full_interleave;
48979 bool uns_p = false;
48980 int i;
48982 switch (qimode)
48984 case E_V16QImode:
48985 himode = V8HImode;
48986 gen_il = gen_vec_interleave_lowv16qi;
48987 gen_ih = gen_vec_interleave_highv16qi;
48988 break;
48989 case E_V32QImode:
48990 himode = V16HImode;
48991 gen_il = gen_avx2_interleave_lowv32qi;
48992 gen_ih = gen_avx2_interleave_highv32qi;
48993 break;
48994 case E_V64QImode:
48995 himode = V32HImode;
48996 gen_il = gen_avx512bw_interleave_lowv64qi;
48997 gen_ih = gen_avx512bw_interleave_highv64qi;
48998 break;
48999 default:
49000 gcc_unreachable ();
49003 op2_l = op2_h = op2;
49004 switch (code)
49006 case MULT:
49007 /* Unpack data such that we've got a source byte in each low byte of
49008 each word. We don't care what goes into the high byte of each word.
49009 Rather than trying to get zero in there, most convenient is to let
49010 it be a copy of the low byte. */
49011 op2_l = gen_reg_rtx (qimode);
49012 op2_h = gen_reg_rtx (qimode);
49013 emit_insn (gen_il (op2_l, op2, op2));
49014 emit_insn (gen_ih (op2_h, op2, op2));
49016 op1_l = gen_reg_rtx (qimode);
49017 op1_h = gen_reg_rtx (qimode);
49018 emit_insn (gen_il (op1_l, op1, op1));
49019 emit_insn (gen_ih (op1_h, op1, op1));
49020 full_interleave = qimode == V16QImode;
49021 break;
49023 case ASHIFT:
49024 case LSHIFTRT:
49025 uns_p = true;
49026 /* FALLTHRU */
49027 case ASHIFTRT:
49028 op1_l = gen_reg_rtx (himode);
49029 op1_h = gen_reg_rtx (himode);
49030 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49031 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49032 full_interleave = true;
49033 break;
49034 default:
49035 gcc_unreachable ();
49038 /* Perform the operation. */
49039 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49040 1, OPTAB_DIRECT);
49041 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49042 1, OPTAB_DIRECT);
49043 gcc_assert (res_l && res_h);
49045 /* Merge the data back into the right place. */
49046 d.target = dest;
49047 d.op0 = gen_lowpart (qimode, res_l);
49048 d.op1 = gen_lowpart (qimode, res_h);
49049 d.vmode = qimode;
49050 d.nelt = GET_MODE_NUNITS (qimode);
49051 d.one_operand_p = false;
49052 d.testing_p = false;
49054 if (full_interleave)
49056 /* For SSE2, we used an full interleave, so the desired
49057 results are in the even elements. */
49058 for (i = 0; i < d.nelt; ++i)
49059 d.perm[i] = i * 2;
49061 else
49063 /* For AVX, the interleave used above was not cross-lane. So the
49064 extraction is evens but with the second and third quarter swapped.
49065 Happily, that is even one insn shorter than even extraction.
49066 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49067 always first from the first and then from the second source operand,
49068 the index bits above the low 4 bits remains the same.
49069 Thus, for d.nelt == 32 we want permutation
49070 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49071 and for d.nelt == 64 we want permutation
49072 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49073 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49074 for (i = 0; i < d.nelt; ++i)
49075 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49078 ok = ix86_expand_vec_perm_const_1 (&d);
49079 gcc_assert (ok);
49081 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49082 gen_rtx_fmt_ee (code, qimode, op1, op2));
49085 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49086 if op is CONST_VECTOR with all odd elements equal to their
49087 preceding element. */
49089 static bool
49090 const_vector_equal_evenodd_p (rtx op)
49092 machine_mode mode = GET_MODE (op);
49093 int i, nunits = GET_MODE_NUNITS (mode);
49094 if (GET_CODE (op) != CONST_VECTOR
49095 || nunits != CONST_VECTOR_NUNITS (op))
49096 return false;
49097 for (i = 0; i < nunits; i += 2)
49098 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49099 return false;
49100 return true;
49103 void
49104 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49105 bool uns_p, bool odd_p)
49107 machine_mode mode = GET_MODE (op1);
49108 machine_mode wmode = GET_MODE (dest);
49109 rtx x;
49110 rtx orig_op1 = op1, orig_op2 = op2;
49112 if (!nonimmediate_operand (op1, mode))
49113 op1 = force_reg (mode, op1);
49114 if (!nonimmediate_operand (op2, mode))
49115 op2 = force_reg (mode, op2);
49117 /* We only play even/odd games with vectors of SImode. */
49118 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49120 /* If we're looking for the odd results, shift those members down to
49121 the even slots. For some cpus this is faster than a PSHUFD. */
49122 if (odd_p)
49124 /* For XOP use vpmacsdqh, but only for smult, as it is only
49125 signed. */
49126 if (TARGET_XOP && mode == V4SImode && !uns_p)
49128 x = force_reg (wmode, CONST0_RTX (wmode));
49129 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49130 return;
49133 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49134 if (!const_vector_equal_evenodd_p (orig_op1))
49135 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49136 x, NULL, 1, OPTAB_DIRECT);
49137 if (!const_vector_equal_evenodd_p (orig_op2))
49138 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49139 x, NULL, 1, OPTAB_DIRECT);
49140 op1 = gen_lowpart (mode, op1);
49141 op2 = gen_lowpart (mode, op2);
49144 if (mode == V16SImode)
49146 if (uns_p)
49147 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49148 else
49149 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49151 else if (mode == V8SImode)
49153 if (uns_p)
49154 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49155 else
49156 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49158 else if (uns_p)
49159 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49160 else if (TARGET_SSE4_1)
49161 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49162 else
49164 rtx s1, s2, t0, t1, t2;
49166 /* The easiest way to implement this without PMULDQ is to go through
49167 the motions as if we are performing a full 64-bit multiply. With
49168 the exception that we need to do less shuffling of the elements. */
49170 /* Compute the sign-extension, aka highparts, of the two operands. */
49171 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49172 op1, pc_rtx, pc_rtx);
49173 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49174 op2, pc_rtx, pc_rtx);
49176 /* Multiply LO(A) * HI(B), and vice-versa. */
49177 t1 = gen_reg_rtx (wmode);
49178 t2 = gen_reg_rtx (wmode);
49179 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49180 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49182 /* Multiply LO(A) * LO(B). */
49183 t0 = gen_reg_rtx (wmode);
49184 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49186 /* Combine and shift the highparts into place. */
49187 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49188 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49189 1, OPTAB_DIRECT);
49191 /* Combine high and low parts. */
49192 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49193 return;
49195 emit_insn (x);
49198 void
49199 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49200 bool uns_p, bool high_p)
49202 machine_mode wmode = GET_MODE (dest);
49203 machine_mode mode = GET_MODE (op1);
49204 rtx t1, t2, t3, t4, mask;
49206 switch (mode)
49208 case E_V4SImode:
49209 t1 = gen_reg_rtx (mode);
49210 t2 = gen_reg_rtx (mode);
49211 if (TARGET_XOP && !uns_p)
49213 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49214 shuffle the elements once so that all elements are in the right
49215 place for immediate use: { A C B D }. */
49216 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49217 const1_rtx, GEN_INT (3)));
49218 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49219 const1_rtx, GEN_INT (3)));
49221 else
49223 /* Put the elements into place for the multiply. */
49224 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49225 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49226 high_p = false;
49228 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49229 break;
49231 case E_V8SImode:
49232 /* Shuffle the elements between the lanes. After this we
49233 have { A B E F | C D G H } for each operand. */
49234 t1 = gen_reg_rtx (V4DImode);
49235 t2 = gen_reg_rtx (V4DImode);
49236 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49237 const0_rtx, const2_rtx,
49238 const1_rtx, GEN_INT (3)));
49239 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49240 const0_rtx, const2_rtx,
49241 const1_rtx, GEN_INT (3)));
49243 /* Shuffle the elements within the lanes. After this we
49244 have { A A B B | C C D D } or { E E F F | G G H H }. */
49245 t3 = gen_reg_rtx (V8SImode);
49246 t4 = gen_reg_rtx (V8SImode);
49247 mask = GEN_INT (high_p
49248 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49249 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49250 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49251 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49253 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49254 break;
49256 case E_V8HImode:
49257 case E_V16HImode:
49258 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49259 uns_p, OPTAB_DIRECT);
49260 t2 = expand_binop (mode,
49261 uns_p ? umul_highpart_optab : smul_highpart_optab,
49262 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49263 gcc_assert (t1 && t2);
49265 t3 = gen_reg_rtx (mode);
49266 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49267 emit_move_insn (dest, gen_lowpart (wmode, t3));
49268 break;
49270 case E_V16QImode:
49271 case E_V32QImode:
49272 case E_V32HImode:
49273 case E_V16SImode:
49274 case E_V64QImode:
49275 t1 = gen_reg_rtx (wmode);
49276 t2 = gen_reg_rtx (wmode);
49277 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49278 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49280 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49281 break;
49283 default:
49284 gcc_unreachable ();
49288 void
49289 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49291 rtx res_1, res_2, res_3, res_4;
49293 res_1 = gen_reg_rtx (V4SImode);
49294 res_2 = gen_reg_rtx (V4SImode);
49295 res_3 = gen_reg_rtx (V2DImode);
49296 res_4 = gen_reg_rtx (V2DImode);
49297 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49298 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49300 /* Move the results in element 2 down to element 1; we don't care
49301 what goes in elements 2 and 3. Then we can merge the parts
49302 back together with an interleave.
49304 Note that two other sequences were tried:
49305 (1) Use interleaves at the start instead of psrldq, which allows
49306 us to use a single shufps to merge things back at the end.
49307 (2) Use shufps here to combine the two vectors, then pshufd to
49308 put the elements in the correct order.
49309 In both cases the cost of the reformatting stall was too high
49310 and the overall sequence slower. */
49312 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49313 const0_rtx, const2_rtx,
49314 const0_rtx, const0_rtx));
49315 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49316 const0_rtx, const2_rtx,
49317 const0_rtx, const0_rtx));
49318 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49320 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49323 void
49324 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49326 machine_mode mode = GET_MODE (op0);
49327 rtx t1, t2, t3, t4, t5, t6;
49329 if (TARGET_AVX512DQ && mode == V8DImode)
49330 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49331 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49332 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49333 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49334 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49335 else if (TARGET_XOP && mode == V2DImode)
49337 /* op1: A,B,C,D, op2: E,F,G,H */
49338 op1 = gen_lowpart (V4SImode, op1);
49339 op2 = gen_lowpart (V4SImode, op2);
49341 t1 = gen_reg_rtx (V4SImode);
49342 t2 = gen_reg_rtx (V4SImode);
49343 t3 = gen_reg_rtx (V2DImode);
49344 t4 = gen_reg_rtx (V2DImode);
49346 /* t1: B,A,D,C */
49347 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49348 GEN_INT (1),
49349 GEN_INT (0),
49350 GEN_INT (3),
49351 GEN_INT (2)));
49353 /* t2: (B*E),(A*F),(D*G),(C*H) */
49354 emit_insn (gen_mulv4si3 (t2, t1, op2));
49356 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49357 emit_insn (gen_xop_phadddq (t3, t2));
49359 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49360 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49362 /* Multiply lower parts and add all */
49363 t5 = gen_reg_rtx (V2DImode);
49364 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49365 gen_lowpart (V4SImode, op1),
49366 gen_lowpart (V4SImode, op2)));
49367 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49370 else
49372 machine_mode nmode;
49373 rtx (*umul) (rtx, rtx, rtx);
49375 if (mode == V2DImode)
49377 umul = gen_vec_widen_umult_even_v4si;
49378 nmode = V4SImode;
49380 else if (mode == V4DImode)
49382 umul = gen_vec_widen_umult_even_v8si;
49383 nmode = V8SImode;
49385 else if (mode == V8DImode)
49387 umul = gen_vec_widen_umult_even_v16si;
49388 nmode = V16SImode;
49390 else
49391 gcc_unreachable ();
49394 /* Multiply low parts. */
49395 t1 = gen_reg_rtx (mode);
49396 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49398 /* Shift input vectors right 32 bits so we can multiply high parts. */
49399 t6 = GEN_INT (32);
49400 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49401 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49403 /* Multiply high parts by low parts. */
49404 t4 = gen_reg_rtx (mode);
49405 t5 = gen_reg_rtx (mode);
49406 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49407 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49409 /* Combine and shift the highparts back. */
49410 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49411 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49413 /* Combine high and low parts. */
49414 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49417 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49418 gen_rtx_MULT (mode, op1, op2));
49421 /* Return 1 if control tansfer instruction INSN
49422 should be encoded with bnd prefix.
49423 If insn is NULL then return 1 when control
49424 transfer instructions should be prefixed with
49425 bnd by default for current function. */
49427 bool
49428 ix86_bnd_prefixed_insn_p (rtx insn)
49430 /* For call insns check special flag. */
49431 if (insn && CALL_P (insn))
49433 rtx call = get_call_rtx_from (insn);
49434 if (call)
49435 return CALL_EXPR_WITH_BOUNDS_P (call);
49438 /* All other insns are prefixed only if function is instrumented. */
49439 return chkp_function_instrumented_p (current_function_decl);
49442 /* Return 1 if control tansfer instruction INSN
49443 should be encoded with notrack prefix. */
49445 static bool
49446 ix86_notrack_prefixed_insn_p (rtx insn)
49448 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
49449 return false;
49451 if (CALL_P (insn))
49453 rtx call = get_call_rtx_from (insn);
49454 gcc_assert (call != NULL_RTX);
49455 rtx addr = XEXP (call, 0);
49457 /* Do not emit 'notrack' if it's not an indirect call. */
49458 if (MEM_P (addr)
49459 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49460 return false;
49461 else
49462 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
49465 if (JUMP_P (insn) && !flag_cet_switch)
49467 rtx target = JUMP_LABEL (insn);
49468 if (target == NULL_RTX || ANY_RETURN_P (target))
49469 return false;
49471 /* Check the jump is a switch table. */
49472 rtx_insn *label = as_a<rtx_insn *> (target);
49473 rtx_insn *table = next_insn (label);
49474 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
49475 return false;
49476 else
49477 return true;
49479 return false;
49482 /* Calculate integer abs() using only SSE2 instructions. */
49484 void
49485 ix86_expand_sse2_abs (rtx target, rtx input)
49487 machine_mode mode = GET_MODE (target);
49488 rtx tmp0, tmp1, x;
49490 switch (mode)
49492 /* For 32-bit signed integer X, the best way to calculate the absolute
49493 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49494 case E_V4SImode:
49495 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49496 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49497 NULL, 0, OPTAB_DIRECT);
49498 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49499 NULL, 0, OPTAB_DIRECT);
49500 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49501 target, 0, OPTAB_DIRECT);
49502 break;
49504 /* For 16-bit signed integer X, the best way to calculate the absolute
49505 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49506 case E_V8HImode:
49507 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49509 x = expand_simple_binop (mode, SMAX, tmp0, input,
49510 target, 0, OPTAB_DIRECT);
49511 break;
49513 /* For 8-bit signed integer X, the best way to calculate the absolute
49514 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49515 as SSE2 provides the PMINUB insn. */
49516 case E_V16QImode:
49517 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49519 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49520 target, 0, OPTAB_DIRECT);
49521 break;
49523 default:
49524 gcc_unreachable ();
49527 if (x != target)
49528 emit_move_insn (target, x);
49531 /* Expand an extract from a vector register through pextr insn.
49532 Return true if successful. */
49534 bool
49535 ix86_expand_pextr (rtx *operands)
49537 rtx dst = operands[0];
49538 rtx src = operands[1];
49540 unsigned int size = INTVAL (operands[2]);
49541 unsigned int pos = INTVAL (operands[3]);
49543 if (SUBREG_P (dst))
49545 /* Reject non-lowpart subregs. */
49546 if (SUBREG_BYTE (dst) > 0)
49547 return false;
49548 dst = SUBREG_REG (dst);
49551 if (SUBREG_P (src))
49553 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49554 src = SUBREG_REG (src);
49557 switch (GET_MODE (src))
49559 case E_V16QImode:
49560 case E_V8HImode:
49561 case E_V4SImode:
49562 case E_V2DImode:
49563 case E_V1TImode:
49564 case E_TImode:
49566 machine_mode srcmode, dstmode;
49567 rtx d, pat;
49569 if (!int_mode_for_size (size, 0).exists (&dstmode))
49570 return false;
49572 switch (dstmode)
49574 case E_QImode:
49575 if (!TARGET_SSE4_1)
49576 return false;
49577 srcmode = V16QImode;
49578 break;
49580 case E_HImode:
49581 if (!TARGET_SSE2)
49582 return false;
49583 srcmode = V8HImode;
49584 break;
49586 case E_SImode:
49587 if (!TARGET_SSE4_1)
49588 return false;
49589 srcmode = V4SImode;
49590 break;
49592 case E_DImode:
49593 gcc_assert (TARGET_64BIT);
49594 if (!TARGET_SSE4_1)
49595 return false;
49596 srcmode = V2DImode;
49597 break;
49599 default:
49600 return false;
49603 /* Reject extractions from misaligned positions. */
49604 if (pos & (size-1))
49605 return false;
49607 if (GET_MODE (dst) == dstmode)
49608 d = dst;
49609 else
49610 d = gen_reg_rtx (dstmode);
49612 /* Construct insn pattern. */
49613 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49614 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49616 /* Let the rtl optimizers know about the zero extension performed. */
49617 if (dstmode == QImode || dstmode == HImode)
49619 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49620 d = gen_lowpart (SImode, d);
49623 emit_insn (gen_rtx_SET (d, pat));
49625 if (d != dst)
49626 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49627 return true;
49630 default:
49631 return false;
49635 /* Expand an insert into a vector register through pinsr insn.
49636 Return true if successful. */
49638 bool
49639 ix86_expand_pinsr (rtx *operands)
49641 rtx dst = operands[0];
49642 rtx src = operands[3];
49644 unsigned int size = INTVAL (operands[1]);
49645 unsigned int pos = INTVAL (operands[2]);
49647 if (SUBREG_P (dst))
49649 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49650 dst = SUBREG_REG (dst);
49653 switch (GET_MODE (dst))
49655 case E_V16QImode:
49656 case E_V8HImode:
49657 case E_V4SImode:
49658 case E_V2DImode:
49659 case E_V1TImode:
49660 case E_TImode:
49662 machine_mode srcmode, dstmode;
49663 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49664 rtx d;
49666 if (!int_mode_for_size (size, 0).exists (&srcmode))
49667 return false;
49669 switch (srcmode)
49671 case E_QImode:
49672 if (!TARGET_SSE4_1)
49673 return false;
49674 dstmode = V16QImode;
49675 pinsr = gen_sse4_1_pinsrb;
49676 break;
49678 case E_HImode:
49679 if (!TARGET_SSE2)
49680 return false;
49681 dstmode = V8HImode;
49682 pinsr = gen_sse2_pinsrw;
49683 break;
49685 case E_SImode:
49686 if (!TARGET_SSE4_1)
49687 return false;
49688 dstmode = V4SImode;
49689 pinsr = gen_sse4_1_pinsrd;
49690 break;
49692 case E_DImode:
49693 gcc_assert (TARGET_64BIT);
49694 if (!TARGET_SSE4_1)
49695 return false;
49696 dstmode = V2DImode;
49697 pinsr = gen_sse4_1_pinsrq;
49698 break;
49700 default:
49701 return false;
49704 /* Reject insertions to misaligned positions. */
49705 if (pos & (size-1))
49706 return false;
49708 if (SUBREG_P (src))
49710 unsigned int srcpos = SUBREG_BYTE (src);
49712 if (srcpos > 0)
49714 rtx extr_ops[4];
49716 extr_ops[0] = gen_reg_rtx (srcmode);
49717 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49718 extr_ops[2] = GEN_INT (size);
49719 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49721 if (!ix86_expand_pextr (extr_ops))
49722 return false;
49724 src = extr_ops[0];
49726 else
49727 src = gen_lowpart (srcmode, SUBREG_REG (src));
49730 if (GET_MODE (dst) == dstmode)
49731 d = dst;
49732 else
49733 d = gen_reg_rtx (dstmode);
49735 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49736 gen_lowpart (srcmode, src),
49737 GEN_INT (1 << (pos / size))));
49738 if (d != dst)
49739 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49740 return true;
49743 default:
49744 return false;
49748 /* This function returns the calling abi specific va_list type node.
49749 It returns the FNDECL specific va_list type. */
49751 static tree
49752 ix86_fn_abi_va_list (tree fndecl)
49754 if (!TARGET_64BIT)
49755 return va_list_type_node;
49756 gcc_assert (fndecl != NULL_TREE);
49758 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
49759 return ms_va_list_type_node;
49760 else
49761 return sysv_va_list_type_node;
49764 /* Returns the canonical va_list type specified by TYPE. If there
49765 is no valid TYPE provided, it return NULL_TREE. */
49767 static tree
49768 ix86_canonical_va_list_type (tree type)
49770 if (TARGET_64BIT)
49772 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
49773 return ms_va_list_type_node;
49775 if ((TREE_CODE (type) == ARRAY_TYPE
49776 && integer_zerop (array_type_nelts (type)))
49777 || POINTER_TYPE_P (type))
49779 tree elem_type = TREE_TYPE (type);
49780 if (TREE_CODE (elem_type) == RECORD_TYPE
49781 && lookup_attribute ("sysv_abi va_list",
49782 TYPE_ATTRIBUTES (elem_type)))
49783 return sysv_va_list_type_node;
49786 return NULL_TREE;
49789 return std_canonical_va_list_type (type);
49792 /* Iterate through the target-specific builtin types for va_list.
49793 IDX denotes the iterator, *PTREE is set to the result type of
49794 the va_list builtin, and *PNAME to its internal type.
49795 Returns zero if there is no element for this index, otherwise
49796 IDX should be increased upon the next call.
49797 Note, do not iterate a base builtin's name like __builtin_va_list.
49798 Used from c_common_nodes_and_builtins. */
49800 static int
49801 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
49803 if (TARGET_64BIT)
49805 switch (idx)
49807 default:
49808 break;
49810 case 0:
49811 *ptree = ms_va_list_type_node;
49812 *pname = "__builtin_ms_va_list";
49813 return 1;
49815 case 1:
49816 *ptree = sysv_va_list_type_node;
49817 *pname = "__builtin_sysv_va_list";
49818 return 1;
49822 return 0;
49825 #undef TARGET_SCHED_DISPATCH
49826 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
49827 #undef TARGET_SCHED_DISPATCH_DO
49828 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
49829 #undef TARGET_SCHED_REASSOCIATION_WIDTH
49830 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
49831 #undef TARGET_SCHED_REORDER
49832 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
49833 #undef TARGET_SCHED_ADJUST_PRIORITY
49834 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
49835 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
49836 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
49837 ix86_dependencies_evaluation_hook
49840 /* Implementation of reassociation_width target hook used by
49841 reassoc phase to identify parallelism level in reassociated
49842 tree. Statements tree_code is passed in OPC. Arguments type
49843 is passed in MODE. */
49845 static int
49846 ix86_reassociation_width (unsigned int op, machine_mode mode)
49848 int width = 1;
49849 /* Vector part. */
49850 if (VECTOR_MODE_P (mode))
49852 int div = 1;
49853 if (INTEGRAL_MODE_P (mode))
49854 width = ix86_cost->reassoc_vec_int;
49855 else if (FLOAT_MODE_P (mode))
49856 width = ix86_cost->reassoc_vec_fp;
49858 if (width == 1)
49859 return 1;
49861 /* Integer vector instructions execute in FP unit
49862 and can execute 3 additions and one multiplication per cycle. */
49863 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
49864 && op != PLUS && op != MINUS)
49865 return 1;
49867 /* Account for targets that splits wide vectors into multiple parts. */
49868 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
49869 div = GET_MODE_BITSIZE (mode) / 128;
49870 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
49871 div = GET_MODE_BITSIZE (mode) / 64;
49872 width = (width + div - 1) / div;
49874 /* Scalar part. */
49875 else if (INTEGRAL_MODE_P (mode))
49876 width = ix86_cost->reassoc_int;
49877 else if (FLOAT_MODE_P (mode))
49878 width = ix86_cost->reassoc_fp;
49880 /* Avoid using too many registers in 32bit mode. */
49881 if (!TARGET_64BIT && width > 2)
49882 width = 2;
49883 return width;
49886 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
49887 place emms and femms instructions. */
49889 static machine_mode
49890 ix86_preferred_simd_mode (scalar_mode mode)
49892 if (!TARGET_SSE)
49893 return word_mode;
49895 switch (mode)
49897 case E_QImode:
49898 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
49899 return V64QImode;
49900 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49901 return V32QImode;
49902 else
49903 return V16QImode;
49905 case E_HImode:
49906 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
49907 return V32HImode;
49908 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49909 return V16HImode;
49910 else
49911 return V8HImode;
49913 case E_SImode:
49914 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49915 return V16SImode;
49916 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49917 return V8SImode;
49918 else
49919 return V4SImode;
49921 case E_DImode:
49922 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49923 return V8DImode;
49924 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49925 return V4DImode;
49926 else
49927 return V2DImode;
49929 case E_SFmode:
49930 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49931 return V16SFmode;
49932 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49933 return V8SFmode;
49934 else
49935 return V4SFmode;
49937 case E_DFmode:
49938 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49939 return V8DFmode;
49940 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49941 return V4DFmode;
49942 else if (TARGET_SSE2)
49943 return V2DFmode;
49944 /* FALLTHRU */
49946 default:
49947 return word_mode;
49951 /* All CPUs prefer to avoid cross-lane operations so perform reductions
49952 upper against lower halves up to SSE reg size. */
49954 static machine_mode
49955 ix86_split_reduction (machine_mode mode)
49957 /* Reduce lowpart against highpart until we reach SSE reg width to
49958 avoid cross-lane operations. */
49959 switch (mode)
49961 case E_V8DImode:
49962 case E_V4DImode:
49963 return V2DImode;
49964 case E_V16SImode:
49965 case E_V8SImode:
49966 return V4SImode;
49967 case E_V32HImode:
49968 case E_V16HImode:
49969 return V8HImode;
49970 case E_V64QImode:
49971 case E_V32QImode:
49972 return V16QImode;
49973 case E_V16SFmode:
49974 case E_V8SFmode:
49975 return V4SFmode;
49976 case E_V8DFmode:
49977 case E_V4DFmode:
49978 return V2DFmode;
49979 default:
49980 return mode;
49984 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
49985 vectors. If AVX512F is enabled then try vectorizing with 512bit,
49986 256bit and 128bit vectors. */
49988 static void
49989 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
49991 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49993 sizes->safe_push (64);
49994 sizes->safe_push (32);
49995 sizes->safe_push (16);
49997 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49999 sizes->safe_push (32);
50000 sizes->safe_push (16);
50004 /* Implemenation of targetm.vectorize.get_mask_mode. */
50006 static opt_machine_mode
50007 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
50009 unsigned elem_size = vector_size / nunits;
50011 /* Scalar mask case. */
50012 if ((TARGET_AVX512F && vector_size == 64)
50013 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50015 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50016 return smallest_int_mode_for_size (nunits);
50019 scalar_int_mode elem_mode
50020 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
50022 gcc_assert (elem_size * nunits == vector_size);
50024 return mode_for_vector (elem_mode, nunits);
50029 /* Return class of registers which could be used for pseudo of MODE
50030 and of class RCLASS for spilling instead of memory. Return NO_REGS
50031 if it is not possible or non-profitable. */
50033 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50035 static reg_class_t
50036 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50038 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50039 && TARGET_SSE2
50040 && TARGET_INTER_UNIT_MOVES_TO_VEC
50041 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50042 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50043 && INTEGER_CLASS_P (rclass))
50044 return ALL_SSE_REGS;
50045 return NO_REGS;
50048 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
50049 but returns a lower bound. */
50051 static unsigned int
50052 ix86_max_noce_ifcvt_seq_cost (edge e)
50054 bool predictable_p = predictable_edge_p (e);
50056 enum compiler_param param
50057 = (predictable_p
50058 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50059 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50061 /* If we have a parameter set, use that, otherwise take a guess using
50062 BRANCH_COST. */
50063 if (global_options_set.x_param_values[param])
50064 return PARAM_VALUE (param);
50065 else
50066 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50069 /* Return true if SEQ is a good candidate as a replacement for the
50070 if-convertible sequence described in IF_INFO. */
50072 static bool
50073 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
50075 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
50077 int cmov_cnt = 0;
50078 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
50079 Maybe we should allow even more conditional moves as long as they
50080 are used far enough not to stall the CPU, or also consider
50081 IF_INFO->TEST_BB succ edge probabilities. */
50082 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
50084 rtx set = single_set (insn);
50085 if (!set)
50086 continue;
50087 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
50088 continue;
50089 rtx src = SET_SRC (set);
50090 machine_mode mode = GET_MODE (src);
50091 if (GET_MODE_CLASS (mode) != MODE_INT
50092 && GET_MODE_CLASS (mode) != MODE_FLOAT)
50093 continue;
50094 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
50095 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
50096 continue;
50097 /* insn is CMOV or FCMOV. */
50098 if (++cmov_cnt > 1)
50099 return false;
50102 return default_noce_conversion_profitable_p (seq, if_info);
50105 /* Implement targetm.vectorize.init_cost. */
50107 static void *
50108 ix86_init_cost (struct loop *)
50110 unsigned *cost = XNEWVEC (unsigned, 3);
50111 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50112 return cost;
50115 /* Implement targetm.vectorize.add_stmt_cost. */
50117 static unsigned
50118 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50119 struct _stmt_vec_info *stmt_info, int misalign,
50120 enum vect_cost_model_location where)
50122 unsigned *cost = (unsigned *) data;
50123 unsigned retval = 0;
50125 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50126 int stmt_cost = - 1;
50128 if ((kind == vector_stmt || kind == scalar_stmt)
50129 && stmt_info
50130 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
50132 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
50133 bool fp = false;
50134 machine_mode mode = TImode;
50136 if (vectype != NULL)
50138 fp = FLOAT_TYPE_P (vectype);
50139 mode = TYPE_MODE (vectype);
50141 /*machine_mode inner_mode = mode;
50142 if (VECTOR_MODE_P (mode))
50143 inner_mode = GET_MODE_INNER (mode);*/
50145 switch (subcode)
50147 case PLUS_EXPR:
50148 case POINTER_PLUS_EXPR:
50149 case MINUS_EXPR:
50150 if (kind == scalar_stmt)
50152 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50153 stmt_cost = ix86_cost->addss;
50154 else if (X87_FLOAT_MODE_P (mode))
50155 stmt_cost = ix86_cost->fadd;
50156 else
50157 stmt_cost = ix86_cost->add;
50159 else
50160 stmt_cost = ix86_vec_cost (mode,
50161 fp ? ix86_cost->addss
50162 : ix86_cost->sse_op,
50163 true);
50164 break;
50166 case MULT_EXPR:
50167 case WIDEN_MULT_EXPR:
50168 case MULT_HIGHPART_EXPR:
50169 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
50170 break;
50171 case FMA_EXPR:
50172 stmt_cost = ix86_vec_cost (mode,
50173 mode == SFmode ? ix86_cost->fmass
50174 : ix86_cost->fmasd,
50175 true);
50176 break;
50177 case NEGATE_EXPR:
50178 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50179 stmt_cost = ix86_cost->sse_op;
50180 else if (X87_FLOAT_MODE_P (mode))
50181 stmt_cost = ix86_cost->fchs;
50182 else if (VECTOR_MODE_P (mode))
50183 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50184 else
50185 stmt_cost = ix86_cost->add;
50186 break;
50187 case TRUNC_DIV_EXPR:
50188 case CEIL_DIV_EXPR:
50189 case FLOOR_DIV_EXPR:
50190 case ROUND_DIV_EXPR:
50191 case TRUNC_MOD_EXPR:
50192 case CEIL_MOD_EXPR:
50193 case FLOOR_MOD_EXPR:
50194 case RDIV_EXPR:
50195 case ROUND_MOD_EXPR:
50196 case EXACT_DIV_EXPR:
50197 stmt_cost = ix86_division_cost (ix86_cost, mode);
50198 break;
50200 case RSHIFT_EXPR:
50201 case LSHIFT_EXPR:
50202 case LROTATE_EXPR:
50203 case RROTATE_EXPR:
50205 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
50206 stmt_cost = ix86_shift_rotate_cost
50207 (ix86_cost, mode,
50208 TREE_CODE (op2) == INTEGER_CST,
50209 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
50210 true, false, false, NULL, NULL);
50212 break;
50213 case NOP_EXPR:
50214 stmt_cost = 0;
50215 break;
50217 case BIT_IOR_EXPR:
50218 case ABS_EXPR:
50219 case MIN_EXPR:
50220 case MAX_EXPR:
50221 case BIT_XOR_EXPR:
50222 case BIT_AND_EXPR:
50223 case BIT_NOT_EXPR:
50224 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50225 stmt_cost = ix86_cost->sse_op;
50226 else if (VECTOR_MODE_P (mode))
50227 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50228 else
50229 stmt_cost = ix86_cost->add;
50230 break;
50231 default:
50232 break;
50235 if (stmt_cost == -1)
50236 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50238 /* Penalize DFmode vector operations for Bonnell. */
50239 if (TARGET_BONNELL && kind == vector_stmt
50240 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50241 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50243 /* Statements in an inner loop relative to the loop being
50244 vectorized are weighted more heavily. The value here is
50245 arbitrary and could potentially be improved with analysis. */
50246 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50247 count *= 50; /* FIXME. */
50249 retval = (unsigned) (count * stmt_cost);
50251 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50252 for Silvermont as it has out of order integer pipeline and can execute
50253 2 scalar instruction per tick, but has in order SIMD pipeline. */
50254 if ((TARGET_SILVERMONT || TARGET_INTEL)
50255 && stmt_info && stmt_info->stmt)
50257 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50258 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50259 retval = (retval * 17) / 10;
50262 cost[where] += retval;
50264 return retval;
50267 /* Implement targetm.vectorize.finish_cost. */
50269 static void
50270 ix86_finish_cost (void *data, unsigned *prologue_cost,
50271 unsigned *body_cost, unsigned *epilogue_cost)
50273 unsigned *cost = (unsigned *) data;
50274 *prologue_cost = cost[vect_prologue];
50275 *body_cost = cost[vect_body];
50276 *epilogue_cost = cost[vect_epilogue];
50279 /* Implement targetm.vectorize.destroy_cost_data. */
50281 static void
50282 ix86_destroy_cost_data (void *data)
50284 free (data);
50287 /* Validate target specific memory model bits in VAL. */
50289 static unsigned HOST_WIDE_INT
50290 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50292 enum memmodel model = memmodel_from_int (val);
50293 bool strong;
50295 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50296 |MEMMODEL_MASK)
50297 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50299 warning (OPT_Winvalid_memory_model,
50300 "unknown architecture specific memory model");
50301 return MEMMODEL_SEQ_CST;
50303 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50304 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50306 warning (OPT_Winvalid_memory_model,
50307 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50308 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50310 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50312 warning (OPT_Winvalid_memory_model,
50313 "HLE_RELEASE not used with RELEASE or stronger memory model");
50314 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50316 return val;
50319 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50320 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50321 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50322 or number of vecsize_mangle variants that should be emitted. */
50324 static int
50325 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50326 struct cgraph_simd_clone *clonei,
50327 tree base_type, int num)
50329 int ret = 1;
50331 if (clonei->simdlen
50332 && (clonei->simdlen < 2
50333 || clonei->simdlen > 1024
50334 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50336 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50337 "unsupported simdlen %d", clonei->simdlen);
50338 return 0;
50341 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50342 if (TREE_CODE (ret_type) != VOID_TYPE)
50343 switch (TYPE_MODE (ret_type))
50345 case E_QImode:
50346 case E_HImode:
50347 case E_SImode:
50348 case E_DImode:
50349 case E_SFmode:
50350 case E_DFmode:
50351 /* case E_SCmode: */
50352 /* case E_DCmode: */
50353 break;
50354 default:
50355 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50356 "unsupported return type %qT for simd", ret_type);
50357 return 0;
50360 tree t;
50361 int i;
50363 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50364 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50365 switch (TYPE_MODE (TREE_TYPE (t)))
50367 case E_QImode:
50368 case E_HImode:
50369 case E_SImode:
50370 case E_DImode:
50371 case E_SFmode:
50372 case E_DFmode:
50373 /* case E_SCmode: */
50374 /* case E_DCmode: */
50375 break;
50376 default:
50377 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50378 "unsupported argument type %qT for simd", TREE_TYPE (t));
50379 return 0;
50382 if (!TREE_PUBLIC (node->decl))
50384 /* If the function isn't exported, we can pick up just one ISA
50385 for the clones. */
50386 if (TARGET_AVX512F)
50387 clonei->vecsize_mangle = 'e';
50388 else if (TARGET_AVX2)
50389 clonei->vecsize_mangle = 'd';
50390 else if (TARGET_AVX)
50391 clonei->vecsize_mangle = 'c';
50392 else
50393 clonei->vecsize_mangle = 'b';
50394 ret = 1;
50396 else
50398 clonei->vecsize_mangle = "bcde"[num];
50399 ret = 4;
50401 clonei->mask_mode = VOIDmode;
50402 switch (clonei->vecsize_mangle)
50404 case 'b':
50405 clonei->vecsize_int = 128;
50406 clonei->vecsize_float = 128;
50407 break;
50408 case 'c':
50409 clonei->vecsize_int = 128;
50410 clonei->vecsize_float = 256;
50411 break;
50412 case 'd':
50413 clonei->vecsize_int = 256;
50414 clonei->vecsize_float = 256;
50415 break;
50416 case 'e':
50417 clonei->vecsize_int = 512;
50418 clonei->vecsize_float = 512;
50419 if (TYPE_MODE (base_type) == QImode)
50420 clonei->mask_mode = DImode;
50421 else
50422 clonei->mask_mode = SImode;
50423 break;
50425 if (clonei->simdlen == 0)
50427 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50428 clonei->simdlen = clonei->vecsize_int;
50429 else
50430 clonei->simdlen = clonei->vecsize_float;
50431 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50433 else if (clonei->simdlen > 16)
50435 /* For compatibility with ICC, use the same upper bounds
50436 for simdlen. In particular, for CTYPE below, use the return type,
50437 unless the function returns void, in that case use the characteristic
50438 type. If it is possible for given SIMDLEN to pass CTYPE value
50439 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50440 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50441 emit corresponding clone. */
50442 tree ctype = ret_type;
50443 if (TREE_CODE (ret_type) == VOID_TYPE)
50444 ctype = base_type;
50445 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50446 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50447 cnt /= clonei->vecsize_int;
50448 else
50449 cnt /= clonei->vecsize_float;
50450 if (cnt > (TARGET_64BIT ? 16 : 8))
50452 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50453 "unsupported simdlen %d", clonei->simdlen);
50454 return 0;
50457 return ret;
50460 /* Add target attribute to SIMD clone NODE if needed. */
50462 static void
50463 ix86_simd_clone_adjust (struct cgraph_node *node)
50465 const char *str = NULL;
50466 gcc_assert (node->decl == cfun->decl);
50467 switch (node->simdclone->vecsize_mangle)
50469 case 'b':
50470 if (!TARGET_SSE2)
50471 str = "sse2";
50472 break;
50473 case 'c':
50474 if (!TARGET_AVX)
50475 str = "avx";
50476 break;
50477 case 'd':
50478 if (!TARGET_AVX2)
50479 str = "avx2";
50480 break;
50481 case 'e':
50482 if (!TARGET_AVX512F)
50483 str = "avx512f";
50484 break;
50485 default:
50486 gcc_unreachable ();
50488 if (str == NULL)
50489 return;
50490 push_cfun (NULL);
50491 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50492 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50493 gcc_assert (ok);
50494 pop_cfun ();
50495 ix86_reset_previous_fndecl ();
50496 ix86_set_current_function (node->decl);
50499 /* If SIMD clone NODE can't be used in a vectorized loop
50500 in current function, return -1, otherwise return a badness of using it
50501 (0 if it is most desirable from vecsize_mangle point of view, 1
50502 slightly less desirable, etc.). */
50504 static int
50505 ix86_simd_clone_usable (struct cgraph_node *node)
50507 switch (node->simdclone->vecsize_mangle)
50509 case 'b':
50510 if (!TARGET_SSE2)
50511 return -1;
50512 if (!TARGET_AVX)
50513 return 0;
50514 return TARGET_AVX2 ? 2 : 1;
50515 case 'c':
50516 if (!TARGET_AVX)
50517 return -1;
50518 return TARGET_AVX2 ? 1 : 0;
50519 case 'd':
50520 if (!TARGET_AVX2)
50521 return -1;
50522 return 0;
50523 case 'e':
50524 if (!TARGET_AVX512F)
50525 return -1;
50526 return 0;
50527 default:
50528 gcc_unreachable ();
50532 /* This function adjusts the unroll factor based on
50533 the hardware capabilities. For ex, bdver3 has
50534 a loop buffer which makes unrolling of smaller
50535 loops less important. This function decides the
50536 unroll factor using number of memory references
50537 (value 32 is used) as a heuristic. */
50539 static unsigned
50540 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50542 basic_block *bbs;
50543 rtx_insn *insn;
50544 unsigned i;
50545 unsigned mem_count = 0;
50547 if (!TARGET_ADJUST_UNROLL)
50548 return nunroll;
50550 /* Count the number of memory references within the loop body.
50551 This value determines the unrolling factor for bdver3 and bdver4
50552 architectures. */
50553 subrtx_iterator::array_type array;
50554 bbs = get_loop_body (loop);
50555 for (i = 0; i < loop->num_nodes; i++)
50556 FOR_BB_INSNS (bbs[i], insn)
50557 if (NONDEBUG_INSN_P (insn))
50558 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50559 if (const_rtx x = *iter)
50560 if (MEM_P (x))
50562 machine_mode mode = GET_MODE (x);
50563 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50564 if (n_words > 4)
50565 mem_count += 2;
50566 else
50567 mem_count += 1;
50569 free (bbs);
50571 if (mem_count && mem_count <=32)
50572 return MIN (nunroll, 32 / mem_count);
50574 return nunroll;
50578 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50580 static bool
50581 ix86_float_exceptions_rounding_supported_p (void)
50583 /* For x87 floating point with standard excess precision handling,
50584 there is no adddf3 pattern (since x87 floating point only has
50585 XFmode operations) so the default hook implementation gets this
50586 wrong. */
50587 return TARGET_80387 || TARGET_SSE_MATH;
50590 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50592 static void
50593 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50595 if (!TARGET_80387 && !TARGET_SSE_MATH)
50596 return;
50597 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50598 if (TARGET_80387)
50600 tree fenv_index_type = build_index_type (size_int (6));
50601 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50602 tree fenv_var = create_tmp_var_raw (fenv_type);
50603 TREE_ADDRESSABLE (fenv_var) = 1;
50604 tree fenv_ptr = build_pointer_type (fenv_type);
50605 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50606 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50607 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50608 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50609 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50610 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50611 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50612 tree hold_fnclex = build_call_expr (fnclex, 0);
50613 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50614 NULL_TREE, NULL_TREE);
50615 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50616 hold_fnclex);
50617 *clear = build_call_expr (fnclex, 0);
50618 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50619 tree fnstsw_call = build_call_expr (fnstsw, 0);
50620 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50621 sw_var, fnstsw_call);
50622 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50623 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50624 exceptions_var, exceptions_x87);
50625 *update = build2 (COMPOUND_EXPR, integer_type_node,
50626 sw_mod, update_mod);
50627 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50628 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50630 if (TARGET_SSE_MATH)
50632 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50633 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50634 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50635 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50636 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50637 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50638 mxcsr_orig_var, stmxcsr_hold_call);
50639 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50640 mxcsr_orig_var,
50641 build_int_cst (unsigned_type_node, 0x1f80));
50642 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50643 build_int_cst (unsigned_type_node, 0xffffffc0));
50644 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50645 mxcsr_mod_var, hold_mod_val);
50646 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50647 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50648 hold_assign_orig, hold_assign_mod);
50649 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50650 ldmxcsr_hold_call);
50651 if (*hold)
50652 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50653 else
50654 *hold = hold_all;
50655 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50656 if (*clear)
50657 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50658 ldmxcsr_clear_call);
50659 else
50660 *clear = ldmxcsr_clear_call;
50661 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50662 tree exceptions_sse = fold_convert (integer_type_node,
50663 stxmcsr_update_call);
50664 if (*update)
50666 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50667 exceptions_var, exceptions_sse);
50668 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50669 exceptions_var, exceptions_mod);
50670 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50671 exceptions_assign);
50673 else
50674 *update = build2 (MODIFY_EXPR, integer_type_node,
50675 exceptions_var, exceptions_sse);
50676 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50677 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50678 ldmxcsr_update_call);
50680 tree atomic_feraiseexcept
50681 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50682 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50683 1, exceptions_var);
50684 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50685 atomic_feraiseexcept_call);
50688 /* Return mode to be used for bounds or VOIDmode
50689 if bounds are not supported. */
50691 static machine_mode
50692 ix86_mpx_bound_mode ()
50694 /* Do not support pointer checker if MPX
50695 is not enabled. */
50696 if (!TARGET_MPX)
50698 if (flag_check_pointer_bounds)
50699 warning (0, "Pointer Checker requires MPX support on this target."
50700 " Use -mmpx options to enable MPX.");
50701 return VOIDmode;
50704 return BNDmode;
50707 /* Return constant used to statically initialize constant bounds.
50709 This function is used to create special bound values. For now
50710 only INIT bounds and NONE bounds are expected. More special
50711 values may be added later. */
50713 static tree
50714 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50716 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50717 : build_zero_cst (pointer_sized_int_node);
50718 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50719 : build_minus_one_cst (pointer_sized_int_node);
50721 /* This function is supposed to be used to create INIT and
50722 NONE bounds only. */
50723 gcc_assert ((lb == 0 && ub == -1)
50724 || (lb == -1 && ub == 0));
50726 return build_complex (NULL, low, high);
50729 /* Generate a list of statements STMTS to initialize pointer bounds
50730 variable VAR with bounds LB and UB. Return the number of generated
50731 statements. */
50733 static int
50734 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50736 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50737 tree lhs, modify, var_p;
50739 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50740 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50742 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50743 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50744 append_to_statement_list (modify, stmts);
50746 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50747 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50748 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50749 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50750 append_to_statement_list (modify, stmts);
50752 return 2;
50755 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50756 /* For i386, common symbol is local only for non-PIE binaries. For
50757 x86-64, common symbol is local only for non-PIE binaries or linker
50758 supports copy reloc in PIE binaries. */
50760 static bool
50761 ix86_binds_local_p (const_tree exp)
50763 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50764 (!flag_pic
50765 || (TARGET_64BIT
50766 && HAVE_LD_PIE_COPYRELOC != 0)));
50768 #endif
50770 /* If MEM is in the form of [base+offset], extract the two parts
50771 of address and set to BASE and OFFSET, otherwise return false. */
50773 static bool
50774 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50776 rtx addr;
50778 gcc_assert (MEM_P (mem));
50780 addr = XEXP (mem, 0);
50782 if (GET_CODE (addr) == CONST)
50783 addr = XEXP (addr, 0);
50785 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50787 *base = addr;
50788 *offset = const0_rtx;
50789 return true;
50792 if (GET_CODE (addr) == PLUS
50793 && (REG_P (XEXP (addr, 0))
50794 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50795 && CONST_INT_P (XEXP (addr, 1)))
50797 *base = XEXP (addr, 0);
50798 *offset = XEXP (addr, 1);
50799 return true;
50802 return false;
50805 /* Given OPERANDS of consecutive load/store, check if we can merge
50806 them into move multiple. LOAD is true if they are load instructions.
50807 MODE is the mode of memory operands. */
50809 bool
50810 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50811 machine_mode mode)
50813 HOST_WIDE_INT offval_1, offval_2, msize;
50814 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50816 if (load)
50818 mem_1 = operands[1];
50819 mem_2 = operands[3];
50820 reg_1 = operands[0];
50821 reg_2 = operands[2];
50823 else
50825 mem_1 = operands[0];
50826 mem_2 = operands[2];
50827 reg_1 = operands[1];
50828 reg_2 = operands[3];
50831 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50833 if (REGNO (reg_1) != REGNO (reg_2))
50834 return false;
50836 /* Check if the addresses are in the form of [base+offset]. */
50837 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50838 return false;
50839 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
50840 return false;
50842 /* Check if the bases are the same. */
50843 if (!rtx_equal_p (base_1, base_2))
50844 return false;
50846 offval_1 = INTVAL (offset_1);
50847 offval_2 = INTVAL (offset_2);
50848 msize = GET_MODE_SIZE (mode);
50849 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
50850 if (offval_1 + msize != offval_2)
50851 return false;
50853 return true;
50856 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
50858 static bool
50859 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
50860 optimization_type opt_type)
50862 switch (op)
50864 case asin_optab:
50865 case acos_optab:
50866 case log1p_optab:
50867 case exp_optab:
50868 case exp10_optab:
50869 case exp2_optab:
50870 case expm1_optab:
50871 case ldexp_optab:
50872 case scalb_optab:
50873 case round_optab:
50874 return opt_type == OPTIMIZE_FOR_SPEED;
50876 case rint_optab:
50877 if (SSE_FLOAT_MODE_P (mode1)
50878 && TARGET_SSE_MATH
50879 && !flag_trapping_math
50880 && !TARGET_SSE4_1)
50881 return opt_type == OPTIMIZE_FOR_SPEED;
50882 return true;
50884 case floor_optab:
50885 case ceil_optab:
50886 case btrunc_optab:
50887 if (SSE_FLOAT_MODE_P (mode1)
50888 && TARGET_SSE_MATH
50889 && !flag_trapping_math
50890 && TARGET_SSE4_1)
50891 return true;
50892 return opt_type == OPTIMIZE_FOR_SPEED;
50894 case rsqrt_optab:
50895 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
50897 default:
50898 return true;
50902 /* Address space support.
50904 This is not "far pointers" in the 16-bit sense, but an easy way
50905 to use %fs and %gs segment prefixes. Therefore:
50907 (a) All address spaces have the same modes,
50908 (b) All address spaces have the same addresss forms,
50909 (c) While %fs and %gs are technically subsets of the generic
50910 address space, they are probably not subsets of each other.
50911 (d) Since we have no access to the segment base register values
50912 without resorting to a system call, we cannot convert a
50913 non-default address space to a default address space.
50914 Therefore we do not claim %fs or %gs are subsets of generic.
50916 Therefore we can (mostly) use the default hooks. */
50918 /* All use of segmentation is assumed to make address 0 valid. */
50920 static bool
50921 ix86_addr_space_zero_address_valid (addr_space_t as)
50923 return as != ADDR_SPACE_GENERIC;
50926 static void
50927 ix86_init_libfuncs (void)
50929 if (TARGET_64BIT)
50931 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
50932 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
50934 else
50936 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
50937 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
50940 #if TARGET_MACHO
50941 darwin_rename_builtins ();
50942 #endif
50945 /* Generate call to __divmoddi4. */
50947 static void
50948 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
50949 rtx op0, rtx op1,
50950 rtx *quot_p, rtx *rem_p)
50952 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
50954 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
50955 mode,
50956 op0, GET_MODE (op0),
50957 op1, GET_MODE (op1),
50958 XEXP (rem, 0), Pmode);
50959 *quot_p = quot;
50960 *rem_p = rem;
50963 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
50964 FPU, assume that the fpcw is set to extended precision; when using
50965 only SSE, rounding is correct; when using both SSE and the FPU,
50966 the rounding precision is indeterminate, since either may be chosen
50967 apparently at random. */
50969 static enum flt_eval_method
50970 ix86_excess_precision (enum excess_precision_type type)
50972 switch (type)
50974 case EXCESS_PRECISION_TYPE_FAST:
50975 /* The fastest type to promote to will always be the native type,
50976 whether that occurs with implicit excess precision or
50977 otherwise. */
50978 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50979 case EXCESS_PRECISION_TYPE_STANDARD:
50980 case EXCESS_PRECISION_TYPE_IMPLICIT:
50981 /* Otherwise, the excess precision we want when we are
50982 in a standards compliant mode, and the implicit precision we
50983 provide would be identical were it not for the unpredictable
50984 cases. */
50985 if (!TARGET_80387)
50986 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50987 else if (!TARGET_MIX_SSE_I387)
50989 if (!TARGET_SSE_MATH)
50990 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
50991 else if (TARGET_SSE2)
50992 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50995 /* If we are in standards compliant mode, but we know we will
50996 calculate in unpredictable precision, return
50997 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
50998 excess precision if the target can't guarantee it will honor
50999 it. */
51000 return (type == EXCESS_PRECISION_TYPE_STANDARD
51001 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51002 : FLT_EVAL_METHOD_UNPREDICTABLE);
51003 default:
51004 gcc_unreachable ();
51007 return FLT_EVAL_METHOD_UNPREDICTABLE;
51010 /* Implement PUSH_ROUNDING. On 386, we have pushw instruction that
51011 decrements by exactly 2 no matter what the position was, there is no pushb.
51013 But as CIE data alignment factor on this arch is -4 for 32bit targets
51014 and -8 for 64bit targets, we need to make sure all stack pointer adjustments
51015 are in multiple of 4 for 32bit targets and 8 for 64bit targets. */
51017 poly_int64
51018 ix86_push_rounding (poly_int64 bytes)
51020 return ROUND_UP (bytes, UNITS_PER_WORD);
51023 /* Target-specific selftests. */
51025 #if CHECKING_P
51027 namespace selftest {
51029 /* Verify that hard regs are dumped as expected (in compact mode). */
51031 static void
51032 ix86_test_dumping_hard_regs ()
51034 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51035 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51038 /* Test dumping an insn with repeated references to the same SCRATCH,
51039 to verify the rtx_reuse code. */
51041 static void
51042 ix86_test_dumping_memory_blockage ()
51044 set_new_first_and_last_insn (NULL, NULL);
51046 rtx pat = gen_memory_blockage ();
51047 rtx_reuse_manager r;
51048 r.preprocess (pat);
51050 /* Verify that the repeated references to the SCRATCH show use
51051 reuse IDS. The first should be prefixed with a reuse ID,
51052 and the second should be dumped as a "reuse_rtx" of that ID.
51053 The expected string assumes Pmode == DImode. */
51054 if (Pmode == DImode)
51055 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51056 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51057 " (unspec:BLK [\n"
51058 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51059 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51062 /* Verify loading an RTL dump; specifically a dump of copying
51063 a param on x86_64 from a hard reg into the frame.
51064 This test is target-specific since the dump contains target-specific
51065 hard reg names. */
51067 static void
51068 ix86_test_loading_dump_fragment_1 ()
51070 rtl_dump_test t (SELFTEST_LOCATION,
51071 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51073 rtx_insn *insn = get_insn_by_uid (1);
51075 /* The block structure and indentation here is purely for
51076 readability; it mirrors the structure of the rtx. */
51077 tree mem_expr;
51079 rtx pat = PATTERN (insn);
51080 ASSERT_EQ (SET, GET_CODE (pat));
51082 rtx dest = SET_DEST (pat);
51083 ASSERT_EQ (MEM, GET_CODE (dest));
51084 /* Verify the "/c" was parsed. */
51085 ASSERT_TRUE (RTX_FLAG (dest, call));
51086 ASSERT_EQ (SImode, GET_MODE (dest));
51088 rtx addr = XEXP (dest, 0);
51089 ASSERT_EQ (PLUS, GET_CODE (addr));
51090 ASSERT_EQ (DImode, GET_MODE (addr));
51092 rtx lhs = XEXP (addr, 0);
51093 /* Verify that the "frame" REG was consolidated. */
51094 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51097 rtx rhs = XEXP (addr, 1);
51098 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51099 ASSERT_EQ (-4, INTVAL (rhs));
51102 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51103 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51104 /* "i" should have been handled by synthesizing a global int
51105 variable named "i". */
51106 mem_expr = MEM_EXPR (dest);
51107 ASSERT_NE (mem_expr, NULL);
51108 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51109 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51110 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51111 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51112 /* "+0". */
51113 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51114 ASSERT_EQ (0, MEM_OFFSET (dest));
51115 /* "S4". */
51116 ASSERT_EQ (4, MEM_SIZE (dest));
51117 /* "A32. */
51118 ASSERT_EQ (32, MEM_ALIGN (dest));
51121 rtx src = SET_SRC (pat);
51122 ASSERT_EQ (REG, GET_CODE (src));
51123 ASSERT_EQ (SImode, GET_MODE (src));
51124 ASSERT_EQ (5, REGNO (src));
51125 tree reg_expr = REG_EXPR (src);
51126 /* "i" here should point to the same var as for the MEM_EXPR. */
51127 ASSERT_EQ (reg_expr, mem_expr);
51132 /* Verify that the RTL loader copes with a call_insn dump.
51133 This test is target-specific since the dump contains a target-specific
51134 hard reg name. */
51136 static void
51137 ix86_test_loading_call_insn ()
51139 /* The test dump includes register "xmm0", where requires TARGET_SSE
51140 to exist. */
51141 if (!TARGET_SSE)
51142 return;
51144 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51146 rtx_insn *insn = get_insns ();
51147 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51149 /* "/j". */
51150 ASSERT_TRUE (RTX_FLAG (insn, jump));
51152 rtx pat = PATTERN (insn);
51153 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51155 /* Verify REG_NOTES. */
51157 /* "(expr_list:REG_CALL_DECL". */
51158 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51159 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51160 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51162 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51163 rtx_expr_list *note1 = note0->next ();
51164 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51166 ASSERT_EQ (NULL, note1->next ());
51169 /* Verify CALL_INSN_FUNCTION_USAGE. */
51171 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51172 rtx_expr_list *usage
51173 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51174 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51175 ASSERT_EQ (DFmode, GET_MODE (usage));
51176 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51177 ASSERT_EQ (NULL, usage->next ());
51181 /* Verify that the RTL loader copes a dump from print_rtx_function.
51182 This test is target-specific since the dump contains target-specific
51183 hard reg names. */
51185 static void
51186 ix86_test_loading_full_dump ()
51188 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51190 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51192 rtx_insn *insn_1 = get_insn_by_uid (1);
51193 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51195 rtx_insn *insn_7 = get_insn_by_uid (7);
51196 ASSERT_EQ (INSN, GET_CODE (insn_7));
51197 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51199 rtx_insn *insn_15 = get_insn_by_uid (15);
51200 ASSERT_EQ (INSN, GET_CODE (insn_15));
51201 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51203 /* Verify crtl->return_rtx. */
51204 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51205 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51206 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51209 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51210 In particular, verify that it correctly loads the 2nd operand.
51211 This test is target-specific since these are machine-specific
51212 operands (and enums). */
51214 static void
51215 ix86_test_loading_unspec ()
51217 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51219 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51221 ASSERT_TRUE (cfun);
51223 /* Test of an UNSPEC. */
51224 rtx_insn *insn = get_insns ();
51225 ASSERT_EQ (INSN, GET_CODE (insn));
51226 rtx set = single_set (insn);
51227 ASSERT_NE (NULL, set);
51228 rtx dst = SET_DEST (set);
51229 ASSERT_EQ (MEM, GET_CODE (dst));
51230 rtx src = SET_SRC (set);
51231 ASSERT_EQ (UNSPEC, GET_CODE (src));
51232 ASSERT_EQ (BLKmode, GET_MODE (src));
51233 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51235 rtx v0 = XVECEXP (src, 0, 0);
51237 /* Verify that the two uses of the first SCRATCH have pointer
51238 equality. */
51239 rtx scratch_a = XEXP (dst, 0);
51240 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51242 rtx scratch_b = XEXP (v0, 0);
51243 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51245 ASSERT_EQ (scratch_a, scratch_b);
51247 /* Verify that the two mems are thus treated as equal. */
51248 ASSERT_TRUE (rtx_equal_p (dst, v0));
51250 /* Verify the the insn is recognized. */
51251 ASSERT_NE(-1, recog_memoized (insn));
51253 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51254 insn = NEXT_INSN (insn);
51255 ASSERT_EQ (INSN, GET_CODE (insn));
51257 set = single_set (insn);
51258 ASSERT_NE (NULL, set);
51260 src = SET_SRC (set);
51261 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51262 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51265 /* Run all target-specific selftests. */
51267 static void
51268 ix86_run_selftests (void)
51270 ix86_test_dumping_hard_regs ();
51271 ix86_test_dumping_memory_blockage ();
51273 /* Various tests of loading RTL dumps, here because they contain
51274 ix86-isms (e.g. names of hard regs). */
51275 ix86_test_loading_dump_fragment_1 ();
51276 ix86_test_loading_call_insn ();
51277 ix86_test_loading_full_dump ();
51278 ix86_test_loading_unspec ();
51281 } // namespace selftest
51283 #endif /* CHECKING_P */
51285 /* Initialize the GCC target structure. */
51286 #undef TARGET_RETURN_IN_MEMORY
51287 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51289 #undef TARGET_LEGITIMIZE_ADDRESS
51290 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51292 #undef TARGET_ATTRIBUTE_TABLE
51293 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51294 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51295 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51296 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51297 # undef TARGET_MERGE_DECL_ATTRIBUTES
51298 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51299 #endif
51301 #undef TARGET_COMP_TYPE_ATTRIBUTES
51302 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51304 #undef TARGET_INIT_BUILTINS
51305 #define TARGET_INIT_BUILTINS ix86_init_builtins
51306 #undef TARGET_BUILTIN_DECL
51307 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51308 #undef TARGET_EXPAND_BUILTIN
51309 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51311 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51312 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51313 ix86_builtin_vectorized_function
51315 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51316 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51318 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51319 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51321 #undef TARGET_BUILTIN_RECIPROCAL
51322 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51324 #undef TARGET_ASM_FUNCTION_EPILOGUE
51325 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51327 #undef TARGET_ENCODE_SECTION_INFO
51328 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51329 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51330 #else
51331 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51332 #endif
51334 #undef TARGET_ASM_OPEN_PAREN
51335 #define TARGET_ASM_OPEN_PAREN ""
51336 #undef TARGET_ASM_CLOSE_PAREN
51337 #define TARGET_ASM_CLOSE_PAREN ""
51339 #undef TARGET_ASM_BYTE_OP
51340 #define TARGET_ASM_BYTE_OP ASM_BYTE
51342 #undef TARGET_ASM_ALIGNED_HI_OP
51343 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51344 #undef TARGET_ASM_ALIGNED_SI_OP
51345 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51346 #ifdef ASM_QUAD
51347 #undef TARGET_ASM_ALIGNED_DI_OP
51348 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51349 #endif
51351 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51352 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51354 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51355 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51357 #undef TARGET_ASM_UNALIGNED_HI_OP
51358 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51359 #undef TARGET_ASM_UNALIGNED_SI_OP
51360 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51361 #undef TARGET_ASM_UNALIGNED_DI_OP
51362 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51364 #undef TARGET_PRINT_OPERAND
51365 #define TARGET_PRINT_OPERAND ix86_print_operand
51366 #undef TARGET_PRINT_OPERAND_ADDRESS
51367 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51368 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51369 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51370 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51371 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51373 #undef TARGET_SCHED_INIT_GLOBAL
51374 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51375 #undef TARGET_SCHED_ADJUST_COST
51376 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51377 #undef TARGET_SCHED_ISSUE_RATE
51378 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51379 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51380 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51381 ia32_multipass_dfa_lookahead
51382 #undef TARGET_SCHED_MACRO_FUSION_P
51383 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51384 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51385 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51387 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51388 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51390 #undef TARGET_MEMMODEL_CHECK
51391 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51393 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51394 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51396 #ifdef HAVE_AS_TLS
51397 #undef TARGET_HAVE_TLS
51398 #define TARGET_HAVE_TLS true
51399 #endif
51400 #undef TARGET_CANNOT_FORCE_CONST_MEM
51401 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51402 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51403 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51405 #undef TARGET_DELEGITIMIZE_ADDRESS
51406 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51408 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
51409 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
51411 #undef TARGET_MS_BITFIELD_LAYOUT_P
51412 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51414 #if TARGET_MACHO
51415 #undef TARGET_BINDS_LOCAL_P
51416 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51417 #else
51418 #undef TARGET_BINDS_LOCAL_P
51419 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51420 #endif
51421 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51422 #undef TARGET_BINDS_LOCAL_P
51423 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51424 #endif
51426 #undef TARGET_ASM_OUTPUT_MI_THUNK
51427 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51428 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51429 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51431 #undef TARGET_ASM_FILE_START
51432 #define TARGET_ASM_FILE_START x86_file_start
51434 #undef TARGET_OPTION_OVERRIDE
51435 #define TARGET_OPTION_OVERRIDE ix86_option_override
51437 #undef TARGET_REGISTER_MOVE_COST
51438 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51439 #undef TARGET_MEMORY_MOVE_COST
51440 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51441 #undef TARGET_RTX_COSTS
51442 #define TARGET_RTX_COSTS ix86_rtx_costs
51443 #undef TARGET_ADDRESS_COST
51444 #define TARGET_ADDRESS_COST ix86_address_cost
51446 #undef TARGET_FLAGS_REGNUM
51447 #define TARGET_FLAGS_REGNUM FLAGS_REG
51448 #undef TARGET_FIXED_CONDITION_CODE_REGS
51449 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51450 #undef TARGET_CC_MODES_COMPATIBLE
51451 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51453 #undef TARGET_MACHINE_DEPENDENT_REORG
51454 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51456 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51457 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51459 #undef TARGET_BUILD_BUILTIN_VA_LIST
51460 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51462 #undef TARGET_FOLD_BUILTIN
51463 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51465 #undef TARGET_GIMPLE_FOLD_BUILTIN
51466 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51468 #undef TARGET_COMPARE_VERSION_PRIORITY
51469 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51471 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51472 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51473 ix86_generate_version_dispatcher_body
51475 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51476 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51477 ix86_get_function_versions_dispatcher
51479 #undef TARGET_ENUM_VA_LIST_P
51480 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51482 #undef TARGET_FN_ABI_VA_LIST
51483 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51485 #undef TARGET_CANONICAL_VA_LIST_TYPE
51486 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51488 #undef TARGET_EXPAND_BUILTIN_VA_START
51489 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51491 #undef TARGET_MD_ASM_ADJUST
51492 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51494 #undef TARGET_C_EXCESS_PRECISION
51495 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51496 #undef TARGET_PROMOTE_PROTOTYPES
51497 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51498 #undef TARGET_SETUP_INCOMING_VARARGS
51499 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51500 #undef TARGET_MUST_PASS_IN_STACK
51501 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51502 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
51503 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
51504 #undef TARGET_FUNCTION_ARG_ADVANCE
51505 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51506 #undef TARGET_FUNCTION_ARG
51507 #define TARGET_FUNCTION_ARG ix86_function_arg
51508 #undef TARGET_INIT_PIC_REG
51509 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51510 #undef TARGET_USE_PSEUDO_PIC_REG
51511 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51512 #undef TARGET_FUNCTION_ARG_BOUNDARY
51513 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51514 #undef TARGET_PASS_BY_REFERENCE
51515 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51516 #undef TARGET_INTERNAL_ARG_POINTER
51517 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51518 #undef TARGET_UPDATE_STACK_BOUNDARY
51519 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51520 #undef TARGET_GET_DRAP_RTX
51521 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51522 #undef TARGET_STRICT_ARGUMENT_NAMING
51523 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51524 #undef TARGET_STATIC_CHAIN
51525 #define TARGET_STATIC_CHAIN ix86_static_chain
51526 #undef TARGET_TRAMPOLINE_INIT
51527 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51528 #undef TARGET_RETURN_POPS_ARGS
51529 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51531 #undef TARGET_WARN_FUNC_RETURN
51532 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
51534 #undef TARGET_LEGITIMATE_COMBINED_INSN
51535 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51537 #undef TARGET_ASAN_SHADOW_OFFSET
51538 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51540 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51541 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51543 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51544 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51546 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51547 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51549 #undef TARGET_C_MODE_FOR_SUFFIX
51550 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51552 #ifdef HAVE_AS_TLS
51553 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51554 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51555 #endif
51557 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51558 #undef TARGET_INSERT_ATTRIBUTES
51559 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51560 #endif
51562 #undef TARGET_MANGLE_TYPE
51563 #define TARGET_MANGLE_TYPE ix86_mangle_type
51565 #undef TARGET_STACK_PROTECT_GUARD
51566 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51568 #if !TARGET_MACHO
51569 #undef TARGET_STACK_PROTECT_FAIL
51570 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51571 #endif
51573 #undef TARGET_FUNCTION_VALUE
51574 #define TARGET_FUNCTION_VALUE ix86_function_value
51576 #undef TARGET_FUNCTION_VALUE_REGNO_P
51577 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51579 #undef TARGET_PROMOTE_FUNCTION_MODE
51580 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51582 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51583 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51585 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51586 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51588 #undef TARGET_INSTANTIATE_DECLS
51589 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51591 #undef TARGET_SECONDARY_RELOAD
51592 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51593 #undef TARGET_SECONDARY_MEMORY_NEEDED
51594 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
51595 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
51596 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
51598 #undef TARGET_CLASS_MAX_NREGS
51599 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51601 #undef TARGET_PREFERRED_RELOAD_CLASS
51602 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51603 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51604 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51605 #undef TARGET_CLASS_LIKELY_SPILLED_P
51606 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51608 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51609 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51610 ix86_builtin_vectorization_cost
51611 #undef TARGET_VECTORIZE_VEC_PERM_CONST
51612 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
51613 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51614 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51615 ix86_preferred_simd_mode
51616 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
51617 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
51618 ix86_split_reduction
51619 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51620 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51621 ix86_autovectorize_vector_sizes
51622 #undef TARGET_VECTORIZE_GET_MASK_MODE
51623 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51624 #undef TARGET_VECTORIZE_INIT_COST
51625 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51626 #undef TARGET_VECTORIZE_ADD_STMT_COST
51627 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51628 #undef TARGET_VECTORIZE_FINISH_COST
51629 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51630 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51631 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51633 #undef TARGET_SET_CURRENT_FUNCTION
51634 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51636 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51637 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51639 #undef TARGET_OPTION_SAVE
51640 #define TARGET_OPTION_SAVE ix86_function_specific_save
51642 #undef TARGET_OPTION_RESTORE
51643 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51645 #undef TARGET_OPTION_POST_STREAM_IN
51646 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51648 #undef TARGET_OPTION_PRINT
51649 #define TARGET_OPTION_PRINT ix86_function_specific_print
51651 #undef TARGET_OPTION_FUNCTION_VERSIONS
51652 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
51654 #undef TARGET_CAN_INLINE_P
51655 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51657 #undef TARGET_LEGITIMATE_ADDRESS_P
51658 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51660 #undef TARGET_REGISTER_PRIORITY
51661 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51663 #undef TARGET_REGISTER_USAGE_LEVELING_P
51664 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51666 #undef TARGET_LEGITIMATE_CONSTANT_P
51667 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51669 #undef TARGET_COMPUTE_FRAME_LAYOUT
51670 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
51672 #undef TARGET_FRAME_POINTER_REQUIRED
51673 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51675 #undef TARGET_CAN_ELIMINATE
51676 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
51678 #undef TARGET_EXTRA_LIVE_ON_ENTRY
51679 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
51681 #undef TARGET_ASM_CODE_END
51682 #define TARGET_ASM_CODE_END ix86_code_end
51684 #undef TARGET_CONDITIONAL_REGISTER_USAGE
51685 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
51687 #undef TARGET_CANONICALIZE_COMPARISON
51688 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
51690 #undef TARGET_LOOP_UNROLL_ADJUST
51691 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
51693 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51694 #undef TARGET_SPILL_CLASS
51695 #define TARGET_SPILL_CLASS ix86_spill_class
51697 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
51698 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
51699 ix86_simd_clone_compute_vecsize_and_simdlen
51701 #undef TARGET_SIMD_CLONE_ADJUST
51702 #define TARGET_SIMD_CLONE_ADJUST \
51703 ix86_simd_clone_adjust
51705 #undef TARGET_SIMD_CLONE_USABLE
51706 #define TARGET_SIMD_CLONE_USABLE \
51707 ix86_simd_clone_usable
51709 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
51710 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
51711 ix86_float_exceptions_rounding_supported_p
51713 #undef TARGET_MODE_EMIT
51714 #define TARGET_MODE_EMIT ix86_emit_mode_set
51716 #undef TARGET_MODE_NEEDED
51717 #define TARGET_MODE_NEEDED ix86_mode_needed
51719 #undef TARGET_MODE_AFTER
51720 #define TARGET_MODE_AFTER ix86_mode_after
51722 #undef TARGET_MODE_ENTRY
51723 #define TARGET_MODE_ENTRY ix86_mode_entry
51725 #undef TARGET_MODE_EXIT
51726 #define TARGET_MODE_EXIT ix86_mode_exit
51728 #undef TARGET_MODE_PRIORITY
51729 #define TARGET_MODE_PRIORITY ix86_mode_priority
51731 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
51732 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
51734 #undef TARGET_LOAD_BOUNDS_FOR_ARG
51735 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
51737 #undef TARGET_STORE_BOUNDS_FOR_ARG
51738 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
51740 #undef TARGET_LOAD_RETURNED_BOUNDS
51741 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
51743 #undef TARGET_STORE_RETURNED_BOUNDS
51744 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
51746 #undef TARGET_CHKP_BOUND_MODE
51747 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
51749 #undef TARGET_BUILTIN_CHKP_FUNCTION
51750 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
51752 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
51753 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
51755 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
51756 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
51758 #undef TARGET_CHKP_INITIALIZE_BOUNDS
51759 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
51761 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
51762 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
51764 #undef TARGET_OFFLOAD_OPTIONS
51765 #define TARGET_OFFLOAD_OPTIONS \
51766 ix86_offload_options
51768 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
51769 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
51771 #undef TARGET_OPTAB_SUPPORTED_P
51772 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
51774 #undef TARGET_HARD_REGNO_SCRATCH_OK
51775 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
51777 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
51778 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
51780 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
51781 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
51783 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
51784 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
51786 #undef TARGET_INIT_LIBFUNCS
51787 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
51789 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
51790 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
51792 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
51793 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
51795 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
51796 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
51798 #undef TARGET_HARD_REGNO_NREGS
51799 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
51800 #undef TARGET_HARD_REGNO_MODE_OK
51801 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
51803 #undef TARGET_MODES_TIEABLE_P
51804 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
51806 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
51807 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
51808 ix86_hard_regno_call_part_clobbered
51810 #undef TARGET_CAN_CHANGE_MODE_CLASS
51811 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
51813 #undef TARGET_STATIC_RTX_ALIGNMENT
51814 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
51815 #undef TARGET_CONSTANT_ALIGNMENT
51816 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
51818 #undef TARGET_EMPTY_RECORD_P
51819 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
51821 #undef TARGET_WARN_PARAMETER_PASSING_ABI
51822 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
51824 #if CHECKING_P
51825 #undef TARGET_RUN_TARGET_SELFTESTS
51826 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
51827 #endif /* #if CHECKING_P */
51829 struct gcc_target targetm = TARGET_INITIALIZER;
51831 #include "gt-i386.h"