PR target/84564
[official-gcc.git] / gcc / config / i386 / i386.c
blobe352303ca2fce00599578f2e187a82c20e83b6a7
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
93 #include "wide-int-bitmask.h"
95 /* This file should be included last. */
96 #include "target-def.h"
98 #include "x86-tune-costs.h"
100 static rtx legitimize_dllimport_symbol (rtx, bool);
101 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
102 static rtx legitimize_pe_coff_symbol (rtx, bool);
103 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
104 static bool ix86_save_reg (unsigned int, bool, bool);
105 static bool ix86_function_naked (const_tree);
106 static bool ix86_notrack_prefixed_insn_p (rtx);
107 static void ix86_emit_restore_reg_using_pop (rtx);
110 #ifndef CHECK_STACK_LIMIT
111 #define CHECK_STACK_LIMIT (-1)
112 #endif
114 /* Return index of given mode in mult and division cost tables. */
115 #define MODE_INDEX(mode) \
116 ((mode) == QImode ? 0 \
117 : (mode) == HImode ? 1 \
118 : (mode) == SImode ? 2 \
119 : (mode) == DImode ? 3 \
120 : 4)
123 /* Set by -mtune. */
124 const struct processor_costs *ix86_tune_cost = NULL;
126 /* Set by -mtune or -Os. */
127 const struct processor_costs *ix86_cost = NULL;
129 /* Processor feature/optimization bitmasks. */
130 #define m_386 (1U<<PROCESSOR_I386)
131 #define m_486 (1U<<PROCESSOR_I486)
132 #define m_PENT (1U<<PROCESSOR_PENTIUM)
133 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
134 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
135 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
136 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
137 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
138 #define m_CORE2 (1U<<PROCESSOR_CORE2)
139 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
140 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
141 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
142 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
143 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
144 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
145 #define m_KNL (1U<<PROCESSOR_KNL)
146 #define m_KNM (1U<<PROCESSOR_KNM)
147 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
148 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
149 #define m_ICELAKE (1U<<PROCESSOR_ICELAKE)
150 #define m_INTEL (1U<<PROCESSOR_INTEL)
152 #define m_GEODE (1U<<PROCESSOR_GEODE)
153 #define m_K6 (1U<<PROCESSOR_K6)
154 #define m_K6_GEODE (m_K6 | m_GEODE)
155 #define m_K8 (1U<<PROCESSOR_K8)
156 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
157 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
158 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
159 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
160 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
161 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
162 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
163 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
164 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
165 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
166 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
167 #define m_BTVER (m_BTVER1 | m_BTVER2)
168 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
169 | m_ZNVER1)
171 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
173 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
174 #undef DEF_TUNE
175 #define DEF_TUNE(tune, name, selector) name,
176 #include "x86-tune.def"
177 #undef DEF_TUNE
180 /* Feature tests against the various tunings. */
181 unsigned char ix86_tune_features[X86_TUNE_LAST];
183 /* Feature tests against the various tunings used to create ix86_tune_features
184 based on the processor mask. */
185 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
186 #undef DEF_TUNE
187 #define DEF_TUNE(tune, name, selector) selector,
188 #include "x86-tune.def"
189 #undef DEF_TUNE
192 /* Feature tests against the various architecture variations. */
193 unsigned char ix86_arch_features[X86_ARCH_LAST];
195 /* Feature tests against the various architecture variations, used to create
196 ix86_arch_features based on the processor mask. */
197 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
198 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
199 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
201 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
202 ~m_386,
204 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
205 ~(m_386 | m_486),
207 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
208 ~m_386,
210 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
211 ~m_386,
214 /* In case the average insn count for single function invocation is
215 lower than this constant, emit fast (but longer) prologue and
216 epilogue code. */
217 #define FAST_PROLOGUE_INSN_COUNT 20
219 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
220 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
221 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
222 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
224 /* Array of the smallest class containing reg number REGNO, indexed by
225 REGNO. Used by REGNO_REG_CLASS in i386.h. */
227 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
229 /* ax, dx, cx, bx */
230 AREG, DREG, CREG, BREG,
231 /* si, di, bp, sp */
232 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
233 /* FP registers */
234 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
235 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
236 /* arg pointer */
237 NON_Q_REGS,
238 /* flags, fpsr, fpcr, frame */
239 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
240 /* SSE registers */
241 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
242 SSE_REGS, SSE_REGS,
243 /* MMX registers */
244 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
245 MMX_REGS, MMX_REGS,
246 /* REX registers */
247 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
248 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
249 /* SSE REX registers */
250 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
251 SSE_REGS, SSE_REGS,
252 /* AVX-512 SSE registers */
253 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
256 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
257 /* Mask registers. */
258 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
259 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
260 /* MPX bound registers */
261 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
264 /* The "default" register map used in 32bit mode. */
266 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
268 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
269 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
270 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
271 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
272 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
273 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
274 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
275 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
276 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
277 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
278 101, 102, 103, 104, /* bound registers */
281 /* The "default" register map used in 64bit mode. */
283 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
285 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
286 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
287 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
288 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
289 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
290 8,9,10,11,12,13,14,15, /* extended integer registers */
291 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
292 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
293 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
294 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
295 126, 127, 128, 129, /* bound registers */
298 /* Define the register numbers to be used in Dwarf debugging information.
299 The SVR4 reference port C compiler uses the following register numbers
300 in its Dwarf output code:
301 0 for %eax (gcc regno = 0)
302 1 for %ecx (gcc regno = 2)
303 2 for %edx (gcc regno = 1)
304 3 for %ebx (gcc regno = 3)
305 4 for %esp (gcc regno = 7)
306 5 for %ebp (gcc regno = 6)
307 6 for %esi (gcc regno = 4)
308 7 for %edi (gcc regno = 5)
309 The following three DWARF register numbers are never generated by
310 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
311 believed these numbers have these meanings.
312 8 for %eip (no gcc equivalent)
313 9 for %eflags (gcc regno = 17)
314 10 for %trapno (no gcc equivalent)
315 It is not at all clear how we should number the FP stack registers
316 for the x86 architecture. If the version of SDB on x86/svr4 were
317 a bit less brain dead with respect to floating-point then we would
318 have a precedent to follow with respect to DWARF register numbers
319 for x86 FP registers, but the SDB on x86/svr4 was so completely
320 broken with respect to FP registers that it is hardly worth thinking
321 of it as something to strive for compatibility with.
322 The version of x86/svr4 SDB I had does (partially)
323 seem to believe that DWARF register number 11 is associated with
324 the x86 register %st(0), but that's about all. Higher DWARF
325 register numbers don't seem to be associated with anything in
326 particular, and even for DWARF regno 11, SDB only seemed to under-
327 stand that it should say that a variable lives in %st(0) (when
328 asked via an `=' command) if we said it was in DWARF regno 11,
329 but SDB still printed garbage when asked for the value of the
330 variable in question (via a `/' command).
331 (Also note that the labels SDB printed for various FP stack regs
332 when doing an `x' command were all wrong.)
333 Note that these problems generally don't affect the native SVR4
334 C compiler because it doesn't allow the use of -O with -g and
335 because when it is *not* optimizing, it allocates a memory
336 location for each floating-point variable, and the memory
337 location is what gets described in the DWARF AT_location
338 attribute for the variable in question.
339 Regardless of the severe mental illness of the x86/svr4 SDB, we
340 do something sensible here and we use the following DWARF
341 register numbers. Note that these are all stack-top-relative
342 numbers.
343 11 for %st(0) (gcc regno = 8)
344 12 for %st(1) (gcc regno = 9)
345 13 for %st(2) (gcc regno = 10)
346 14 for %st(3) (gcc regno = 11)
347 15 for %st(4) (gcc regno = 12)
348 16 for %st(5) (gcc regno = 13)
349 17 for %st(6) (gcc regno = 14)
350 18 for %st(7) (gcc regno = 15)
352 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
354 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
355 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
356 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
357 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
358 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
359 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
360 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
361 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
362 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
363 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
364 101, 102, 103, 104, /* bound registers */
367 /* Define parameter passing and return registers. */
369 static int const x86_64_int_parameter_registers[6] =
371 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
374 static int const x86_64_ms_abi_int_parameter_registers[4] =
376 CX_REG, DX_REG, R8_REG, R9_REG
379 static int const x86_64_int_return_registers[4] =
381 AX_REG, DX_REG, DI_REG, SI_REG
384 /* Additional registers that are clobbered by SYSV calls. */
386 #define NUM_X86_64_MS_CLOBBERED_REGS 12
387 static int const x86_64_ms_sysv_extra_clobbered_registers
388 [NUM_X86_64_MS_CLOBBERED_REGS] =
390 SI_REG, DI_REG,
391 XMM6_REG, XMM7_REG,
392 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
393 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
396 enum xlogue_stub {
397 XLOGUE_STUB_SAVE,
398 XLOGUE_STUB_RESTORE,
399 XLOGUE_STUB_RESTORE_TAIL,
400 XLOGUE_STUB_SAVE_HFP,
401 XLOGUE_STUB_RESTORE_HFP,
402 XLOGUE_STUB_RESTORE_HFP_TAIL,
404 XLOGUE_STUB_COUNT
407 enum xlogue_stub_sets {
408 XLOGUE_SET_ALIGNED,
409 XLOGUE_SET_ALIGNED_PLUS_8,
410 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
411 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
413 XLOGUE_SET_COUNT
416 /* Register save/restore layout used by out-of-line stubs. */
417 class xlogue_layout {
418 public:
419 struct reginfo
421 unsigned regno;
422 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
423 rsi) to where each register is stored. */
426 unsigned get_nregs () const {return m_nregs;}
427 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
429 const reginfo &get_reginfo (unsigned reg) const
431 gcc_assert (reg < m_nregs);
432 return m_regs[reg];
435 static const char *get_stub_name (enum xlogue_stub stub,
436 unsigned n_extra_args);
438 /* Returns an rtx for the stub's symbol based upon
439 1.) the specified stub (save, restore or restore_ret) and
440 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
441 3.) rather or not stack alignment is being performed. */
442 static rtx get_stub_rtx (enum xlogue_stub stub);
444 /* Returns the amount of stack space (including padding) that the stub
445 needs to store registers based upon data in the machine_function. */
446 HOST_WIDE_INT get_stack_space_used () const
448 const struct machine_function *m = cfun->machine;
449 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
451 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
452 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
455 /* Returns the offset for the base pointer used by the stub. */
456 HOST_WIDE_INT get_stub_ptr_offset () const
458 return STUB_INDEX_OFFSET + m_stack_align_off_in;
461 static const struct xlogue_layout &get_instance ();
462 static unsigned count_stub_managed_regs ();
463 static bool is_stub_managed_reg (unsigned regno, unsigned count);
465 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
466 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
467 static const unsigned MAX_REGS = 18;
468 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
469 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
470 static const unsigned STUB_NAME_MAX_LEN = 20;
471 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
472 static const unsigned REG_ORDER[MAX_REGS];
473 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
475 private:
476 xlogue_layout ();
477 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
478 xlogue_layout (const xlogue_layout &);
480 /* True if hard frame pointer is used. */
481 bool m_hfp;
483 /* Max number of register this layout manages. */
484 unsigned m_nregs;
486 /* Incoming offset from 16-byte alignment. */
487 HOST_WIDE_INT m_stack_align_off_in;
489 /* Register order and offsets. */
490 struct reginfo m_regs[MAX_REGS];
492 /* Lazy-inited cache of symbol names for stubs. */
493 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
494 [STUB_NAME_MAX_LEN];
496 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
499 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
500 "savms64",
501 "resms64",
502 "resms64x",
503 "savms64f",
504 "resms64f",
505 "resms64fx"
508 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
509 /* The below offset values are where each register is stored for the layout
510 relative to incoming stack pointer. The value of each m_regs[].offset will
511 be relative to the incoming base pointer (rax or rsi) used by the stub.
513 s_instances: 0 1 2 3
514 Offset: realigned or aligned + 8
515 Register aligned aligned + 8 aligned w/HFP w/HFP */
516 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
517 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
518 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
519 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
520 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
521 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
522 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
523 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
524 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
525 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
526 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
527 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
528 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
529 BP_REG, /* 0xc0 0xc8 N/A N/A */
530 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
531 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
532 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
533 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
536 /* Instantiate static const values. */
537 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
538 const unsigned xlogue_layout::MIN_REGS;
539 const unsigned xlogue_layout::MAX_REGS;
540 const unsigned xlogue_layout::MAX_EXTRA_REGS;
541 const unsigned xlogue_layout::VARIANT_COUNT;
542 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
544 /* Initialize xlogue_layout::s_stub_names to zero. */
545 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
546 [STUB_NAME_MAX_LEN];
548 /* Instantiates all xlogue_layout instances. */
549 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
550 xlogue_layout (0, false),
551 xlogue_layout (8, false),
552 xlogue_layout (0, true),
553 xlogue_layout (8, true)
556 /* Return an appropriate const instance of xlogue_layout based upon values
557 in cfun->machine and crtl. */
558 const struct xlogue_layout &
559 xlogue_layout::get_instance ()
561 enum xlogue_stub_sets stub_set;
562 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
564 if (stack_realign_fp)
565 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
566 else if (frame_pointer_needed)
567 stub_set = aligned_plus_8
568 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
569 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
570 else
571 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
573 return s_instances[stub_set];
576 /* Determine how many clobbered registers can be saved by the stub.
577 Returns the count of registers the stub will save and restore. */
578 unsigned
579 xlogue_layout::count_stub_managed_regs ()
581 bool hfp = frame_pointer_needed || stack_realign_fp;
582 unsigned i, count;
583 unsigned regno;
585 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
587 regno = REG_ORDER[i];
588 if (regno == BP_REG && hfp)
589 continue;
590 if (!ix86_save_reg (regno, false, false))
591 break;
592 ++count;
594 return count;
597 /* Determine if register REGNO is a stub managed register given the
598 total COUNT of stub managed registers. */
599 bool
600 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
602 bool hfp = frame_pointer_needed || stack_realign_fp;
603 unsigned i;
605 for (i = 0; i < count; ++i)
607 gcc_assert (i < MAX_REGS);
608 if (REG_ORDER[i] == BP_REG && hfp)
609 ++count;
610 else if (REG_ORDER[i] == regno)
611 return true;
613 return false;
616 /* Constructor for xlogue_layout. */
617 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
618 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
619 m_stack_align_off_in (stack_align_off_in)
621 HOST_WIDE_INT offset = stack_align_off_in;
622 unsigned i, j;
624 for (i = j = 0; i < MAX_REGS; ++i)
626 unsigned regno = REG_ORDER[i];
628 if (regno == BP_REG && hfp)
629 continue;
630 if (SSE_REGNO_P (regno))
632 offset += 16;
633 /* Verify that SSE regs are always aligned. */
634 gcc_assert (!((stack_align_off_in + offset) & 15));
636 else
637 offset += 8;
639 m_regs[j].regno = regno;
640 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
642 gcc_assert (j == m_nregs);
645 const char *
646 xlogue_layout::get_stub_name (enum xlogue_stub stub,
647 unsigned n_extra_regs)
649 const int have_avx = TARGET_AVX;
650 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
652 /* Lazy init */
653 if (!*name)
655 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
656 (have_avx ? "avx" : "sse"),
657 STUB_BASE_NAMES[stub],
658 MIN_REGS + n_extra_regs);
659 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
662 return name;
665 /* Return rtx of a symbol ref for the entry point (based upon
666 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
668 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
670 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
671 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
672 gcc_assert (stub < XLOGUE_STUB_COUNT);
673 gcc_assert (crtl->stack_realign_finalized);
675 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
678 /* Define the structure for the machine field in struct function. */
680 struct GTY(()) stack_local_entry {
681 unsigned short mode;
682 unsigned short n;
683 rtx rtl;
684 struct stack_local_entry *next;
687 /* Which cpu are we scheduling for. */
688 enum attr_cpu ix86_schedule;
690 /* Which cpu are we optimizing for. */
691 enum processor_type ix86_tune;
693 /* Which instruction set architecture to use. */
694 enum processor_type ix86_arch;
696 /* True if processor has SSE prefetch instruction. */
697 unsigned char x86_prefetch_sse;
699 /* -mstackrealign option */
700 static const char ix86_force_align_arg_pointer_string[]
701 = "force_align_arg_pointer";
703 static rtx (*ix86_gen_leave) (void);
704 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
705 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
706 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
707 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
708 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
709 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_clzero) (rtx);
711 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
713 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
714 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
715 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
716 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
718 /* Preferred alignment for stack boundary in bits. */
719 unsigned int ix86_preferred_stack_boundary;
721 /* Alignment for incoming stack boundary in bits specified at
722 command line. */
723 static unsigned int ix86_user_incoming_stack_boundary;
725 /* Default alignment for incoming stack boundary in bits. */
726 static unsigned int ix86_default_incoming_stack_boundary;
728 /* Alignment for incoming stack boundary in bits. */
729 unsigned int ix86_incoming_stack_boundary;
731 /* Calling abi specific va_list type nodes. */
732 static GTY(()) tree sysv_va_list_type_node;
733 static GTY(()) tree ms_va_list_type_node;
735 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
736 char internal_label_prefix[16];
737 int internal_label_prefix_len;
739 /* Fence to use after loop using movnt. */
740 tree x86_mfence;
742 /* Register class used for passing given 64bit part of the argument.
743 These represent classes as documented by the PS ABI, with the exception
744 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
745 use SF or DFmode move instead of DImode to avoid reformatting penalties.
747 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
748 whenever possible (upper half does contain padding). */
749 enum x86_64_reg_class
751 X86_64_NO_CLASS,
752 X86_64_INTEGER_CLASS,
753 X86_64_INTEGERSI_CLASS,
754 X86_64_SSE_CLASS,
755 X86_64_SSESF_CLASS,
756 X86_64_SSEDF_CLASS,
757 X86_64_SSEUP_CLASS,
758 X86_64_X87_CLASS,
759 X86_64_X87UP_CLASS,
760 X86_64_COMPLEX_X87_CLASS,
761 X86_64_MEMORY_CLASS
764 #define MAX_CLASSES 8
766 /* Table of constants used by fldpi, fldln2, etc.... */
767 static REAL_VALUE_TYPE ext_80387_constants_table [5];
768 static bool ext_80387_constants_init;
771 static struct machine_function * ix86_init_machine_status (void);
772 static rtx ix86_function_value (const_tree, const_tree, bool);
773 static bool ix86_function_value_regno_p (const unsigned int);
774 static unsigned int ix86_function_arg_boundary (machine_mode,
775 const_tree);
776 static rtx ix86_static_chain (const_tree, bool);
777 static int ix86_function_regparm (const_tree, const_tree);
778 static void ix86_compute_frame_layout (void);
779 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
780 rtx, rtx, int);
781 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
782 static tree ix86_canonical_va_list_type (tree);
783 static void predict_jump (int);
784 static unsigned int split_stack_prologue_scratch_regno (void);
785 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
787 enum ix86_function_specific_strings
789 IX86_FUNCTION_SPECIFIC_ARCH,
790 IX86_FUNCTION_SPECIFIC_TUNE,
791 IX86_FUNCTION_SPECIFIC_MAX
794 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
795 const char *, const char *, enum fpmath_unit,
796 bool);
797 static void ix86_function_specific_save (struct cl_target_option *,
798 struct gcc_options *opts);
799 static void ix86_function_specific_restore (struct gcc_options *opts,
800 struct cl_target_option *);
801 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
802 static void ix86_function_specific_print (FILE *, int,
803 struct cl_target_option *);
804 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
805 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
806 struct gcc_options *,
807 struct gcc_options *,
808 struct gcc_options *);
809 static bool ix86_can_inline_p (tree, tree);
810 static void ix86_set_current_function (tree);
811 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
813 static enum calling_abi ix86_function_abi (const_tree);
816 #ifndef SUBTARGET32_DEFAULT_CPU
817 #define SUBTARGET32_DEFAULT_CPU "i386"
818 #endif
820 /* Whether -mtune= or -march= were specified */
821 static int ix86_tune_defaulted;
822 static int ix86_arch_specified;
824 /* Vectorization library interface and handlers. */
825 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
827 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
828 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
830 /* Processor target table, indexed by processor number */
831 struct ptt
833 const char *const name; /* processor name */
834 const struct processor_costs *cost; /* Processor costs */
835 const int align_loop; /* Default alignments. */
836 const int align_loop_max_skip;
837 const int align_jump;
838 const int align_jump_max_skip;
839 const int align_func;
842 /* This table must be in sync with enum processor_type in i386.h. */
843 static const struct ptt processor_target_table[PROCESSOR_max] =
845 {"generic", &generic_cost, 16, 10, 16, 10, 16},
846 {"i386", &i386_cost, 4, 3, 4, 3, 4},
847 {"i486", &i486_cost, 16, 15, 16, 15, 16},
848 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
849 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
850 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
851 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
852 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
853 {"core2", &core_cost, 16, 10, 16, 10, 16},
854 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
855 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
856 {"haswell", &core_cost, 16, 10, 16, 10, 16},
857 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
858 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
859 {"knl", &slm_cost, 16, 15, 16, 7, 16},
860 {"knm", &slm_cost, 16, 15, 16, 7, 16},
861 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
862 {"cannonlake", &skylake_cost, 16, 10, 16, 10, 16},
863 {"icelake", &skylake_cost, 16, 10, 16, 10, 16},
864 {"intel", &intel_cost, 16, 15, 16, 7, 16},
865 {"geode", &geode_cost, 0, 0, 0, 0, 0},
866 {"k6", &k6_cost, 32, 7, 32, 7, 32},
867 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
868 {"k8", &k8_cost, 16, 7, 16, 7, 16},
869 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
870 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
871 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
872 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
873 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
874 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
875 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
876 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
879 static unsigned int
880 rest_of_handle_insert_vzeroupper (void)
882 int i;
884 /* vzeroupper instructions are inserted immediately after reload to
885 account for possible spills from 256bit or 512bit registers. The pass
886 reuses mode switching infrastructure by re-running mode insertion
887 pass, so disable entities that have already been processed. */
888 for (i = 0; i < MAX_386_ENTITIES; i++)
889 ix86_optimize_mode_switching[i] = 0;
891 ix86_optimize_mode_switching[AVX_U128] = 1;
893 /* Call optimize_mode_switching. */
894 g->get_passes ()->execute_pass_mode_switching ();
895 return 0;
898 /* Return 1 if INSN uses or defines a hard register.
899 Hard register uses in a memory address are ignored.
900 Clobbers and flags definitions are ignored. */
902 static bool
903 has_non_address_hard_reg (rtx_insn *insn)
905 df_ref ref;
906 FOR_EACH_INSN_DEF (ref, insn)
907 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
908 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
909 && DF_REF_REGNO (ref) != FLAGS_REG)
910 return true;
912 FOR_EACH_INSN_USE (ref, insn)
913 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
914 return true;
916 return false;
919 /* Check if comparison INSN may be transformed
920 into vector comparison. Currently we transform
921 zero checks only which look like:
923 (set (reg:CCZ 17 flags)
924 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
925 (subreg:SI (reg:DI x) 0))
926 (const_int 0 [0]))) */
928 static bool
929 convertible_comparison_p (rtx_insn *insn)
931 if (!TARGET_SSE4_1)
932 return false;
934 rtx def_set = single_set (insn);
936 gcc_assert (def_set);
938 rtx src = SET_SRC (def_set);
939 rtx dst = SET_DEST (def_set);
941 gcc_assert (GET_CODE (src) == COMPARE);
943 if (GET_CODE (dst) != REG
944 || REGNO (dst) != FLAGS_REG
945 || GET_MODE (dst) != CCZmode)
946 return false;
948 rtx op1 = XEXP (src, 0);
949 rtx op2 = XEXP (src, 1);
951 if (op2 != CONST0_RTX (GET_MODE (op2)))
952 return false;
954 if (GET_CODE (op1) != IOR)
955 return false;
957 op2 = XEXP (op1, 1);
958 op1 = XEXP (op1, 0);
960 if (!SUBREG_P (op1)
961 || !SUBREG_P (op2)
962 || GET_MODE (op1) != SImode
963 || GET_MODE (op2) != SImode
964 || ((SUBREG_BYTE (op1) != 0
965 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
966 && (SUBREG_BYTE (op2) != 0
967 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
968 return false;
970 op1 = SUBREG_REG (op1);
971 op2 = SUBREG_REG (op2);
973 if (op1 != op2
974 || !REG_P (op1)
975 || GET_MODE (op1) != DImode)
976 return false;
978 return true;
981 /* The DImode version of scalar_to_vector_candidate_p. */
983 static bool
984 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
986 rtx def_set = single_set (insn);
988 if (!def_set)
989 return false;
991 if (has_non_address_hard_reg (insn))
992 return false;
994 rtx src = SET_SRC (def_set);
995 rtx dst = SET_DEST (def_set);
997 if (GET_CODE (src) == COMPARE)
998 return convertible_comparison_p (insn);
1000 /* We are interested in DImode promotion only. */
1001 if ((GET_MODE (src) != DImode
1002 && !CONST_INT_P (src))
1003 || GET_MODE (dst) != DImode)
1004 return false;
1006 if (!REG_P (dst) && !MEM_P (dst))
1007 return false;
1009 switch (GET_CODE (src))
1011 case ASHIFTRT:
1012 if (!TARGET_AVX512VL)
1013 return false;
1014 /* FALLTHRU */
1016 case ASHIFT:
1017 case LSHIFTRT:
1018 if (!REG_P (XEXP (src, 1))
1019 && (!SUBREG_P (XEXP (src, 1))
1020 || SUBREG_BYTE (XEXP (src, 1)) != 0
1021 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1022 && (!CONST_INT_P (XEXP (src, 1))
1023 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1024 return false;
1026 if (GET_MODE (XEXP (src, 1)) != QImode
1027 && !CONST_INT_P (XEXP (src, 1)))
1028 return false;
1029 break;
1031 case PLUS:
1032 case MINUS:
1033 case IOR:
1034 case XOR:
1035 case AND:
1036 if (!REG_P (XEXP (src, 1))
1037 && !MEM_P (XEXP (src, 1))
1038 && !CONST_INT_P (XEXP (src, 1)))
1039 return false;
1041 if (GET_MODE (XEXP (src, 1)) != DImode
1042 && !CONST_INT_P (XEXP (src, 1)))
1043 return false;
1044 break;
1046 case NEG:
1047 case NOT:
1048 break;
1050 case REG:
1051 return true;
1053 case MEM:
1054 case CONST_INT:
1055 return REG_P (dst);
1057 default:
1058 return false;
1061 if (!REG_P (XEXP (src, 0))
1062 && !MEM_P (XEXP (src, 0))
1063 && !CONST_INT_P (XEXP (src, 0))
1064 /* Check for andnot case. */
1065 && (GET_CODE (src) != AND
1066 || GET_CODE (XEXP (src, 0)) != NOT
1067 || !REG_P (XEXP (XEXP (src, 0), 0))))
1068 return false;
1070 if (GET_MODE (XEXP (src, 0)) != DImode
1071 && !CONST_INT_P (XEXP (src, 0)))
1072 return false;
1074 return true;
1077 /* The TImode version of scalar_to_vector_candidate_p. */
1079 static bool
1080 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1082 rtx def_set = single_set (insn);
1084 if (!def_set)
1085 return false;
1087 if (has_non_address_hard_reg (insn))
1088 return false;
1090 rtx src = SET_SRC (def_set);
1091 rtx dst = SET_DEST (def_set);
1093 /* Only TImode load and store are allowed. */
1094 if (GET_MODE (dst) != TImode)
1095 return false;
1097 if (MEM_P (dst))
1099 /* Check for store. Memory must be aligned or unaligned store
1100 is optimal. Only support store from register, standard SSE
1101 constant or CONST_WIDE_INT generated from piecewise store.
1103 ??? Verify performance impact before enabling CONST_INT for
1104 __int128 store. */
1105 if (misaligned_operand (dst, TImode)
1106 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1107 return false;
1109 switch (GET_CODE (src))
1111 default:
1112 return false;
1114 case REG:
1115 case CONST_WIDE_INT:
1116 return true;
1118 case CONST_INT:
1119 return standard_sse_constant_p (src, TImode);
1122 else if (MEM_P (src))
1124 /* Check for load. Memory must be aligned or unaligned load is
1125 optimal. */
1126 return (REG_P (dst)
1127 && (!misaligned_operand (src, TImode)
1128 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1131 return false;
1134 /* Return 1 if INSN may be converted into vector
1135 instruction. */
1137 static bool
1138 scalar_to_vector_candidate_p (rtx_insn *insn)
1140 if (TARGET_64BIT)
1141 return timode_scalar_to_vector_candidate_p (insn);
1142 else
1143 return dimode_scalar_to_vector_candidate_p (insn);
1146 /* The DImode version of remove_non_convertible_regs. */
1148 static void
1149 dimode_remove_non_convertible_regs (bitmap candidates)
1151 bitmap_iterator bi;
1152 unsigned id;
1153 bitmap regs = BITMAP_ALLOC (NULL);
1155 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1157 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1158 rtx reg = SET_DEST (def_set);
1160 if (!REG_P (reg)
1161 || bitmap_bit_p (regs, REGNO (reg))
1162 || HARD_REGISTER_P (reg))
1163 continue;
1165 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1166 def;
1167 def = DF_REF_NEXT_REG (def))
1169 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1171 if (dump_file)
1172 fprintf (dump_file,
1173 "r%d has non convertible definition in insn %d\n",
1174 REGNO (reg), DF_REF_INSN_UID (def));
1176 bitmap_set_bit (regs, REGNO (reg));
1177 break;
1182 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1184 for (df_ref def = DF_REG_DEF_CHAIN (id);
1185 def;
1186 def = DF_REF_NEXT_REG (def))
1187 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1189 if (dump_file)
1190 fprintf (dump_file, "Removing insn %d from candidates list\n",
1191 DF_REF_INSN_UID (def));
1193 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1197 BITMAP_FREE (regs);
1200 /* For a register REGNO, scan instructions for its defs and uses.
1201 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1203 static void
1204 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1205 unsigned int regno)
1207 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1208 def;
1209 def = DF_REF_NEXT_REG (def))
1211 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1213 if (dump_file)
1214 fprintf (dump_file,
1215 "r%d has non convertible def in insn %d\n",
1216 regno, DF_REF_INSN_UID (def));
1218 bitmap_set_bit (regs, regno);
1219 break;
1223 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1224 ref;
1225 ref = DF_REF_NEXT_REG (ref))
1227 /* Debug instructions are skipped. */
1228 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1229 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1231 if (dump_file)
1232 fprintf (dump_file,
1233 "r%d has non convertible use in insn %d\n",
1234 regno, DF_REF_INSN_UID (ref));
1236 bitmap_set_bit (regs, regno);
1237 break;
1242 /* The TImode version of remove_non_convertible_regs. */
1244 static void
1245 timode_remove_non_convertible_regs (bitmap candidates)
1247 bitmap_iterator bi;
1248 unsigned id;
1249 bitmap regs = BITMAP_ALLOC (NULL);
1251 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1253 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1254 rtx dest = SET_DEST (def_set);
1255 rtx src = SET_SRC (def_set);
1257 if ((!REG_P (dest)
1258 || bitmap_bit_p (regs, REGNO (dest))
1259 || HARD_REGISTER_P (dest))
1260 && (!REG_P (src)
1261 || bitmap_bit_p (regs, REGNO (src))
1262 || HARD_REGISTER_P (src)))
1263 continue;
1265 if (REG_P (dest))
1266 timode_check_non_convertible_regs (candidates, regs,
1267 REGNO (dest));
1269 if (REG_P (src))
1270 timode_check_non_convertible_regs (candidates, regs,
1271 REGNO (src));
1274 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1276 for (df_ref def = DF_REG_DEF_CHAIN (id);
1277 def;
1278 def = DF_REF_NEXT_REG (def))
1279 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1281 if (dump_file)
1282 fprintf (dump_file, "Removing insn %d from candidates list\n",
1283 DF_REF_INSN_UID (def));
1285 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1288 for (df_ref ref = DF_REG_USE_CHAIN (id);
1289 ref;
1290 ref = DF_REF_NEXT_REG (ref))
1291 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1293 if (dump_file)
1294 fprintf (dump_file, "Removing insn %d from candidates list\n",
1295 DF_REF_INSN_UID (ref));
1297 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1301 BITMAP_FREE (regs);
1304 /* For a given bitmap of insn UIDs scans all instruction and
1305 remove insn from CANDIDATES in case it has both convertible
1306 and not convertible definitions.
1308 All insns in a bitmap are conversion candidates according to
1309 scalar_to_vector_candidate_p. Currently it implies all insns
1310 are single_set. */
1312 static void
1313 remove_non_convertible_regs (bitmap candidates)
1315 if (TARGET_64BIT)
1316 timode_remove_non_convertible_regs (candidates);
1317 else
1318 dimode_remove_non_convertible_regs (candidates);
1321 class scalar_chain
1323 public:
1324 scalar_chain ();
1325 virtual ~scalar_chain ();
1327 static unsigned max_id;
1329 /* ID of a chain. */
1330 unsigned int chain_id;
1331 /* A queue of instructions to be included into a chain. */
1332 bitmap queue;
1333 /* Instructions included into a chain. */
1334 bitmap insns;
1335 /* All registers defined by a chain. */
1336 bitmap defs;
1337 /* Registers used in both vector and sclar modes. */
1338 bitmap defs_conv;
1340 void build (bitmap candidates, unsigned insn_uid);
1341 virtual int compute_convert_gain () = 0;
1342 int convert ();
1344 protected:
1345 void add_to_queue (unsigned insn_uid);
1346 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1348 private:
1349 void add_insn (bitmap candidates, unsigned insn_uid);
1350 void analyze_register_chain (bitmap candidates, df_ref ref);
1351 virtual void mark_dual_mode_def (df_ref def) = 0;
1352 virtual void convert_insn (rtx_insn *insn) = 0;
1353 virtual void convert_registers () = 0;
1356 class dimode_scalar_chain : public scalar_chain
1358 public:
1359 int compute_convert_gain ();
1360 private:
1361 void mark_dual_mode_def (df_ref def);
1362 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1363 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1364 void convert_insn (rtx_insn *insn);
1365 void convert_op (rtx *op, rtx_insn *insn);
1366 void convert_reg (unsigned regno);
1367 void make_vector_copies (unsigned regno);
1368 void convert_registers ();
1369 int vector_const_cost (rtx exp);
1372 class timode_scalar_chain : public scalar_chain
1374 public:
1375 /* Convert from TImode to V1TImode is always faster. */
1376 int compute_convert_gain () { return 1; }
1378 private:
1379 void mark_dual_mode_def (df_ref def);
1380 void fix_debug_reg_uses (rtx reg);
1381 void convert_insn (rtx_insn *insn);
1382 /* We don't convert registers to difference size. */
1383 void convert_registers () {}
1386 unsigned scalar_chain::max_id = 0;
1388 /* Initialize new chain. */
1390 scalar_chain::scalar_chain ()
1392 chain_id = ++max_id;
1394 if (dump_file)
1395 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1397 bitmap_obstack_initialize (NULL);
1398 insns = BITMAP_ALLOC (NULL);
1399 defs = BITMAP_ALLOC (NULL);
1400 defs_conv = BITMAP_ALLOC (NULL);
1401 queue = NULL;
1404 /* Free chain's data. */
1406 scalar_chain::~scalar_chain ()
1408 BITMAP_FREE (insns);
1409 BITMAP_FREE (defs);
1410 BITMAP_FREE (defs_conv);
1411 bitmap_obstack_release (NULL);
1414 /* Add instruction into chains' queue. */
1416 void
1417 scalar_chain::add_to_queue (unsigned insn_uid)
1419 if (bitmap_bit_p (insns, insn_uid)
1420 || bitmap_bit_p (queue, insn_uid))
1421 return;
1423 if (dump_file)
1424 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1425 insn_uid, chain_id);
1426 bitmap_set_bit (queue, insn_uid);
1429 /* For DImode conversion, mark register defined by DEF as requiring
1430 conversion. */
1432 void
1433 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1435 gcc_assert (DF_REF_REG_DEF_P (def));
1437 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1438 return;
1440 if (dump_file)
1441 fprintf (dump_file,
1442 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1443 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1445 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1448 /* For TImode conversion, it is unused. */
1450 void
1451 timode_scalar_chain::mark_dual_mode_def (df_ref)
1453 gcc_unreachable ();
1456 /* Check REF's chain to add new insns into a queue
1457 and find registers requiring conversion. */
1459 void
1460 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1462 df_link *chain;
1464 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1465 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1466 add_to_queue (DF_REF_INSN_UID (ref));
1468 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1470 unsigned uid = DF_REF_INSN_UID (chain->ref);
1472 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1473 continue;
1475 if (!DF_REF_REG_MEM_P (chain->ref))
1477 if (bitmap_bit_p (insns, uid))
1478 continue;
1480 if (bitmap_bit_p (candidates, uid))
1482 add_to_queue (uid);
1483 continue;
1487 if (DF_REF_REG_DEF_P (chain->ref))
1489 if (dump_file)
1490 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1491 DF_REF_REGNO (chain->ref), uid);
1492 mark_dual_mode_def (chain->ref);
1494 else
1496 if (dump_file)
1497 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1498 DF_REF_REGNO (chain->ref), uid);
1499 mark_dual_mode_def (ref);
1504 /* Add instruction into a chain. */
1506 void
1507 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1509 if (bitmap_bit_p (insns, insn_uid))
1510 return;
1512 if (dump_file)
1513 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1515 bitmap_set_bit (insns, insn_uid);
1517 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1518 rtx def_set = single_set (insn);
1519 if (def_set && REG_P (SET_DEST (def_set))
1520 && !HARD_REGISTER_P (SET_DEST (def_set)))
1521 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1523 df_ref ref;
1524 df_ref def;
1525 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1526 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1527 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1528 def;
1529 def = DF_REF_NEXT_REG (def))
1530 analyze_register_chain (candidates, def);
1531 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1532 if (!DF_REF_REG_MEM_P (ref))
1533 analyze_register_chain (candidates, ref);
1536 /* Build new chain starting from insn INSN_UID recursively
1537 adding all dependent uses and definitions. */
1539 void
1540 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1542 queue = BITMAP_ALLOC (NULL);
1543 bitmap_set_bit (queue, insn_uid);
1545 if (dump_file)
1546 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1548 while (!bitmap_empty_p (queue))
1550 insn_uid = bitmap_first_set_bit (queue);
1551 bitmap_clear_bit (queue, insn_uid);
1552 bitmap_clear_bit (candidates, insn_uid);
1553 add_insn (candidates, insn_uid);
1556 if (dump_file)
1558 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1559 fprintf (dump_file, " insns: ");
1560 dump_bitmap (dump_file, insns);
1561 if (!bitmap_empty_p (defs_conv))
1563 bitmap_iterator bi;
1564 unsigned id;
1565 const char *comma = "";
1566 fprintf (dump_file, " defs to convert: ");
1567 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1569 fprintf (dump_file, "%sr%d", comma, id);
1570 comma = ", ";
1572 fprintf (dump_file, "\n");
1576 BITMAP_FREE (queue);
1579 /* Return a cost of building a vector costant
1580 instead of using a scalar one. */
1583 dimode_scalar_chain::vector_const_cost (rtx exp)
1585 gcc_assert (CONST_INT_P (exp));
1587 if (standard_sse_constant_p (exp, V2DImode))
1588 return COSTS_N_INSNS (1);
1589 return ix86_cost->sse_load[1];
1592 /* Compute a gain for chain conversion. */
1595 dimode_scalar_chain::compute_convert_gain ()
1597 bitmap_iterator bi;
1598 unsigned insn_uid;
1599 int gain = 0;
1600 int cost = 0;
1602 if (dump_file)
1603 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1605 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1607 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1608 rtx def_set = single_set (insn);
1609 rtx src = SET_SRC (def_set);
1610 rtx dst = SET_DEST (def_set);
1612 if (REG_P (src) && REG_P (dst))
1613 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1614 else if (REG_P (src) && MEM_P (dst))
1615 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1616 else if (MEM_P (src) && REG_P (dst))
1617 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1618 else if (GET_CODE (src) == ASHIFT
1619 || GET_CODE (src) == ASHIFTRT
1620 || GET_CODE (src) == LSHIFTRT)
1622 if (CONST_INT_P (XEXP (src, 0)))
1623 gain -= vector_const_cost (XEXP (src, 0));
1624 if (CONST_INT_P (XEXP (src, 1)))
1626 gain += ix86_cost->shift_const;
1627 if (INTVAL (XEXP (src, 1)) >= 32)
1628 gain -= COSTS_N_INSNS (1);
1630 else
1631 /* Additional gain for omitting two CMOVs. */
1632 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1634 else if (GET_CODE (src) == PLUS
1635 || GET_CODE (src) == MINUS
1636 || GET_CODE (src) == IOR
1637 || GET_CODE (src) == XOR
1638 || GET_CODE (src) == AND)
1640 gain += ix86_cost->add;
1641 /* Additional gain for andnot for targets without BMI. */
1642 if (GET_CODE (XEXP (src, 0)) == NOT
1643 && !TARGET_BMI)
1644 gain += 2 * ix86_cost->add;
1646 if (CONST_INT_P (XEXP (src, 0)))
1647 gain -= vector_const_cost (XEXP (src, 0));
1648 if (CONST_INT_P (XEXP (src, 1)))
1649 gain -= vector_const_cost (XEXP (src, 1));
1651 else if (GET_CODE (src) == NEG
1652 || GET_CODE (src) == NOT)
1653 gain += ix86_cost->add - COSTS_N_INSNS (1);
1654 else if (GET_CODE (src) == COMPARE)
1656 /* Assume comparison cost is the same. */
1658 else if (CONST_INT_P (src))
1660 if (REG_P (dst))
1661 gain += COSTS_N_INSNS (2);
1662 else if (MEM_P (dst))
1663 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1664 gain -= vector_const_cost (src);
1666 else
1667 gcc_unreachable ();
1670 if (dump_file)
1671 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1673 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1674 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1676 if (dump_file)
1677 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1679 gain -= cost;
1681 if (dump_file)
1682 fprintf (dump_file, " Total gain: %d\n", gain);
1684 return gain;
1687 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1690 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1692 if (x == reg)
1693 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1695 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1696 int i, j;
1697 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1699 if (fmt[i] == 'e')
1700 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1701 else if (fmt[i] == 'E')
1702 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1703 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1704 reg, new_reg);
1707 return x;
1710 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1712 void
1713 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1714 rtx reg, rtx new_reg)
1716 replace_with_subreg (single_set (insn), reg, new_reg);
1719 /* Insert generated conversion instruction sequence INSNS
1720 after instruction AFTER. New BB may be required in case
1721 instruction has EH region attached. */
1723 void
1724 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1726 if (!control_flow_insn_p (after))
1728 emit_insn_after (insns, after);
1729 return;
1732 basic_block bb = BLOCK_FOR_INSN (after);
1733 edge e = find_fallthru_edge (bb->succs);
1734 gcc_assert (e);
1736 basic_block new_bb = split_edge (e);
1737 emit_insn_after (insns, BB_HEAD (new_bb));
1740 /* Make vector copies for all register REGNO definitions
1741 and replace its uses in a chain. */
1743 void
1744 dimode_scalar_chain::make_vector_copies (unsigned regno)
1746 rtx reg = regno_reg_rtx[regno];
1747 rtx vreg = gen_reg_rtx (DImode);
1748 bool count_reg = false;
1749 df_ref ref;
1751 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1752 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1754 df_ref use;
1756 /* Detect the count register of a shift instruction. */
1757 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1758 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1760 rtx_insn *insn = DF_REF_INSN (use);
1761 rtx def_set = single_set (insn);
1763 gcc_assert (def_set);
1765 rtx src = SET_SRC (def_set);
1767 if ((GET_CODE (src) == ASHIFT
1768 || GET_CODE (src) == ASHIFTRT
1769 || GET_CODE (src) == LSHIFTRT)
1770 && !CONST_INT_P (XEXP (src, 1))
1771 && reg_or_subregno (XEXP (src, 1)) == regno)
1772 count_reg = true;
1775 start_sequence ();
1776 if (count_reg)
1778 rtx qreg = gen_lowpart (QImode, reg);
1779 rtx tmp = gen_reg_rtx (SImode);
1781 if (TARGET_ZERO_EXTEND_WITH_AND
1782 && optimize_function_for_speed_p (cfun))
1784 emit_move_insn (tmp, const0_rtx);
1785 emit_insn (gen_movstrictqi
1786 (gen_lowpart (QImode, tmp), qreg));
1788 else
1789 emit_insn (gen_rtx_SET
1790 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1792 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1794 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1795 emit_move_insn (slot, tmp);
1796 tmp = copy_rtx (slot);
1799 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1801 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1803 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1804 emit_move_insn (adjust_address (tmp, SImode, 0),
1805 gen_rtx_SUBREG (SImode, reg, 0));
1806 emit_move_insn (adjust_address (tmp, SImode, 4),
1807 gen_rtx_SUBREG (SImode, reg, 4));
1808 emit_move_insn (vreg, tmp);
1810 else if (TARGET_SSE4_1)
1812 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1813 CONST0_RTX (V4SImode),
1814 gen_rtx_SUBREG (SImode, reg, 0)));
1815 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1816 gen_rtx_SUBREG (V4SImode, vreg, 0),
1817 gen_rtx_SUBREG (SImode, reg, 4),
1818 GEN_INT (2)));
1820 else
1822 rtx tmp = gen_reg_rtx (DImode);
1823 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1824 CONST0_RTX (V4SImode),
1825 gen_rtx_SUBREG (SImode, reg, 0)));
1826 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1827 CONST0_RTX (V4SImode),
1828 gen_rtx_SUBREG (SImode, reg, 4)));
1829 emit_insn (gen_vec_interleave_lowv4si
1830 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1831 gen_rtx_SUBREG (V4SImode, vreg, 0),
1832 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1834 rtx_insn *seq = get_insns ();
1835 end_sequence ();
1836 rtx_insn *insn = DF_REF_INSN (ref);
1837 emit_conversion_insns (seq, insn);
1839 if (dump_file)
1840 fprintf (dump_file,
1841 " Copied r%d to a vector register r%d for insn %d\n",
1842 regno, REGNO (vreg), INSN_UID (insn));
1845 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1846 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1848 rtx_insn *insn = DF_REF_INSN (ref);
1849 if (count_reg)
1851 rtx def_set = single_set (insn);
1852 gcc_assert (def_set);
1854 rtx src = SET_SRC (def_set);
1856 if ((GET_CODE (src) == ASHIFT
1857 || GET_CODE (src) == ASHIFTRT
1858 || GET_CODE (src) == LSHIFTRT)
1859 && !CONST_INT_P (XEXP (src, 1))
1860 && reg_or_subregno (XEXP (src, 1)) == regno)
1861 XEXP (src, 1) = vreg;
1863 else
1864 replace_with_subreg_in_insn (insn, reg, vreg);
1866 if (dump_file)
1867 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1868 regno, REGNO (vreg), INSN_UID (insn));
1872 /* Convert all definitions of register REGNO
1873 and fix its uses. Scalar copies may be created
1874 in case register is used in not convertible insn. */
1876 void
1877 dimode_scalar_chain::convert_reg (unsigned regno)
1879 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1880 rtx reg = regno_reg_rtx[regno];
1881 rtx scopy = NULL_RTX;
1882 df_ref ref;
1883 bitmap conv;
1885 conv = BITMAP_ALLOC (NULL);
1886 bitmap_copy (conv, insns);
1888 if (scalar_copy)
1889 scopy = gen_reg_rtx (DImode);
1891 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1893 rtx_insn *insn = DF_REF_INSN (ref);
1894 rtx def_set = single_set (insn);
1895 rtx src = SET_SRC (def_set);
1896 rtx reg = DF_REF_REG (ref);
1898 if (!MEM_P (src))
1900 replace_with_subreg_in_insn (insn, reg, reg);
1901 bitmap_clear_bit (conv, INSN_UID (insn));
1904 if (scalar_copy)
1906 start_sequence ();
1907 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1909 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1910 emit_move_insn (tmp, reg);
1911 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1912 adjust_address (tmp, SImode, 0));
1913 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1914 adjust_address (tmp, SImode, 4));
1916 else if (TARGET_SSE4_1)
1918 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1919 emit_insn
1920 (gen_rtx_SET
1921 (gen_rtx_SUBREG (SImode, scopy, 0),
1922 gen_rtx_VEC_SELECT (SImode,
1923 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1925 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1926 emit_insn
1927 (gen_rtx_SET
1928 (gen_rtx_SUBREG (SImode, scopy, 4),
1929 gen_rtx_VEC_SELECT (SImode,
1930 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1932 else
1934 rtx vcopy = gen_reg_rtx (V2DImode);
1935 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1936 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1937 gen_rtx_SUBREG (SImode, vcopy, 0));
1938 emit_move_insn (vcopy,
1939 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1940 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1941 gen_rtx_SUBREG (SImode, vcopy, 0));
1943 rtx_insn *seq = get_insns ();
1944 end_sequence ();
1945 emit_conversion_insns (seq, insn);
1947 if (dump_file)
1948 fprintf (dump_file,
1949 " Copied r%d to a scalar register r%d for insn %d\n",
1950 regno, REGNO (scopy), INSN_UID (insn));
1954 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1955 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1957 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1959 rtx_insn *insn = DF_REF_INSN (ref);
1961 rtx def_set = single_set (insn);
1962 gcc_assert (def_set);
1964 rtx src = SET_SRC (def_set);
1965 rtx dst = SET_DEST (def_set);
1967 if ((GET_CODE (src) == ASHIFT
1968 || GET_CODE (src) == ASHIFTRT
1969 || GET_CODE (src) == LSHIFTRT)
1970 && !CONST_INT_P (XEXP (src, 1))
1971 && reg_or_subregno (XEXP (src, 1)) == regno)
1973 rtx tmp2 = gen_reg_rtx (V2DImode);
1975 start_sequence ();
1977 if (TARGET_SSE4_1)
1978 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1979 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1980 else
1982 rtx vec_cst
1983 = gen_rtx_CONST_VECTOR (V2DImode,
1984 gen_rtvec (2, GEN_INT (0xff),
1985 const0_rtx));
1986 vec_cst
1987 = validize_mem (force_const_mem (V2DImode, vec_cst));
1989 emit_insn (gen_rtx_SET
1990 (tmp2,
1991 gen_rtx_AND (V2DImode,
1992 gen_rtx_SUBREG (V2DImode, reg, 0),
1993 vec_cst)));
1995 rtx_insn *seq = get_insns ();
1996 end_sequence ();
1998 emit_insn_before (seq, insn);
2000 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
2002 else if (!MEM_P (dst) || !REG_P (src))
2003 replace_with_subreg_in_insn (insn, reg, reg);
2005 bitmap_clear_bit (conv, INSN_UID (insn));
2008 /* Skip debug insns and uninitialized uses. */
2009 else if (DF_REF_CHAIN (ref)
2010 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2012 gcc_assert (scopy);
2013 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2014 df_insn_rescan (DF_REF_INSN (ref));
2017 BITMAP_FREE (conv);
2020 /* Convert operand OP in INSN. We should handle
2021 memory operands and uninitialized registers.
2022 All other register uses are converted during
2023 registers conversion. */
2025 void
2026 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2028 *op = copy_rtx_if_shared (*op);
2030 if (GET_CODE (*op) == NOT)
2032 convert_op (&XEXP (*op, 0), insn);
2033 PUT_MODE (*op, V2DImode);
2035 else if (MEM_P (*op))
2037 rtx tmp = gen_reg_rtx (DImode);
2039 emit_insn_before (gen_move_insn (tmp, *op), insn);
2040 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2042 if (dump_file)
2043 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2044 INSN_UID (insn), REGNO (tmp));
2046 else if (REG_P (*op))
2048 /* We may have not converted register usage in case
2049 this register has no definition. Otherwise it
2050 should be converted in convert_reg. */
2051 df_ref ref;
2052 FOR_EACH_INSN_USE (ref, insn)
2053 if (DF_REF_REGNO (ref) == REGNO (*op))
2055 gcc_assert (!DF_REF_CHAIN (ref));
2056 break;
2058 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2060 else if (CONST_INT_P (*op))
2062 rtx vec_cst;
2063 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2065 /* Prefer all ones vector in case of -1. */
2066 if (constm1_operand (*op, GET_MODE (*op)))
2067 vec_cst = CONSTM1_RTX (V2DImode);
2068 else
2069 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2070 gen_rtvec (2, *op, const0_rtx));
2072 if (!standard_sse_constant_p (vec_cst, V2DImode))
2074 start_sequence ();
2075 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2076 rtx_insn *seq = get_insns ();
2077 end_sequence ();
2078 emit_insn_before (seq, insn);
2081 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2082 *op = tmp;
2084 else
2086 gcc_assert (SUBREG_P (*op));
2087 gcc_assert (GET_MODE (*op) == V2DImode);
2091 /* Convert INSN to vector mode. */
2093 void
2094 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2096 rtx def_set = single_set (insn);
2097 rtx src = SET_SRC (def_set);
2098 rtx dst = SET_DEST (def_set);
2099 rtx subreg;
2101 if (MEM_P (dst) && !REG_P (src))
2103 /* There are no scalar integer instructions and therefore
2104 temporary register usage is required. */
2105 rtx tmp = gen_reg_rtx (DImode);
2106 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2107 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2110 switch (GET_CODE (src))
2112 case ASHIFT:
2113 case ASHIFTRT:
2114 case LSHIFTRT:
2115 convert_op (&XEXP (src, 0), insn);
2116 PUT_MODE (src, V2DImode);
2117 break;
2119 case PLUS:
2120 case MINUS:
2121 case IOR:
2122 case XOR:
2123 case AND:
2124 convert_op (&XEXP (src, 0), insn);
2125 convert_op (&XEXP (src, 1), insn);
2126 PUT_MODE (src, V2DImode);
2127 break;
2129 case NEG:
2130 src = XEXP (src, 0);
2131 convert_op (&src, insn);
2132 subreg = gen_reg_rtx (V2DImode);
2133 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2134 src = gen_rtx_MINUS (V2DImode, subreg, src);
2135 break;
2137 case NOT:
2138 src = XEXP (src, 0);
2139 convert_op (&src, insn);
2140 subreg = gen_reg_rtx (V2DImode);
2141 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2142 src = gen_rtx_XOR (V2DImode, src, subreg);
2143 break;
2145 case MEM:
2146 if (!REG_P (dst))
2147 convert_op (&src, insn);
2148 break;
2150 case REG:
2151 if (!MEM_P (dst))
2152 convert_op (&src, insn);
2153 break;
2155 case SUBREG:
2156 gcc_assert (GET_MODE (src) == V2DImode);
2157 break;
2159 case COMPARE:
2160 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2162 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2163 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2165 if (REG_P (src))
2166 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2167 else
2168 subreg = copy_rtx_if_shared (src);
2169 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2170 copy_rtx_if_shared (subreg),
2171 copy_rtx_if_shared (subreg)),
2172 insn);
2173 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2174 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2175 copy_rtx_if_shared (src)),
2176 UNSPEC_PTEST);
2177 break;
2179 case CONST_INT:
2180 convert_op (&src, insn);
2181 break;
2183 default:
2184 gcc_unreachable ();
2187 SET_SRC (def_set) = src;
2188 SET_DEST (def_set) = dst;
2190 /* Drop possible dead definitions. */
2191 PATTERN (insn) = def_set;
2193 INSN_CODE (insn) = -1;
2194 recog_memoized (insn);
2195 df_insn_rescan (insn);
2198 /* Fix uses of converted REG in debug insns. */
2200 void
2201 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2203 if (!flag_var_tracking)
2204 return;
2206 df_ref ref, next;
2207 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2209 rtx_insn *insn = DF_REF_INSN (ref);
2210 /* Make sure the next ref is for a different instruction,
2211 so that we're not affected by the rescan. */
2212 next = DF_REF_NEXT_REG (ref);
2213 while (next && DF_REF_INSN (next) == insn)
2214 next = DF_REF_NEXT_REG (next);
2216 if (DEBUG_INSN_P (insn))
2218 /* It may be a debug insn with a TImode variable in
2219 register. */
2220 bool changed = false;
2221 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2223 rtx *loc = DF_REF_LOC (ref);
2224 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2226 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2227 changed = true;
2230 if (changed)
2231 df_insn_rescan (insn);
2236 /* Convert INSN from TImode to V1T1mode. */
2238 void
2239 timode_scalar_chain::convert_insn (rtx_insn *insn)
2241 rtx def_set = single_set (insn);
2242 rtx src = SET_SRC (def_set);
2243 rtx dst = SET_DEST (def_set);
2245 switch (GET_CODE (dst))
2247 case REG:
2249 rtx tmp = find_reg_equal_equiv_note (insn);
2250 if (tmp)
2251 PUT_MODE (XEXP (tmp, 0), V1TImode);
2252 PUT_MODE (dst, V1TImode);
2253 fix_debug_reg_uses (dst);
2255 break;
2256 case MEM:
2257 PUT_MODE (dst, V1TImode);
2258 break;
2260 default:
2261 gcc_unreachable ();
2264 switch (GET_CODE (src))
2266 case REG:
2267 PUT_MODE (src, V1TImode);
2268 /* Call fix_debug_reg_uses only if SRC is never defined. */
2269 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2270 fix_debug_reg_uses (src);
2271 break;
2273 case MEM:
2274 PUT_MODE (src, V1TImode);
2275 break;
2277 case CONST_WIDE_INT:
2278 if (NONDEBUG_INSN_P (insn))
2280 /* Since there are no instructions to store 128-bit constant,
2281 temporary register usage is required. */
2282 rtx tmp = gen_reg_rtx (V1TImode);
2283 start_sequence ();
2284 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2285 src = validize_mem (force_const_mem (V1TImode, src));
2286 rtx_insn *seq = get_insns ();
2287 end_sequence ();
2288 if (seq)
2289 emit_insn_before (seq, insn);
2290 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2291 dst = tmp;
2293 break;
2295 case CONST_INT:
2296 switch (standard_sse_constant_p (src, TImode))
2298 case 1:
2299 src = CONST0_RTX (GET_MODE (dst));
2300 break;
2301 case 2:
2302 src = CONSTM1_RTX (GET_MODE (dst));
2303 break;
2304 default:
2305 gcc_unreachable ();
2307 if (NONDEBUG_INSN_P (insn))
2309 rtx tmp = gen_reg_rtx (V1TImode);
2310 /* Since there are no instructions to store standard SSE
2311 constant, temporary register usage is required. */
2312 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2313 dst = tmp;
2315 break;
2317 default:
2318 gcc_unreachable ();
2321 SET_SRC (def_set) = src;
2322 SET_DEST (def_set) = dst;
2324 /* Drop possible dead definitions. */
2325 PATTERN (insn) = def_set;
2327 INSN_CODE (insn) = -1;
2328 recog_memoized (insn);
2329 df_insn_rescan (insn);
2332 void
2333 dimode_scalar_chain::convert_registers ()
2335 bitmap_iterator bi;
2336 unsigned id;
2338 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2339 convert_reg (id);
2341 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2342 make_vector_copies (id);
2345 /* Convert whole chain creating required register
2346 conversions and copies. */
2349 scalar_chain::convert ()
2351 bitmap_iterator bi;
2352 unsigned id;
2353 int converted_insns = 0;
2355 if (!dbg_cnt (stv_conversion))
2356 return 0;
2358 if (dump_file)
2359 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2361 convert_registers ();
2363 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2365 convert_insn (DF_INSN_UID_GET (id)->insn);
2366 converted_insns++;
2369 return converted_insns;
2372 /* Main STV pass function. Find and convert scalar
2373 instructions into vector mode when profitable. */
2375 static unsigned int
2376 convert_scalars_to_vector ()
2378 basic_block bb;
2379 bitmap candidates;
2380 int converted_insns = 0;
2382 bitmap_obstack_initialize (NULL);
2383 candidates = BITMAP_ALLOC (NULL);
2385 calculate_dominance_info (CDI_DOMINATORS);
2386 df_set_flags (DF_DEFER_INSN_RESCAN);
2387 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2388 df_md_add_problem ();
2389 df_analyze ();
2391 /* Find all instructions we want to convert into vector mode. */
2392 if (dump_file)
2393 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2395 FOR_EACH_BB_FN (bb, cfun)
2397 rtx_insn *insn;
2398 FOR_BB_INSNS (bb, insn)
2399 if (scalar_to_vector_candidate_p (insn))
2401 if (dump_file)
2402 fprintf (dump_file, " insn %d is marked as a candidate\n",
2403 INSN_UID (insn));
2405 bitmap_set_bit (candidates, INSN_UID (insn));
2409 remove_non_convertible_regs (candidates);
2411 if (bitmap_empty_p (candidates))
2412 if (dump_file)
2413 fprintf (dump_file, "There are no candidates for optimization.\n");
2415 while (!bitmap_empty_p (candidates))
2417 unsigned uid = bitmap_first_set_bit (candidates);
2418 scalar_chain *chain;
2420 if (TARGET_64BIT)
2421 chain = new timode_scalar_chain;
2422 else
2423 chain = new dimode_scalar_chain;
2425 /* Find instructions chain we want to convert to vector mode.
2426 Check all uses and definitions to estimate all required
2427 conversions. */
2428 chain->build (candidates, uid);
2430 if (chain->compute_convert_gain () > 0)
2431 converted_insns += chain->convert ();
2432 else
2433 if (dump_file)
2434 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2435 chain->chain_id);
2437 delete chain;
2440 if (dump_file)
2441 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2443 BITMAP_FREE (candidates);
2444 bitmap_obstack_release (NULL);
2445 df_process_deferred_rescans ();
2447 /* Conversion means we may have 128bit register spills/fills
2448 which require aligned stack. */
2449 if (converted_insns)
2451 if (crtl->stack_alignment_needed < 128)
2452 crtl->stack_alignment_needed = 128;
2453 if (crtl->stack_alignment_estimated < 128)
2454 crtl->stack_alignment_estimated = 128;
2455 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2456 if (TARGET_64BIT)
2457 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2458 parm; parm = DECL_CHAIN (parm))
2460 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2461 continue;
2462 if (DECL_RTL_SET_P (parm)
2463 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2465 rtx r = DECL_RTL (parm);
2466 if (REG_P (r))
2467 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2469 if (DECL_INCOMING_RTL (parm)
2470 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2472 rtx r = DECL_INCOMING_RTL (parm);
2473 if (REG_P (r))
2474 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2479 return 0;
2482 namespace {
2484 const pass_data pass_data_insert_vzeroupper =
2486 RTL_PASS, /* type */
2487 "vzeroupper", /* name */
2488 OPTGROUP_NONE, /* optinfo_flags */
2489 TV_MACH_DEP, /* tv_id */
2490 0, /* properties_required */
2491 0, /* properties_provided */
2492 0, /* properties_destroyed */
2493 0, /* todo_flags_start */
2494 TODO_df_finish, /* todo_flags_finish */
2497 class pass_insert_vzeroupper : public rtl_opt_pass
2499 public:
2500 pass_insert_vzeroupper(gcc::context *ctxt)
2501 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2504 /* opt_pass methods: */
2505 virtual bool gate (function *)
2507 return TARGET_AVX
2508 && TARGET_VZEROUPPER && flag_expensive_optimizations
2509 && !optimize_size;
2512 virtual unsigned int execute (function *)
2514 return rest_of_handle_insert_vzeroupper ();
2517 }; // class pass_insert_vzeroupper
2519 const pass_data pass_data_stv =
2521 RTL_PASS, /* type */
2522 "stv", /* name */
2523 OPTGROUP_NONE, /* optinfo_flags */
2524 TV_MACH_DEP, /* tv_id */
2525 0, /* properties_required */
2526 0, /* properties_provided */
2527 0, /* properties_destroyed */
2528 0, /* todo_flags_start */
2529 TODO_df_finish, /* todo_flags_finish */
2532 class pass_stv : public rtl_opt_pass
2534 public:
2535 pass_stv (gcc::context *ctxt)
2536 : rtl_opt_pass (pass_data_stv, ctxt),
2537 timode_p (false)
2540 /* opt_pass methods: */
2541 virtual bool gate (function *)
2543 return (timode_p == !!TARGET_64BIT
2544 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2547 virtual unsigned int execute (function *)
2549 return convert_scalars_to_vector ();
2552 opt_pass *clone ()
2554 return new pass_stv (m_ctxt);
2557 void set_pass_param (unsigned int n, bool param)
2559 gcc_assert (n == 0);
2560 timode_p = param;
2563 private:
2564 bool timode_p;
2565 }; // class pass_stv
2567 } // anon namespace
2569 rtl_opt_pass *
2570 make_pass_insert_vzeroupper (gcc::context *ctxt)
2572 return new pass_insert_vzeroupper (ctxt);
2575 rtl_opt_pass *
2576 make_pass_stv (gcc::context *ctxt)
2578 return new pass_stv (ctxt);
2581 /* Inserting ENDBRANCH instructions. */
2583 static unsigned int
2584 rest_of_insert_endbranch (void)
2586 timevar_push (TV_MACH_DEP);
2588 rtx cet_eb;
2589 rtx_insn *insn;
2590 basic_block bb;
2592 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2593 absent among function attributes. Later an optimization will be
2594 introduced to make analysis if an address of a static function is
2595 taken. A static function whose address is not taken will get a
2596 nocf_check attribute. This will allow to reduce the number of EB. */
2598 if (!lookup_attribute ("nocf_check",
2599 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2600 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2602 cet_eb = gen_nop_endbr ();
2604 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2605 insn = BB_HEAD (bb);
2606 emit_insn_before (cet_eb, insn);
2609 bb = 0;
2610 FOR_EACH_BB_FN (bb, cfun)
2612 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2613 insn = NEXT_INSN (insn))
2615 if (CALL_P (insn))
2617 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2618 continue;
2619 /* Generate ENDBRANCH after CALL, which can return more than
2620 twice, setjmp-like functions. */
2622 cet_eb = gen_nop_endbr ();
2623 emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
2624 continue;
2627 if (JUMP_P (insn) && flag_cet_switch)
2629 rtx target = JUMP_LABEL (insn);
2630 if (target == NULL_RTX || ANY_RETURN_P (target))
2631 continue;
2633 /* Check the jump is a switch table. */
2634 rtx_insn *label = as_a<rtx_insn *> (target);
2635 rtx_insn *table = next_insn (label);
2636 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2637 continue;
2639 /* For the indirect jump find out all places it jumps and insert
2640 ENDBRANCH there. It should be done under a special flag to
2641 control ENDBRANCH generation for switch stmts. */
2642 edge_iterator ei;
2643 edge e;
2644 basic_block dest_blk;
2646 FOR_EACH_EDGE (e, ei, bb->succs)
2648 rtx_insn *insn;
2650 dest_blk = e->dest;
2651 insn = BB_HEAD (dest_blk);
2652 gcc_assert (LABEL_P (insn));
2653 cet_eb = gen_nop_endbr ();
2654 emit_insn_after (cet_eb, insn);
2656 continue;
2659 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2660 || (NOTE_P (insn)
2661 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2662 /* TODO. Check /s bit also. */
2664 cet_eb = gen_nop_endbr ();
2665 emit_insn_after (cet_eb, insn);
2666 continue;
2671 timevar_pop (TV_MACH_DEP);
2672 return 0;
2675 namespace {
2677 const pass_data pass_data_insert_endbranch =
2679 RTL_PASS, /* type. */
2680 "cet", /* name. */
2681 OPTGROUP_NONE, /* optinfo_flags. */
2682 TV_MACH_DEP, /* tv_id. */
2683 0, /* properties_required. */
2684 0, /* properties_provided. */
2685 0, /* properties_destroyed. */
2686 0, /* todo_flags_start. */
2687 0, /* todo_flags_finish. */
2690 class pass_insert_endbranch : public rtl_opt_pass
2692 public:
2693 pass_insert_endbranch (gcc::context *ctxt)
2694 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2697 /* opt_pass methods: */
2698 virtual bool gate (function *)
2700 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2703 virtual unsigned int execute (function *)
2705 return rest_of_insert_endbranch ();
2708 }; // class pass_insert_endbranch
2710 } // anon namespace
2712 rtl_opt_pass *
2713 make_pass_insert_endbranch (gcc::context *ctxt)
2715 return new pass_insert_endbranch (ctxt);
2718 /* Return true if a red-zone is in use. We can't use red-zone when
2719 there are local indirect jumps, like "indirect_jump" or "tablejump",
2720 which jumps to another place in the function, since "call" in the
2721 indirect thunk pushes the return address onto stack, destroying
2722 red-zone.
2724 TODO: If we can reserve the first 2 WORDs, for PUSH and, another
2725 for CALL, in red-zone, we can allow local indirect jumps with
2726 indirect thunk. */
2728 bool
2729 ix86_using_red_zone (void)
2731 return (TARGET_RED_ZONE
2732 && !TARGET_64BIT_MS_ABI
2733 && (!cfun->machine->has_local_indirect_jump
2734 || cfun->machine->indirect_branch_type == indirect_branch_keep));
2737 /* Return a string that documents the current -m options. The caller is
2738 responsible for freeing the string. */
2740 static char *
2741 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2742 int flags, int flags2,
2743 const char *arch, const char *tune,
2744 enum fpmath_unit fpmath, bool add_nl_p)
2746 struct ix86_target_opts
2748 const char *option; /* option string */
2749 HOST_WIDE_INT mask; /* isa mask options */
2752 /* This table is ordered so that options like -msse4.2 that imply other
2753 ISAs come first. Target string will be displayed in the same order. */
2754 static struct ix86_target_opts isa2_opts[] =
2756 { "-mcx16", OPTION_MASK_ISA_CX16 },
2757 { "-mmpx", OPTION_MASK_ISA_MPX },
2758 { "-mvaes", OPTION_MASK_ISA_VAES },
2759 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2760 { "-mpconfig", OPTION_MASK_ISA_PCONFIG },
2761 { "-mwbnoinvd", OPTION_MASK_ISA_WBNOINVD },
2762 { "-msgx", OPTION_MASK_ISA_SGX },
2763 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2764 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2765 { "-mibt", OPTION_MASK_ISA_IBT },
2766 { "-mhle", OPTION_MASK_ISA_HLE },
2767 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2768 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2769 { "-mmwaitx", OPTION_MASK_ISA_MWAITX }
2771 static struct ix86_target_opts isa_opts[] =
2773 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2774 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2775 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2776 { "-mgfni", OPTION_MASK_ISA_GFNI },
2777 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2778 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2779 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2780 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2781 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2782 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2783 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2784 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2785 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2786 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2787 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2788 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2789 { "-mfma", OPTION_MASK_ISA_FMA },
2790 { "-mxop", OPTION_MASK_ISA_XOP },
2791 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2792 { "-mf16c", OPTION_MASK_ISA_F16C },
2793 { "-mavx", OPTION_MASK_ISA_AVX },
2794 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2795 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2796 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2797 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2798 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2799 { "-msse3", OPTION_MASK_ISA_SSE3 },
2800 { "-maes", OPTION_MASK_ISA_AES },
2801 { "-msha", OPTION_MASK_ISA_SHA },
2802 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2803 { "-msse2", OPTION_MASK_ISA_SSE2 },
2804 { "-msse", OPTION_MASK_ISA_SSE },
2805 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2806 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2807 { "-mmmx", OPTION_MASK_ISA_MMX },
2808 { "-mrtm", OPTION_MASK_ISA_RTM },
2809 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2810 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2811 { "-madx", OPTION_MASK_ISA_ADX },
2812 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2813 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2814 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2815 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2816 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2817 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2818 { "-mabm", OPTION_MASK_ISA_ABM },
2819 { "-mbmi", OPTION_MASK_ISA_BMI },
2820 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2821 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2822 { "-mtbm", OPTION_MASK_ISA_TBM },
2823 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2824 { "-msahf", OPTION_MASK_ISA_SAHF },
2825 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2826 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2827 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2828 { "-mpku", OPTION_MASK_ISA_PKU },
2829 { "-mlwp", OPTION_MASK_ISA_LWP },
2830 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2831 { "-mclwb", OPTION_MASK_ISA_CLWB },
2832 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2835 /* Flag options. */
2836 static struct ix86_target_opts flag_opts[] =
2838 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2839 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2840 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2841 { "-m80387", MASK_80387 },
2842 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2843 { "-malign-double", MASK_ALIGN_DOUBLE },
2844 { "-mcld", MASK_CLD },
2845 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2846 { "-mieee-fp", MASK_IEEE_FP },
2847 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2848 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2849 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2850 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2851 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2852 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2853 { "-mno-red-zone", MASK_NO_RED_ZONE },
2854 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2855 { "-mrecip", MASK_RECIP },
2856 { "-mrtd", MASK_RTD },
2857 { "-msseregparm", MASK_SSEREGPARM },
2858 { "-mstack-arg-probe", MASK_STACK_PROBE },
2859 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2860 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2861 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2862 { "-mvzeroupper", MASK_VZEROUPPER },
2863 { "-mstv", MASK_STV },
2864 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2865 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2866 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2869 /* Additional flag options. */
2870 static struct ix86_target_opts flag2_opts[] =
2872 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2875 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2876 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2878 char isa_other[40];
2879 char isa2_other[40];
2880 char flags_other[40];
2881 char flags2_other[40];
2882 unsigned num = 0;
2883 unsigned i, j;
2884 char *ret;
2885 char *ptr;
2886 size_t len;
2887 size_t line_len;
2888 size_t sep_len;
2889 const char *abi;
2891 memset (opts, '\0', sizeof (opts));
2893 /* Add -march= option. */
2894 if (arch)
2896 opts[num][0] = "-march=";
2897 opts[num++][1] = arch;
2900 /* Add -mtune= option. */
2901 if (tune)
2903 opts[num][0] = "-mtune=";
2904 opts[num++][1] = tune;
2907 /* Add -m32/-m64/-mx32. */
2908 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2910 if ((isa & OPTION_MASK_ABI_64) != 0)
2911 abi = "-m64";
2912 else
2913 abi = "-mx32";
2914 isa &= ~ (OPTION_MASK_ISA_64BIT
2915 | OPTION_MASK_ABI_64
2916 | OPTION_MASK_ABI_X32);
2918 else
2919 abi = "-m32";
2920 opts[num++][0] = abi;
2922 /* Pick out the options in isa2 options. */
2923 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2925 if ((isa2 & isa2_opts[i].mask) != 0)
2927 opts[num++][0] = isa2_opts[i].option;
2928 isa2 &= ~ isa2_opts[i].mask;
2932 if (isa2 && add_nl_p)
2934 opts[num++][0] = isa2_other;
2935 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2938 /* Pick out the options in isa options. */
2939 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2941 if ((isa & isa_opts[i].mask) != 0)
2943 opts[num++][0] = isa_opts[i].option;
2944 isa &= ~ isa_opts[i].mask;
2948 if (isa && add_nl_p)
2950 opts[num++][0] = isa_other;
2951 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2954 /* Add flag options. */
2955 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2957 if ((flags & flag_opts[i].mask) != 0)
2959 opts[num++][0] = flag_opts[i].option;
2960 flags &= ~ flag_opts[i].mask;
2964 if (flags && add_nl_p)
2966 opts[num++][0] = flags_other;
2967 sprintf (flags_other, "(other flags: %#x)", flags);
2970 /* Add additional flag options. */
2971 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2973 if ((flags2 & flag2_opts[i].mask) != 0)
2975 opts[num++][0] = flag2_opts[i].option;
2976 flags2 &= ~ flag2_opts[i].mask;
2980 if (flags2 && add_nl_p)
2982 opts[num++][0] = flags2_other;
2983 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2986 /* Add -fpmath= option. */
2987 if (fpmath)
2989 opts[num][0] = "-mfpmath=";
2990 switch ((int) fpmath)
2992 case FPMATH_387:
2993 opts[num++][1] = "387";
2994 break;
2996 case FPMATH_SSE:
2997 opts[num++][1] = "sse";
2998 break;
3000 case FPMATH_387 | FPMATH_SSE:
3001 opts[num++][1] = "sse+387";
3002 break;
3004 default:
3005 gcc_unreachable ();
3009 /* Any options? */
3010 if (num == 0)
3011 return NULL;
3013 gcc_assert (num < ARRAY_SIZE (opts));
3015 /* Size the string. */
3016 len = 0;
3017 sep_len = (add_nl_p) ? 3 : 1;
3018 for (i = 0; i < num; i++)
3020 len += sep_len;
3021 for (j = 0; j < 2; j++)
3022 if (opts[i][j])
3023 len += strlen (opts[i][j]);
3026 /* Build the string. */
3027 ret = ptr = (char *) xmalloc (len);
3028 line_len = 0;
3030 for (i = 0; i < num; i++)
3032 size_t len2[2];
3034 for (j = 0; j < 2; j++)
3035 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3037 if (i != 0)
3039 *ptr++ = ' ';
3040 line_len++;
3042 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3044 *ptr++ = '\\';
3045 *ptr++ = '\n';
3046 line_len = 0;
3050 for (j = 0; j < 2; j++)
3051 if (opts[i][j])
3053 memcpy (ptr, opts[i][j], len2[j]);
3054 ptr += len2[j];
3055 line_len += len2[j];
3059 *ptr = '\0';
3060 gcc_assert (ret + len >= ptr);
3062 return ret;
3065 /* Return true, if profiling code should be emitted before
3066 prologue. Otherwise it returns false.
3067 Note: For x86 with "hotfix" it is sorried. */
3068 static bool
3069 ix86_profile_before_prologue (void)
3071 return flag_fentry != 0;
3074 /* Function that is callable from the debugger to print the current
3075 options. */
3076 void ATTRIBUTE_UNUSED
3077 ix86_debug_options (void)
3079 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3080 target_flags, ix86_target_flags,
3081 ix86_arch_string,ix86_tune_string,
3082 ix86_fpmath, true);
3084 if (opts)
3086 fprintf (stderr, "%s\n\n", opts);
3087 free (opts);
3089 else
3090 fputs ("<no options>\n\n", stderr);
3092 return;
3095 /* Return true if T is one of the bytes we should avoid with
3096 -mmitigate-rop. */
3098 static bool
3099 ix86_rop_should_change_byte_p (int t)
3101 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3104 static const char *stringop_alg_names[] = {
3105 #define DEF_ENUM
3106 #define DEF_ALG(alg, name) #name,
3107 #include "stringop.def"
3108 #undef DEF_ENUM
3109 #undef DEF_ALG
3112 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3113 The string is of the following form (or comma separated list of it):
3115 strategy_alg:max_size:[align|noalign]
3117 where the full size range for the strategy is either [0, max_size] or
3118 [min_size, max_size], in which min_size is the max_size + 1 of the
3119 preceding range. The last size range must have max_size == -1.
3121 Examples:
3124 -mmemcpy-strategy=libcall:-1:noalign
3126 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3130 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3132 This is to tell the compiler to use the following strategy for memset
3133 1) when the expected size is between [1, 16], use rep_8byte strategy;
3134 2) when the size is between [17, 2048], use vector_loop;
3135 3) when the size is > 2048, use libcall. */
3137 struct stringop_size_range
3139 int max;
3140 stringop_alg alg;
3141 bool noalign;
3144 static void
3145 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3147 const struct stringop_algs *default_algs;
3148 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3149 char *curr_range_str, *next_range_str;
3150 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3151 int i = 0, n = 0;
3153 if (is_memset)
3154 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3155 else
3156 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3158 curr_range_str = strategy_str;
3162 int maxs;
3163 char alg_name[128];
3164 char align[16];
3165 next_range_str = strchr (curr_range_str, ',');
3166 if (next_range_str)
3167 *next_range_str++ = '\0';
3169 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3170 align) != 3)
3172 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3173 return;
3176 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3178 error ("size ranges of option %qs should be increasing", opt);
3179 return;
3182 for (i = 0; i < last_alg; i++)
3183 if (!strcmp (alg_name, stringop_alg_names[i]))
3184 break;
3186 if (i == last_alg)
3188 error ("wrong strategy name %qs specified for option %qs",
3189 alg_name, opt);
3191 auto_vec <const char *> candidates;
3192 for (i = 0; i < last_alg; i++)
3193 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3194 candidates.safe_push (stringop_alg_names[i]);
3196 char *s;
3197 const char *hint
3198 = candidates_list_and_hint (alg_name, s, candidates);
3199 if (hint)
3200 inform (input_location,
3201 "valid arguments to %qs are: %s; did you mean %qs?",
3202 opt, s, hint);
3203 else
3204 inform (input_location, "valid arguments to %qs are: %s",
3205 opt, s);
3206 XDELETEVEC (s);
3207 return;
3210 if ((stringop_alg) i == rep_prefix_8_byte
3211 && !TARGET_64BIT)
3213 /* rep; movq isn't available in 32-bit code. */
3214 error ("strategy name %qs specified for option %qs "
3215 "not supported for 32-bit code", alg_name, opt);
3216 return;
3219 input_ranges[n].max = maxs;
3220 input_ranges[n].alg = (stringop_alg) i;
3221 if (!strcmp (align, "align"))
3222 input_ranges[n].noalign = false;
3223 else if (!strcmp (align, "noalign"))
3224 input_ranges[n].noalign = true;
3225 else
3227 error ("unknown alignment %qs specified for option %qs", align, opt);
3228 return;
3230 n++;
3231 curr_range_str = next_range_str;
3233 while (curr_range_str);
3235 if (input_ranges[n - 1].max != -1)
3237 error ("the max value for the last size range should be -1"
3238 " for option %qs", opt);
3239 return;
3242 if (n > MAX_STRINGOP_ALGS)
3244 error ("too many size ranges specified in option %qs", opt);
3245 return;
3248 /* Now override the default algs array. */
3249 for (i = 0; i < n; i++)
3251 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3252 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3253 = input_ranges[i].alg;
3254 *const_cast<int *>(&default_algs->size[i].noalign)
3255 = input_ranges[i].noalign;
3260 /* parse -mtune-ctrl= option. When DUMP is true,
3261 print the features that are explicitly set. */
3263 static void
3264 parse_mtune_ctrl_str (bool dump)
3266 if (!ix86_tune_ctrl_string)
3267 return;
3269 char *next_feature_string = NULL;
3270 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3271 char *orig = curr_feature_string;
3272 int i;
3275 bool clear = false;
3277 next_feature_string = strchr (curr_feature_string, ',');
3278 if (next_feature_string)
3279 *next_feature_string++ = '\0';
3280 if (*curr_feature_string == '^')
3282 curr_feature_string++;
3283 clear = true;
3285 for (i = 0; i < X86_TUNE_LAST; i++)
3287 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3289 ix86_tune_features[i] = !clear;
3290 if (dump)
3291 fprintf (stderr, "Explicitly %s feature %s\n",
3292 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3293 break;
3296 if (i == X86_TUNE_LAST)
3297 error ("unknown parameter to option -mtune-ctrl: %s",
3298 clear ? curr_feature_string - 1 : curr_feature_string);
3299 curr_feature_string = next_feature_string;
3301 while (curr_feature_string);
3302 free (orig);
3305 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3306 processor type. */
3308 static void
3309 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3311 unsigned int ix86_tune_mask = 1u << ix86_tune;
3312 int i;
3314 for (i = 0; i < X86_TUNE_LAST; ++i)
3316 if (ix86_tune_no_default)
3317 ix86_tune_features[i] = 0;
3318 else
3319 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3322 if (dump)
3324 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3325 for (i = 0; i < X86_TUNE_LAST; i++)
3326 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3327 ix86_tune_features[i] ? "on" : "off");
3330 parse_mtune_ctrl_str (dump);
3334 /* Default align_* from the processor table. */
3336 static void
3337 ix86_default_align (struct gcc_options *opts)
3339 if (opts->x_align_loops == 0)
3341 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3342 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3344 if (opts->x_align_jumps == 0)
3346 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3347 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3349 if (opts->x_align_functions == 0)
3351 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3355 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3357 static void
3358 ix86_override_options_after_change (void)
3360 ix86_default_align (&global_options);
3363 /* Override various settings based on options. If MAIN_ARGS_P, the
3364 options are from the command line, otherwise they are from
3365 attributes. Return true if there's an error related to march
3366 option. */
3368 static bool
3369 ix86_option_override_internal (bool main_args_p,
3370 struct gcc_options *opts,
3371 struct gcc_options *opts_set)
3373 int i;
3374 unsigned int ix86_arch_mask;
3375 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3377 const wide_int_bitmask PTA_3DNOW (HOST_WIDE_INT_1U << 0);
3378 const wide_int_bitmask PTA_3DNOW_A (HOST_WIDE_INT_1U << 1);
3379 const wide_int_bitmask PTA_64BIT (HOST_WIDE_INT_1U << 2);
3380 const wide_int_bitmask PTA_ABM (HOST_WIDE_INT_1U << 3);
3381 const wide_int_bitmask PTA_AES (HOST_WIDE_INT_1U << 4);
3382 const wide_int_bitmask PTA_AVX (HOST_WIDE_INT_1U << 5);
3383 const wide_int_bitmask PTA_BMI (HOST_WIDE_INT_1U << 6);
3384 const wide_int_bitmask PTA_CX16 (HOST_WIDE_INT_1U << 7);
3385 const wide_int_bitmask PTA_F16C (HOST_WIDE_INT_1U << 8);
3386 const wide_int_bitmask PTA_FMA (HOST_WIDE_INT_1U << 9);
3387 const wide_int_bitmask PTA_FMA4 (HOST_WIDE_INT_1U << 10);
3388 const wide_int_bitmask PTA_FSGSBASE (HOST_WIDE_INT_1U << 11);
3389 const wide_int_bitmask PTA_LWP (HOST_WIDE_INT_1U << 12);
3390 const wide_int_bitmask PTA_LZCNT (HOST_WIDE_INT_1U << 13);
3391 const wide_int_bitmask PTA_MMX (HOST_WIDE_INT_1U << 14);
3392 const wide_int_bitmask PTA_MOVBE (HOST_WIDE_INT_1U << 15);
3393 const wide_int_bitmask PTA_NO_SAHF (HOST_WIDE_INT_1U << 16);
3394 const wide_int_bitmask PTA_PCLMUL (HOST_WIDE_INT_1U << 17);
3395 const wide_int_bitmask PTA_POPCNT (HOST_WIDE_INT_1U << 18);
3396 const wide_int_bitmask PTA_PREFETCH_SSE (HOST_WIDE_INT_1U << 19);
3397 const wide_int_bitmask PTA_RDRND (HOST_WIDE_INT_1U << 20);
3398 const wide_int_bitmask PTA_SSE (HOST_WIDE_INT_1U << 21);
3399 const wide_int_bitmask PTA_SSE2 (HOST_WIDE_INT_1U << 22);
3400 const wide_int_bitmask PTA_SSE3 (HOST_WIDE_INT_1U << 23);
3401 const wide_int_bitmask PTA_SSE4_1 (HOST_WIDE_INT_1U << 24);
3402 const wide_int_bitmask PTA_SSE4_2 (HOST_WIDE_INT_1U << 25);
3403 const wide_int_bitmask PTA_SSE4A (HOST_WIDE_INT_1U << 26);
3404 const wide_int_bitmask PTA_SSSE3 (HOST_WIDE_INT_1U << 27);
3405 const wide_int_bitmask PTA_TBM (HOST_WIDE_INT_1U << 28);
3406 const wide_int_bitmask PTA_XOP (HOST_WIDE_INT_1U << 29);
3407 const wide_int_bitmask PTA_AVX2 (HOST_WIDE_INT_1U << 30);
3408 const wide_int_bitmask PTA_BMI2 (HOST_WIDE_INT_1U << 31);
3409 const wide_int_bitmask PTA_RTM (HOST_WIDE_INT_1U << 32);
3410 const wide_int_bitmask PTA_HLE (HOST_WIDE_INT_1U << 33);
3411 const wide_int_bitmask PTA_PRFCHW (HOST_WIDE_INT_1U << 34);
3412 const wide_int_bitmask PTA_RDSEED (HOST_WIDE_INT_1U << 35);
3413 const wide_int_bitmask PTA_ADX (HOST_WIDE_INT_1U << 36);
3414 const wide_int_bitmask PTA_FXSR (HOST_WIDE_INT_1U << 37);
3415 const wide_int_bitmask PTA_XSAVE (HOST_WIDE_INT_1U << 38);
3416 const wide_int_bitmask PTA_XSAVEOPT (HOST_WIDE_INT_1U << 39);
3417 const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40);
3418 const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41);
3419 const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42);
3420 const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43);
3421 const wide_int_bitmask PTA_MPX (HOST_WIDE_INT_1U << 44);
3422 const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45);
3423 const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46);
3424 const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47);
3425 const wide_int_bitmask PTA_XSAVEC (HOST_WIDE_INT_1U << 48);
3426 const wide_int_bitmask PTA_XSAVES (HOST_WIDE_INT_1U << 49);
3427 const wide_int_bitmask PTA_AVX512DQ (HOST_WIDE_INT_1U << 50);
3428 const wide_int_bitmask PTA_AVX512BW (HOST_WIDE_INT_1U << 51);
3429 const wide_int_bitmask PTA_AVX512VL (HOST_WIDE_INT_1U << 52);
3430 const wide_int_bitmask PTA_AVX512IFMA (HOST_WIDE_INT_1U << 53);
3431 const wide_int_bitmask PTA_AVX512VBMI (HOST_WIDE_INT_1U << 54);
3432 const wide_int_bitmask PTA_CLWB (HOST_WIDE_INT_1U << 55);
3433 const wide_int_bitmask PTA_MWAITX (HOST_WIDE_INT_1U << 56);
3434 const wide_int_bitmask PTA_CLZERO (HOST_WIDE_INT_1U << 57);
3435 const wide_int_bitmask PTA_NO_80387 (HOST_WIDE_INT_1U << 58);
3436 const wide_int_bitmask PTA_PKU (HOST_WIDE_INT_1U << 59);
3437 const wide_int_bitmask PTA_AVX5124VNNIW (HOST_WIDE_INT_1U << 60);
3438 const wide_int_bitmask PTA_AVX5124FMAPS (HOST_WIDE_INT_1U << 61);
3439 const wide_int_bitmask PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1U << 62);
3440 const wide_int_bitmask PTA_SGX (HOST_WIDE_INT_1U << 63);
3441 const wide_int_bitmask PTA_AVX512VNNI (0, HOST_WIDE_INT_1U);
3442 const wide_int_bitmask PTA_GFNI (0, HOST_WIDE_INT_1U << 1);
3443 const wide_int_bitmask PTA_VAES (0, HOST_WIDE_INT_1U << 2);
3444 const wide_int_bitmask PTA_AVX512VBMI2 (0, HOST_WIDE_INT_1U << 3);
3445 const wide_int_bitmask PTA_VPCLMULQDQ (0, HOST_WIDE_INT_1U << 4);
3446 const wide_int_bitmask PTA_AVX512BITALG (0, HOST_WIDE_INT_1U << 5);
3447 const wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6);
3449 const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
3450 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
3451 const wide_int_bitmask PTA_NEHALEM = PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2
3452 | PTA_POPCNT;
3453 const wide_int_bitmask PTA_WESTMERE = PTA_NEHALEM | PTA_AES | PTA_PCLMUL;
3454 const wide_int_bitmask PTA_SANDYBRIDGE = PTA_WESTMERE | PTA_AVX | PTA_XSAVE
3455 | PTA_XSAVEOPT;
3456 const wide_int_bitmask PTA_IVYBRIDGE = PTA_SANDYBRIDGE | PTA_FSGSBASE
3457 | PTA_RDRND | PTA_F16C;
3458 const wide_int_bitmask PTA_HASWELL = PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI
3459 | PTA_BMI2 | PTA_LZCNT | PTA_FMA | PTA_MOVBE | PTA_HLE;
3460 const wide_int_bitmask PTA_BROADWELL = PTA_HASWELL | PTA_ADX | PTA_PRFCHW
3461 | PTA_RDSEED;
3462 const wide_int_bitmask PTA_SKYLAKE = PTA_BROADWELL | PTA_CLFLUSHOPT
3463 | PTA_XSAVEC | PTA_XSAVES;
3464 const wide_int_bitmask PTA_SKYLAKE_AVX512 = PTA_SKYLAKE | PTA_AVX512F
3465 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3466 | PTA_CLWB;
3467 const wide_int_bitmask PTA_CANNONLAKE = PTA_SKYLAKE | PTA_AVX512F
3468 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3469 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA;
3470 const wide_int_bitmask PTA_ICELAKE = PTA_CANNONLAKE | PTA_AVX512VNNI
3471 | PTA_GFNI | PTA_VAES | PTA_AVX512VBMI2 | PTA_VPCLMULQDQ | PTA_AVX512BITALG
3472 | PTA_RDPID;
3473 const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER
3474 | PTA_AVX512F | PTA_AVX512CD;
3475 const wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
3476 const wide_int_bitmask PTA_SILVERMONT = PTA_WESTMERE | PTA_MOVBE | PTA_RDRND;
3477 const wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
3478 | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
3480 static struct pta
3482 const char *const name; /* processor name or nickname. */
3483 const enum processor_type processor;
3484 const enum attr_cpu schedule;
3485 const wide_int_bitmask flags;
3487 const processor_alias_table[] =
3489 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3490 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3491 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3492 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3493 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3494 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3495 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3496 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3497 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3498 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3499 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3500 PTA_MMX | PTA_SSE | PTA_FXSR},
3501 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3502 PTA_MMX | PTA_SSE | PTA_FXSR},
3503 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3504 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3505 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3506 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3507 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3508 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3509 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3510 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3511 PTA_MMX | PTA_SSE | PTA_FXSR},
3512 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3513 PTA_MMX | PTA_SSE | PTA_FXSR},
3514 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3515 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3516 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3517 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3518 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3519 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3520 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3521 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3522 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3523 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3524 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3525 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3526 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3527 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3528 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3529 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3530 PTA_SANDYBRIDGE},
3531 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3532 PTA_SANDYBRIDGE},
3533 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3534 PTA_IVYBRIDGE},
3535 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3536 PTA_IVYBRIDGE},
3537 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3538 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3539 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3540 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3541 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3542 PTA_SKYLAKE_AVX512},
3543 {"cannonlake", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL, PTA_CANNONLAKE},
3544 {"icelake", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL, PTA_ICELAKE},
3545 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3546 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3547 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3548 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3549 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3550 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3551 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3552 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3553 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3554 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3555 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3556 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3557 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3558 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3559 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3560 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3561 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3562 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3563 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3564 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3565 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3566 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3567 {"x86-64", PROCESSOR_K8, CPU_K8,
3568 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3569 {"eden-x2", PROCESSOR_K8, CPU_K8,
3570 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3571 {"nano", PROCESSOR_K8, CPU_K8,
3572 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3573 | PTA_SSSE3 | PTA_FXSR},
3574 {"nano-1000", PROCESSOR_K8, CPU_K8,
3575 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3576 | PTA_SSSE3 | PTA_FXSR},
3577 {"nano-2000", PROCESSOR_K8, CPU_K8,
3578 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3579 | PTA_SSSE3 | PTA_FXSR},
3580 {"nano-3000", PROCESSOR_K8, CPU_K8,
3581 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3582 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3583 {"nano-x2", PROCESSOR_K8, CPU_K8,
3584 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3585 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3586 {"eden-x4", PROCESSOR_K8, CPU_K8,
3587 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3588 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3589 {"nano-x4", PROCESSOR_K8, CPU_K8,
3590 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3591 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3592 {"k8", PROCESSOR_K8, CPU_K8,
3593 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3594 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3595 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3596 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3597 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3598 {"opteron", PROCESSOR_K8, CPU_K8,
3599 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3600 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3601 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3602 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3603 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3604 {"athlon64", PROCESSOR_K8, CPU_K8,
3605 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3606 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3607 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3608 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3609 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3610 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3611 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3612 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3613 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3614 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3615 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3616 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3617 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3618 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3619 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3620 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3621 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3622 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3623 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3624 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3625 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3626 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3627 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3628 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3629 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3630 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3631 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3632 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3633 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3634 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3635 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3636 | PTA_XSAVEOPT | PTA_FSGSBASE},
3637 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3638 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3639 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3640 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3641 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3642 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3643 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3644 | PTA_MOVBE | PTA_MWAITX},
3645 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3646 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3647 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3648 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3649 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3650 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3651 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3652 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3653 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3654 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3655 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3656 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3657 | PTA_FXSR | PTA_XSAVE},
3658 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3659 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3660 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3661 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3662 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3663 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3665 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3666 PTA_64BIT
3667 | PTA_HLE /* flags are only used for -march switch. */ },
3670 /* -mrecip options. */
3671 static struct
3673 const char *string; /* option name */
3674 unsigned int mask; /* mask bits to set */
3676 const recip_options[] =
3678 { "all", RECIP_MASK_ALL },
3679 { "none", RECIP_MASK_NONE },
3680 { "div", RECIP_MASK_DIV },
3681 { "sqrt", RECIP_MASK_SQRT },
3682 { "vec-div", RECIP_MASK_VEC_DIV },
3683 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3686 int const pta_size = ARRAY_SIZE (processor_alias_table);
3688 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3689 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3690 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3691 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3692 #ifdef TARGET_BI_ARCH
3693 else
3695 #if TARGET_BI_ARCH == 1
3696 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3697 is on and OPTION_MASK_ABI_X32 is off. We turn off
3698 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3699 -mx32. */
3700 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3701 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3702 #else
3703 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3704 on and OPTION_MASK_ABI_64 is off. We turn off
3705 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3706 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3707 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3708 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3709 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3710 #endif
3711 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3712 && TARGET_IAMCU_P (opts->x_target_flags))
3713 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3714 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3716 #endif
3718 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3720 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3721 OPTION_MASK_ABI_64 for TARGET_X32. */
3722 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3723 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3725 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3726 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3727 | OPTION_MASK_ABI_X32
3728 | OPTION_MASK_ABI_64);
3729 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3731 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3732 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3733 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3734 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3737 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3738 SUBTARGET_OVERRIDE_OPTIONS;
3739 #endif
3741 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3742 SUBSUBTARGET_OVERRIDE_OPTIONS;
3743 #endif
3745 /* -fPIC is the default for x86_64. */
3746 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3747 opts->x_flag_pic = 2;
3749 /* Need to check -mtune=generic first. */
3750 if (opts->x_ix86_tune_string)
3752 /* As special support for cross compilers we read -mtune=native
3753 as -mtune=generic. With native compilers we won't see the
3754 -mtune=native, as it was changed by the driver. */
3755 if (!strcmp (opts->x_ix86_tune_string, "native"))
3757 opts->x_ix86_tune_string = "generic";
3759 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3760 warning (OPT_Wdeprecated,
3761 main_args_p
3762 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3763 "or %<-mtune=generic%> instead as appropriate")
3764 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3765 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3766 " instead as appropriate"));
3768 else
3770 if (opts->x_ix86_arch_string)
3771 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3772 if (!opts->x_ix86_tune_string)
3774 opts->x_ix86_tune_string
3775 = processor_target_table[TARGET_CPU_DEFAULT].name;
3776 ix86_tune_defaulted = 1;
3779 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3780 or defaulted. We need to use a sensible tune option. */
3781 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3783 opts->x_ix86_tune_string = "generic";
3787 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3788 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3790 /* rep; movq isn't available in 32-bit code. */
3791 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3792 opts->x_ix86_stringop_alg = no_stringop;
3795 if (!opts->x_ix86_arch_string)
3796 opts->x_ix86_arch_string
3797 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3798 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3799 else
3800 ix86_arch_specified = 1;
3802 if (opts_set->x_ix86_pmode)
3804 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3805 && opts->x_ix86_pmode == PMODE_SI)
3806 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3807 && opts->x_ix86_pmode == PMODE_DI))
3808 error ("address mode %qs not supported in the %s bit mode",
3809 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3810 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3812 else
3813 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3814 ? PMODE_DI : PMODE_SI;
3816 if (!opts_set->x_ix86_abi)
3817 opts->x_ix86_abi = DEFAULT_ABI;
3819 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3820 error ("-mabi=ms not supported with X32 ABI");
3821 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3823 /* For targets using ms ABI enable ms-extensions, if not
3824 explicit turned off. For non-ms ABI we turn off this
3825 option. */
3826 if (!opts_set->x_flag_ms_extensions)
3827 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3829 if (opts_set->x_ix86_cmodel)
3831 switch (opts->x_ix86_cmodel)
3833 case CM_SMALL:
3834 case CM_SMALL_PIC:
3835 if (opts->x_flag_pic)
3836 opts->x_ix86_cmodel = CM_SMALL_PIC;
3837 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3838 error ("code model %qs not supported in the %s bit mode",
3839 "small", "32");
3840 break;
3842 case CM_MEDIUM:
3843 case CM_MEDIUM_PIC:
3844 if (opts->x_flag_pic)
3845 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3846 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3847 error ("code model %qs not supported in the %s bit mode",
3848 "medium", "32");
3849 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3850 error ("code model %qs not supported in x32 mode",
3851 "medium");
3852 break;
3854 case CM_LARGE:
3855 case CM_LARGE_PIC:
3856 if (opts->x_flag_pic)
3857 opts->x_ix86_cmodel = CM_LARGE_PIC;
3858 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3859 error ("code model %qs not supported in the %s bit mode",
3860 "large", "32");
3861 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3862 error ("code model %qs not supported in x32 mode",
3863 "large");
3864 break;
3866 case CM_32:
3867 if (opts->x_flag_pic)
3868 error ("code model %s does not support PIC mode", "32");
3869 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3870 error ("code model %qs not supported in the %s bit mode",
3871 "32", "64");
3872 break;
3874 case CM_KERNEL:
3875 if (opts->x_flag_pic)
3877 error ("code model %s does not support PIC mode", "kernel");
3878 opts->x_ix86_cmodel = CM_32;
3880 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3881 error ("code model %qs not supported in the %s bit mode",
3882 "kernel", "32");
3883 break;
3885 default:
3886 gcc_unreachable ();
3889 else
3891 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3892 use of rip-relative addressing. This eliminates fixups that
3893 would otherwise be needed if this object is to be placed in a
3894 DLL, and is essentially just as efficient as direct addressing. */
3895 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3896 && (TARGET_RDOS || TARGET_PECOFF))
3897 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3898 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3899 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3900 else
3901 opts->x_ix86_cmodel = CM_32;
3903 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3905 error ("-masm=intel not supported in this configuration");
3906 opts->x_ix86_asm_dialect = ASM_ATT;
3908 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3909 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3910 sorry ("%i-bit mode not compiled in",
3911 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3913 for (i = 0; i < pta_size; i++)
3914 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3916 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3918 error (main_args_p
3919 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3920 "switch")
3921 : G_("%<generic%> CPU can be used only for "
3922 "%<target(\"tune=\")%> attribute"));
3923 return false;
3925 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3927 error (main_args_p
3928 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3929 "switch")
3930 : G_("%<intel%> CPU can be used only for "
3931 "%<target(\"tune=\")%> attribute"));
3932 return false;
3935 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3936 && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
3938 error ("CPU you selected does not support x86-64 "
3939 "instruction set");
3940 return false;
3943 ix86_schedule = processor_alias_table[i].schedule;
3944 ix86_arch = processor_alias_table[i].processor;
3945 /* Default cpu tuning to the architecture. */
3946 ix86_tune = ix86_arch;
3948 if (((processor_alias_table[i].flags & PTA_MMX) != 0)
3949 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3950 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3951 if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
3952 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3953 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3954 if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
3955 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3956 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3957 if (((processor_alias_table[i].flags & PTA_SSE) != 0)
3958 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3959 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3960 if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
3961 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3962 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3963 if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
3964 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3965 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3966 if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
3967 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3968 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3969 if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
3970 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3971 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3972 if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
3973 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3974 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3975 if (((processor_alias_table[i].flags & PTA_AVX) != 0)
3976 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3977 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3978 if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
3979 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3980 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3981 if (((processor_alias_table[i].flags & PTA_FMA) != 0)
3982 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3983 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3984 if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
3985 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3986 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3987 if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
3988 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3989 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3990 if (((processor_alias_table[i].flags & PTA_XOP) != 0)
3991 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3992 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3993 if (((processor_alias_table[i].flags & PTA_LWP) != 0)
3994 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3995 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3996 if (((processor_alias_table[i].flags & PTA_ABM) != 0)
3997 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3998 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3999 if (((processor_alias_table[i].flags & PTA_BMI) != 0)
4000 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
4001 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
4002 if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
4003 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
4004 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
4005 if (((processor_alias_table[i].flags & PTA_TBM) != 0)
4006 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
4007 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
4008 if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
4009 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4010 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4011 if (((processor_alias_table[i].flags & PTA_CX16) != 0)
4012 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4013 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4014 if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
4015 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4016 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4017 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4018 && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
4019 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4020 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4021 if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
4022 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4023 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4024 if (((processor_alias_table[i].flags & PTA_AES) != 0)
4025 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4026 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4027 if (((processor_alias_table[i].flags & PTA_SHA) != 0)
4028 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4029 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4030 if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
4031 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4032 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4033 if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
4034 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4035 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4036 if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
4037 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4038 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4039 if (((processor_alias_table[i].flags & PTA_F16C) != 0)
4040 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4041 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4042 if (((processor_alias_table[i].flags & PTA_RTM) != 0)
4043 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4044 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4045 if (((processor_alias_table[i].flags & PTA_HLE) != 0)
4046 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4047 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4048 if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
4049 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4050 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4051 if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
4052 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4053 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4054 if (((processor_alias_table[i].flags & PTA_ADX) != 0)
4055 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4056 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4057 if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
4058 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4059 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4060 if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
4061 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4062 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4063 if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
4064 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4065 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4066 if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
4067 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4068 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4069 if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
4070 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4071 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4072 if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
4073 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4074 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4075 if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
4076 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4077 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4078 if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
4079 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4080 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4081 if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
4082 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4083 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4084 if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
4085 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4086 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4087 if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
4088 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4089 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4090 if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
4091 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4092 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4093 if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
4094 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4095 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4096 if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
4097 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4098 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4099 if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
4100 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4101 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4102 if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
4103 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4104 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4105 if (((processor_alias_table[i].flags & PTA_MPX) != 0)
4106 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4107 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4108 if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
4109 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4110 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4111 if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
4112 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4113 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4114 if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
4115 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
4116 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
4117 if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
4118 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
4119 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
4120 if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
4121 && !(opts->x_ix86_isa_flags_explicit
4122 & OPTION_MASK_ISA_AVX512VBMI2))
4123 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
4124 if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
4125 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
4126 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
4127 if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
4128 && !(opts->x_ix86_isa_flags_explicit
4129 & OPTION_MASK_ISA_AVX512BITALG))
4130 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
4132 if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
4133 && !(opts->x_ix86_isa_flags2_explicit
4134 & OPTION_MASK_ISA_AVX5124VNNIW))
4135 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4136 if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
4137 && !(opts->x_ix86_isa_flags2_explicit
4138 & OPTION_MASK_ISA_AVX5124FMAPS))
4139 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4140 if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
4141 && !(opts->x_ix86_isa_flags_explicit
4142 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4143 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4144 if (((processor_alias_table[i].flags & PTA_SGX) != 0)
4145 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4146 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4147 if (((processor_alias_table[i].flags & PTA_VAES) != 0)
4148 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
4149 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
4150 if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
4151 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
4152 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
4154 if ((processor_alias_table[i].flags
4155 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
4156 x86_prefetch_sse = true;
4157 if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
4158 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4159 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4160 if (((processor_alias_table[i].flags & PTA_PKU) != 0)
4161 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4162 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4164 /* Don't enable x87 instructions if only
4165 general registers are allowed. */
4166 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4167 && !(opts_set->x_target_flags & MASK_80387))
4169 if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
4170 opts->x_target_flags &= ~MASK_80387;
4171 else
4172 opts->x_target_flags |= MASK_80387;
4174 break;
4177 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4178 error ("Intel MPX does not support x32");
4180 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4181 error ("Intel MPX does not support x32");
4183 if (i == pta_size)
4185 error (main_args_p
4186 ? G_("bad value (%qs) for %<-march=%> switch")
4187 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4188 opts->x_ix86_arch_string);
4190 auto_vec <const char *> candidates;
4191 for (i = 0; i < pta_size; i++)
4192 if (strcmp (processor_alias_table[i].name, "generic")
4193 && strcmp (processor_alias_table[i].name, "intel")
4194 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4195 || ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
4196 candidates.safe_push (processor_alias_table[i].name);
4198 #ifdef HAVE_LOCAL_CPU_DETECT
4199 /* Add also "native" as possible value. */
4200 candidates.safe_push ("native");
4201 #endif
4203 char *s;
4204 const char *hint
4205 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4206 if (hint)
4207 inform (input_location,
4208 main_args_p
4209 ? G_("valid arguments to %<-march=%> switch are: "
4210 "%s; did you mean %qs?")
4211 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4212 "%s; did you mean %qs?"), s, hint);
4213 else
4214 inform (input_location,
4215 main_args_p
4216 ? G_("valid arguments to %<-march=%> switch are: %s")
4217 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4218 "are: %s"), s);
4219 XDELETEVEC (s);
4222 ix86_arch_mask = 1u << ix86_arch;
4223 for (i = 0; i < X86_ARCH_LAST; ++i)
4224 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4226 for (i = 0; i < pta_size; i++)
4227 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4229 ix86_schedule = processor_alias_table[i].schedule;
4230 ix86_tune = processor_alias_table[i].processor;
4231 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4233 if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
4235 if (ix86_tune_defaulted)
4237 opts->x_ix86_tune_string = "x86-64";
4238 for (i = 0; i < pta_size; i++)
4239 if (! strcmp (opts->x_ix86_tune_string,
4240 processor_alias_table[i].name))
4241 break;
4242 ix86_schedule = processor_alias_table[i].schedule;
4243 ix86_tune = processor_alias_table[i].processor;
4245 else
4246 error ("CPU you selected does not support x86-64 "
4247 "instruction set");
4250 /* Intel CPUs have always interpreted SSE prefetch instructions as
4251 NOPs; so, we can enable SSE prefetch instructions even when
4252 -mtune (rather than -march) points us to a processor that has them.
4253 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4254 higher processors. */
4255 if (TARGET_CMOV
4256 && ((processor_alias_table[i].flags
4257 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
4258 x86_prefetch_sse = true;
4259 break;
4262 if (ix86_tune_specified && i == pta_size)
4264 error (main_args_p
4265 ? G_("bad value (%qs) for %<-mtune=%> switch")
4266 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4267 opts->x_ix86_tune_string);
4269 auto_vec <const char *> candidates;
4270 for (i = 0; i < pta_size; i++)
4271 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4272 || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
4273 candidates.safe_push (processor_alias_table[i].name);
4275 #ifdef HAVE_LOCAL_CPU_DETECT
4276 /* Add also "native" as possible value. */
4277 candidates.safe_push ("native");
4278 #endif
4280 char *s;
4281 const char *hint
4282 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4283 if (hint)
4284 inform (input_location,
4285 main_args_p
4286 ? G_("valid arguments to %<-mtune=%> switch are: "
4287 "%s; did you mean %qs?")
4288 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4289 "%s; did you mean %qs?"), s, hint);
4290 else
4291 inform (input_location,
4292 main_args_p
4293 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4294 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4295 "are: %s"), s);
4296 XDELETEVEC (s);
4299 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4301 #ifndef USE_IX86_FRAME_POINTER
4302 #define USE_IX86_FRAME_POINTER 0
4303 #endif
4305 #ifndef USE_X86_64_FRAME_POINTER
4306 #define USE_X86_64_FRAME_POINTER 0
4307 #endif
4309 /* Set the default values for switches whose default depends on TARGET_64BIT
4310 in case they weren't overwritten by command line options. */
4311 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4313 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4314 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4315 if (opts->x_flag_asynchronous_unwind_tables
4316 && !opts_set->x_flag_unwind_tables
4317 && TARGET_64BIT_MS_ABI)
4318 opts->x_flag_unwind_tables = 1;
4319 if (opts->x_flag_asynchronous_unwind_tables == 2)
4320 opts->x_flag_unwind_tables
4321 = opts->x_flag_asynchronous_unwind_tables = 1;
4322 if (opts->x_flag_pcc_struct_return == 2)
4323 opts->x_flag_pcc_struct_return = 0;
4325 else
4327 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4328 opts->x_flag_omit_frame_pointer
4329 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4330 if (opts->x_flag_asynchronous_unwind_tables == 2)
4331 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4332 if (opts->x_flag_pcc_struct_return == 2)
4334 /* Intel MCU psABI specifies that -freg-struct-return should
4335 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4336 we check -miamcu so that -freg-struct-return is always
4337 turned on if -miamcu is used. */
4338 if (TARGET_IAMCU_P (opts->x_target_flags))
4339 opts->x_flag_pcc_struct_return = 0;
4340 else
4341 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4345 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4346 /* TODO: ix86_cost should be chosen at instruction or function granuality
4347 so for cold code we use size_cost even in !optimize_size compilation. */
4348 if (opts->x_optimize_size)
4349 ix86_cost = &ix86_size_cost;
4350 else
4351 ix86_cost = ix86_tune_cost;
4353 /* Arrange to set up i386_stack_locals for all functions. */
4354 init_machine_status = ix86_init_machine_status;
4356 /* Validate -mregparm= value. */
4357 if (opts_set->x_ix86_regparm)
4359 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4360 warning (0, "-mregparm is ignored in 64-bit mode");
4361 else if (TARGET_IAMCU_P (opts->x_target_flags))
4362 warning (0, "-mregparm is ignored for Intel MCU psABI");
4363 if (opts->x_ix86_regparm > REGPARM_MAX)
4365 error ("-mregparm=%d is not between 0 and %d",
4366 opts->x_ix86_regparm, REGPARM_MAX);
4367 opts->x_ix86_regparm = 0;
4370 if (TARGET_IAMCU_P (opts->x_target_flags)
4371 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4372 opts->x_ix86_regparm = REGPARM_MAX;
4374 /* Default align_* from the processor table. */
4375 ix86_default_align (opts);
4377 /* Provide default for -mbranch-cost= value. */
4378 if (!opts_set->x_ix86_branch_cost)
4379 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4381 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4383 opts->x_target_flags
4384 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4386 /* Enable by default the SSE and MMX builtins. Do allow the user to
4387 explicitly disable any of these. In particular, disabling SSE and
4388 MMX for kernel code is extremely useful. */
4389 if (!ix86_arch_specified)
4390 opts->x_ix86_isa_flags
4391 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4392 | TARGET_SUBTARGET64_ISA_DEFAULT)
4393 & ~opts->x_ix86_isa_flags_explicit);
4395 if (TARGET_RTD_P (opts->x_target_flags))
4396 warning (0,
4397 main_args_p
4398 ? G_("%<-mrtd%> is ignored in 64bit mode")
4399 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4401 else
4403 opts->x_target_flags
4404 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4406 if (!ix86_arch_specified)
4407 opts->x_ix86_isa_flags
4408 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4410 /* i386 ABI does not specify red zone. It still makes sense to use it
4411 when programmer takes care to stack from being destroyed. */
4412 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4413 opts->x_target_flags |= MASK_NO_RED_ZONE;
4416 /* Keep nonleaf frame pointers. */
4417 if (opts->x_flag_omit_frame_pointer)
4418 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4419 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4420 opts->x_flag_omit_frame_pointer = 1;
4422 /* If we're doing fast math, we don't care about comparison order
4423 wrt NaNs. This lets us use a shorter comparison sequence. */
4424 if (opts->x_flag_finite_math_only)
4425 opts->x_target_flags &= ~MASK_IEEE_FP;
4427 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4428 since the insns won't need emulation. */
4429 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4430 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4432 /* Likewise, if the target doesn't have a 387, or we've specified
4433 software floating point, don't use 387 inline intrinsics. */
4434 if (!TARGET_80387_P (opts->x_target_flags))
4435 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4437 /* Turn on MMX builtins for -msse. */
4438 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4439 opts->x_ix86_isa_flags
4440 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4442 /* Enable SSE prefetch. */
4443 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4444 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4445 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4446 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4447 x86_prefetch_sse = true;
4449 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4450 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4451 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4452 opts->x_ix86_isa_flags
4453 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4455 /* Enable lzcnt instruction for -mabm. */
4456 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4457 opts->x_ix86_isa_flags
4458 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4460 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4461 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4462 opts->x_ix86_isa_flags
4463 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4464 & ~opts->x_ix86_isa_flags_explicit);
4466 /* Validate -mpreferred-stack-boundary= value or default it to
4467 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4468 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4469 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4471 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4472 int max = TARGET_SEH ? 4 : 12;
4474 if (opts->x_ix86_preferred_stack_boundary_arg < min
4475 || opts->x_ix86_preferred_stack_boundary_arg > max)
4477 if (min == max)
4478 error ("-mpreferred-stack-boundary is not supported "
4479 "for this target");
4480 else
4481 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4482 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4484 else
4485 ix86_preferred_stack_boundary
4486 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4489 /* Set the default value for -mstackrealign. */
4490 if (!opts_set->x_ix86_force_align_arg_pointer)
4491 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4493 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4495 /* Validate -mincoming-stack-boundary= value or default it to
4496 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4497 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4498 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4500 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4502 if (opts->x_ix86_incoming_stack_boundary_arg < min
4503 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4504 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4505 opts->x_ix86_incoming_stack_boundary_arg, min);
4506 else
4508 ix86_user_incoming_stack_boundary
4509 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4510 ix86_incoming_stack_boundary
4511 = ix86_user_incoming_stack_boundary;
4515 #ifndef NO_PROFILE_COUNTERS
4516 if (flag_nop_mcount)
4517 error ("-mnop-mcount is not compatible with this target");
4518 #endif
4519 if (flag_nop_mcount && flag_pic)
4520 error ("-mnop-mcount is not implemented for -fPIC");
4522 /* Accept -msseregparm only if at least SSE support is enabled. */
4523 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4524 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4525 error (main_args_p
4526 ? G_("%<-msseregparm%> used without SSE enabled")
4527 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4529 if (opts_set->x_ix86_fpmath)
4531 if (opts->x_ix86_fpmath & FPMATH_SSE)
4533 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4535 if (TARGET_80387_P (opts->x_target_flags))
4537 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4538 opts->x_ix86_fpmath = FPMATH_387;
4541 else if ((opts->x_ix86_fpmath & FPMATH_387)
4542 && !TARGET_80387_P (opts->x_target_flags))
4544 warning (0, "387 instruction set disabled, using SSE arithmetics");
4545 opts->x_ix86_fpmath = FPMATH_SSE;
4549 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4550 fpmath=387. The second is however default at many targets since the
4551 extra 80bit precision of temporaries is considered to be part of ABI.
4552 Overwrite the default at least for -ffast-math.
4553 TODO: -mfpmath=both seems to produce same performing code with bit
4554 smaller binaries. It is however not clear if register allocation is
4555 ready for this setting.
4556 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4557 codegen. We may switch to 387 with -ffast-math for size optimized
4558 functions. */
4559 else if (fast_math_flags_set_p (&global_options)
4560 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4561 opts->x_ix86_fpmath = FPMATH_SSE;
4562 else
4563 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4565 /* Use external vectorized library in vectorizing intrinsics. */
4566 if (opts_set->x_ix86_veclibabi_type)
4567 switch (opts->x_ix86_veclibabi_type)
4569 case ix86_veclibabi_type_svml:
4570 ix86_veclib_handler = ix86_veclibabi_svml;
4571 break;
4573 case ix86_veclibabi_type_acml:
4574 ix86_veclib_handler = ix86_veclibabi_acml;
4575 break;
4577 default:
4578 gcc_unreachable ();
4581 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4582 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4583 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4585 /* If stack probes are required, the space used for large function
4586 arguments on the stack must also be probed, so enable
4587 -maccumulate-outgoing-args so this happens in the prologue. */
4588 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4589 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4591 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4592 warning (0,
4593 main_args_p
4594 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4595 "for correctness")
4596 : G_("stack probing requires "
4597 "%<target(\"accumulate-outgoing-args\")%> for "
4598 "correctness"));
4599 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4602 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4603 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4604 if (fixed_regs[BP_REG]
4605 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4607 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4608 warning (0,
4609 main_args_p
4610 ? G_("fixed ebp register requires "
4611 "%<-maccumulate-outgoing-args%>")
4612 : G_("fixed ebp register requires "
4613 "%<target(\"accumulate-outgoing-args\")%>"));
4614 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4617 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4619 char *p;
4620 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4621 p = strchr (internal_label_prefix, 'X');
4622 internal_label_prefix_len = p - internal_label_prefix;
4623 *p = '\0';
4626 /* When scheduling description is not available, disable scheduler pass
4627 so it won't slow down the compilation and make x87 code slower. */
4628 if (!TARGET_SCHEDULE)
4629 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4631 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4632 ix86_tune_cost->simultaneous_prefetches,
4633 opts->x_param_values,
4634 opts_set->x_param_values);
4635 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4636 ix86_tune_cost->prefetch_block,
4637 opts->x_param_values,
4638 opts_set->x_param_values);
4639 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4640 ix86_tune_cost->l1_cache_size,
4641 opts->x_param_values,
4642 opts_set->x_param_values);
4643 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4644 ix86_tune_cost->l2_cache_size,
4645 opts->x_param_values,
4646 opts_set->x_param_values);
4648 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4649 if (opts->x_flag_prefetch_loop_arrays < 0
4650 && HAVE_prefetch
4651 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4652 && !opts->x_optimize_size
4653 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4654 opts->x_flag_prefetch_loop_arrays = 1;
4656 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4657 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4658 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4659 targetm.expand_builtin_va_start = NULL;
4661 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4663 ix86_gen_leave = gen_leave_rex64;
4664 if (Pmode == DImode)
4666 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4667 ix86_gen_tls_local_dynamic_base_64
4668 = gen_tls_local_dynamic_base_64_di;
4670 else
4672 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4673 ix86_gen_tls_local_dynamic_base_64
4674 = gen_tls_local_dynamic_base_64_si;
4677 else
4678 ix86_gen_leave = gen_leave;
4680 if (Pmode == DImode)
4682 ix86_gen_add3 = gen_adddi3;
4683 ix86_gen_sub3 = gen_subdi3;
4684 ix86_gen_sub3_carry = gen_subdi3_carry;
4685 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4686 ix86_gen_andsp = gen_anddi3;
4687 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4688 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4689 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4690 ix86_gen_monitor = gen_sse3_monitor_di;
4691 ix86_gen_monitorx = gen_monitorx_di;
4692 ix86_gen_clzero = gen_clzero_di;
4694 else
4696 ix86_gen_add3 = gen_addsi3;
4697 ix86_gen_sub3 = gen_subsi3;
4698 ix86_gen_sub3_carry = gen_subsi3_carry;
4699 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4700 ix86_gen_andsp = gen_andsi3;
4701 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4702 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4703 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4704 ix86_gen_monitor = gen_sse3_monitor_si;
4705 ix86_gen_monitorx = gen_monitorx_si;
4706 ix86_gen_clzero = gen_clzero_si;
4709 #ifdef USE_IX86_CLD
4710 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4711 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4712 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4713 #endif
4715 /* Set the default value for -mfentry. */
4716 if (!opts_set->x_flag_fentry)
4717 opts->x_flag_fentry = TARGET_SEH;
4718 else
4720 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4721 && opts->x_flag_fentry)
4722 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4723 "with -fpic");
4724 else if (TARGET_SEH && !opts->x_flag_fentry)
4725 sorry ("-mno-fentry isn%'t compatible with SEH");
4728 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4729 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4731 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4732 && TARGET_EMIT_VZEROUPPER)
4733 opts->x_target_flags |= MASK_VZEROUPPER;
4734 if (!(opts_set->x_target_flags & MASK_STV))
4735 opts->x_target_flags |= MASK_STV;
4736 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4737 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4738 stack realignment will be extra cost the pass doesn't take into
4739 account and the pass can't realign the stack. */
4740 if (ix86_preferred_stack_boundary < 128
4741 || ix86_incoming_stack_boundary < 128
4742 || opts->x_ix86_force_align_arg_pointer)
4743 opts->x_target_flags &= ~MASK_STV;
4744 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4745 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4746 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4747 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4748 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4749 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4751 /* Enable 128-bit AVX instruction generation
4752 for the auto-vectorizer. */
4753 if (TARGET_AVX128_OPTIMAL
4754 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4755 opts->x_prefer_vector_width_type = PVW_AVX128;
4757 /* Use 256-bit AVX instruction generation
4758 in the auto-vectorizer. */
4759 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4760 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4761 opts->x_prefer_vector_width_type = PVW_AVX256;
4763 if (opts->x_ix86_recip_name)
4765 char *p = ASTRDUP (opts->x_ix86_recip_name);
4766 char *q;
4767 unsigned int mask, i;
4768 bool invert;
4770 while ((q = strtok (p, ",")) != NULL)
4772 p = NULL;
4773 if (*q == '!')
4775 invert = true;
4776 q++;
4778 else
4779 invert = false;
4781 if (!strcmp (q, "default"))
4782 mask = RECIP_MASK_ALL;
4783 else
4785 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4786 if (!strcmp (q, recip_options[i].string))
4788 mask = recip_options[i].mask;
4789 break;
4792 if (i == ARRAY_SIZE (recip_options))
4794 error ("unknown option for -mrecip=%s", q);
4795 invert = false;
4796 mask = RECIP_MASK_NONE;
4800 opts->x_recip_mask_explicit |= mask;
4801 if (invert)
4802 opts->x_recip_mask &= ~mask;
4803 else
4804 opts->x_recip_mask |= mask;
4808 if (TARGET_RECIP_P (opts->x_target_flags))
4809 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4810 else if (opts_set->x_target_flags & MASK_RECIP)
4811 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4813 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4814 for 64-bit Bionic. Also default long double to 64-bit for Intel
4815 MCU psABI. */
4816 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4817 && !(opts_set->x_target_flags
4818 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4819 opts->x_target_flags |= (TARGET_64BIT
4820 ? MASK_LONG_DOUBLE_128
4821 : MASK_LONG_DOUBLE_64);
4823 /* Only one of them can be active. */
4824 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4825 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4827 /* Handle stack protector */
4828 if (!opts_set->x_ix86_stack_protector_guard)
4829 opts->x_ix86_stack_protector_guard
4830 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4832 #ifdef TARGET_THREAD_SSP_OFFSET
4833 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4834 #endif
4836 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4838 char *endp;
4839 const char *str = ix86_stack_protector_guard_offset_str;
4841 errno = 0;
4842 int64_t offset;
4844 #if defined(INT64_T_IS_LONG)
4845 offset = strtol (str, &endp, 0);
4846 #else
4847 offset = strtoll (str, &endp, 0);
4848 #endif
4850 if (!*str || *endp || errno)
4851 error ("%qs is not a valid number "
4852 "in -mstack-protector-guard-offset=", str);
4854 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4855 HOST_WIDE_INT_C (0x7fffffff)))
4856 error ("%qs is not a valid offset "
4857 "in -mstack-protector-guard-offset=", str);
4859 ix86_stack_protector_guard_offset = offset;
4862 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4864 /* The kernel uses a different segment register for performance
4865 reasons; a system call would not have to trash the userspace
4866 segment register, which would be expensive. */
4867 if (ix86_cmodel == CM_KERNEL)
4868 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4870 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4872 const char *str = ix86_stack_protector_guard_reg_str;
4873 addr_space_t seg = ADDR_SPACE_GENERIC;
4875 /* Discard optional register prefix. */
4876 if (str[0] == '%')
4877 str++;
4879 if (strlen (str) == 2 && str[1] == 's')
4881 if (str[0] == 'f')
4882 seg = ADDR_SPACE_SEG_FS;
4883 else if (str[0] == 'g')
4884 seg = ADDR_SPACE_SEG_GS;
4887 if (seg == ADDR_SPACE_GENERIC)
4888 error ("%qs is not a valid base register "
4889 "in -mstack-protector-guard-reg=",
4890 ix86_stack_protector_guard_reg_str);
4892 ix86_stack_protector_guard_reg = seg;
4895 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4896 if (opts->x_ix86_tune_memcpy_strategy)
4898 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4899 ix86_parse_stringop_strategy_string (str, false);
4900 free (str);
4903 if (opts->x_ix86_tune_memset_strategy)
4905 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4906 ix86_parse_stringop_strategy_string (str, true);
4907 free (str);
4910 /* Save the initial options in case the user does function specific
4911 options. */
4912 if (main_args_p)
4913 target_option_default_node = target_option_current_node
4914 = build_target_option_node (opts);
4916 /* Do not support control flow instrumentation if CET is not enabled. */
4917 cf_protection_level cf_protection
4918 = (cf_protection_level) (opts->x_flag_cf_protection & ~CF_SET);
4919 if (cf_protection != CF_NONE)
4921 switch (cf_protection)
4923 case CF_BRANCH:
4924 if (! TARGET_IBT_P (opts->x_ix86_isa_flags2))
4926 error ("%<-fcf-protection=branch%> requires Intel CET "
4927 "support. Use -mcet or -mibt option to enable CET");
4928 flag_cf_protection = CF_NONE;
4929 return false;
4931 break;
4932 case CF_RETURN:
4933 if (! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4935 error ("%<-fcf-protection=return%> requires Intel CET "
4936 "support. Use -mcet or -mshstk option to enable CET");
4937 flag_cf_protection = CF_NONE;
4938 return false;
4940 break;
4941 case CF_FULL:
4942 if ( ! TARGET_IBT_P (opts->x_ix86_isa_flags2)
4943 || ! TARGET_SHSTK_P (opts->x_ix86_isa_flags))
4945 error ("%<-fcf-protection=full%> requires Intel CET "
4946 "support. Use -mcet or both of -mibt and "
4947 "-mshstk options to enable CET");
4948 flag_cf_protection = CF_NONE;
4949 return false;
4951 break;
4952 default:
4953 gcc_unreachable ();
4956 opts->x_flag_cf_protection =
4957 (cf_protection_level) (cf_protection | CF_SET);
4960 if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
4961 maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
4962 opts->x_param_values,
4963 opts_set->x_param_values);
4965 return true;
4968 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4970 static void
4971 ix86_option_override (void)
4973 ix86_option_override_internal (true, &global_options, &global_options_set);
4976 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4977 static char *
4978 ix86_offload_options (void)
4980 if (TARGET_LP64)
4981 return xstrdup ("-foffload-abi=lp64");
4982 return xstrdup ("-foffload-abi=ilp32");
4985 /* Update register usage after having seen the compiler flags. */
4987 static void
4988 ix86_conditional_register_usage (void)
4990 int i, c_mask;
4992 /* If there are no caller-saved registers, preserve all registers.
4993 except fixed_regs and registers used for function return value
4994 since aggregate_value_p checks call_used_regs[regno] on return
4995 value. */
4996 if (cfun && cfun->machine->no_caller_saved_registers)
4997 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4998 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4999 call_used_regs[i] = 0;
5001 /* For 32-bit targets, squash the REX registers. */
5002 if (! TARGET_64BIT)
5004 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
5005 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5006 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
5007 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5008 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5009 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5012 /* See the definition of CALL_USED_REGISTERS in i386.h. */
5013 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
5015 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
5017 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5019 /* Set/reset conditionally defined registers from
5020 CALL_USED_REGISTERS initializer. */
5021 if (call_used_regs[i] > 1)
5022 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
5024 /* Calculate registers of CLOBBERED_REGS register set
5025 as call used registers from GENERAL_REGS register set. */
5026 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
5027 && call_used_regs[i])
5028 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
5031 /* If MMX is disabled, squash the registers. */
5032 if (! TARGET_MMX)
5033 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5034 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
5035 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5037 /* If SSE is disabled, squash the registers. */
5038 if (! TARGET_SSE)
5039 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5040 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
5041 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5043 /* If the FPU is disabled, squash the registers. */
5044 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
5045 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5046 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
5047 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5049 /* If AVX512F is disabled, squash the registers. */
5050 if (! TARGET_AVX512F)
5052 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5053 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5055 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
5056 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5059 /* If MPX is disabled, squash the registers. */
5060 if (! TARGET_MPX)
5061 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
5062 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5065 /* Canonicalize a comparison from one we don't have to one we do have. */
5067 static void
5068 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5069 bool op0_preserve_value)
5071 /* The order of operands in x87 ficom compare is forced by combine in
5072 simplify_comparison () function. Float operator is treated as RTX_OBJ
5073 with a precedence over other operators and is always put in the first
5074 place. Swap condition and operands to match ficom instruction. */
5075 if (!op0_preserve_value
5076 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5078 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5080 /* We are called only for compares that are split to SAHF instruction.
5081 Ensure that we have setcc/jcc insn for the swapped condition. */
5082 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5084 std::swap (*op0, *op1);
5085 *code = (int) scode;
5090 /* Save the current options */
5092 static void
5093 ix86_function_specific_save (struct cl_target_option *ptr,
5094 struct gcc_options *opts)
5096 ptr->arch = ix86_arch;
5097 ptr->schedule = ix86_schedule;
5098 ptr->prefetch_sse = x86_prefetch_sse;
5099 ptr->tune = ix86_tune;
5100 ptr->branch_cost = ix86_branch_cost;
5101 ptr->tune_defaulted = ix86_tune_defaulted;
5102 ptr->arch_specified = ix86_arch_specified;
5103 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5104 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5105 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5106 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5107 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5108 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5109 ptr->x_ix86_abi = opts->x_ix86_abi;
5110 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5111 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5112 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5113 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5114 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5115 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5116 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5117 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5118 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5119 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5120 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5121 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5122 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5123 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5124 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5125 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5126 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5127 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5128 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5129 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5131 /* The fields are char but the variables are not; make sure the
5132 values fit in the fields. */
5133 gcc_assert (ptr->arch == ix86_arch);
5134 gcc_assert (ptr->schedule == ix86_schedule);
5135 gcc_assert (ptr->tune == ix86_tune);
5136 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5139 /* Restore the current options */
5141 static void
5142 ix86_function_specific_restore (struct gcc_options *opts,
5143 struct cl_target_option *ptr)
5145 enum processor_type old_tune = ix86_tune;
5146 enum processor_type old_arch = ix86_arch;
5147 unsigned int ix86_arch_mask;
5148 int i;
5150 /* We don't change -fPIC. */
5151 opts->x_flag_pic = flag_pic;
5153 ix86_arch = (enum processor_type) ptr->arch;
5154 ix86_schedule = (enum attr_cpu) ptr->schedule;
5155 ix86_tune = (enum processor_type) ptr->tune;
5156 x86_prefetch_sse = ptr->prefetch_sse;
5157 opts->x_ix86_branch_cost = ptr->branch_cost;
5158 ix86_tune_defaulted = ptr->tune_defaulted;
5159 ix86_arch_specified = ptr->arch_specified;
5160 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5161 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5162 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5163 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5164 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5165 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5166 opts->x_ix86_abi = ptr->x_ix86_abi;
5167 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5168 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5169 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5170 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5171 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5172 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5173 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5174 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5175 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5176 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5177 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5178 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5179 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5180 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5181 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5182 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5183 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5184 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5185 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5186 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5187 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5188 /* TODO: ix86_cost should be chosen at instruction or function granuality
5189 so for cold code we use size_cost even in !optimize_size compilation. */
5190 if (opts->x_optimize_size)
5191 ix86_cost = &ix86_size_cost;
5192 else
5193 ix86_cost = ix86_tune_cost;
5195 /* Recreate the arch feature tests if the arch changed */
5196 if (old_arch != ix86_arch)
5198 ix86_arch_mask = 1u << ix86_arch;
5199 for (i = 0; i < X86_ARCH_LAST; ++i)
5200 ix86_arch_features[i]
5201 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5204 /* Recreate the tune optimization tests */
5205 if (old_tune != ix86_tune)
5206 set_ix86_tune_features (ix86_tune, false);
5209 /* Adjust target options after streaming them in. This is mainly about
5210 reconciling them with global options. */
5212 static void
5213 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5215 /* flag_pic is a global option, but ix86_cmodel is target saved option
5216 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5217 for PIC, or error out. */
5218 if (flag_pic)
5219 switch (ptr->x_ix86_cmodel)
5221 case CM_SMALL:
5222 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5223 break;
5225 case CM_MEDIUM:
5226 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5227 break;
5229 case CM_LARGE:
5230 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5231 break;
5233 case CM_KERNEL:
5234 error ("code model %s does not support PIC mode", "kernel");
5235 break;
5237 default:
5238 break;
5240 else
5241 switch (ptr->x_ix86_cmodel)
5243 case CM_SMALL_PIC:
5244 ptr->x_ix86_cmodel = CM_SMALL;
5245 break;
5247 case CM_MEDIUM_PIC:
5248 ptr->x_ix86_cmodel = CM_MEDIUM;
5249 break;
5251 case CM_LARGE_PIC:
5252 ptr->x_ix86_cmodel = CM_LARGE;
5253 break;
5255 default:
5256 break;
5260 /* Print the current options */
5262 static void
5263 ix86_function_specific_print (FILE *file, int indent,
5264 struct cl_target_option *ptr)
5266 char *target_string
5267 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5268 ptr->x_target_flags, ptr->x_ix86_target_flags,
5269 NULL, NULL, ptr->x_ix86_fpmath, false);
5271 gcc_assert (ptr->arch < PROCESSOR_max);
5272 fprintf (file, "%*sarch = %d (%s)\n",
5273 indent, "",
5274 ptr->arch, processor_target_table[ptr->arch].name);
5276 gcc_assert (ptr->tune < PROCESSOR_max);
5277 fprintf (file, "%*stune = %d (%s)\n",
5278 indent, "",
5279 ptr->tune, processor_target_table[ptr->tune].name);
5281 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5283 if (target_string)
5285 fprintf (file, "%*s%s\n", indent, "", target_string);
5286 free (target_string);
5291 /* Inner function to process the attribute((target(...))), take an argument and
5292 set the current options from the argument. If we have a list, recursively go
5293 over the list. */
5295 static bool
5296 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5297 struct gcc_options *opts,
5298 struct gcc_options *opts_set,
5299 struct gcc_options *enum_opts_set)
5301 char *next_optstr;
5302 bool ret = true;
5304 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5305 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5306 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5307 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5308 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5310 enum ix86_opt_type
5312 ix86_opt_unknown,
5313 ix86_opt_yes,
5314 ix86_opt_no,
5315 ix86_opt_str,
5316 ix86_opt_enum,
5317 ix86_opt_isa
5320 static const struct
5322 const char *string;
5323 size_t len;
5324 enum ix86_opt_type type;
5325 int opt;
5326 int mask;
5327 } attrs[] = {
5328 /* isa options */
5329 IX86_ATTR_ISA ("pconfig", OPT_mpconfig),
5330 IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd),
5331 IX86_ATTR_ISA ("sgx", OPT_msgx),
5332 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5333 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5334 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5335 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5336 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5337 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5339 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5340 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5341 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5342 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5343 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5344 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5345 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5346 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5347 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5348 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5349 IX86_ATTR_ISA ("fma", OPT_mfma),
5350 IX86_ATTR_ISA ("xop", OPT_mxop),
5351 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5352 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5353 IX86_ATTR_ISA ("avx", OPT_mavx),
5354 IX86_ATTR_ISA ("sse4", OPT_msse4),
5355 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5356 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5357 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5358 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5359 IX86_ATTR_ISA ("sse3", OPT_msse3),
5360 IX86_ATTR_ISA ("aes", OPT_maes),
5361 IX86_ATTR_ISA ("sha", OPT_msha),
5362 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5363 IX86_ATTR_ISA ("sse2", OPT_msse2),
5364 IX86_ATTR_ISA ("sse", OPT_msse),
5365 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5366 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5367 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5368 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5369 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5370 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5371 IX86_ATTR_ISA ("adx", OPT_madx),
5372 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5373 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5374 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5375 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5376 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5377 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5378 IX86_ATTR_ISA ("abm", OPT_mabm),
5379 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5380 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5381 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5382 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5383 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5384 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5385 IX86_ATTR_ISA ("sahf", OPT_msahf),
5386 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5387 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5388 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5389 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5390 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5391 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5392 IX86_ATTR_ISA ("pku", OPT_mpku),
5393 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5394 IX86_ATTR_ISA ("hle", OPT_mhle),
5395 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5396 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5397 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5398 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5399 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5400 IX86_ATTR_ISA ("ibt", OPT_mibt),
5401 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5402 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5403 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5405 /* enum options */
5406 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5408 /* string options */
5409 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5410 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5412 /* flag options */
5413 IX86_ATTR_YES ("cld",
5414 OPT_mcld,
5415 MASK_CLD),
5417 IX86_ATTR_NO ("fancy-math-387",
5418 OPT_mfancy_math_387,
5419 MASK_NO_FANCY_MATH_387),
5421 IX86_ATTR_YES ("ieee-fp",
5422 OPT_mieee_fp,
5423 MASK_IEEE_FP),
5425 IX86_ATTR_YES ("inline-all-stringops",
5426 OPT_minline_all_stringops,
5427 MASK_INLINE_ALL_STRINGOPS),
5429 IX86_ATTR_YES ("inline-stringops-dynamically",
5430 OPT_minline_stringops_dynamically,
5431 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5433 IX86_ATTR_NO ("align-stringops",
5434 OPT_mno_align_stringops,
5435 MASK_NO_ALIGN_STRINGOPS),
5437 IX86_ATTR_YES ("recip",
5438 OPT_mrecip,
5439 MASK_RECIP),
5443 /* If this is a list, recurse to get the options. */
5444 if (TREE_CODE (args) == TREE_LIST)
5446 bool ret = true;
5448 for (; args; args = TREE_CHAIN (args))
5449 if (TREE_VALUE (args)
5450 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5451 p_strings, opts, opts_set,
5452 enum_opts_set))
5453 ret = false;
5455 return ret;
5458 else if (TREE_CODE (args) != STRING_CST)
5460 error ("attribute %<target%> argument not a string");
5461 return false;
5464 /* Handle multiple arguments separated by commas. */
5465 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5467 while (next_optstr && *next_optstr != '\0')
5469 char *p = next_optstr;
5470 char *orig_p = p;
5471 char *comma = strchr (next_optstr, ',');
5472 const char *opt_string;
5473 size_t len, opt_len;
5474 int opt;
5475 bool opt_set_p;
5476 char ch;
5477 unsigned i;
5478 enum ix86_opt_type type = ix86_opt_unknown;
5479 int mask = 0;
5481 if (comma)
5483 *comma = '\0';
5484 len = comma - next_optstr;
5485 next_optstr = comma + 1;
5487 else
5489 len = strlen (p);
5490 next_optstr = NULL;
5493 /* Recognize no-xxx. */
5494 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5496 opt_set_p = false;
5497 p += 3;
5498 len -= 3;
5500 else
5501 opt_set_p = true;
5503 /* Find the option. */
5504 ch = *p;
5505 opt = N_OPTS;
5506 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5508 type = attrs[i].type;
5509 opt_len = attrs[i].len;
5510 if (ch == attrs[i].string[0]
5511 && ((type != ix86_opt_str && type != ix86_opt_enum)
5512 ? len == opt_len
5513 : len > opt_len)
5514 && memcmp (p, attrs[i].string, opt_len) == 0)
5516 opt = attrs[i].opt;
5517 mask = attrs[i].mask;
5518 opt_string = attrs[i].string;
5519 break;
5523 /* Process the option. */
5524 if (opt == N_OPTS)
5526 error ("attribute(target(\"%s\")) is unknown", orig_p);
5527 ret = false;
5530 else if (type == ix86_opt_isa)
5532 struct cl_decoded_option decoded;
5534 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5535 ix86_handle_option (opts, opts_set,
5536 &decoded, input_location);
5539 else if (type == ix86_opt_yes || type == ix86_opt_no)
5541 if (type == ix86_opt_no)
5542 opt_set_p = !opt_set_p;
5544 if (opt_set_p)
5545 opts->x_target_flags |= mask;
5546 else
5547 opts->x_target_flags &= ~mask;
5550 else if (type == ix86_opt_str)
5552 if (p_strings[opt])
5554 error ("option(\"%s\") was already specified", opt_string);
5555 ret = false;
5557 else
5558 p_strings[opt] = xstrdup (p + opt_len);
5561 else if (type == ix86_opt_enum)
5563 bool arg_ok;
5564 int value;
5566 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5567 if (arg_ok)
5568 set_option (opts, enum_opts_set, opt, value,
5569 p + opt_len, DK_UNSPECIFIED, input_location,
5570 global_dc);
5571 else
5573 error ("attribute(target(\"%s\")) is unknown", orig_p);
5574 ret = false;
5578 else
5579 gcc_unreachable ();
5582 return ret;
5585 /* Release allocated strings. */
5586 static void
5587 release_options_strings (char **option_strings)
5589 /* Free up memory allocated to hold the strings */
5590 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5591 free (option_strings[i]);
5594 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5596 tree
5597 ix86_valid_target_attribute_tree (tree args,
5598 struct gcc_options *opts,
5599 struct gcc_options *opts_set)
5601 const char *orig_arch_string = opts->x_ix86_arch_string;
5602 const char *orig_tune_string = opts->x_ix86_tune_string;
5603 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5604 int orig_tune_defaulted = ix86_tune_defaulted;
5605 int orig_arch_specified = ix86_arch_specified;
5606 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5607 tree t = NULL_TREE;
5608 struct cl_target_option *def
5609 = TREE_TARGET_OPTION (target_option_default_node);
5610 struct gcc_options enum_opts_set;
5612 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5614 /* Process each of the options on the chain. */
5615 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5616 opts_set, &enum_opts_set))
5617 return error_mark_node;
5619 /* If the changed options are different from the default, rerun
5620 ix86_option_override_internal, and then save the options away.
5621 The string options are attribute options, and will be undone
5622 when we copy the save structure. */
5623 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5624 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5625 || opts->x_target_flags != def->x_target_flags
5626 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5627 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5628 || enum_opts_set.x_ix86_fpmath)
5630 /* If we are using the default tune= or arch=, undo the string assigned,
5631 and use the default. */
5632 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5634 opts->x_ix86_arch_string
5635 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5637 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5638 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5639 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5640 | OPTION_MASK_ABI_64
5641 | OPTION_MASK_ABI_X32
5642 | OPTION_MASK_CODE16);
5643 opts->x_ix86_isa_flags2 = 0;
5645 else if (!orig_arch_specified)
5646 opts->x_ix86_arch_string = NULL;
5648 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5649 opts->x_ix86_tune_string
5650 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5651 else if (orig_tune_defaulted)
5652 opts->x_ix86_tune_string = NULL;
5654 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5655 if (enum_opts_set.x_ix86_fpmath)
5656 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5658 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5659 bool r = ix86_option_override_internal (false, opts, opts_set);
5660 if (!r)
5662 release_options_strings (option_strings);
5663 return error_mark_node;
5666 /* Add any builtin functions with the new isa if any. */
5667 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5669 /* Save the current options unless we are validating options for
5670 #pragma. */
5671 t = build_target_option_node (opts);
5673 opts->x_ix86_arch_string = orig_arch_string;
5674 opts->x_ix86_tune_string = orig_tune_string;
5675 opts_set->x_ix86_fpmath = orig_fpmath_set;
5677 release_options_strings (option_strings);
5680 return t;
5683 /* Hook to validate attribute((target("string"))). */
5685 static bool
5686 ix86_valid_target_attribute_p (tree fndecl,
5687 tree ARG_UNUSED (name),
5688 tree args,
5689 int ARG_UNUSED (flags))
5691 struct gcc_options func_options;
5692 tree new_target, new_optimize;
5693 bool ret = true;
5695 /* attribute((target("default"))) does nothing, beyond
5696 affecting multi-versioning. */
5697 if (TREE_VALUE (args)
5698 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5699 && TREE_CHAIN (args) == NULL_TREE
5700 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5701 return true;
5703 tree old_optimize = build_optimization_node (&global_options);
5705 /* Get the optimization options of the current function. */
5706 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5708 if (!func_optimize)
5709 func_optimize = old_optimize;
5711 /* Init func_options. */
5712 memset (&func_options, 0, sizeof (func_options));
5713 init_options_struct (&func_options, NULL);
5714 lang_hooks.init_options_struct (&func_options);
5716 cl_optimization_restore (&func_options,
5717 TREE_OPTIMIZATION (func_optimize));
5719 /* Initialize func_options to the default before its target options can
5720 be set. */
5721 cl_target_option_restore (&func_options,
5722 TREE_TARGET_OPTION (target_option_default_node));
5724 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5725 &global_options_set);
5727 new_optimize = build_optimization_node (&func_options);
5729 if (new_target == error_mark_node)
5730 ret = false;
5732 else if (fndecl && new_target)
5734 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5736 if (old_optimize != new_optimize)
5737 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5740 finalize_options_struct (&func_options);
5742 return ret;
5746 /* Hook to determine if one function can safely inline another. */
5748 static bool
5749 ix86_can_inline_p (tree caller, tree callee)
5751 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5752 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5753 if (!callee_tree)
5754 callee_tree = target_option_default_node;
5755 if (!caller_tree)
5756 caller_tree = target_option_default_node;
5757 if (callee_tree == caller_tree)
5758 return true;
5760 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5761 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5762 bool ret = false;
5764 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5765 function can inline a SSE2 function but a SSE2 function can't inline
5766 a SSE4 function. */
5767 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5768 != callee_opts->x_ix86_isa_flags)
5769 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5770 != callee_opts->x_ix86_isa_flags2))
5771 ret = false;
5773 /* See if we have the same non-isa options. */
5774 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5775 ret = false;
5777 /* See if arch, tune, etc. are the same. */
5778 else if (caller_opts->arch != callee_opts->arch)
5779 ret = false;
5781 else if (caller_opts->tune != callee_opts->tune)
5782 ret = false;
5784 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5785 /* If the calle doesn't use FP expressions differences in
5786 ix86_fpmath can be ignored. We are called from FEs
5787 for multi-versioning call optimization, so beware of
5788 ipa_fn_summaries not available. */
5789 && (! ipa_fn_summaries
5790 || ipa_fn_summaries->get
5791 (cgraph_node::get (callee))->fp_expressions))
5792 ret = false;
5794 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5795 ret = false;
5797 else
5798 ret = true;
5800 return ret;
5804 /* Remember the last target of ix86_set_current_function. */
5805 static GTY(()) tree ix86_previous_fndecl;
5807 /* Set targets globals to the default (or current #pragma GCC target
5808 if active). Invalidate ix86_previous_fndecl cache. */
5810 void
5811 ix86_reset_previous_fndecl (void)
5813 tree new_tree = target_option_current_node;
5814 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5815 if (TREE_TARGET_GLOBALS (new_tree))
5816 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5817 else if (new_tree == target_option_default_node)
5818 restore_target_globals (&default_target_globals);
5819 else
5820 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5821 ix86_previous_fndecl = NULL_TREE;
5824 /* Set the func_type field from the function FNDECL. */
5826 static void
5827 ix86_set_func_type (tree fndecl)
5829 if (cfun->machine->func_type == TYPE_UNKNOWN)
5831 if (lookup_attribute ("interrupt",
5832 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5834 if (ix86_function_naked (fndecl))
5835 error_at (DECL_SOURCE_LOCATION (fndecl),
5836 "interrupt and naked attributes are not compatible");
5838 int nargs = 0;
5839 for (tree arg = DECL_ARGUMENTS (fndecl);
5840 arg;
5841 arg = TREE_CHAIN (arg))
5842 nargs++;
5843 cfun->machine->no_caller_saved_registers = true;
5844 cfun->machine->func_type
5845 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5847 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5849 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5850 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5851 sorry ("Only DWARF debug format is supported for interrupt "
5852 "service routine.");
5854 else
5856 cfun->machine->func_type = TYPE_NORMAL;
5857 if (lookup_attribute ("no_caller_saved_registers",
5858 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5859 cfun->machine->no_caller_saved_registers = true;
5864 /* Set the indirect_branch_type field from the function FNDECL. */
5866 static void
5867 ix86_set_indirect_branch_type (tree fndecl)
5869 if (cfun->machine->indirect_branch_type == indirect_branch_unset)
5871 tree attr = lookup_attribute ("indirect_branch",
5872 DECL_ATTRIBUTES (fndecl));
5873 if (attr != NULL)
5875 tree args = TREE_VALUE (attr);
5876 if (args == NULL)
5877 gcc_unreachable ();
5878 tree cst = TREE_VALUE (args);
5879 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5880 cfun->machine->indirect_branch_type = indirect_branch_keep;
5881 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5882 cfun->machine->indirect_branch_type = indirect_branch_thunk;
5883 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5884 cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
5885 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5886 cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
5887 else
5888 gcc_unreachable ();
5890 else
5891 cfun->machine->indirect_branch_type = ix86_indirect_branch;
5893 /* -mcmodel=large is not compatible with -mindirect-branch=thunk
5894 nor -mindirect-branch=thunk-extern. */
5895 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5896 && ((cfun->machine->indirect_branch_type
5897 == indirect_branch_thunk_extern)
5898 || (cfun->machine->indirect_branch_type
5899 == indirect_branch_thunk)))
5900 error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
5901 "compatible",
5902 ((cfun->machine->indirect_branch_type
5903 == indirect_branch_thunk_extern)
5904 ? "thunk-extern" : "thunk"));
5906 /* -mindirect-branch=thunk-extern, -fcf-protection=branch and
5907 -fcheck-pointer-bounds are not compatible. */
5908 if ((cfun->machine->indirect_branch_type
5909 == indirect_branch_thunk_extern)
5910 && flag_check_pointer_bounds
5911 && (flag_cf_protection & CF_BRANCH) != 0)
5912 error ("%<-mindirect-branch=thunk-extern%>, "
5913 "%<-fcf-protection=branch%> and "
5914 "%<-fcheck-pointer-bounds%> are not compatible");
5917 if (cfun->machine->function_return_type == indirect_branch_unset)
5919 tree attr = lookup_attribute ("function_return",
5920 DECL_ATTRIBUTES (fndecl));
5921 if (attr != NULL)
5923 tree args = TREE_VALUE (attr);
5924 if (args == NULL)
5925 gcc_unreachable ();
5926 tree cst = TREE_VALUE (args);
5927 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5928 cfun->machine->function_return_type = indirect_branch_keep;
5929 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5930 cfun->machine->function_return_type = indirect_branch_thunk;
5931 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5932 cfun->machine->function_return_type = indirect_branch_thunk_inline;
5933 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5934 cfun->machine->function_return_type = indirect_branch_thunk_extern;
5935 else
5936 gcc_unreachable ();
5938 else
5939 cfun->machine->function_return_type = ix86_function_return;
5941 /* -mcmodel=large is not compatible with -mfunction-return=thunk
5942 nor -mfunction-return=thunk-extern. */
5943 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5944 && ((cfun->machine->function_return_type
5945 == indirect_branch_thunk_extern)
5946 || (cfun->machine->function_return_type
5947 == indirect_branch_thunk)))
5948 error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
5949 "compatible",
5950 ((cfun->machine->function_return_type
5951 == indirect_branch_thunk_extern)
5952 ? "thunk-extern" : "thunk"));
5956 /* Establish appropriate back-end context for processing the function
5957 FNDECL. The argument might be NULL to indicate processing at top
5958 level, outside of any function scope. */
5959 static void
5960 ix86_set_current_function (tree fndecl)
5962 /* Only change the context if the function changes. This hook is called
5963 several times in the course of compiling a function, and we don't want to
5964 slow things down too much or call target_reinit when it isn't safe. */
5965 if (fndecl == ix86_previous_fndecl)
5967 /* There may be 2 function bodies for the same function FNDECL,
5968 one is extern inline and one isn't. Call ix86_set_func_type
5969 to set the func_type field. */
5970 if (fndecl != NULL_TREE)
5972 ix86_set_func_type (fndecl);
5973 ix86_set_indirect_branch_type (fndecl);
5975 return;
5978 tree old_tree;
5979 if (ix86_previous_fndecl == NULL_TREE)
5980 old_tree = target_option_current_node;
5981 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5982 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5983 else
5984 old_tree = target_option_default_node;
5986 if (fndecl == NULL_TREE)
5988 if (old_tree != target_option_current_node)
5989 ix86_reset_previous_fndecl ();
5990 return;
5993 ix86_set_func_type (fndecl);
5994 ix86_set_indirect_branch_type (fndecl);
5996 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5997 if (new_tree == NULL_TREE)
5998 new_tree = target_option_default_node;
6000 if (old_tree != new_tree)
6002 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6003 if (TREE_TARGET_GLOBALS (new_tree))
6004 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6005 else if (new_tree == target_option_default_node)
6006 restore_target_globals (&default_target_globals);
6007 else
6008 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
6010 ix86_previous_fndecl = fndecl;
6012 static bool prev_no_caller_saved_registers;
6014 /* 64-bit MS and SYSV ABI have different set of call used registers.
6015 Avoid expensive re-initialization of init_regs each time we switch
6016 function context. */
6017 if (TARGET_64BIT
6018 && (call_used_regs[SI_REG]
6019 == (cfun->machine->call_abi == MS_ABI)))
6020 reinit_regs ();
6021 /* Need to re-initialize init_regs if caller-saved registers are
6022 changed. */
6023 else if (prev_no_caller_saved_registers
6024 != cfun->machine->no_caller_saved_registers)
6025 reinit_regs ();
6027 if (cfun->machine->func_type != TYPE_NORMAL
6028 || cfun->machine->no_caller_saved_registers)
6030 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
6031 may change processor state. */
6032 const char *isa;
6033 if (TARGET_MPX)
6034 isa = "MPX";
6035 else if (TARGET_SSE)
6036 isa = "SSE";
6037 else if (TARGET_MMX)
6038 isa = "MMX/3Dnow";
6039 else if (TARGET_80387)
6040 isa = "80387";
6041 else
6042 isa = NULL;
6043 if (isa != NULL)
6045 if (cfun->machine->func_type != TYPE_NORMAL)
6046 sorry ("%s instructions aren't allowed in %s service routine",
6047 isa, (cfun->machine->func_type == TYPE_EXCEPTION
6048 ? "exception" : "interrupt"));
6049 else
6050 sorry ("%s instructions aren't allowed in function with "
6051 "no_caller_saved_registers attribute", isa);
6052 /* Don't issue the same error twice. */
6053 cfun->machine->func_type = TYPE_NORMAL;
6054 cfun->machine->no_caller_saved_registers = false;
6058 prev_no_caller_saved_registers
6059 = cfun->machine->no_caller_saved_registers;
6063 /* Return true if this goes in large data/bss. */
6065 static bool
6066 ix86_in_large_data_p (tree exp)
6068 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
6069 return false;
6071 if (exp == NULL_TREE)
6072 return false;
6074 /* Functions are never large data. */
6075 if (TREE_CODE (exp) == FUNCTION_DECL)
6076 return false;
6078 /* Automatic variables are never large data. */
6079 if (VAR_P (exp) && !is_global_var (exp))
6080 return false;
6082 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
6084 const char *section = DECL_SECTION_NAME (exp);
6085 if (strcmp (section, ".ldata") == 0
6086 || strcmp (section, ".lbss") == 0)
6087 return true;
6088 return false;
6090 else
6092 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
6094 /* If this is an incomplete type with size 0, then we can't put it
6095 in data because it might be too big when completed. Also,
6096 int_size_in_bytes returns -1 if size can vary or is larger than
6097 an integer in which case also it is safer to assume that it goes in
6098 large data. */
6099 if (size <= 0 || size > ix86_section_threshold)
6100 return true;
6103 return false;
6106 /* i386-specific section flag to mark large sections. */
6107 #define SECTION_LARGE SECTION_MACH_DEP
6109 /* Switch to the appropriate section for output of DECL.
6110 DECL is either a `VAR_DECL' node or a constant of some sort.
6111 RELOC indicates whether forming the initial value of DECL requires
6112 link-time relocations. */
6114 ATTRIBUTE_UNUSED static section *
6115 x86_64_elf_select_section (tree decl, int reloc,
6116 unsigned HOST_WIDE_INT align)
6118 if (ix86_in_large_data_p (decl))
6120 const char *sname = NULL;
6121 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
6122 switch (categorize_decl_for_section (decl, reloc))
6124 case SECCAT_DATA:
6125 sname = ".ldata";
6126 break;
6127 case SECCAT_DATA_REL:
6128 sname = ".ldata.rel";
6129 break;
6130 case SECCAT_DATA_REL_LOCAL:
6131 sname = ".ldata.rel.local";
6132 break;
6133 case SECCAT_DATA_REL_RO:
6134 sname = ".ldata.rel.ro";
6135 break;
6136 case SECCAT_DATA_REL_RO_LOCAL:
6137 sname = ".ldata.rel.ro.local";
6138 break;
6139 case SECCAT_BSS:
6140 sname = ".lbss";
6141 flags |= SECTION_BSS;
6142 break;
6143 case SECCAT_RODATA:
6144 case SECCAT_RODATA_MERGE_STR:
6145 case SECCAT_RODATA_MERGE_STR_INIT:
6146 case SECCAT_RODATA_MERGE_CONST:
6147 sname = ".lrodata";
6148 flags &= ~SECTION_WRITE;
6149 break;
6150 case SECCAT_SRODATA:
6151 case SECCAT_SDATA:
6152 case SECCAT_SBSS:
6153 gcc_unreachable ();
6154 case SECCAT_TEXT:
6155 case SECCAT_TDATA:
6156 case SECCAT_TBSS:
6157 /* We don't split these for medium model. Place them into
6158 default sections and hope for best. */
6159 break;
6161 if (sname)
6163 /* We might get called with string constants, but get_named_section
6164 doesn't like them as they are not DECLs. Also, we need to set
6165 flags in that case. */
6166 if (!DECL_P (decl))
6167 return get_section (sname, flags, NULL);
6168 return get_named_section (decl, sname, reloc);
6171 return default_elf_select_section (decl, reloc, align);
6174 /* Select a set of attributes for section NAME based on the properties
6175 of DECL and whether or not RELOC indicates that DECL's initializer
6176 might contain runtime relocations. */
6178 static unsigned int ATTRIBUTE_UNUSED
6179 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6181 unsigned int flags = default_section_type_flags (decl, name, reloc);
6183 if (ix86_in_large_data_p (decl))
6184 flags |= SECTION_LARGE;
6186 if (decl == NULL_TREE
6187 && (strcmp (name, ".ldata.rel.ro") == 0
6188 || strcmp (name, ".ldata.rel.ro.local") == 0))
6189 flags |= SECTION_RELRO;
6191 if (strcmp (name, ".lbss") == 0
6192 || strncmp (name, ".lbss.", 5) == 0
6193 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6194 flags |= SECTION_BSS;
6196 return flags;
6199 /* Build up a unique section name, expressed as a
6200 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6201 RELOC indicates whether the initial value of EXP requires
6202 link-time relocations. */
6204 static void ATTRIBUTE_UNUSED
6205 x86_64_elf_unique_section (tree decl, int reloc)
6207 if (ix86_in_large_data_p (decl))
6209 const char *prefix = NULL;
6210 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6211 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6213 switch (categorize_decl_for_section (decl, reloc))
6215 case SECCAT_DATA:
6216 case SECCAT_DATA_REL:
6217 case SECCAT_DATA_REL_LOCAL:
6218 case SECCAT_DATA_REL_RO:
6219 case SECCAT_DATA_REL_RO_LOCAL:
6220 prefix = one_only ? ".ld" : ".ldata";
6221 break;
6222 case SECCAT_BSS:
6223 prefix = one_only ? ".lb" : ".lbss";
6224 break;
6225 case SECCAT_RODATA:
6226 case SECCAT_RODATA_MERGE_STR:
6227 case SECCAT_RODATA_MERGE_STR_INIT:
6228 case SECCAT_RODATA_MERGE_CONST:
6229 prefix = one_only ? ".lr" : ".lrodata";
6230 break;
6231 case SECCAT_SRODATA:
6232 case SECCAT_SDATA:
6233 case SECCAT_SBSS:
6234 gcc_unreachable ();
6235 case SECCAT_TEXT:
6236 case SECCAT_TDATA:
6237 case SECCAT_TBSS:
6238 /* We don't split these for medium model. Place them into
6239 default sections and hope for best. */
6240 break;
6242 if (prefix)
6244 const char *name, *linkonce;
6245 char *string;
6247 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6248 name = targetm.strip_name_encoding (name);
6250 /* If we're using one_only, then there needs to be a .gnu.linkonce
6251 prefix to the section name. */
6252 linkonce = one_only ? ".gnu.linkonce" : "";
6254 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6256 set_decl_section_name (decl, string);
6257 return;
6260 default_unique_section (decl, reloc);
6263 #ifdef COMMON_ASM_OP
6265 #ifndef LARGECOMM_SECTION_ASM_OP
6266 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6267 #endif
6269 /* This says how to output assembler code to declare an
6270 uninitialized external linkage data object.
6272 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6273 large objects. */
6274 void
6275 x86_elf_aligned_decl_common (FILE *file, tree decl,
6276 const char *name, unsigned HOST_WIDE_INT size,
6277 int align)
6279 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6280 && size > (unsigned int)ix86_section_threshold)
6282 switch_to_section (get_named_section (decl, ".lbss", 0));
6283 fputs (LARGECOMM_SECTION_ASM_OP, file);
6285 else
6286 fputs (COMMON_ASM_OP, file);
6287 assemble_name (file, name);
6288 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6289 size, align / BITS_PER_UNIT);
6291 #endif
6293 /* Utility function for targets to use in implementing
6294 ASM_OUTPUT_ALIGNED_BSS. */
6296 void
6297 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6298 unsigned HOST_WIDE_INT size, int align)
6300 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6301 && size > (unsigned int)ix86_section_threshold)
6302 switch_to_section (get_named_section (decl, ".lbss", 0));
6303 else
6304 switch_to_section (bss_section);
6305 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6306 #ifdef ASM_DECLARE_OBJECT_NAME
6307 last_assemble_variable_decl = decl;
6308 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6309 #else
6310 /* Standard thing is just output label for the object. */
6311 ASM_OUTPUT_LABEL (file, name);
6312 #endif /* ASM_DECLARE_OBJECT_NAME */
6313 ASM_OUTPUT_SKIP (file, size ? size : 1);
6316 /* Decide whether we must probe the stack before any space allocation
6317 on this target. It's essentially TARGET_STACK_PROBE except when
6318 -fstack-check causes the stack to be already probed differently. */
6320 bool
6321 ix86_target_stack_probe (void)
6323 /* Do not probe the stack twice if static stack checking is enabled. */
6324 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6325 return false;
6327 return TARGET_STACK_PROBE;
6330 /* Decide whether we can make a sibling call to a function. DECL is the
6331 declaration of the function being targeted by the call and EXP is the
6332 CALL_EXPR representing the call. */
6334 static bool
6335 ix86_function_ok_for_sibcall (tree decl, tree exp)
6337 tree type, decl_or_type;
6338 rtx a, b;
6339 bool bind_global = decl && !targetm.binds_local_p (decl);
6341 if (ix86_function_naked (current_function_decl))
6342 return false;
6344 /* Sibling call isn't OK if there are no caller-saved registers
6345 since all registers must be preserved before return. */
6346 if (cfun->machine->no_caller_saved_registers)
6347 return false;
6349 /* If we are generating position-independent code, we cannot sibcall
6350 optimize direct calls to global functions, as the PLT requires
6351 %ebx be live. (Darwin does not have a PLT.) */
6352 if (!TARGET_MACHO
6353 && !TARGET_64BIT
6354 && flag_pic
6355 && flag_plt
6356 && bind_global)
6357 return false;
6359 /* If we need to align the outgoing stack, then sibcalling would
6360 unalign the stack, which may break the called function. */
6361 if (ix86_minimum_incoming_stack_boundary (true)
6362 < PREFERRED_STACK_BOUNDARY)
6363 return false;
6365 if (decl)
6367 decl_or_type = decl;
6368 type = TREE_TYPE (decl);
6370 else
6372 /* We're looking at the CALL_EXPR, we need the type of the function. */
6373 type = CALL_EXPR_FN (exp); /* pointer expression */
6374 type = TREE_TYPE (type); /* pointer type */
6375 type = TREE_TYPE (type); /* function type */
6376 decl_or_type = type;
6379 /* Check that the return value locations are the same. Like
6380 if we are returning floats on the 80387 register stack, we cannot
6381 make a sibcall from a function that doesn't return a float to a
6382 function that does or, conversely, from a function that does return
6383 a float to a function that doesn't; the necessary stack adjustment
6384 would not be executed. This is also the place we notice
6385 differences in the return value ABI. Note that it is ok for one
6386 of the functions to have void return type as long as the return
6387 value of the other is passed in a register. */
6388 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6389 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6390 cfun->decl, false);
6391 if (STACK_REG_P (a) || STACK_REG_P (b))
6393 if (!rtx_equal_p (a, b))
6394 return false;
6396 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6398 else if (!rtx_equal_p (a, b))
6399 return false;
6401 if (TARGET_64BIT)
6403 /* The SYSV ABI has more call-clobbered registers;
6404 disallow sibcalls from MS to SYSV. */
6405 if (cfun->machine->call_abi == MS_ABI
6406 && ix86_function_type_abi (type) == SYSV_ABI)
6407 return false;
6409 else
6411 /* If this call is indirect, we'll need to be able to use a
6412 call-clobbered register for the address of the target function.
6413 Make sure that all such registers are not used for passing
6414 parameters. Note that DLLIMPORT functions and call to global
6415 function via GOT slot are indirect. */
6416 if (!decl
6417 || (bind_global && flag_pic && !flag_plt)
6418 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
6419 || flag_force_indirect_call)
6421 /* Check if regparm >= 3 since arg_reg_available is set to
6422 false if regparm == 0. If regparm is 1 or 2, there is
6423 always a call-clobbered register available.
6425 ??? The symbol indirect call doesn't need a call-clobbered
6426 register. But we don't know if this is a symbol indirect
6427 call or not here. */
6428 if (ix86_function_regparm (type, decl) >= 3
6429 && !cfun->machine->arg_reg_available)
6430 return false;
6434 /* Otherwise okay. That also includes certain types of indirect calls. */
6435 return true;
6438 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6439 and "sseregparm" calling convention attributes;
6440 arguments as in struct attribute_spec.handler. */
6442 static tree
6443 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6444 bool *no_add_attrs)
6446 if (TREE_CODE (*node) != FUNCTION_TYPE
6447 && TREE_CODE (*node) != METHOD_TYPE
6448 && TREE_CODE (*node) != FIELD_DECL
6449 && TREE_CODE (*node) != TYPE_DECL)
6451 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6452 name);
6453 *no_add_attrs = true;
6454 return NULL_TREE;
6457 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6458 if (is_attribute_p ("regparm", name))
6460 tree cst;
6462 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6464 error ("fastcall and regparm attributes are not compatible");
6467 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6469 error ("regparam and thiscall attributes are not compatible");
6472 cst = TREE_VALUE (args);
6473 if (TREE_CODE (cst) != INTEGER_CST)
6475 warning (OPT_Wattributes,
6476 "%qE attribute requires an integer constant argument",
6477 name);
6478 *no_add_attrs = true;
6480 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6482 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6483 name, REGPARM_MAX);
6484 *no_add_attrs = true;
6487 return NULL_TREE;
6490 if (TARGET_64BIT)
6492 /* Do not warn when emulating the MS ABI. */
6493 if ((TREE_CODE (*node) != FUNCTION_TYPE
6494 && TREE_CODE (*node) != METHOD_TYPE)
6495 || ix86_function_type_abi (*node) != MS_ABI)
6496 warning (OPT_Wattributes, "%qE attribute ignored",
6497 name);
6498 *no_add_attrs = true;
6499 return NULL_TREE;
6502 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6503 if (is_attribute_p ("fastcall", name))
6505 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6507 error ("fastcall and cdecl attributes are not compatible");
6509 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6511 error ("fastcall and stdcall attributes are not compatible");
6513 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6515 error ("fastcall and regparm attributes are not compatible");
6517 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6519 error ("fastcall and thiscall attributes are not compatible");
6523 /* Can combine stdcall with fastcall (redundant), regparm and
6524 sseregparm. */
6525 else if (is_attribute_p ("stdcall", name))
6527 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6529 error ("stdcall and cdecl attributes are not compatible");
6531 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6533 error ("stdcall and fastcall attributes are not compatible");
6535 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6537 error ("stdcall and thiscall attributes are not compatible");
6541 /* Can combine cdecl with regparm and sseregparm. */
6542 else if (is_attribute_p ("cdecl", name))
6544 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6546 error ("stdcall and cdecl attributes are not compatible");
6548 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6550 error ("fastcall and cdecl attributes are not compatible");
6552 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6554 error ("cdecl and thiscall attributes are not compatible");
6557 else if (is_attribute_p ("thiscall", name))
6559 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6560 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6561 name);
6562 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6564 error ("stdcall and thiscall attributes are not compatible");
6566 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6568 error ("fastcall and thiscall attributes are not compatible");
6570 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6572 error ("cdecl and thiscall attributes are not compatible");
6576 /* Can combine sseregparm with all attributes. */
6578 return NULL_TREE;
6581 /* The transactional memory builtins are implicitly regparm or fastcall
6582 depending on the ABI. Override the generic do-nothing attribute that
6583 these builtins were declared with, and replace it with one of the two
6584 attributes that we expect elsewhere. */
6586 static tree
6587 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6588 int flags, bool *no_add_attrs)
6590 tree alt;
6592 /* In no case do we want to add the placeholder attribute. */
6593 *no_add_attrs = true;
6595 /* The 64-bit ABI is unchanged for transactional memory. */
6596 if (TARGET_64BIT)
6597 return NULL_TREE;
6599 /* ??? Is there a better way to validate 32-bit windows? We have
6600 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6601 if (CHECK_STACK_LIMIT > 0)
6602 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6603 else
6605 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6606 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6608 decl_attributes (node, alt, flags);
6610 return NULL_TREE;
6613 /* This function determines from TYPE the calling-convention. */
6615 unsigned int
6616 ix86_get_callcvt (const_tree type)
6618 unsigned int ret = 0;
6619 bool is_stdarg;
6620 tree attrs;
6622 if (TARGET_64BIT)
6623 return IX86_CALLCVT_CDECL;
6625 attrs = TYPE_ATTRIBUTES (type);
6626 if (attrs != NULL_TREE)
6628 if (lookup_attribute ("cdecl", attrs))
6629 ret |= IX86_CALLCVT_CDECL;
6630 else if (lookup_attribute ("stdcall", attrs))
6631 ret |= IX86_CALLCVT_STDCALL;
6632 else if (lookup_attribute ("fastcall", attrs))
6633 ret |= IX86_CALLCVT_FASTCALL;
6634 else if (lookup_attribute ("thiscall", attrs))
6635 ret |= IX86_CALLCVT_THISCALL;
6637 /* Regparam isn't allowed for thiscall and fastcall. */
6638 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6640 if (lookup_attribute ("regparm", attrs))
6641 ret |= IX86_CALLCVT_REGPARM;
6642 if (lookup_attribute ("sseregparm", attrs))
6643 ret |= IX86_CALLCVT_SSEREGPARM;
6646 if (IX86_BASE_CALLCVT(ret) != 0)
6647 return ret;
6650 is_stdarg = stdarg_p (type);
6651 if (TARGET_RTD && !is_stdarg)
6652 return IX86_CALLCVT_STDCALL | ret;
6654 if (ret != 0
6655 || is_stdarg
6656 || TREE_CODE (type) != METHOD_TYPE
6657 || ix86_function_type_abi (type) != MS_ABI)
6658 return IX86_CALLCVT_CDECL | ret;
6660 return IX86_CALLCVT_THISCALL;
6663 /* Return 0 if the attributes for two types are incompatible, 1 if they
6664 are compatible, and 2 if they are nearly compatible (which causes a
6665 warning to be generated). */
6667 static int
6668 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6670 unsigned int ccvt1, ccvt2;
6672 if (TREE_CODE (type1) != FUNCTION_TYPE
6673 && TREE_CODE (type1) != METHOD_TYPE)
6674 return 1;
6676 ccvt1 = ix86_get_callcvt (type1);
6677 ccvt2 = ix86_get_callcvt (type2);
6678 if (ccvt1 != ccvt2)
6679 return 0;
6680 if (ix86_function_regparm (type1, NULL)
6681 != ix86_function_regparm (type2, NULL))
6682 return 0;
6684 return 1;
6687 /* Return the regparm value for a function with the indicated TYPE and DECL.
6688 DECL may be NULL when calling function indirectly
6689 or considering a libcall. */
6691 static int
6692 ix86_function_regparm (const_tree type, const_tree decl)
6694 tree attr;
6695 int regparm;
6696 unsigned int ccvt;
6698 if (TARGET_64BIT)
6699 return (ix86_function_type_abi (type) == SYSV_ABI
6700 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6701 ccvt = ix86_get_callcvt (type);
6702 regparm = ix86_regparm;
6704 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6706 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6707 if (attr)
6709 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6710 return regparm;
6713 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6714 return 2;
6715 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6716 return 1;
6718 /* Use register calling convention for local functions when possible. */
6719 if (decl
6720 && TREE_CODE (decl) == FUNCTION_DECL)
6722 cgraph_node *target = cgraph_node::get (decl);
6723 if (target)
6724 target = target->function_symbol ();
6726 /* Caller and callee must agree on the calling convention, so
6727 checking here just optimize means that with
6728 __attribute__((optimize (...))) caller could use regparm convention
6729 and callee not, or vice versa. Instead look at whether the callee
6730 is optimized or not. */
6731 if (target && opt_for_fn (target->decl, optimize)
6732 && !(profile_flag && !flag_fentry))
6734 cgraph_local_info *i = &target->local;
6735 if (i && i->local && i->can_change_signature)
6737 int local_regparm, globals = 0, regno;
6739 /* Make sure no regparm register is taken by a
6740 fixed register variable. */
6741 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6742 local_regparm++)
6743 if (fixed_regs[local_regparm])
6744 break;
6746 /* We don't want to use regparm(3) for nested functions as
6747 these use a static chain pointer in the third argument. */
6748 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6749 local_regparm = 2;
6751 /* Save a register for the split stack. */
6752 if (flag_split_stack)
6754 if (local_regparm == 3)
6755 local_regparm = 2;
6756 else if (local_regparm == 2
6757 && DECL_STATIC_CHAIN (target->decl))
6758 local_regparm = 1;
6761 /* Each fixed register usage increases register pressure,
6762 so less registers should be used for argument passing.
6763 This functionality can be overriden by an explicit
6764 regparm value. */
6765 for (regno = AX_REG; regno <= DI_REG; regno++)
6766 if (fixed_regs[regno])
6767 globals++;
6769 local_regparm
6770 = globals < local_regparm ? local_regparm - globals : 0;
6772 if (local_regparm > regparm)
6773 regparm = local_regparm;
6778 return regparm;
6781 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6782 DFmode (2) arguments in SSE registers for a function with the
6783 indicated TYPE and DECL. DECL may be NULL when calling function
6784 indirectly or considering a libcall. Return -1 if any FP parameter
6785 should be rejected by error. This is used in siutation we imply SSE
6786 calling convetion but the function is called from another function with
6787 SSE disabled. Otherwise return 0. */
6789 static int
6790 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6792 gcc_assert (!TARGET_64BIT);
6794 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6795 by the sseregparm attribute. */
6796 if (TARGET_SSEREGPARM
6797 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6799 if (!TARGET_SSE)
6801 if (warn)
6803 if (decl)
6804 error ("calling %qD with attribute sseregparm without "
6805 "SSE/SSE2 enabled", decl);
6806 else
6807 error ("calling %qT with attribute sseregparm without "
6808 "SSE/SSE2 enabled", type);
6810 return 0;
6813 return 2;
6816 if (!decl)
6817 return 0;
6819 cgraph_node *target = cgraph_node::get (decl);
6820 if (target)
6821 target = target->function_symbol ();
6823 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6824 (and DFmode for SSE2) arguments in SSE registers. */
6825 if (target
6826 /* TARGET_SSE_MATH */
6827 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6828 && opt_for_fn (target->decl, optimize)
6829 && !(profile_flag && !flag_fentry))
6831 cgraph_local_info *i = &target->local;
6832 if (i && i->local && i->can_change_signature)
6834 /* Refuse to produce wrong code when local function with SSE enabled
6835 is called from SSE disabled function.
6836 FIXME: We need a way to detect these cases cross-ltrans partition
6837 and avoid using SSE calling conventions on local functions called
6838 from function with SSE disabled. For now at least delay the
6839 warning until we know we are going to produce wrong code.
6840 See PR66047 */
6841 if (!TARGET_SSE && warn)
6842 return -1;
6843 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6844 ->x_ix86_isa_flags) ? 2 : 1;
6848 return 0;
6851 /* Return true if EAX is live at the start of the function. Used by
6852 ix86_expand_prologue to determine if we need special help before
6853 calling allocate_stack_worker. */
6855 static bool
6856 ix86_eax_live_at_start_p (void)
6858 /* Cheat. Don't bother working forward from ix86_function_regparm
6859 to the function type to whether an actual argument is located in
6860 eax. Instead just look at cfg info, which is still close enough
6861 to correct at this point. This gives false positives for broken
6862 functions that might use uninitialized data that happens to be
6863 allocated in eax, but who cares? */
6864 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6867 static bool
6868 ix86_keep_aggregate_return_pointer (tree fntype)
6870 tree attr;
6872 if (!TARGET_64BIT)
6874 attr = lookup_attribute ("callee_pop_aggregate_return",
6875 TYPE_ATTRIBUTES (fntype));
6876 if (attr)
6877 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6879 /* For 32-bit MS-ABI the default is to keep aggregate
6880 return pointer. */
6881 if (ix86_function_type_abi (fntype) == MS_ABI)
6882 return true;
6884 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6887 /* Value is the number of bytes of arguments automatically
6888 popped when returning from a subroutine call.
6889 FUNDECL is the declaration node of the function (as a tree),
6890 FUNTYPE is the data type of the function (as a tree),
6891 or for a library call it is an identifier node for the subroutine name.
6892 SIZE is the number of bytes of arguments passed on the stack.
6894 On the 80386, the RTD insn may be used to pop them if the number
6895 of args is fixed, but if the number is variable then the caller
6896 must pop them all. RTD can't be used for library calls now
6897 because the library is compiled with the Unix compiler.
6898 Use of RTD is a selectable option, since it is incompatible with
6899 standard Unix calling sequences. If the option is not selected,
6900 the caller must always pop the args.
6902 The attribute stdcall is equivalent to RTD on a per module basis. */
6904 static poly_int64
6905 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6907 unsigned int ccvt;
6909 /* None of the 64-bit ABIs pop arguments. */
6910 if (TARGET_64BIT)
6911 return 0;
6913 ccvt = ix86_get_callcvt (funtype);
6915 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6916 | IX86_CALLCVT_THISCALL)) != 0
6917 && ! stdarg_p (funtype))
6918 return size;
6920 /* Lose any fake structure return argument if it is passed on the stack. */
6921 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6922 && !ix86_keep_aggregate_return_pointer (funtype))
6924 int nregs = ix86_function_regparm (funtype, fundecl);
6925 if (nregs == 0)
6926 return GET_MODE_SIZE (Pmode);
6929 return 0;
6932 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6934 static bool
6935 ix86_legitimate_combined_insn (rtx_insn *insn)
6937 int i;
6939 /* Check operand constraints in case hard registers were propagated
6940 into insn pattern. This check prevents combine pass from
6941 generating insn patterns with invalid hard register operands.
6942 These invalid insns can eventually confuse reload to error out
6943 with a spill failure. See also PRs 46829 and 46843. */
6945 gcc_assert (INSN_CODE (insn) >= 0);
6947 extract_insn (insn);
6948 preprocess_constraints (insn);
6950 int n_operands = recog_data.n_operands;
6951 int n_alternatives = recog_data.n_alternatives;
6952 for (i = 0; i < n_operands; i++)
6954 rtx op = recog_data.operand[i];
6955 machine_mode mode = GET_MODE (op);
6956 const operand_alternative *op_alt;
6957 int offset = 0;
6958 bool win;
6959 int j;
6961 /* A unary operator may be accepted by the predicate, but it
6962 is irrelevant for matching constraints. */
6963 if (UNARY_P (op))
6964 op = XEXP (op, 0);
6966 if (SUBREG_P (op))
6968 if (REG_P (SUBREG_REG (op))
6969 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6970 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6971 GET_MODE (SUBREG_REG (op)),
6972 SUBREG_BYTE (op),
6973 GET_MODE (op));
6974 op = SUBREG_REG (op);
6977 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6978 continue;
6980 op_alt = recog_op_alt;
6982 /* Operand has no constraints, anything is OK. */
6983 win = !n_alternatives;
6985 alternative_mask preferred = get_preferred_alternatives (insn);
6986 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6988 if (!TEST_BIT (preferred, j))
6989 continue;
6990 if (op_alt[i].anything_ok
6991 || (op_alt[i].matches != -1
6992 && operands_match_p
6993 (recog_data.operand[i],
6994 recog_data.operand[op_alt[i].matches]))
6995 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6997 win = true;
6998 break;
7002 if (!win)
7003 return false;
7006 return true;
7009 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
7011 static unsigned HOST_WIDE_INT
7012 ix86_asan_shadow_offset (void)
7014 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
7015 : HOST_WIDE_INT_C (0x7fff8000))
7016 : (HOST_WIDE_INT_1 << 29);
7019 /* Argument support functions. */
7021 /* Return true when register may be used to pass function parameters. */
7022 bool
7023 ix86_function_arg_regno_p (int regno)
7025 int i;
7026 enum calling_abi call_abi;
7027 const int *parm_regs;
7029 if (TARGET_MPX && BND_REGNO_P (regno))
7030 return true;
7032 if (!TARGET_64BIT)
7034 if (TARGET_MACHO)
7035 return (regno < REGPARM_MAX
7036 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
7037 else
7038 return (regno < REGPARM_MAX
7039 || (TARGET_MMX && MMX_REGNO_P (regno)
7040 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
7041 || (TARGET_SSE && SSE_REGNO_P (regno)
7042 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
7045 if (TARGET_SSE && SSE_REGNO_P (regno)
7046 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
7047 return true;
7049 /* TODO: The function should depend on current function ABI but
7050 builtins.c would need updating then. Therefore we use the
7051 default ABI. */
7052 call_abi = ix86_cfun_abi ();
7054 /* RAX is used as hidden argument to va_arg functions. */
7055 if (call_abi == SYSV_ABI && regno == AX_REG)
7056 return true;
7058 if (call_abi == MS_ABI)
7059 parm_regs = x86_64_ms_abi_int_parameter_registers;
7060 else
7061 parm_regs = x86_64_int_parameter_registers;
7063 for (i = 0; i < (call_abi == MS_ABI
7064 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
7065 if (regno == parm_regs[i])
7066 return true;
7067 return false;
7070 /* Return if we do not know how to pass TYPE solely in registers. */
7072 static bool
7073 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
7075 if (must_pass_in_stack_var_size_or_pad (mode, type))
7076 return true;
7078 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
7079 The layout_type routine is crafty and tries to trick us into passing
7080 currently unsupported vector types on the stack by using TImode. */
7081 return (!TARGET_64BIT && mode == TImode
7082 && type && TREE_CODE (type) != VECTOR_TYPE);
7085 /* It returns the size, in bytes, of the area reserved for arguments passed
7086 in registers for the function represented by fndecl dependent to the used
7087 abi format. */
7089 ix86_reg_parm_stack_space (const_tree fndecl)
7091 enum calling_abi call_abi = SYSV_ABI;
7092 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
7093 call_abi = ix86_function_abi (fndecl);
7094 else
7095 call_abi = ix86_function_type_abi (fndecl);
7096 if (TARGET_64BIT && call_abi == MS_ABI)
7097 return 32;
7098 return 0;
7101 /* We add this as a workaround in order to use libc_has_function
7102 hook in i386.md. */
7103 bool
7104 ix86_libc_has_function (enum function_class fn_class)
7106 return targetm.libc_has_function (fn_class);
7109 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
7110 specifying the call abi used. */
7111 enum calling_abi
7112 ix86_function_type_abi (const_tree fntype)
7114 enum calling_abi abi = ix86_abi;
7116 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
7117 return abi;
7119 if (abi == SYSV_ABI
7120 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
7122 static int warned;
7123 if (TARGET_X32 && !warned)
7125 error ("X32 does not support ms_abi attribute");
7126 warned = 1;
7129 abi = MS_ABI;
7131 else if (abi == MS_ABI
7132 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
7133 abi = SYSV_ABI;
7135 return abi;
7138 static enum calling_abi
7139 ix86_function_abi (const_tree fndecl)
7141 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
7144 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
7145 specifying the call abi used. */
7146 enum calling_abi
7147 ix86_cfun_abi (void)
7149 return cfun ? cfun->machine->call_abi : ix86_abi;
7152 static bool
7153 ix86_function_ms_hook_prologue (const_tree fn)
7155 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
7157 if (decl_function_context (fn) != NULL_TREE)
7158 error_at (DECL_SOURCE_LOCATION (fn),
7159 "ms_hook_prologue is not compatible with nested function");
7160 else
7161 return true;
7163 return false;
7166 static bool
7167 ix86_function_naked (const_tree fn)
7169 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7170 return true;
7172 return false;
7175 /* Write the extra assembler code needed to declare a function properly. */
7177 void
7178 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7179 tree decl)
7181 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7183 if (is_ms_hook)
7185 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7186 unsigned int filler_cc = 0xcccccccc;
7188 for (i = 0; i < filler_count; i += 4)
7189 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7192 #ifdef SUBTARGET_ASM_UNWIND_INIT
7193 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7194 #endif
7196 ASM_OUTPUT_LABEL (asm_out_file, fname);
7198 /* Output magic byte marker, if hot-patch attribute is set. */
7199 if (is_ms_hook)
7201 if (TARGET_64BIT)
7203 /* leaq [%rsp + 0], %rsp */
7204 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7205 asm_out_file);
7207 else
7209 /* movl.s %edi, %edi
7210 push %ebp
7211 movl.s %esp, %ebp */
7212 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7217 /* Implementation of call abi switching target hook. Specific to FNDECL
7218 the specific call register sets are set. See also
7219 ix86_conditional_register_usage for more details. */
7220 void
7221 ix86_call_abi_override (const_tree fndecl)
7223 cfun->machine->call_abi = ix86_function_abi (fndecl);
7226 /* Return 1 if pseudo register should be created and used to hold
7227 GOT address for PIC code. */
7228 bool
7229 ix86_use_pseudo_pic_reg (void)
7231 if ((TARGET_64BIT
7232 && (ix86_cmodel == CM_SMALL_PIC
7233 || TARGET_PECOFF))
7234 || !flag_pic)
7235 return false;
7236 return true;
7239 /* Initialize large model PIC register. */
7241 static void
7242 ix86_init_large_pic_reg (unsigned int tmp_regno)
7244 rtx_code_label *label;
7245 rtx tmp_reg;
7247 gcc_assert (Pmode == DImode);
7248 label = gen_label_rtx ();
7249 emit_label (label);
7250 LABEL_PRESERVE_P (label) = 1;
7251 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7252 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7253 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7254 label));
7255 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7256 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7257 pic_offset_table_rtx, tmp_reg));
7258 const char *name = LABEL_NAME (label);
7259 PUT_CODE (label, NOTE);
7260 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7261 NOTE_DELETED_LABEL_NAME (label) = name;
7264 /* Create and initialize PIC register if required. */
7265 static void
7266 ix86_init_pic_reg (void)
7268 edge entry_edge;
7269 rtx_insn *seq;
7271 if (!ix86_use_pseudo_pic_reg ())
7272 return;
7274 start_sequence ();
7276 if (TARGET_64BIT)
7278 if (ix86_cmodel == CM_LARGE_PIC)
7279 ix86_init_large_pic_reg (R11_REG);
7280 else
7281 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7283 else
7285 /* If there is future mcount call in the function it is more profitable
7286 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7287 rtx reg = crtl->profile
7288 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7289 : pic_offset_table_rtx;
7290 rtx_insn *insn = emit_insn (gen_set_got (reg));
7291 RTX_FRAME_RELATED_P (insn) = 1;
7292 if (crtl->profile)
7293 emit_move_insn (pic_offset_table_rtx, reg);
7294 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7297 seq = get_insns ();
7298 end_sequence ();
7300 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7301 insert_insn_on_edge (seq, entry_edge);
7302 commit_one_edge_insertion (entry_edge);
7305 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7306 for a call to a function whose data type is FNTYPE.
7307 For a library call, FNTYPE is 0. */
7309 void
7310 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7311 tree fntype, /* tree ptr for function decl */
7312 rtx libname, /* SYMBOL_REF of library name or 0 */
7313 tree fndecl,
7314 int caller)
7316 struct cgraph_local_info *i = NULL;
7317 struct cgraph_node *target = NULL;
7319 memset (cum, 0, sizeof (*cum));
7321 if (fndecl)
7323 target = cgraph_node::get (fndecl);
7324 if (target)
7326 target = target->function_symbol ();
7327 i = cgraph_node::local_info (target->decl);
7328 cum->call_abi = ix86_function_abi (target->decl);
7330 else
7331 cum->call_abi = ix86_function_abi (fndecl);
7333 else
7334 cum->call_abi = ix86_function_type_abi (fntype);
7336 cum->caller = caller;
7338 /* Set up the number of registers to use for passing arguments. */
7339 cum->nregs = ix86_regparm;
7340 if (TARGET_64BIT)
7342 cum->nregs = (cum->call_abi == SYSV_ABI
7343 ? X86_64_REGPARM_MAX
7344 : X86_64_MS_REGPARM_MAX);
7346 if (TARGET_SSE)
7348 cum->sse_nregs = SSE_REGPARM_MAX;
7349 if (TARGET_64BIT)
7351 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7352 ? X86_64_SSE_REGPARM_MAX
7353 : X86_64_MS_SSE_REGPARM_MAX);
7356 if (TARGET_MMX)
7357 cum->mmx_nregs = MMX_REGPARM_MAX;
7358 cum->warn_avx512f = true;
7359 cum->warn_avx = true;
7360 cum->warn_sse = true;
7361 cum->warn_mmx = true;
7363 /* Because type might mismatch in between caller and callee, we need to
7364 use actual type of function for local calls.
7365 FIXME: cgraph_analyze can be told to actually record if function uses
7366 va_start so for local functions maybe_vaarg can be made aggressive
7367 helping K&R code.
7368 FIXME: once typesytem is fixed, we won't need this code anymore. */
7369 if (i && i->local && i->can_change_signature)
7370 fntype = TREE_TYPE (target->decl);
7371 cum->stdarg = stdarg_p (fntype);
7372 cum->maybe_vaarg = (fntype
7373 ? (!prototype_p (fntype) || stdarg_p (fntype))
7374 : !libname);
7376 cum->bnd_regno = FIRST_BND_REG;
7377 cum->bnds_in_bt = 0;
7378 cum->force_bnd_pass = 0;
7379 cum->decl = fndecl;
7381 cum->warn_empty = !warn_abi || cum->stdarg;
7382 if (!cum->warn_empty && fntype)
7384 function_args_iterator iter;
7385 tree argtype;
7386 bool seen_empty_type = false;
7387 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7389 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7390 break;
7391 if (TYPE_EMPTY_P (argtype))
7392 seen_empty_type = true;
7393 else if (seen_empty_type)
7395 cum->warn_empty = true;
7396 break;
7401 if (!TARGET_64BIT)
7403 /* If there are variable arguments, then we won't pass anything
7404 in registers in 32-bit mode. */
7405 if (stdarg_p (fntype))
7407 cum->nregs = 0;
7408 /* Since in 32-bit, variable arguments are always passed on
7409 stack, there is scratch register available for indirect
7410 sibcall. */
7411 cfun->machine->arg_reg_available = true;
7412 cum->sse_nregs = 0;
7413 cum->mmx_nregs = 0;
7414 cum->warn_avx512f = false;
7415 cum->warn_avx = false;
7416 cum->warn_sse = false;
7417 cum->warn_mmx = false;
7418 return;
7421 /* Use ecx and edx registers if function has fastcall attribute,
7422 else look for regparm information. */
7423 if (fntype)
7425 unsigned int ccvt = ix86_get_callcvt (fntype);
7426 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7428 cum->nregs = 1;
7429 cum->fastcall = 1; /* Same first register as in fastcall. */
7431 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7433 cum->nregs = 2;
7434 cum->fastcall = 1;
7436 else
7437 cum->nregs = ix86_function_regparm (fntype, fndecl);
7440 /* Set up the number of SSE registers used for passing SFmode
7441 and DFmode arguments. Warn for mismatching ABI. */
7442 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7445 cfun->machine->arg_reg_available = (cum->nregs > 0);
7448 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7449 But in the case of vector types, it is some vector mode.
7451 When we have only some of our vector isa extensions enabled, then there
7452 are some modes for which vector_mode_supported_p is false. For these
7453 modes, the generic vector support in gcc will choose some non-vector mode
7454 in order to implement the type. By computing the natural mode, we'll
7455 select the proper ABI location for the operand and not depend on whatever
7456 the middle-end decides to do with these vector types.
7458 The midde-end can't deal with the vector types > 16 bytes. In this
7459 case, we return the original mode and warn ABI change if CUM isn't
7460 NULL.
7462 If INT_RETURN is true, warn ABI change if the vector mode isn't
7463 available for function return value. */
7465 static machine_mode
7466 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7467 bool in_return)
7469 machine_mode mode = TYPE_MODE (type);
7471 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7473 HOST_WIDE_INT size = int_size_in_bytes (type);
7474 if ((size == 8 || size == 16 || size == 32 || size == 64)
7475 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7476 && TYPE_VECTOR_SUBPARTS (type) > 1)
7478 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7480 /* There are no XFmode vector modes. */
7481 if (innermode == XFmode)
7482 return mode;
7484 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7485 mode = MIN_MODE_VECTOR_FLOAT;
7486 else
7487 mode = MIN_MODE_VECTOR_INT;
7489 /* Get the mode which has this inner mode and number of units. */
7490 FOR_EACH_MODE_FROM (mode, mode)
7491 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7492 && GET_MODE_INNER (mode) == innermode)
7494 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7496 static bool warnedavx512f;
7497 static bool warnedavx512f_ret;
7499 if (cum && cum->warn_avx512f && !warnedavx512f)
7501 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7502 "without AVX512F enabled changes the ABI"))
7503 warnedavx512f = true;
7505 else if (in_return && !warnedavx512f_ret)
7507 if (warning (OPT_Wpsabi, "AVX512F vector return "
7508 "without AVX512F enabled changes the ABI"))
7509 warnedavx512f_ret = true;
7512 return TYPE_MODE (type);
7514 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7516 static bool warnedavx;
7517 static bool warnedavx_ret;
7519 if (cum && cum->warn_avx && !warnedavx)
7521 if (warning (OPT_Wpsabi, "AVX vector argument "
7522 "without AVX enabled changes the ABI"))
7523 warnedavx = true;
7525 else if (in_return && !warnedavx_ret)
7527 if (warning (OPT_Wpsabi, "AVX vector return "
7528 "without AVX enabled changes the ABI"))
7529 warnedavx_ret = true;
7532 return TYPE_MODE (type);
7534 else if (((size == 8 && TARGET_64BIT) || size == 16)
7535 && !TARGET_SSE
7536 && !TARGET_IAMCU)
7538 static bool warnedsse;
7539 static bool warnedsse_ret;
7541 if (cum && cum->warn_sse && !warnedsse)
7543 if (warning (OPT_Wpsabi, "SSE vector argument "
7544 "without SSE enabled changes the ABI"))
7545 warnedsse = true;
7547 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7549 if (warning (OPT_Wpsabi, "SSE vector return "
7550 "without SSE enabled changes the ABI"))
7551 warnedsse_ret = true;
7554 else if ((size == 8 && !TARGET_64BIT)
7555 && (!cfun
7556 || cfun->machine->func_type == TYPE_NORMAL)
7557 && !TARGET_MMX
7558 && !TARGET_IAMCU)
7560 static bool warnedmmx;
7561 static bool warnedmmx_ret;
7563 if (cum && cum->warn_mmx && !warnedmmx)
7565 if (warning (OPT_Wpsabi, "MMX vector argument "
7566 "without MMX enabled changes the ABI"))
7567 warnedmmx = true;
7569 else if (in_return && !warnedmmx_ret)
7571 if (warning (OPT_Wpsabi, "MMX vector return "
7572 "without MMX enabled changes the ABI"))
7573 warnedmmx_ret = true;
7576 return mode;
7579 gcc_unreachable ();
7583 return mode;
7586 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7587 this may not agree with the mode that the type system has chosen for the
7588 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7589 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7591 static rtx
7592 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7593 unsigned int regno)
7595 rtx tmp;
7597 if (orig_mode != BLKmode)
7598 tmp = gen_rtx_REG (orig_mode, regno);
7599 else
7601 tmp = gen_rtx_REG (mode, regno);
7602 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7603 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7606 return tmp;
7609 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7610 of this code is to classify each 8bytes of incoming argument by the register
7611 class and assign registers accordingly. */
7613 /* Return the union class of CLASS1 and CLASS2.
7614 See the x86-64 PS ABI for details. */
7616 static enum x86_64_reg_class
7617 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7619 /* Rule #1: If both classes are equal, this is the resulting class. */
7620 if (class1 == class2)
7621 return class1;
7623 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7624 the other class. */
7625 if (class1 == X86_64_NO_CLASS)
7626 return class2;
7627 if (class2 == X86_64_NO_CLASS)
7628 return class1;
7630 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7631 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7632 return X86_64_MEMORY_CLASS;
7634 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7635 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7636 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7637 return X86_64_INTEGERSI_CLASS;
7638 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7639 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7640 return X86_64_INTEGER_CLASS;
7642 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7643 MEMORY is used. */
7644 if (class1 == X86_64_X87_CLASS
7645 || class1 == X86_64_X87UP_CLASS
7646 || class1 == X86_64_COMPLEX_X87_CLASS
7647 || class2 == X86_64_X87_CLASS
7648 || class2 == X86_64_X87UP_CLASS
7649 || class2 == X86_64_COMPLEX_X87_CLASS)
7650 return X86_64_MEMORY_CLASS;
7652 /* Rule #6: Otherwise class SSE is used. */
7653 return X86_64_SSE_CLASS;
7656 /* Classify the argument of type TYPE and mode MODE.
7657 CLASSES will be filled by the register class used to pass each word
7658 of the operand. The number of words is returned. In case the parameter
7659 should be passed in memory, 0 is returned. As a special case for zero
7660 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7662 BIT_OFFSET is used internally for handling records and specifies offset
7663 of the offset in bits modulo 512 to avoid overflow cases.
7665 See the x86-64 PS ABI for details.
7668 static int
7669 classify_argument (machine_mode mode, const_tree type,
7670 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7672 HOST_WIDE_INT bytes =
7673 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7674 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7676 /* Variable sized entities are always passed/returned in memory. */
7677 if (bytes < 0)
7678 return 0;
7680 if (mode != VOIDmode
7681 && targetm.calls.must_pass_in_stack (mode, type))
7682 return 0;
7684 if (type && AGGREGATE_TYPE_P (type))
7686 int i;
7687 tree field;
7688 enum x86_64_reg_class subclasses[MAX_CLASSES];
7690 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7691 if (bytes > 64)
7692 return 0;
7694 for (i = 0; i < words; i++)
7695 classes[i] = X86_64_NO_CLASS;
7697 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7698 signalize memory class, so handle it as special case. */
7699 if (!words)
7701 classes[0] = X86_64_NO_CLASS;
7702 return 1;
7705 /* Classify each field of record and merge classes. */
7706 switch (TREE_CODE (type))
7708 case RECORD_TYPE:
7709 /* And now merge the fields of structure. */
7710 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7712 if (TREE_CODE (field) == FIELD_DECL)
7714 int num;
7716 if (TREE_TYPE (field) == error_mark_node)
7717 continue;
7719 /* Bitfields are always classified as integer. Handle them
7720 early, since later code would consider them to be
7721 misaligned integers. */
7722 if (DECL_BIT_FIELD (field))
7724 for (i = (int_bit_position (field)
7725 + (bit_offset % 64)) / 8 / 8;
7726 i < ((int_bit_position (field) + (bit_offset % 64))
7727 + tree_to_shwi (DECL_SIZE (field))
7728 + 63) / 8 / 8; i++)
7729 classes[i] =
7730 merge_classes (X86_64_INTEGER_CLASS,
7731 classes[i]);
7733 else
7735 int pos;
7737 type = TREE_TYPE (field);
7739 /* Flexible array member is ignored. */
7740 if (TYPE_MODE (type) == BLKmode
7741 && TREE_CODE (type) == ARRAY_TYPE
7742 && TYPE_SIZE (type) == NULL_TREE
7743 && TYPE_DOMAIN (type) != NULL_TREE
7744 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7745 == NULL_TREE))
7747 static bool warned;
7749 if (!warned && warn_psabi)
7751 warned = true;
7752 inform (input_location,
7753 "the ABI of passing struct with"
7754 " a flexible array member has"
7755 " changed in GCC 4.4");
7757 continue;
7759 num = classify_argument (TYPE_MODE (type), type,
7760 subclasses,
7761 (int_bit_position (field)
7762 + bit_offset) % 512);
7763 if (!num)
7764 return 0;
7765 pos = (int_bit_position (field)
7766 + (bit_offset % 64)) / 8 / 8;
7767 for (i = 0; i < num && (i + pos) < words; i++)
7768 classes[i + pos] =
7769 merge_classes (subclasses[i], classes[i + pos]);
7773 break;
7775 case ARRAY_TYPE:
7776 /* Arrays are handled as small records. */
7778 int num;
7779 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7780 TREE_TYPE (type), subclasses, bit_offset);
7781 if (!num)
7782 return 0;
7784 /* The partial classes are now full classes. */
7785 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7786 subclasses[0] = X86_64_SSE_CLASS;
7787 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7788 && !((bit_offset % 64) == 0 && bytes == 4))
7789 subclasses[0] = X86_64_INTEGER_CLASS;
7791 for (i = 0; i < words; i++)
7792 classes[i] = subclasses[i % num];
7794 break;
7796 case UNION_TYPE:
7797 case QUAL_UNION_TYPE:
7798 /* Unions are similar to RECORD_TYPE but offset is always 0.
7800 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7802 if (TREE_CODE (field) == FIELD_DECL)
7804 int num;
7806 if (TREE_TYPE (field) == error_mark_node)
7807 continue;
7809 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7810 TREE_TYPE (field), subclasses,
7811 bit_offset);
7812 if (!num)
7813 return 0;
7814 for (i = 0; i < num && i < words; i++)
7815 classes[i] = merge_classes (subclasses[i], classes[i]);
7818 break;
7820 default:
7821 gcc_unreachable ();
7824 if (words > 2)
7826 /* When size > 16 bytes, if the first one isn't
7827 X86_64_SSE_CLASS or any other ones aren't
7828 X86_64_SSEUP_CLASS, everything should be passed in
7829 memory. */
7830 if (classes[0] != X86_64_SSE_CLASS)
7831 return 0;
7833 for (i = 1; i < words; i++)
7834 if (classes[i] != X86_64_SSEUP_CLASS)
7835 return 0;
7838 /* Final merger cleanup. */
7839 for (i = 0; i < words; i++)
7841 /* If one class is MEMORY, everything should be passed in
7842 memory. */
7843 if (classes[i] == X86_64_MEMORY_CLASS)
7844 return 0;
7846 /* The X86_64_SSEUP_CLASS should be always preceded by
7847 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7848 if (classes[i] == X86_64_SSEUP_CLASS
7849 && classes[i - 1] != X86_64_SSE_CLASS
7850 && classes[i - 1] != X86_64_SSEUP_CLASS)
7852 /* The first one should never be X86_64_SSEUP_CLASS. */
7853 gcc_assert (i != 0);
7854 classes[i] = X86_64_SSE_CLASS;
7857 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7858 everything should be passed in memory. */
7859 if (classes[i] == X86_64_X87UP_CLASS
7860 && (classes[i - 1] != X86_64_X87_CLASS))
7862 static bool warned;
7864 /* The first one should never be X86_64_X87UP_CLASS. */
7865 gcc_assert (i != 0);
7866 if (!warned && warn_psabi)
7868 warned = true;
7869 inform (input_location,
7870 "the ABI of passing union with long double"
7871 " has changed in GCC 4.4");
7873 return 0;
7876 return words;
7879 /* Compute alignment needed. We align all types to natural boundaries with
7880 exception of XFmode that is aligned to 64bits. */
7881 if (mode != VOIDmode && mode != BLKmode)
7883 int mode_alignment = GET_MODE_BITSIZE (mode);
7885 if (mode == XFmode)
7886 mode_alignment = 128;
7887 else if (mode == XCmode)
7888 mode_alignment = 256;
7889 if (COMPLEX_MODE_P (mode))
7890 mode_alignment /= 2;
7891 /* Misaligned fields are always returned in memory. */
7892 if (bit_offset % mode_alignment)
7893 return 0;
7896 /* for V1xx modes, just use the base mode */
7897 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7898 && GET_MODE_UNIT_SIZE (mode) == bytes)
7899 mode = GET_MODE_INNER (mode);
7901 /* Classification of atomic types. */
7902 switch (mode)
7904 case E_SDmode:
7905 case E_DDmode:
7906 classes[0] = X86_64_SSE_CLASS;
7907 return 1;
7908 case E_TDmode:
7909 classes[0] = X86_64_SSE_CLASS;
7910 classes[1] = X86_64_SSEUP_CLASS;
7911 return 2;
7912 case E_DImode:
7913 case E_SImode:
7914 case E_HImode:
7915 case E_QImode:
7916 case E_CSImode:
7917 case E_CHImode:
7918 case E_CQImode:
7920 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7922 /* Analyze last 128 bits only. */
7923 size = (size - 1) & 0x7f;
7925 if (size < 32)
7927 classes[0] = X86_64_INTEGERSI_CLASS;
7928 return 1;
7930 else if (size < 64)
7932 classes[0] = X86_64_INTEGER_CLASS;
7933 return 1;
7935 else if (size < 64+32)
7937 classes[0] = X86_64_INTEGER_CLASS;
7938 classes[1] = X86_64_INTEGERSI_CLASS;
7939 return 2;
7941 else if (size < 64+64)
7943 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7944 return 2;
7946 else
7947 gcc_unreachable ();
7949 case E_CDImode:
7950 case E_TImode:
7951 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7952 return 2;
7953 case E_COImode:
7954 case E_OImode:
7955 /* OImode shouldn't be used directly. */
7956 gcc_unreachable ();
7957 case E_CTImode:
7958 return 0;
7959 case E_SFmode:
7960 if (!(bit_offset % 64))
7961 classes[0] = X86_64_SSESF_CLASS;
7962 else
7963 classes[0] = X86_64_SSE_CLASS;
7964 return 1;
7965 case E_DFmode:
7966 classes[0] = X86_64_SSEDF_CLASS;
7967 return 1;
7968 case E_XFmode:
7969 classes[0] = X86_64_X87_CLASS;
7970 classes[1] = X86_64_X87UP_CLASS;
7971 return 2;
7972 case E_TFmode:
7973 classes[0] = X86_64_SSE_CLASS;
7974 classes[1] = X86_64_SSEUP_CLASS;
7975 return 2;
7976 case E_SCmode:
7977 classes[0] = X86_64_SSE_CLASS;
7978 if (!(bit_offset % 64))
7979 return 1;
7980 else
7982 static bool warned;
7984 if (!warned && warn_psabi)
7986 warned = true;
7987 inform (input_location,
7988 "the ABI of passing structure with complex float"
7989 " member has changed in GCC 4.4");
7991 classes[1] = X86_64_SSESF_CLASS;
7992 return 2;
7994 case E_DCmode:
7995 classes[0] = X86_64_SSEDF_CLASS;
7996 classes[1] = X86_64_SSEDF_CLASS;
7997 return 2;
7998 case E_XCmode:
7999 classes[0] = X86_64_COMPLEX_X87_CLASS;
8000 return 1;
8001 case E_TCmode:
8002 /* This modes is larger than 16 bytes. */
8003 return 0;
8004 case E_V8SFmode:
8005 case E_V8SImode:
8006 case E_V32QImode:
8007 case E_V16HImode:
8008 case E_V4DFmode:
8009 case E_V4DImode:
8010 classes[0] = X86_64_SSE_CLASS;
8011 classes[1] = X86_64_SSEUP_CLASS;
8012 classes[2] = X86_64_SSEUP_CLASS;
8013 classes[3] = X86_64_SSEUP_CLASS;
8014 return 4;
8015 case E_V8DFmode:
8016 case E_V16SFmode:
8017 case E_V8DImode:
8018 case E_V16SImode:
8019 case E_V32HImode:
8020 case E_V64QImode:
8021 classes[0] = X86_64_SSE_CLASS;
8022 classes[1] = X86_64_SSEUP_CLASS;
8023 classes[2] = X86_64_SSEUP_CLASS;
8024 classes[3] = X86_64_SSEUP_CLASS;
8025 classes[4] = X86_64_SSEUP_CLASS;
8026 classes[5] = X86_64_SSEUP_CLASS;
8027 classes[6] = X86_64_SSEUP_CLASS;
8028 classes[7] = X86_64_SSEUP_CLASS;
8029 return 8;
8030 case E_V4SFmode:
8031 case E_V4SImode:
8032 case E_V16QImode:
8033 case E_V8HImode:
8034 case E_V2DFmode:
8035 case E_V2DImode:
8036 classes[0] = X86_64_SSE_CLASS;
8037 classes[1] = X86_64_SSEUP_CLASS;
8038 return 2;
8039 case E_V1TImode:
8040 case E_V1DImode:
8041 case E_V2SFmode:
8042 case E_V2SImode:
8043 case E_V4HImode:
8044 case E_V8QImode:
8045 classes[0] = X86_64_SSE_CLASS;
8046 return 1;
8047 case E_BLKmode:
8048 case E_VOIDmode:
8049 return 0;
8050 default:
8051 gcc_assert (VECTOR_MODE_P (mode));
8053 if (bytes > 16)
8054 return 0;
8056 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
8058 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
8059 classes[0] = X86_64_INTEGERSI_CLASS;
8060 else
8061 classes[0] = X86_64_INTEGER_CLASS;
8062 classes[1] = X86_64_INTEGER_CLASS;
8063 return 1 + (bytes > 8);
8067 /* Examine the argument and return set number of register required in each
8068 class. Return true iff parameter should be passed in memory. */
8070 static bool
8071 examine_argument (machine_mode mode, const_tree type, int in_return,
8072 int *int_nregs, int *sse_nregs)
8074 enum x86_64_reg_class regclass[MAX_CLASSES];
8075 int n = classify_argument (mode, type, regclass, 0);
8077 *int_nregs = 0;
8078 *sse_nregs = 0;
8080 if (!n)
8081 return true;
8082 for (n--; n >= 0; n--)
8083 switch (regclass[n])
8085 case X86_64_INTEGER_CLASS:
8086 case X86_64_INTEGERSI_CLASS:
8087 (*int_nregs)++;
8088 break;
8089 case X86_64_SSE_CLASS:
8090 case X86_64_SSESF_CLASS:
8091 case X86_64_SSEDF_CLASS:
8092 (*sse_nregs)++;
8093 break;
8094 case X86_64_NO_CLASS:
8095 case X86_64_SSEUP_CLASS:
8096 break;
8097 case X86_64_X87_CLASS:
8098 case X86_64_X87UP_CLASS:
8099 case X86_64_COMPLEX_X87_CLASS:
8100 if (!in_return)
8101 return true;
8102 break;
8103 case X86_64_MEMORY_CLASS:
8104 gcc_unreachable ();
8107 return false;
8110 /* Construct container for the argument used by GCC interface. See
8111 FUNCTION_ARG for the detailed description. */
8113 static rtx
8114 construct_container (machine_mode mode, machine_mode orig_mode,
8115 const_tree type, int in_return, int nintregs, int nsseregs,
8116 const int *intreg, int sse_regno)
8118 /* The following variables hold the static issued_error state. */
8119 static bool issued_sse_arg_error;
8120 static bool issued_sse_ret_error;
8121 static bool issued_x87_ret_error;
8123 machine_mode tmpmode;
8124 int bytes =
8125 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8126 enum x86_64_reg_class regclass[MAX_CLASSES];
8127 int n;
8128 int i;
8129 int nexps = 0;
8130 int needed_sseregs, needed_intregs;
8131 rtx exp[MAX_CLASSES];
8132 rtx ret;
8134 n = classify_argument (mode, type, regclass, 0);
8135 if (!n)
8136 return NULL;
8137 if (examine_argument (mode, type, in_return, &needed_intregs,
8138 &needed_sseregs))
8139 return NULL;
8140 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
8141 return NULL;
8143 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
8144 some less clueful developer tries to use floating-point anyway. */
8145 if (needed_sseregs && !TARGET_SSE)
8147 if (in_return)
8149 if (!issued_sse_ret_error)
8151 error ("SSE register return with SSE disabled");
8152 issued_sse_ret_error = true;
8155 else if (!issued_sse_arg_error)
8157 error ("SSE register argument with SSE disabled");
8158 issued_sse_arg_error = true;
8160 return NULL;
8163 /* Likewise, error if the ABI requires us to return values in the
8164 x87 registers and the user specified -mno-80387. */
8165 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8166 for (i = 0; i < n; i++)
8167 if (regclass[i] == X86_64_X87_CLASS
8168 || regclass[i] == X86_64_X87UP_CLASS
8169 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8171 if (!issued_x87_ret_error)
8173 error ("x87 register return with x87 disabled");
8174 issued_x87_ret_error = true;
8176 return NULL;
8179 /* First construct simple cases. Avoid SCmode, since we want to use
8180 single register to pass this type. */
8181 if (n == 1 && mode != SCmode)
8182 switch (regclass[0])
8184 case X86_64_INTEGER_CLASS:
8185 case X86_64_INTEGERSI_CLASS:
8186 return gen_rtx_REG (mode, intreg[0]);
8187 case X86_64_SSE_CLASS:
8188 case X86_64_SSESF_CLASS:
8189 case X86_64_SSEDF_CLASS:
8190 if (mode != BLKmode)
8191 return gen_reg_or_parallel (mode, orig_mode,
8192 SSE_REGNO (sse_regno));
8193 break;
8194 case X86_64_X87_CLASS:
8195 case X86_64_COMPLEX_X87_CLASS:
8196 return gen_rtx_REG (mode, FIRST_STACK_REG);
8197 case X86_64_NO_CLASS:
8198 /* Zero sized array, struct or class. */
8199 return NULL;
8200 default:
8201 gcc_unreachable ();
8203 if (n == 2
8204 && regclass[0] == X86_64_SSE_CLASS
8205 && regclass[1] == X86_64_SSEUP_CLASS
8206 && mode != BLKmode)
8207 return gen_reg_or_parallel (mode, orig_mode,
8208 SSE_REGNO (sse_regno));
8209 if (n == 4
8210 && regclass[0] == X86_64_SSE_CLASS
8211 && regclass[1] == X86_64_SSEUP_CLASS
8212 && regclass[2] == X86_64_SSEUP_CLASS
8213 && regclass[3] == X86_64_SSEUP_CLASS
8214 && mode != BLKmode)
8215 return gen_reg_or_parallel (mode, orig_mode,
8216 SSE_REGNO (sse_regno));
8217 if (n == 8
8218 && regclass[0] == X86_64_SSE_CLASS
8219 && regclass[1] == X86_64_SSEUP_CLASS
8220 && regclass[2] == X86_64_SSEUP_CLASS
8221 && regclass[3] == X86_64_SSEUP_CLASS
8222 && regclass[4] == X86_64_SSEUP_CLASS
8223 && regclass[5] == X86_64_SSEUP_CLASS
8224 && regclass[6] == X86_64_SSEUP_CLASS
8225 && regclass[7] == X86_64_SSEUP_CLASS
8226 && mode != BLKmode)
8227 return gen_reg_or_parallel (mode, orig_mode,
8228 SSE_REGNO (sse_regno));
8229 if (n == 2
8230 && regclass[0] == X86_64_X87_CLASS
8231 && regclass[1] == X86_64_X87UP_CLASS)
8232 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8234 if (n == 2
8235 && regclass[0] == X86_64_INTEGER_CLASS
8236 && regclass[1] == X86_64_INTEGER_CLASS
8237 && (mode == CDImode || mode == TImode)
8238 && intreg[0] + 1 == intreg[1])
8239 return gen_rtx_REG (mode, intreg[0]);
8241 /* Otherwise figure out the entries of the PARALLEL. */
8242 for (i = 0; i < n; i++)
8244 int pos;
8246 switch (regclass[i])
8248 case X86_64_NO_CLASS:
8249 break;
8250 case X86_64_INTEGER_CLASS:
8251 case X86_64_INTEGERSI_CLASS:
8252 /* Merge TImodes on aligned occasions here too. */
8253 if (i * 8 + 8 > bytes)
8255 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8256 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8257 /* We've requested 24 bytes we
8258 don't have mode for. Use DImode. */
8259 tmpmode = DImode;
8261 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8262 tmpmode = SImode;
8263 else
8264 tmpmode = DImode;
8265 exp [nexps++]
8266 = gen_rtx_EXPR_LIST (VOIDmode,
8267 gen_rtx_REG (tmpmode, *intreg),
8268 GEN_INT (i*8));
8269 intreg++;
8270 break;
8271 case X86_64_SSESF_CLASS:
8272 exp [nexps++]
8273 = gen_rtx_EXPR_LIST (VOIDmode,
8274 gen_rtx_REG (SFmode,
8275 SSE_REGNO (sse_regno)),
8276 GEN_INT (i*8));
8277 sse_regno++;
8278 break;
8279 case X86_64_SSEDF_CLASS:
8280 exp [nexps++]
8281 = gen_rtx_EXPR_LIST (VOIDmode,
8282 gen_rtx_REG (DFmode,
8283 SSE_REGNO (sse_regno)),
8284 GEN_INT (i*8));
8285 sse_regno++;
8286 break;
8287 case X86_64_SSE_CLASS:
8288 pos = i;
8289 switch (n)
8291 case 1:
8292 tmpmode = DImode;
8293 break;
8294 case 2:
8295 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8297 tmpmode = TImode;
8298 i++;
8300 else
8301 tmpmode = DImode;
8302 break;
8303 case 4:
8304 gcc_assert (i == 0
8305 && regclass[1] == X86_64_SSEUP_CLASS
8306 && regclass[2] == X86_64_SSEUP_CLASS
8307 && regclass[3] == X86_64_SSEUP_CLASS);
8308 tmpmode = OImode;
8309 i += 3;
8310 break;
8311 case 8:
8312 gcc_assert (i == 0
8313 && regclass[1] == X86_64_SSEUP_CLASS
8314 && regclass[2] == X86_64_SSEUP_CLASS
8315 && regclass[3] == X86_64_SSEUP_CLASS
8316 && regclass[4] == X86_64_SSEUP_CLASS
8317 && regclass[5] == X86_64_SSEUP_CLASS
8318 && regclass[6] == X86_64_SSEUP_CLASS
8319 && regclass[7] == X86_64_SSEUP_CLASS);
8320 tmpmode = XImode;
8321 i += 7;
8322 break;
8323 default:
8324 gcc_unreachable ();
8326 exp [nexps++]
8327 = gen_rtx_EXPR_LIST (VOIDmode,
8328 gen_rtx_REG (tmpmode,
8329 SSE_REGNO (sse_regno)),
8330 GEN_INT (pos*8));
8331 sse_regno++;
8332 break;
8333 default:
8334 gcc_unreachable ();
8338 /* Empty aligned struct, union or class. */
8339 if (nexps == 0)
8340 return NULL;
8342 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8343 for (i = 0; i < nexps; i++)
8344 XVECEXP (ret, 0, i) = exp [i];
8345 return ret;
8348 /* Update the data in CUM to advance over an argument of mode MODE
8349 and data type TYPE. (TYPE is null for libcalls where that information
8350 may not be available.)
8352 Return a number of integer regsiters advanced over. */
8354 static int
8355 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8356 const_tree type, HOST_WIDE_INT bytes,
8357 HOST_WIDE_INT words)
8359 int res = 0;
8360 bool error_p = false;
8362 if (TARGET_IAMCU)
8364 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8365 bytes in registers. */
8366 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8367 goto pass_in_reg;
8368 return res;
8371 switch (mode)
8373 default:
8374 break;
8376 case E_BLKmode:
8377 if (bytes < 0)
8378 break;
8379 /* FALLTHRU */
8381 case E_DImode:
8382 case E_SImode:
8383 case E_HImode:
8384 case E_QImode:
8385 pass_in_reg:
8386 cum->words += words;
8387 cum->nregs -= words;
8388 cum->regno += words;
8389 if (cum->nregs >= 0)
8390 res = words;
8391 if (cum->nregs <= 0)
8393 cum->nregs = 0;
8394 cfun->machine->arg_reg_available = false;
8395 cum->regno = 0;
8397 break;
8399 case E_OImode:
8400 /* OImode shouldn't be used directly. */
8401 gcc_unreachable ();
8403 case E_DFmode:
8404 if (cum->float_in_sse == -1)
8405 error_p = true;
8406 if (cum->float_in_sse < 2)
8407 break;
8408 /* FALLTHRU */
8409 case E_SFmode:
8410 if (cum->float_in_sse == -1)
8411 error_p = true;
8412 if (cum->float_in_sse < 1)
8413 break;
8414 /* FALLTHRU */
8416 case E_V8SFmode:
8417 case E_V8SImode:
8418 case E_V64QImode:
8419 case E_V32HImode:
8420 case E_V16SImode:
8421 case E_V8DImode:
8422 case E_V16SFmode:
8423 case E_V8DFmode:
8424 case E_V32QImode:
8425 case E_V16HImode:
8426 case E_V4DFmode:
8427 case E_V4DImode:
8428 case E_TImode:
8429 case E_V16QImode:
8430 case E_V8HImode:
8431 case E_V4SImode:
8432 case E_V2DImode:
8433 case E_V4SFmode:
8434 case E_V2DFmode:
8435 if (!type || !AGGREGATE_TYPE_P (type))
8437 cum->sse_words += words;
8438 cum->sse_nregs -= 1;
8439 cum->sse_regno += 1;
8440 if (cum->sse_nregs <= 0)
8442 cum->sse_nregs = 0;
8443 cum->sse_regno = 0;
8446 break;
8448 case E_V8QImode:
8449 case E_V4HImode:
8450 case E_V2SImode:
8451 case E_V2SFmode:
8452 case E_V1TImode:
8453 case E_V1DImode:
8454 if (!type || !AGGREGATE_TYPE_P (type))
8456 cum->mmx_words += words;
8457 cum->mmx_nregs -= 1;
8458 cum->mmx_regno += 1;
8459 if (cum->mmx_nregs <= 0)
8461 cum->mmx_nregs = 0;
8462 cum->mmx_regno = 0;
8465 break;
8467 if (error_p)
8469 cum->float_in_sse = 0;
8470 error ("calling %qD with SSE calling convention without "
8471 "SSE/SSE2 enabled", cum->decl);
8472 sorry ("this is a GCC bug that can be worked around by adding "
8473 "attribute used to function called");
8476 return res;
8479 static int
8480 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8481 const_tree type, HOST_WIDE_INT words, bool named)
8483 int int_nregs, sse_nregs;
8485 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8486 if (!named && (VALID_AVX512F_REG_MODE (mode)
8487 || VALID_AVX256_REG_MODE (mode)))
8488 return 0;
8490 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8491 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8493 cum->nregs -= int_nregs;
8494 cum->sse_nregs -= sse_nregs;
8495 cum->regno += int_nregs;
8496 cum->sse_regno += sse_nregs;
8497 return int_nregs;
8499 else
8501 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8502 cum->words = ROUND_UP (cum->words, align);
8503 cum->words += words;
8504 return 0;
8508 static int
8509 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8510 HOST_WIDE_INT words)
8512 /* Otherwise, this should be passed indirect. */
8513 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8515 cum->words += words;
8516 if (cum->nregs > 0)
8518 cum->nregs -= 1;
8519 cum->regno += 1;
8520 return 1;
8522 return 0;
8525 /* Update the data in CUM to advance over an argument of mode MODE and
8526 data type TYPE. (TYPE is null for libcalls where that information
8527 may not be available.) */
8529 static void
8530 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8531 const_tree type, bool named)
8533 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8534 HOST_WIDE_INT bytes, words;
8535 int nregs;
8537 /* The argument of interrupt handler is a special case and is
8538 handled in ix86_function_arg. */
8539 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8540 return;
8542 if (mode == BLKmode)
8543 bytes = int_size_in_bytes (type);
8544 else
8545 bytes = GET_MODE_SIZE (mode);
8546 words = CEIL (bytes, UNITS_PER_WORD);
8548 if (type)
8549 mode = type_natural_mode (type, NULL, false);
8551 if ((type && POINTER_BOUNDS_TYPE_P (type))
8552 || POINTER_BOUNDS_MODE_P (mode))
8554 /* If we pass bounds in BT then just update remained bounds count. */
8555 if (cum->bnds_in_bt)
8557 cum->bnds_in_bt--;
8558 return;
8561 /* Update remained number of bounds to force. */
8562 if (cum->force_bnd_pass)
8563 cum->force_bnd_pass--;
8565 cum->bnd_regno++;
8567 return;
8570 /* The first arg not going to Bounds Tables resets this counter. */
8571 cum->bnds_in_bt = 0;
8572 /* For unnamed args we always pass bounds to avoid bounds mess when
8573 passed and received types do not match. If bounds do not follow
8574 unnamed arg, still pretend required number of bounds were passed. */
8575 if (cum->force_bnd_pass)
8577 cum->bnd_regno += cum->force_bnd_pass;
8578 cum->force_bnd_pass = 0;
8581 if (TARGET_64BIT)
8583 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8585 if (call_abi == MS_ABI)
8586 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8587 else
8588 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8590 else
8591 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8593 /* For stdarg we expect bounds to be passed for each value passed
8594 in register. */
8595 if (cum->stdarg)
8596 cum->force_bnd_pass = nregs;
8597 /* For pointers passed in memory we expect bounds passed in Bounds
8598 Table. */
8599 if (!nregs)
8601 /* Track if there are outgoing arguments on stack. */
8602 if (cum->caller)
8603 cfun->machine->outgoing_args_on_stack = true;
8605 cum->bnds_in_bt = chkp_type_bounds_count (type);
8609 /* Define where to put the arguments to a function.
8610 Value is zero to push the argument on the stack,
8611 or a hard register in which to store the argument.
8613 MODE is the argument's machine mode.
8614 TYPE is the data type of the argument (as a tree).
8615 This is null for libcalls where that information may
8616 not be available.
8617 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8618 the preceding args and about the function being called.
8619 NAMED is nonzero if this argument is a named parameter
8620 (otherwise it is an extra parameter matching an ellipsis). */
8622 static rtx
8623 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8624 machine_mode orig_mode, const_tree type,
8625 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8627 bool error_p = false;
8629 /* Avoid the AL settings for the Unix64 ABI. */
8630 if (mode == VOIDmode)
8631 return constm1_rtx;
8633 if (TARGET_IAMCU)
8635 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8636 bytes in registers. */
8637 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8638 goto pass_in_reg;
8639 return NULL_RTX;
8642 switch (mode)
8644 default:
8645 break;
8647 case E_BLKmode:
8648 if (bytes < 0)
8649 break;
8650 /* FALLTHRU */
8651 case E_DImode:
8652 case E_SImode:
8653 case E_HImode:
8654 case E_QImode:
8655 pass_in_reg:
8656 if (words <= cum->nregs)
8658 int regno = cum->regno;
8660 /* Fastcall allocates the first two DWORD (SImode) or
8661 smaller arguments to ECX and EDX if it isn't an
8662 aggregate type . */
8663 if (cum->fastcall)
8665 if (mode == BLKmode
8666 || mode == DImode
8667 || (type && AGGREGATE_TYPE_P (type)))
8668 break;
8670 /* ECX not EAX is the first allocated register. */
8671 if (regno == AX_REG)
8672 regno = CX_REG;
8674 return gen_rtx_REG (mode, regno);
8676 break;
8678 case E_DFmode:
8679 if (cum->float_in_sse == -1)
8680 error_p = true;
8681 if (cum->float_in_sse < 2)
8682 break;
8683 /* FALLTHRU */
8684 case E_SFmode:
8685 if (cum->float_in_sse == -1)
8686 error_p = true;
8687 if (cum->float_in_sse < 1)
8688 break;
8689 /* FALLTHRU */
8690 case E_TImode:
8691 /* In 32bit, we pass TImode in xmm registers. */
8692 case E_V16QImode:
8693 case E_V8HImode:
8694 case E_V4SImode:
8695 case E_V2DImode:
8696 case E_V4SFmode:
8697 case E_V2DFmode:
8698 if (!type || !AGGREGATE_TYPE_P (type))
8700 if (cum->sse_nregs)
8701 return gen_reg_or_parallel (mode, orig_mode,
8702 cum->sse_regno + FIRST_SSE_REG);
8704 break;
8706 case E_OImode:
8707 case E_XImode:
8708 /* OImode and XImode shouldn't be used directly. */
8709 gcc_unreachable ();
8711 case E_V64QImode:
8712 case E_V32HImode:
8713 case E_V16SImode:
8714 case E_V8DImode:
8715 case E_V16SFmode:
8716 case E_V8DFmode:
8717 case E_V8SFmode:
8718 case E_V8SImode:
8719 case E_V32QImode:
8720 case E_V16HImode:
8721 case E_V4DFmode:
8722 case E_V4DImode:
8723 if (!type || !AGGREGATE_TYPE_P (type))
8725 if (cum->sse_nregs)
8726 return gen_reg_or_parallel (mode, orig_mode,
8727 cum->sse_regno + FIRST_SSE_REG);
8729 break;
8731 case E_V8QImode:
8732 case E_V4HImode:
8733 case E_V2SImode:
8734 case E_V2SFmode:
8735 case E_V1TImode:
8736 case E_V1DImode:
8737 if (!type || !AGGREGATE_TYPE_P (type))
8739 if (cum->mmx_nregs)
8740 return gen_reg_or_parallel (mode, orig_mode,
8741 cum->mmx_regno + FIRST_MMX_REG);
8743 break;
8745 if (error_p)
8747 cum->float_in_sse = 0;
8748 error ("calling %qD with SSE calling convention without "
8749 "SSE/SSE2 enabled", cum->decl);
8750 sorry ("this is a GCC bug that can be worked around by adding "
8751 "attribute used to function called");
8754 return NULL_RTX;
8757 static rtx
8758 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8759 machine_mode orig_mode, const_tree type, bool named)
8761 /* Handle a hidden AL argument containing number of registers
8762 for varargs x86-64 functions. */
8763 if (mode == VOIDmode)
8764 return GEN_INT (cum->maybe_vaarg
8765 ? (cum->sse_nregs < 0
8766 ? X86_64_SSE_REGPARM_MAX
8767 : cum->sse_regno)
8768 : -1);
8770 switch (mode)
8772 default:
8773 break;
8775 case E_V8SFmode:
8776 case E_V8SImode:
8777 case E_V32QImode:
8778 case E_V16HImode:
8779 case E_V4DFmode:
8780 case E_V4DImode:
8781 case E_V16SFmode:
8782 case E_V16SImode:
8783 case E_V64QImode:
8784 case E_V32HImode:
8785 case E_V8DFmode:
8786 case E_V8DImode:
8787 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8788 if (!named)
8789 return NULL;
8790 break;
8793 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8794 cum->sse_nregs,
8795 &x86_64_int_parameter_registers [cum->regno],
8796 cum->sse_regno);
8799 static rtx
8800 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8801 machine_mode orig_mode, bool named,
8802 HOST_WIDE_INT bytes)
8804 unsigned int regno;
8806 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8807 We use value of -2 to specify that current function call is MSABI. */
8808 if (mode == VOIDmode)
8809 return GEN_INT (-2);
8811 /* If we've run out of registers, it goes on the stack. */
8812 if (cum->nregs == 0)
8813 return NULL_RTX;
8815 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8817 /* Only floating point modes are passed in anything but integer regs. */
8818 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8820 if (named)
8821 regno = cum->regno + FIRST_SSE_REG;
8822 else
8824 rtx t1, t2;
8826 /* Unnamed floating parameters are passed in both the
8827 SSE and integer registers. */
8828 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8829 t2 = gen_rtx_REG (mode, regno);
8830 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8831 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8832 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8835 /* Handle aggregated types passed in register. */
8836 if (orig_mode == BLKmode)
8838 if (bytes > 0 && bytes <= 8)
8839 mode = (bytes > 4 ? DImode : SImode);
8840 if (mode == BLKmode)
8841 mode = DImode;
8844 return gen_reg_or_parallel (mode, orig_mode, regno);
8847 /* Return where to put the arguments to a function.
8848 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8850 MODE is the argument's machine mode. TYPE is the data type of the
8851 argument. It is null for libcalls where that information may not be
8852 available. CUM gives information about the preceding args and about
8853 the function being called. NAMED is nonzero if this argument is a
8854 named parameter (otherwise it is an extra parameter matching an
8855 ellipsis). */
8857 static rtx
8858 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8859 const_tree type, bool named)
8861 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8862 machine_mode mode = omode;
8863 HOST_WIDE_INT bytes, words;
8864 rtx arg;
8866 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8868 gcc_assert (type != NULL_TREE);
8869 if (POINTER_TYPE_P (type))
8871 /* This is the pointer argument. */
8872 gcc_assert (TYPE_MODE (type) == Pmode);
8873 /* It is at -WORD(AP) in the current frame in interrupt and
8874 exception handlers. */
8875 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8877 else
8879 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8880 && TREE_CODE (type) == INTEGER_TYPE
8881 && TYPE_MODE (type) == word_mode);
8882 /* The error code is the word-mode integer argument at
8883 -2 * WORD(AP) in the current frame of the exception
8884 handler. */
8885 arg = gen_rtx_MEM (word_mode,
8886 plus_constant (Pmode,
8887 arg_pointer_rtx,
8888 -2 * UNITS_PER_WORD));
8890 return arg;
8893 /* All pointer bounds arguments are handled separately here. */
8894 if ((type && POINTER_BOUNDS_TYPE_P (type))
8895 || POINTER_BOUNDS_MODE_P (mode))
8897 /* Return NULL if bounds are forced to go in Bounds Table. */
8898 if (cum->bnds_in_bt)
8899 arg = NULL;
8900 /* Return the next available bound reg if any. */
8901 else if (cum->bnd_regno <= LAST_BND_REG)
8902 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8903 /* Return the next special slot number otherwise. */
8904 else
8905 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8907 return arg;
8910 if (mode == BLKmode)
8911 bytes = int_size_in_bytes (type);
8912 else
8913 bytes = GET_MODE_SIZE (mode);
8914 words = CEIL (bytes, UNITS_PER_WORD);
8916 /* To simplify the code below, represent vector types with a vector mode
8917 even if MMX/SSE are not active. */
8918 if (type && TREE_CODE (type) == VECTOR_TYPE)
8919 mode = type_natural_mode (type, cum, false);
8921 if (TARGET_64BIT)
8923 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8925 if (call_abi == MS_ABI)
8926 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8927 else
8928 arg = function_arg_64 (cum, mode, omode, type, named);
8930 else
8931 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8933 /* Track if there are outgoing arguments on stack. */
8934 if (arg == NULL_RTX && cum->caller)
8935 cfun->machine->outgoing_args_on_stack = true;
8937 return arg;
8940 /* A C expression that indicates when an argument must be passed by
8941 reference. If nonzero for an argument, a copy of that argument is
8942 made in memory and a pointer to the argument is passed instead of
8943 the argument itself. The pointer is passed in whatever way is
8944 appropriate for passing a pointer to that type. */
8946 static bool
8947 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8948 const_tree type, bool)
8950 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8952 /* Bounds are never passed by reference. */
8953 if ((type && POINTER_BOUNDS_TYPE_P (type))
8954 || POINTER_BOUNDS_MODE_P (mode))
8955 return false;
8957 if (TARGET_64BIT)
8959 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8961 /* See Windows x64 Software Convention. */
8962 if (call_abi == MS_ABI)
8964 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8966 if (type)
8968 /* Arrays are passed by reference. */
8969 if (TREE_CODE (type) == ARRAY_TYPE)
8970 return true;
8972 if (RECORD_OR_UNION_TYPE_P (type))
8974 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8975 are passed by reference. */
8976 msize = int_size_in_bytes (type);
8980 /* __m128 is passed by reference. */
8981 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8983 else if (type && int_size_in_bytes (type) == -1)
8984 return true;
8987 return false;
8990 /* Return true when TYPE should be 128bit aligned for 32bit argument
8991 passing ABI. XXX: This function is obsolete and is only used for
8992 checking psABI compatibility with previous versions of GCC. */
8994 static bool
8995 ix86_compat_aligned_value_p (const_tree type)
8997 machine_mode mode = TYPE_MODE (type);
8998 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8999 || mode == TDmode
9000 || mode == TFmode
9001 || mode == TCmode)
9002 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
9003 return true;
9004 if (TYPE_ALIGN (type) < 128)
9005 return false;
9007 if (AGGREGATE_TYPE_P (type))
9009 /* Walk the aggregates recursively. */
9010 switch (TREE_CODE (type))
9012 case RECORD_TYPE:
9013 case UNION_TYPE:
9014 case QUAL_UNION_TYPE:
9016 tree field;
9018 /* Walk all the structure fields. */
9019 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9021 if (TREE_CODE (field) == FIELD_DECL
9022 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
9023 return true;
9025 break;
9028 case ARRAY_TYPE:
9029 /* Just for use if some languages passes arrays by value. */
9030 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
9031 return true;
9032 break;
9034 default:
9035 gcc_unreachable ();
9038 return false;
9041 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
9042 XXX: This function is obsolete and is only used for checking psABI
9043 compatibility with previous versions of GCC. */
9045 static unsigned int
9046 ix86_compat_function_arg_boundary (machine_mode mode,
9047 const_tree type, unsigned int align)
9049 /* In 32bit, only _Decimal128 and __float128 are aligned to their
9050 natural boundaries. */
9051 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
9053 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
9054 make an exception for SSE modes since these require 128bit
9055 alignment.
9057 The handling here differs from field_alignment. ICC aligns MMX
9058 arguments to 4 byte boundaries, while structure fields are aligned
9059 to 8 byte boundaries. */
9060 if (!type)
9062 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
9063 align = PARM_BOUNDARY;
9065 else
9067 if (!ix86_compat_aligned_value_p (type))
9068 align = PARM_BOUNDARY;
9071 if (align > BIGGEST_ALIGNMENT)
9072 align = BIGGEST_ALIGNMENT;
9073 return align;
9076 /* Return true when TYPE should be 128bit aligned for 32bit argument
9077 passing ABI. */
9079 static bool
9080 ix86_contains_aligned_value_p (const_tree type)
9082 machine_mode mode = TYPE_MODE (type);
9084 if (mode == XFmode || mode == XCmode)
9085 return false;
9087 if (TYPE_ALIGN (type) < 128)
9088 return false;
9090 if (AGGREGATE_TYPE_P (type))
9092 /* Walk the aggregates recursively. */
9093 switch (TREE_CODE (type))
9095 case RECORD_TYPE:
9096 case UNION_TYPE:
9097 case QUAL_UNION_TYPE:
9099 tree field;
9101 /* Walk all the structure fields. */
9102 for (field = TYPE_FIELDS (type);
9103 field;
9104 field = DECL_CHAIN (field))
9106 if (TREE_CODE (field) == FIELD_DECL
9107 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
9108 return true;
9110 break;
9113 case ARRAY_TYPE:
9114 /* Just for use if some languages passes arrays by value. */
9115 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
9116 return true;
9117 break;
9119 default:
9120 gcc_unreachable ();
9123 else
9124 return TYPE_ALIGN (type) >= 128;
9126 return false;
9129 /* Gives the alignment boundary, in bits, of an argument with the
9130 specified mode and type. */
9132 static unsigned int
9133 ix86_function_arg_boundary (machine_mode mode, const_tree type)
9135 unsigned int align;
9136 if (type)
9138 /* Since the main variant type is used for call, we convert it to
9139 the main variant type. */
9140 type = TYPE_MAIN_VARIANT (type);
9141 align = TYPE_ALIGN (type);
9142 if (TYPE_EMPTY_P (type))
9143 return PARM_BOUNDARY;
9145 else
9146 align = GET_MODE_ALIGNMENT (mode);
9147 if (align < PARM_BOUNDARY)
9148 align = PARM_BOUNDARY;
9149 else
9151 static bool warned;
9152 unsigned int saved_align = align;
9154 if (!TARGET_64BIT)
9156 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
9157 if (!type)
9159 if (mode == XFmode || mode == XCmode)
9160 align = PARM_BOUNDARY;
9162 else if (!ix86_contains_aligned_value_p (type))
9163 align = PARM_BOUNDARY;
9165 if (align < 128)
9166 align = PARM_BOUNDARY;
9169 if (warn_psabi
9170 && !warned
9171 && align != ix86_compat_function_arg_boundary (mode, type,
9172 saved_align))
9174 warned = true;
9175 inform (input_location,
9176 "The ABI for passing parameters with %d-byte"
9177 " alignment has changed in GCC 4.6",
9178 align / BITS_PER_UNIT);
9182 return align;
9185 /* Return true if N is a possible register number of function value. */
9187 static bool
9188 ix86_function_value_regno_p (const unsigned int regno)
9190 switch (regno)
9192 case AX_REG:
9193 return true;
9194 case DX_REG:
9195 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9196 case DI_REG:
9197 case SI_REG:
9198 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9200 case BND0_REG:
9201 case BND1_REG:
9202 return chkp_function_instrumented_p (current_function_decl);
9204 /* Complex values are returned in %st(0)/%st(1) pair. */
9205 case ST0_REG:
9206 case ST1_REG:
9207 /* TODO: The function should depend on current function ABI but
9208 builtins.c would need updating then. Therefore we use the
9209 default ABI. */
9210 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9211 return false;
9212 return TARGET_FLOAT_RETURNS_IN_80387;
9214 /* Complex values are returned in %xmm0/%xmm1 pair. */
9215 case XMM0_REG:
9216 case XMM1_REG:
9217 return TARGET_SSE;
9219 case MM0_REG:
9220 if (TARGET_MACHO || TARGET_64BIT)
9221 return false;
9222 return TARGET_MMX;
9225 return false;
9228 /* Define how to find the value returned by a function.
9229 VALTYPE is the data type of the value (as a tree).
9230 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9231 otherwise, FUNC is 0. */
9233 static rtx
9234 function_value_32 (machine_mode orig_mode, machine_mode mode,
9235 const_tree fntype, const_tree fn)
9237 unsigned int regno;
9239 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9240 we normally prevent this case when mmx is not available. However
9241 some ABIs may require the result to be returned like DImode. */
9242 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9243 regno = FIRST_MMX_REG;
9245 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9246 we prevent this case when sse is not available. However some ABIs
9247 may require the result to be returned like integer TImode. */
9248 else if (mode == TImode
9249 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9250 regno = FIRST_SSE_REG;
9252 /* 32-byte vector modes in %ymm0. */
9253 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9254 regno = FIRST_SSE_REG;
9256 /* 64-byte vector modes in %zmm0. */
9257 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9258 regno = FIRST_SSE_REG;
9260 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9261 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9262 regno = FIRST_FLOAT_REG;
9263 else
9264 /* Most things go in %eax. */
9265 regno = AX_REG;
9267 /* Override FP return register with %xmm0 for local functions when
9268 SSE math is enabled or for functions with sseregparm attribute. */
9269 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9271 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9272 if (sse_level == -1)
9274 error ("calling %qD with SSE calling convention without "
9275 "SSE/SSE2 enabled", fn);
9276 sorry ("this is a GCC bug that can be worked around by adding "
9277 "attribute used to function called");
9279 else if ((sse_level >= 1 && mode == SFmode)
9280 || (sse_level == 2 && mode == DFmode))
9281 regno = FIRST_SSE_REG;
9284 /* OImode shouldn't be used directly. */
9285 gcc_assert (mode != OImode);
9287 return gen_rtx_REG (orig_mode, regno);
9290 static rtx
9291 function_value_64 (machine_mode orig_mode, machine_mode mode,
9292 const_tree valtype)
9294 rtx ret;
9296 /* Handle libcalls, which don't provide a type node. */
9297 if (valtype == NULL)
9299 unsigned int regno;
9301 switch (mode)
9303 case E_SFmode:
9304 case E_SCmode:
9305 case E_DFmode:
9306 case E_DCmode:
9307 case E_TFmode:
9308 case E_SDmode:
9309 case E_DDmode:
9310 case E_TDmode:
9311 regno = FIRST_SSE_REG;
9312 break;
9313 case E_XFmode:
9314 case E_XCmode:
9315 regno = FIRST_FLOAT_REG;
9316 break;
9317 case E_TCmode:
9318 return NULL;
9319 default:
9320 regno = AX_REG;
9323 return gen_rtx_REG (mode, regno);
9325 else if (POINTER_TYPE_P (valtype))
9327 /* Pointers are always returned in word_mode. */
9328 mode = word_mode;
9331 ret = construct_container (mode, orig_mode, valtype, 1,
9332 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9333 x86_64_int_return_registers, 0);
9335 /* For zero sized structures, construct_container returns NULL, but we
9336 need to keep rest of compiler happy by returning meaningful value. */
9337 if (!ret)
9338 ret = gen_rtx_REG (orig_mode, AX_REG);
9340 return ret;
9343 static rtx
9344 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9345 const_tree valtype)
9347 unsigned int regno = AX_REG;
9349 if (TARGET_SSE)
9351 switch (GET_MODE_SIZE (mode))
9353 case 16:
9354 if (valtype != NULL_TREE
9355 && !VECTOR_INTEGER_TYPE_P (valtype)
9356 && !VECTOR_INTEGER_TYPE_P (valtype)
9357 && !INTEGRAL_TYPE_P (valtype)
9358 && !VECTOR_FLOAT_TYPE_P (valtype))
9359 break;
9360 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9361 && !COMPLEX_MODE_P (mode))
9362 regno = FIRST_SSE_REG;
9363 break;
9364 case 8:
9365 case 4:
9366 if (mode == SFmode || mode == DFmode)
9367 regno = FIRST_SSE_REG;
9368 break;
9369 default:
9370 break;
9373 return gen_rtx_REG (orig_mode, regno);
9376 static rtx
9377 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9378 machine_mode orig_mode, machine_mode mode)
9380 const_tree fn, fntype;
9382 fn = NULL_TREE;
9383 if (fntype_or_decl && DECL_P (fntype_or_decl))
9384 fn = fntype_or_decl;
9385 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9387 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9388 || POINTER_BOUNDS_MODE_P (mode))
9389 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9390 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9391 return function_value_ms_64 (orig_mode, mode, valtype);
9392 else if (TARGET_64BIT)
9393 return function_value_64 (orig_mode, mode, valtype);
9394 else
9395 return function_value_32 (orig_mode, mode, fntype, fn);
9398 static rtx
9399 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9401 machine_mode mode, orig_mode;
9403 orig_mode = TYPE_MODE (valtype);
9404 mode = type_natural_mode (valtype, NULL, true);
9405 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9408 /* Return an RTX representing a place where a function returns
9409 or recieves pointer bounds or NULL if no bounds are returned.
9411 VALTYPE is a data type of a value returned by the function.
9413 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9414 or FUNCTION_TYPE of the function.
9416 If OUTGOING is false, return a place in which the caller will
9417 see the return value. Otherwise, return a place where a
9418 function returns a value. */
9420 static rtx
9421 ix86_function_value_bounds (const_tree valtype,
9422 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9423 bool outgoing ATTRIBUTE_UNUSED)
9425 rtx res = NULL_RTX;
9427 if (BOUNDED_TYPE_P (valtype))
9428 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9429 else if (chkp_type_has_pointer (valtype))
9431 bitmap slots;
9432 rtx bounds[2];
9433 bitmap_iterator bi;
9434 unsigned i, bnd_no = 0;
9436 bitmap_obstack_initialize (NULL);
9437 slots = BITMAP_ALLOC (NULL);
9438 chkp_find_bound_slots (valtype, slots);
9440 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9442 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9443 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9444 gcc_assert (bnd_no < 2);
9445 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9448 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9450 BITMAP_FREE (slots);
9451 bitmap_obstack_release (NULL);
9453 else
9454 res = NULL_RTX;
9456 return res;
9459 /* Pointer function arguments and return values are promoted to
9460 word_mode for normal functions. */
9462 static machine_mode
9463 ix86_promote_function_mode (const_tree type, machine_mode mode,
9464 int *punsignedp, const_tree fntype,
9465 int for_return)
9467 if (cfun->machine->func_type == TYPE_NORMAL
9468 && type != NULL_TREE
9469 && POINTER_TYPE_P (type))
9471 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9472 return word_mode;
9474 return default_promote_function_mode (type, mode, punsignedp, fntype,
9475 for_return);
9478 /* Return true if a structure, union or array with MODE containing FIELD
9479 should be accessed using BLKmode. */
9481 static bool
9482 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9484 /* Union with XFmode must be in BLKmode. */
9485 return (mode == XFmode
9486 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9487 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9491 ix86_libcall_value (machine_mode mode)
9493 return ix86_function_value_1 (NULL, NULL, mode, mode);
9496 /* Return true iff type is returned in memory. */
9498 static bool
9499 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9501 #ifdef SUBTARGET_RETURN_IN_MEMORY
9502 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9503 #else
9504 const machine_mode mode = type_natural_mode (type, NULL, true);
9505 HOST_WIDE_INT size;
9507 if (POINTER_BOUNDS_TYPE_P (type))
9508 return false;
9510 if (TARGET_64BIT)
9512 if (ix86_function_type_abi (fntype) == MS_ABI)
9514 size = int_size_in_bytes (type);
9516 /* __m128 is returned in xmm0. */
9517 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9518 || INTEGRAL_TYPE_P (type)
9519 || VECTOR_FLOAT_TYPE_P (type))
9520 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9521 && !COMPLEX_MODE_P (mode)
9522 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9523 return false;
9525 /* Otherwise, the size must be exactly in [1248]. */
9526 return size != 1 && size != 2 && size != 4 && size != 8;
9528 else
9530 int needed_intregs, needed_sseregs;
9532 return examine_argument (mode, type, 1,
9533 &needed_intregs, &needed_sseregs);
9536 else
9538 size = int_size_in_bytes (type);
9540 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9541 bytes in registers. */
9542 if (TARGET_IAMCU)
9543 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9545 if (mode == BLKmode)
9546 return true;
9548 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9549 return false;
9551 if (VECTOR_MODE_P (mode) || mode == TImode)
9553 /* User-created vectors small enough to fit in EAX. */
9554 if (size < 8)
9555 return false;
9557 /* Unless ABI prescibes otherwise,
9558 MMX/3dNow values are returned in MM0 if available. */
9560 if (size == 8)
9561 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9563 /* SSE values are returned in XMM0 if available. */
9564 if (size == 16)
9565 return !TARGET_SSE;
9567 /* AVX values are returned in YMM0 if available. */
9568 if (size == 32)
9569 return !TARGET_AVX;
9571 /* AVX512F values are returned in ZMM0 if available. */
9572 if (size == 64)
9573 return !TARGET_AVX512F;
9576 if (mode == XFmode)
9577 return false;
9579 if (size > 12)
9580 return true;
9582 /* OImode shouldn't be used directly. */
9583 gcc_assert (mode != OImode);
9585 return false;
9587 #endif
9591 /* Create the va_list data type. */
9593 static tree
9594 ix86_build_builtin_va_list_64 (void)
9596 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9598 record = lang_hooks.types.make_type (RECORD_TYPE);
9599 type_decl = build_decl (BUILTINS_LOCATION,
9600 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9602 f_gpr = build_decl (BUILTINS_LOCATION,
9603 FIELD_DECL, get_identifier ("gp_offset"),
9604 unsigned_type_node);
9605 f_fpr = build_decl (BUILTINS_LOCATION,
9606 FIELD_DECL, get_identifier ("fp_offset"),
9607 unsigned_type_node);
9608 f_ovf = build_decl (BUILTINS_LOCATION,
9609 FIELD_DECL, get_identifier ("overflow_arg_area"),
9610 ptr_type_node);
9611 f_sav = build_decl (BUILTINS_LOCATION,
9612 FIELD_DECL, get_identifier ("reg_save_area"),
9613 ptr_type_node);
9615 va_list_gpr_counter_field = f_gpr;
9616 va_list_fpr_counter_field = f_fpr;
9618 DECL_FIELD_CONTEXT (f_gpr) = record;
9619 DECL_FIELD_CONTEXT (f_fpr) = record;
9620 DECL_FIELD_CONTEXT (f_ovf) = record;
9621 DECL_FIELD_CONTEXT (f_sav) = record;
9623 TYPE_STUB_DECL (record) = type_decl;
9624 TYPE_NAME (record) = type_decl;
9625 TYPE_FIELDS (record) = f_gpr;
9626 DECL_CHAIN (f_gpr) = f_fpr;
9627 DECL_CHAIN (f_fpr) = f_ovf;
9628 DECL_CHAIN (f_ovf) = f_sav;
9630 layout_type (record);
9632 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9633 NULL_TREE, TYPE_ATTRIBUTES (record));
9635 /* The correct type is an array type of one element. */
9636 return build_array_type (record, build_index_type (size_zero_node));
9639 /* Setup the builtin va_list data type and for 64-bit the additional
9640 calling convention specific va_list data types. */
9642 static tree
9643 ix86_build_builtin_va_list (void)
9645 if (TARGET_64BIT)
9647 /* Initialize ABI specific va_list builtin types.
9649 In lto1, we can encounter two va_list types:
9650 - one as a result of the type-merge across TUs, and
9651 - the one constructed here.
9652 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9653 a type identity check in canonical_va_list_type based on
9654 TYPE_MAIN_VARIANT (which we used to have) will not work.
9655 Instead, we tag each va_list_type_node with its unique attribute, and
9656 look for the attribute in the type identity check in
9657 canonical_va_list_type.
9659 Tagging sysv_va_list_type_node directly with the attribute is
9660 problematic since it's a array of one record, which will degrade into a
9661 pointer to record when used as parameter (see build_va_arg comments for
9662 an example), dropping the attribute in the process. So we tag the
9663 record instead. */
9665 /* For SYSV_ABI we use an array of one record. */
9666 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9668 /* For MS_ABI we use plain pointer to argument area. */
9669 tree char_ptr_type = build_pointer_type (char_type_node);
9670 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9671 TYPE_ATTRIBUTES (char_ptr_type));
9672 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9674 return ((ix86_abi == MS_ABI)
9675 ? ms_va_list_type_node
9676 : sysv_va_list_type_node);
9678 else
9680 /* For i386 we use plain pointer to argument area. */
9681 return build_pointer_type (char_type_node);
9685 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9687 static void
9688 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9690 rtx save_area, mem;
9691 alias_set_type set;
9692 int i, max;
9694 /* GPR size of varargs save area. */
9695 if (cfun->va_list_gpr_size)
9696 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9697 else
9698 ix86_varargs_gpr_size = 0;
9700 /* FPR size of varargs save area. We don't need it if we don't pass
9701 anything in SSE registers. */
9702 if (TARGET_SSE && cfun->va_list_fpr_size)
9703 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9704 else
9705 ix86_varargs_fpr_size = 0;
9707 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9708 return;
9710 save_area = frame_pointer_rtx;
9711 set = get_varargs_alias_set ();
9713 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9714 if (max > X86_64_REGPARM_MAX)
9715 max = X86_64_REGPARM_MAX;
9717 for (i = cum->regno; i < max; i++)
9719 mem = gen_rtx_MEM (word_mode,
9720 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9721 MEM_NOTRAP_P (mem) = 1;
9722 set_mem_alias_set (mem, set);
9723 emit_move_insn (mem,
9724 gen_rtx_REG (word_mode,
9725 x86_64_int_parameter_registers[i]));
9728 if (ix86_varargs_fpr_size)
9730 machine_mode smode;
9731 rtx_code_label *label;
9732 rtx test;
9734 /* Now emit code to save SSE registers. The AX parameter contains number
9735 of SSE parameter registers used to call this function, though all we
9736 actually check here is the zero/non-zero status. */
9738 label = gen_label_rtx ();
9739 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9740 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9741 label));
9743 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9744 we used movdqa (i.e. TImode) instead? Perhaps even better would
9745 be if we could determine the real mode of the data, via a hook
9746 into pass_stdarg. Ignore all that for now. */
9747 smode = V4SFmode;
9748 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9749 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9751 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9752 if (max > X86_64_SSE_REGPARM_MAX)
9753 max = X86_64_SSE_REGPARM_MAX;
9755 for (i = cum->sse_regno; i < max; ++i)
9757 mem = plus_constant (Pmode, save_area,
9758 i * 16 + ix86_varargs_gpr_size);
9759 mem = gen_rtx_MEM (smode, mem);
9760 MEM_NOTRAP_P (mem) = 1;
9761 set_mem_alias_set (mem, set);
9762 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9764 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9767 emit_label (label);
9771 static void
9772 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9774 alias_set_type set = get_varargs_alias_set ();
9775 int i;
9777 /* Reset to zero, as there might be a sysv vaarg used
9778 before. */
9779 ix86_varargs_gpr_size = 0;
9780 ix86_varargs_fpr_size = 0;
9782 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9784 rtx reg, mem;
9786 mem = gen_rtx_MEM (Pmode,
9787 plus_constant (Pmode, virtual_incoming_args_rtx,
9788 i * UNITS_PER_WORD));
9789 MEM_NOTRAP_P (mem) = 1;
9790 set_mem_alias_set (mem, set);
9792 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9793 emit_move_insn (mem, reg);
9797 static void
9798 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9799 tree type, int *, int no_rtl)
9801 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9802 CUMULATIVE_ARGS next_cum;
9803 tree fntype;
9805 /* This argument doesn't appear to be used anymore. Which is good,
9806 because the old code here didn't suppress rtl generation. */
9807 gcc_assert (!no_rtl);
9809 if (!TARGET_64BIT)
9810 return;
9812 fntype = TREE_TYPE (current_function_decl);
9814 /* For varargs, we do not want to skip the dummy va_dcl argument.
9815 For stdargs, we do want to skip the last named argument. */
9816 next_cum = *cum;
9817 if (stdarg_p (fntype))
9818 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9819 true);
9821 if (cum->call_abi == MS_ABI)
9822 setup_incoming_varargs_ms_64 (&next_cum);
9823 else
9824 setup_incoming_varargs_64 (&next_cum);
9827 static void
9828 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9829 machine_mode mode,
9830 tree type,
9831 int *pretend_size ATTRIBUTE_UNUSED,
9832 int no_rtl)
9834 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9835 CUMULATIVE_ARGS next_cum;
9836 tree fntype;
9837 rtx save_area;
9838 int bnd_reg, i, max;
9840 gcc_assert (!no_rtl);
9842 /* Do nothing if we use plain pointer to argument area. */
9843 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9844 return;
9846 fntype = TREE_TYPE (current_function_decl);
9848 /* For varargs, we do not want to skip the dummy va_dcl argument.
9849 For stdargs, we do want to skip the last named argument. */
9850 next_cum = *cum;
9851 if (stdarg_p (fntype))
9852 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9853 true);
9854 save_area = frame_pointer_rtx;
9856 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9857 if (max > X86_64_REGPARM_MAX)
9858 max = X86_64_REGPARM_MAX;
9860 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9861 if (chkp_function_instrumented_p (current_function_decl))
9862 for (i = cum->regno; i < max; i++)
9864 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9865 rtx ptr = gen_rtx_REG (Pmode,
9866 x86_64_int_parameter_registers[i]);
9867 rtx bounds;
9869 if (bnd_reg <= LAST_BND_REG)
9870 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9871 else
9873 rtx ldx_addr =
9874 plus_constant (Pmode, arg_pointer_rtx,
9875 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9876 bounds = gen_reg_rtx (BNDmode);
9877 emit_insn (BNDmode == BND64mode
9878 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9879 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9882 emit_insn (BNDmode == BND64mode
9883 ? gen_bnd64_stx (addr, ptr, bounds)
9884 : gen_bnd32_stx (addr, ptr, bounds));
9886 bnd_reg++;
9891 /* Checks if TYPE is of kind va_list char *. */
9893 static bool
9894 is_va_list_char_pointer (tree type)
9896 tree canonic;
9898 /* For 32-bit it is always true. */
9899 if (!TARGET_64BIT)
9900 return true;
9901 canonic = ix86_canonical_va_list_type (type);
9902 return (canonic == ms_va_list_type_node
9903 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9906 /* Implement va_start. */
9908 static void
9909 ix86_va_start (tree valist, rtx nextarg)
9911 HOST_WIDE_INT words, n_gpr, n_fpr;
9912 tree f_gpr, f_fpr, f_ovf, f_sav;
9913 tree gpr, fpr, ovf, sav, t;
9914 tree type;
9915 rtx ovf_rtx;
9917 if (flag_split_stack
9918 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9920 unsigned int scratch_regno;
9922 /* When we are splitting the stack, we can't refer to the stack
9923 arguments using internal_arg_pointer, because they may be on
9924 the old stack. The split stack prologue will arrange to
9925 leave a pointer to the old stack arguments in a scratch
9926 register, which we here copy to a pseudo-register. The split
9927 stack prologue can't set the pseudo-register directly because
9928 it (the prologue) runs before any registers have been saved. */
9930 scratch_regno = split_stack_prologue_scratch_regno ();
9931 if (scratch_regno != INVALID_REGNUM)
9933 rtx reg;
9934 rtx_insn *seq;
9936 reg = gen_reg_rtx (Pmode);
9937 cfun->machine->split_stack_varargs_pointer = reg;
9939 start_sequence ();
9940 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9941 seq = get_insns ();
9942 end_sequence ();
9944 push_topmost_sequence ();
9945 emit_insn_after (seq, entry_of_function ());
9946 pop_topmost_sequence ();
9950 /* Only 64bit target needs something special. */
9951 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9953 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9954 std_expand_builtin_va_start (valist, nextarg);
9955 else
9957 rtx va_r, next;
9959 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9960 next = expand_binop (ptr_mode, add_optab,
9961 cfun->machine->split_stack_varargs_pointer,
9962 crtl->args.arg_offset_rtx,
9963 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9964 convert_move (va_r, next, 0);
9966 /* Store zero bounds for va_list. */
9967 if (chkp_function_instrumented_p (current_function_decl))
9968 chkp_expand_bounds_reset_for_mem (valist,
9969 make_tree (TREE_TYPE (valist),
9970 next));
9973 return;
9976 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9977 f_fpr = DECL_CHAIN (f_gpr);
9978 f_ovf = DECL_CHAIN (f_fpr);
9979 f_sav = DECL_CHAIN (f_ovf);
9981 valist = build_simple_mem_ref (valist);
9982 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9983 /* The following should be folded into the MEM_REF offset. */
9984 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9985 f_gpr, NULL_TREE);
9986 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9987 f_fpr, NULL_TREE);
9988 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9989 f_ovf, NULL_TREE);
9990 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9991 f_sav, NULL_TREE);
9993 /* Count number of gp and fp argument registers used. */
9994 words = crtl->args.info.words;
9995 n_gpr = crtl->args.info.regno;
9996 n_fpr = crtl->args.info.sse_regno;
9998 if (cfun->va_list_gpr_size)
10000 type = TREE_TYPE (gpr);
10001 t = build2 (MODIFY_EXPR, type,
10002 gpr, build_int_cst (type, n_gpr * 8));
10003 TREE_SIDE_EFFECTS (t) = 1;
10004 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10007 if (TARGET_SSE && cfun->va_list_fpr_size)
10009 type = TREE_TYPE (fpr);
10010 t = build2 (MODIFY_EXPR, type, fpr,
10011 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
10012 TREE_SIDE_EFFECTS (t) = 1;
10013 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10016 /* Find the overflow area. */
10017 type = TREE_TYPE (ovf);
10018 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10019 ovf_rtx = crtl->args.internal_arg_pointer;
10020 else
10021 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
10022 t = make_tree (type, ovf_rtx);
10023 if (words != 0)
10024 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
10026 /* Store zero bounds for overflow area pointer. */
10027 if (chkp_function_instrumented_p (current_function_decl))
10028 chkp_expand_bounds_reset_for_mem (ovf, t);
10030 t = build2 (MODIFY_EXPR, type, ovf, t);
10031 TREE_SIDE_EFFECTS (t) = 1;
10032 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10034 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
10036 /* Find the register save area.
10037 Prologue of the function save it right above stack frame. */
10038 type = TREE_TYPE (sav);
10039 t = make_tree (type, frame_pointer_rtx);
10040 if (!ix86_varargs_gpr_size)
10041 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
10043 /* Store zero bounds for save area pointer. */
10044 if (chkp_function_instrumented_p (current_function_decl))
10045 chkp_expand_bounds_reset_for_mem (sav, t);
10047 t = build2 (MODIFY_EXPR, type, sav, t);
10048 TREE_SIDE_EFFECTS (t) = 1;
10049 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10053 /* Implement va_arg. */
10055 static tree
10056 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
10057 gimple_seq *post_p)
10059 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
10060 tree f_gpr, f_fpr, f_ovf, f_sav;
10061 tree gpr, fpr, ovf, sav, t;
10062 int size, rsize;
10063 tree lab_false, lab_over = NULL_TREE;
10064 tree addr, t2;
10065 rtx container;
10066 int indirect_p = 0;
10067 tree ptrtype;
10068 machine_mode nat_mode;
10069 unsigned int arg_boundary;
10071 /* Only 64bit target needs something special. */
10072 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10073 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
10075 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10076 f_fpr = DECL_CHAIN (f_gpr);
10077 f_ovf = DECL_CHAIN (f_fpr);
10078 f_sav = DECL_CHAIN (f_ovf);
10080 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
10081 valist, f_gpr, NULL_TREE);
10083 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
10084 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
10085 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
10087 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10088 if (indirect_p)
10089 type = build_pointer_type (type);
10090 size = arg_int_size_in_bytes (type);
10091 rsize = CEIL (size, UNITS_PER_WORD);
10093 nat_mode = type_natural_mode (type, NULL, false);
10094 switch (nat_mode)
10096 case E_V8SFmode:
10097 case E_V8SImode:
10098 case E_V32QImode:
10099 case E_V16HImode:
10100 case E_V4DFmode:
10101 case E_V4DImode:
10102 case E_V16SFmode:
10103 case E_V16SImode:
10104 case E_V64QImode:
10105 case E_V32HImode:
10106 case E_V8DFmode:
10107 case E_V8DImode:
10108 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10109 if (!TARGET_64BIT_MS_ABI)
10111 container = NULL;
10112 break;
10114 /* FALLTHRU */
10116 default:
10117 container = construct_container (nat_mode, TYPE_MODE (type),
10118 type, 0, X86_64_REGPARM_MAX,
10119 X86_64_SSE_REGPARM_MAX, intreg,
10121 break;
10124 /* Pull the value out of the saved registers. */
10126 addr = create_tmp_var (ptr_type_node, "addr");
10128 if (container)
10130 int needed_intregs, needed_sseregs;
10131 bool need_temp;
10132 tree int_addr, sse_addr;
10134 lab_false = create_artificial_label (UNKNOWN_LOCATION);
10135 lab_over = create_artificial_label (UNKNOWN_LOCATION);
10137 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
10139 need_temp = (!REG_P (container)
10140 && ((needed_intregs && TYPE_ALIGN (type) > 64)
10141 || TYPE_ALIGN (type) > 128));
10143 /* In case we are passing structure, verify that it is consecutive block
10144 on the register save area. If not we need to do moves. */
10145 if (!need_temp && !REG_P (container))
10147 /* Verify that all registers are strictly consecutive */
10148 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
10150 int i;
10152 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10154 rtx slot = XVECEXP (container, 0, i);
10155 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
10156 || INTVAL (XEXP (slot, 1)) != i * 16)
10157 need_temp = true;
10160 else
10162 int i;
10164 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10166 rtx slot = XVECEXP (container, 0, i);
10167 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10168 || INTVAL (XEXP (slot, 1)) != i * 8)
10169 need_temp = true;
10173 if (!need_temp)
10175 int_addr = addr;
10176 sse_addr = addr;
10178 else
10180 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10181 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10184 /* First ensure that we fit completely in registers. */
10185 if (needed_intregs)
10187 t = build_int_cst (TREE_TYPE (gpr),
10188 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10189 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10190 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10191 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10192 gimplify_and_add (t, pre_p);
10194 if (needed_sseregs)
10196 t = build_int_cst (TREE_TYPE (fpr),
10197 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10198 + X86_64_REGPARM_MAX * 8);
10199 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10200 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10201 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10202 gimplify_and_add (t, pre_p);
10205 /* Compute index to start of area used for integer regs. */
10206 if (needed_intregs)
10208 /* int_addr = gpr + sav; */
10209 t = fold_build_pointer_plus (sav, gpr);
10210 gimplify_assign (int_addr, t, pre_p);
10212 if (needed_sseregs)
10214 /* sse_addr = fpr + sav; */
10215 t = fold_build_pointer_plus (sav, fpr);
10216 gimplify_assign (sse_addr, t, pre_p);
10218 if (need_temp)
10220 int i, prev_size = 0;
10221 tree temp = create_tmp_var (type, "va_arg_tmp");
10223 /* addr = &temp; */
10224 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10225 gimplify_assign (addr, t, pre_p);
10227 for (i = 0; i < XVECLEN (container, 0); i++)
10229 rtx slot = XVECEXP (container, 0, i);
10230 rtx reg = XEXP (slot, 0);
10231 machine_mode mode = GET_MODE (reg);
10232 tree piece_type;
10233 tree addr_type;
10234 tree daddr_type;
10235 tree src_addr, src;
10236 int src_offset;
10237 tree dest_addr, dest;
10238 int cur_size = GET_MODE_SIZE (mode);
10240 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10241 prev_size = INTVAL (XEXP (slot, 1));
10242 if (prev_size + cur_size > size)
10244 cur_size = size - prev_size;
10245 unsigned int nbits = cur_size * BITS_PER_UNIT;
10246 if (!int_mode_for_size (nbits, 1).exists (&mode))
10247 mode = QImode;
10249 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10250 if (mode == GET_MODE (reg))
10251 addr_type = build_pointer_type (piece_type);
10252 else
10253 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10254 true);
10255 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10256 true);
10258 if (SSE_REGNO_P (REGNO (reg)))
10260 src_addr = sse_addr;
10261 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10263 else
10265 src_addr = int_addr;
10266 src_offset = REGNO (reg) * 8;
10268 src_addr = fold_convert (addr_type, src_addr);
10269 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10271 dest_addr = fold_convert (daddr_type, addr);
10272 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10273 if (cur_size == GET_MODE_SIZE (mode))
10275 src = build_va_arg_indirect_ref (src_addr);
10276 dest = build_va_arg_indirect_ref (dest_addr);
10278 gimplify_assign (dest, src, pre_p);
10280 else
10282 tree copy
10283 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10284 3, dest_addr, src_addr,
10285 size_int (cur_size));
10286 gimplify_and_add (copy, pre_p);
10288 prev_size += cur_size;
10292 if (needed_intregs)
10294 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10295 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10296 gimplify_assign (gpr, t, pre_p);
10299 if (needed_sseregs)
10301 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10302 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10303 gimplify_assign (unshare_expr (fpr), t, pre_p);
10306 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10308 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10311 /* ... otherwise out of the overflow area. */
10313 /* When we align parameter on stack for caller, if the parameter
10314 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10315 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10316 here with caller. */
10317 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10318 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10319 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10321 /* Care for on-stack alignment if needed. */
10322 if (arg_boundary <= 64 || size == 0)
10323 t = ovf;
10324 else
10326 HOST_WIDE_INT align = arg_boundary / 8;
10327 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10328 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10329 build_int_cst (TREE_TYPE (t), -align));
10332 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10333 gimplify_assign (addr, t, pre_p);
10335 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10336 gimplify_assign (unshare_expr (ovf), t, pre_p);
10338 if (container)
10339 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10341 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10342 addr = fold_convert (ptrtype, addr);
10344 if (indirect_p)
10345 addr = build_va_arg_indirect_ref (addr);
10346 return build_va_arg_indirect_ref (addr);
10349 /* Return true if OPNUM's MEM should be matched
10350 in movabs* patterns. */
10352 bool
10353 ix86_check_movabs (rtx insn, int opnum)
10355 rtx set, mem;
10357 set = PATTERN (insn);
10358 if (GET_CODE (set) == PARALLEL)
10359 set = XVECEXP (set, 0, 0);
10360 gcc_assert (GET_CODE (set) == SET);
10361 mem = XEXP (set, opnum);
10362 while (SUBREG_P (mem))
10363 mem = SUBREG_REG (mem);
10364 gcc_assert (MEM_P (mem));
10365 return volatile_ok || !MEM_VOLATILE_P (mem);
10368 /* Return false if INSN contains a MEM with a non-default address space. */
10369 bool
10370 ix86_check_no_addr_space (rtx insn)
10372 subrtx_var_iterator::array_type array;
10373 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10375 rtx x = *iter;
10376 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10377 return false;
10379 return true;
10382 /* Initialize the table of extra 80387 mathematical constants. */
10384 static void
10385 init_ext_80387_constants (void)
10387 static const char * cst[5] =
10389 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10390 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10391 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10392 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10393 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10395 int i;
10397 for (i = 0; i < 5; i++)
10399 real_from_string (&ext_80387_constants_table[i], cst[i]);
10400 /* Ensure each constant is rounded to XFmode precision. */
10401 real_convert (&ext_80387_constants_table[i],
10402 XFmode, &ext_80387_constants_table[i]);
10405 ext_80387_constants_init = 1;
10408 /* Return non-zero if the constant is something that
10409 can be loaded with a special instruction. */
10412 standard_80387_constant_p (rtx x)
10414 machine_mode mode = GET_MODE (x);
10416 const REAL_VALUE_TYPE *r;
10418 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10419 return -1;
10421 if (x == CONST0_RTX (mode))
10422 return 1;
10423 if (x == CONST1_RTX (mode))
10424 return 2;
10426 r = CONST_DOUBLE_REAL_VALUE (x);
10428 /* For XFmode constants, try to find a special 80387 instruction when
10429 optimizing for size or on those CPUs that benefit from them. */
10430 if (mode == XFmode
10431 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10433 int i;
10435 if (! ext_80387_constants_init)
10436 init_ext_80387_constants ();
10438 for (i = 0; i < 5; i++)
10439 if (real_identical (r, &ext_80387_constants_table[i]))
10440 return i + 3;
10443 /* Load of the constant -0.0 or -1.0 will be split as
10444 fldz;fchs or fld1;fchs sequence. */
10445 if (real_isnegzero (r))
10446 return 8;
10447 if (real_identical (r, &dconstm1))
10448 return 9;
10450 return 0;
10453 /* Return the opcode of the special instruction to be used to load
10454 the constant X. */
10456 const char *
10457 standard_80387_constant_opcode (rtx x)
10459 switch (standard_80387_constant_p (x))
10461 case 1:
10462 return "fldz";
10463 case 2:
10464 return "fld1";
10465 case 3:
10466 return "fldlg2";
10467 case 4:
10468 return "fldln2";
10469 case 5:
10470 return "fldl2e";
10471 case 6:
10472 return "fldl2t";
10473 case 7:
10474 return "fldpi";
10475 case 8:
10476 case 9:
10477 return "#";
10478 default:
10479 gcc_unreachable ();
10483 /* Return the CONST_DOUBLE representing the 80387 constant that is
10484 loaded by the specified special instruction. The argument IDX
10485 matches the return value from standard_80387_constant_p. */
10488 standard_80387_constant_rtx (int idx)
10490 int i;
10492 if (! ext_80387_constants_init)
10493 init_ext_80387_constants ();
10495 switch (idx)
10497 case 3:
10498 case 4:
10499 case 5:
10500 case 6:
10501 case 7:
10502 i = idx - 3;
10503 break;
10505 default:
10506 gcc_unreachable ();
10509 return const_double_from_real_value (ext_80387_constants_table[i],
10510 XFmode);
10513 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10514 in supported SSE/AVX vector mode. */
10517 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10519 machine_mode mode;
10521 if (!TARGET_SSE)
10522 return 0;
10524 mode = GET_MODE (x);
10526 if (x == const0_rtx || const0_operand (x, mode))
10527 return 1;
10529 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10531 /* VOIDmode integer constant, get mode from the predicate. */
10532 if (mode == VOIDmode)
10533 mode = pred_mode;
10535 switch (GET_MODE_SIZE (mode))
10537 case 64:
10538 if (TARGET_AVX512F)
10539 return 2;
10540 break;
10541 case 32:
10542 if (TARGET_AVX2)
10543 return 2;
10544 break;
10545 case 16:
10546 if (TARGET_SSE2)
10547 return 2;
10548 break;
10549 case 0:
10550 /* VOIDmode */
10551 gcc_unreachable ();
10552 default:
10553 break;
10557 return 0;
10560 /* Return the opcode of the special instruction to be used to load
10561 the constant operands[1] into operands[0]. */
10563 const char *
10564 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10566 machine_mode mode;
10567 rtx x = operands[1];
10569 gcc_assert (TARGET_SSE);
10571 mode = GET_MODE (x);
10573 if (x == const0_rtx || const0_operand (x, mode))
10575 switch (get_attr_mode (insn))
10577 case MODE_TI:
10578 if (!EXT_REX_SSE_REG_P (operands[0]))
10579 return "%vpxor\t%0, %d0";
10580 /* FALLTHRU */
10581 case MODE_XI:
10582 case MODE_OI:
10583 if (EXT_REX_SSE_REG_P (operands[0]))
10584 return (TARGET_AVX512VL
10585 ? "vpxord\t%x0, %x0, %x0"
10586 : "vpxord\t%g0, %g0, %g0");
10587 return "vpxor\t%x0, %x0, %x0";
10589 case MODE_V2DF:
10590 if (!EXT_REX_SSE_REG_P (operands[0]))
10591 return "%vxorpd\t%0, %d0";
10592 /* FALLTHRU */
10593 case MODE_V8DF:
10594 case MODE_V4DF:
10595 if (!EXT_REX_SSE_REG_P (operands[0]))
10596 return "vxorpd\t%x0, %x0, %x0";
10597 else if (TARGET_AVX512DQ)
10598 return (TARGET_AVX512VL
10599 ? "vxorpd\t%x0, %x0, %x0"
10600 : "vxorpd\t%g0, %g0, %g0");
10601 else
10602 return (TARGET_AVX512VL
10603 ? "vpxorq\t%x0, %x0, %x0"
10604 : "vpxorq\t%g0, %g0, %g0");
10606 case MODE_V4SF:
10607 if (!EXT_REX_SSE_REG_P (operands[0]))
10608 return "%vxorps\t%0, %d0";
10609 /* FALLTHRU */
10610 case MODE_V16SF:
10611 case MODE_V8SF:
10612 if (!EXT_REX_SSE_REG_P (operands[0]))
10613 return "vxorps\t%x0, %x0, %x0";
10614 else if (TARGET_AVX512DQ)
10615 return (TARGET_AVX512VL
10616 ? "vxorps\t%x0, %x0, %x0"
10617 : "vxorps\t%g0, %g0, %g0");
10618 else
10619 return (TARGET_AVX512VL
10620 ? "vpxord\t%x0, %x0, %x0"
10621 : "vpxord\t%g0, %g0, %g0");
10623 default:
10624 gcc_unreachable ();
10627 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10629 enum attr_mode insn_mode = get_attr_mode (insn);
10631 switch (insn_mode)
10633 case MODE_XI:
10634 case MODE_V8DF:
10635 case MODE_V16SF:
10636 gcc_assert (TARGET_AVX512F);
10637 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10639 case MODE_OI:
10640 case MODE_V4DF:
10641 case MODE_V8SF:
10642 gcc_assert (TARGET_AVX2);
10643 /* FALLTHRU */
10644 case MODE_TI:
10645 case MODE_V2DF:
10646 case MODE_V4SF:
10647 gcc_assert (TARGET_SSE2);
10648 if (!EXT_REX_SSE_REG_P (operands[0]))
10649 return (TARGET_AVX
10650 ? "vpcmpeqd\t%0, %0, %0"
10651 : "pcmpeqd\t%0, %0");
10652 else if (TARGET_AVX512VL)
10653 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10654 else
10655 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10657 default:
10658 gcc_unreachable ();
10662 gcc_unreachable ();
10665 /* Returns true if INSN can be transformed from a memory load
10666 to a supported FP constant load. */
10668 bool
10669 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10671 rtx src = find_constant_src (insn);
10673 gcc_assert (REG_P (dst));
10675 if (src == NULL
10676 || (SSE_REGNO_P (REGNO (dst))
10677 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10678 || (STACK_REGNO_P (REGNO (dst))
10679 && standard_80387_constant_p (src) < 1))
10680 return false;
10682 return true;
10685 /* Returns true if OP contains a symbol reference */
10687 bool
10688 symbolic_reference_mentioned_p (rtx op)
10690 const char *fmt;
10691 int i;
10693 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10694 return true;
10696 fmt = GET_RTX_FORMAT (GET_CODE (op));
10697 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10699 if (fmt[i] == 'E')
10701 int j;
10703 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10704 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10705 return true;
10708 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10709 return true;
10712 return false;
10715 /* Return true if it is appropriate to emit `ret' instructions in the
10716 body of a function. Do this only if the epilogue is simple, needing a
10717 couple of insns. Prior to reloading, we can't tell how many registers
10718 must be saved, so return false then. Return false if there is no frame
10719 marker to de-allocate. */
10721 bool
10722 ix86_can_use_return_insn_p (void)
10724 if (ix86_function_naked (current_function_decl))
10725 return false;
10727 /* Don't use `ret' instruction in interrupt handler. */
10728 if (! reload_completed
10729 || frame_pointer_needed
10730 || cfun->machine->func_type != TYPE_NORMAL)
10731 return 0;
10733 /* Don't allow more than 32k pop, since that's all we can do
10734 with one instruction. */
10735 if (crtl->args.pops_args && crtl->args.size >= 32768)
10736 return 0;
10738 struct ix86_frame &frame = cfun->machine->frame;
10739 return (frame.stack_pointer_offset == UNITS_PER_WORD
10740 && (frame.nregs + frame.nsseregs) == 0);
10743 /* Value should be nonzero if functions must have frame pointers.
10744 Zero means the frame pointer need not be set up (and parms may
10745 be accessed via the stack pointer) in functions that seem suitable. */
10747 static bool
10748 ix86_frame_pointer_required (void)
10750 /* If we accessed previous frames, then the generated code expects
10751 to be able to access the saved ebp value in our frame. */
10752 if (cfun->machine->accesses_prev_frame)
10753 return true;
10755 /* Several x86 os'es need a frame pointer for other reasons,
10756 usually pertaining to setjmp. */
10757 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10758 return true;
10760 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10761 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10762 return true;
10764 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10765 allocation is 4GB. */
10766 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10767 return true;
10769 /* SSE saves require frame-pointer when stack is misaligned. */
10770 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10771 return true;
10773 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10774 turns off the frame pointer by default. Turn it back on now if
10775 we've not got a leaf function. */
10776 if (TARGET_OMIT_LEAF_FRAME_POINTER
10777 && (!crtl->is_leaf
10778 || ix86_current_function_calls_tls_descriptor))
10779 return true;
10781 if (crtl->profile && !flag_fentry)
10782 return true;
10784 return false;
10787 /* Record that the current function accesses previous call frames. */
10789 void
10790 ix86_setup_frame_addresses (void)
10792 cfun->machine->accesses_prev_frame = 1;
10795 #ifndef USE_HIDDEN_LINKONCE
10796 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10797 # define USE_HIDDEN_LINKONCE 1
10798 # else
10799 # define USE_HIDDEN_LINKONCE 0
10800 # endif
10801 #endif
10803 /* Label count for call and return thunks. It is used to make unique
10804 labels in call and return thunks. */
10805 static int indirectlabelno;
10807 /* True if call and return thunk functions are needed. */
10808 static bool indirect_thunk_needed = false;
10809 /* True if call and return thunk functions with the BND prefix are
10810 needed. */
10811 static bool indirect_thunk_bnd_needed = false;
10813 /* Bit masks of integer registers, which contain branch target, used
10814 by call and return thunks functions. */
10815 static int indirect_thunks_used;
10816 /* Bit masks of integer registers, which contain branch target, used
10817 by call and return thunks functions with the BND prefix. */
10818 static int indirect_thunks_bnd_used;
10820 /* True if return thunk function via CX is needed. */
10821 static bool indirect_return_via_cx;
10822 /* True if return thunk function via CX with the BND prefix is
10823 needed. */
10824 static bool indirect_return_via_cx_bnd;
10826 #ifndef INDIRECT_LABEL
10827 # define INDIRECT_LABEL "LIND"
10828 #endif
10830 /* Indicate what prefix is needed for an indirect branch. */
10831 enum indirect_thunk_prefix
10833 indirect_thunk_prefix_none,
10834 indirect_thunk_prefix_bnd,
10835 indirect_thunk_prefix_nt
10838 /* Return the prefix needed for an indirect branch INSN. */
10840 enum indirect_thunk_prefix
10841 indirect_thunk_need_prefix (rtx_insn *insn)
10843 enum indirect_thunk_prefix need_prefix;
10844 if (ix86_bnd_prefixed_insn_p (insn))
10845 need_prefix = indirect_thunk_prefix_bnd;
10846 else if ((cfun->machine->indirect_branch_type
10847 == indirect_branch_thunk_extern)
10848 && ix86_notrack_prefixed_insn_p (insn))
10850 /* NOTRACK prefix is only used with external thunk so that it
10851 can be properly updated to support CET at run-time. */
10852 need_prefix = indirect_thunk_prefix_nt;
10854 else
10855 need_prefix = indirect_thunk_prefix_none;
10856 return need_prefix;
10859 /* Fills in the label name that should be used for the indirect thunk. */
10861 static void
10862 indirect_thunk_name (char name[32], unsigned int regno,
10863 enum indirect_thunk_prefix need_prefix,
10864 bool ret_p)
10866 if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
10867 gcc_unreachable ();
10869 if (USE_HIDDEN_LINKONCE)
10871 const char *prefix;
10873 if (need_prefix == indirect_thunk_prefix_bnd)
10874 prefix = "_bnd";
10875 else if (need_prefix == indirect_thunk_prefix_nt
10876 && regno != INVALID_REGNUM)
10878 /* NOTRACK prefix is only used with external thunk via
10879 register so that NOTRACK prefix can be added to indirect
10880 branch via register to support CET at run-time. */
10881 prefix = "_nt";
10883 else
10884 prefix = "";
10886 const char *ret = ret_p ? "return" : "indirect";
10888 if (regno != INVALID_REGNUM)
10890 const char *reg_prefix;
10891 if (LEGACY_INT_REGNO_P (regno))
10892 reg_prefix = TARGET_64BIT ? "r" : "e";
10893 else
10894 reg_prefix = "";
10895 sprintf (name, "__x86_%s_thunk%s_%s%s",
10896 ret, prefix, reg_prefix, reg_names[regno]);
10898 else
10899 sprintf (name, "__x86_%s_thunk%s", ret, prefix);
10901 else
10903 if (regno != INVALID_REGNUM)
10905 if (need_prefix == indirect_thunk_prefix_bnd)
10906 ASM_GENERATE_INTERNAL_LABEL (name, "LITBR", regno);
10907 else
10908 ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
10910 else
10912 if (ret_p)
10914 if (need_prefix == indirect_thunk_prefix_bnd)
10915 ASM_GENERATE_INTERNAL_LABEL (name, "LRTB", 0);
10916 else
10917 ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
10919 else
10921 if (need_prefix == indirect_thunk_prefix_bnd)
10922 ASM_GENERATE_INTERNAL_LABEL (name, "LITB", 0);
10923 else
10924 ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
10930 /* Output a call and return thunk for indirect branch. If BND_P is
10931 true, the BND prefix is needed. If REGNO != -1, the function
10932 address is in REGNO and the call and return thunk looks like:
10934 call L2
10936 pause
10937 lfence
10938 jmp L1
10940 mov %REG, (%sp)
10943 Otherwise, the function address is on the top of stack and the
10944 call and return thunk looks like:
10946 call L2
10948 pause
10949 lfence
10950 jmp L1
10952 lea WORD_SIZE(%sp), %sp
10956 static void
10957 output_indirect_thunk (enum indirect_thunk_prefix need_prefix,
10958 unsigned int regno)
10960 char indirectlabel1[32];
10961 char indirectlabel2[32];
10963 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
10964 indirectlabelno++);
10965 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
10966 indirectlabelno++);
10968 /* Call */
10969 if (need_prefix == indirect_thunk_prefix_bnd)
10970 fputs ("\tbnd call\t", asm_out_file);
10971 else
10972 fputs ("\tcall\t", asm_out_file);
10973 assemble_name_raw (asm_out_file, indirectlabel2);
10974 fputc ('\n', asm_out_file);
10976 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
10978 /* AMD and Intel CPUs prefer each a different instruction as loop filler.
10979 Usage of both pause + lfence is compromise solution. */
10980 fprintf (asm_out_file, "\tpause\n\tlfence\n");
10982 /* Jump. */
10983 fputs ("\tjmp\t", asm_out_file);
10984 assemble_name_raw (asm_out_file, indirectlabel1);
10985 fputc ('\n', asm_out_file);
10987 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
10989 if (regno != INVALID_REGNUM)
10991 /* MOV. */
10992 rtx xops[2];
10993 xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
10994 xops[1] = gen_rtx_REG (word_mode, regno);
10995 output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
10997 else
10999 /* LEA. */
11000 rtx xops[2];
11001 xops[0] = stack_pointer_rtx;
11002 xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11003 output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
11006 if (need_prefix == indirect_thunk_prefix_bnd)
11007 fputs ("\tbnd ret\n", asm_out_file);
11008 else
11009 fputs ("\tret\n", asm_out_file);
11012 /* Output a funtion with a call and return thunk for indirect branch.
11013 If BND_P is true, the BND prefix is needed. If REGNO != UNVALID_REGNUM,
11014 the function address is in REGNO. Otherwise, the function address is
11015 on the top of stack. */
11017 static void
11018 output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
11019 unsigned int regno)
11021 char name[32];
11022 tree decl;
11024 /* Create __x86_indirect_thunk/__x86_indirect_thunk_bnd. */
11025 indirect_thunk_name (name, regno, need_prefix, false);
11026 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11027 get_identifier (name),
11028 build_function_type_list (void_type_node, NULL_TREE));
11029 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11030 NULL_TREE, void_type_node);
11031 TREE_PUBLIC (decl) = 1;
11032 TREE_STATIC (decl) = 1;
11033 DECL_IGNORED_P (decl) = 1;
11035 #if TARGET_MACHO
11036 if (TARGET_MACHO)
11038 switch_to_section (darwin_sections[picbase_thunk_section]);
11039 fputs ("\t.weak_definition\t", asm_out_file);
11040 assemble_name (asm_out_file, name);
11041 fputs ("\n\t.private_extern\t", asm_out_file);
11042 assemble_name (asm_out_file, name);
11043 putc ('\n', asm_out_file);
11044 ASM_OUTPUT_LABEL (asm_out_file, name);
11045 DECL_WEAK (decl) = 1;
11047 else
11048 #endif
11049 if (USE_HIDDEN_LINKONCE)
11051 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11053 targetm.asm_out.unique_section (decl, 0);
11054 switch_to_section (get_named_section (decl, NULL, 0));
11056 targetm.asm_out.globalize_label (asm_out_file, name);
11057 fputs ("\t.hidden\t", asm_out_file);
11058 assemble_name (asm_out_file, name);
11059 putc ('\n', asm_out_file);
11060 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11062 else
11064 switch_to_section (text_section);
11065 ASM_OUTPUT_LABEL (asm_out_file, name);
11068 /* Create alias for __x86_return_thunk/__x86_return_thunk_bnd or
11069 __x86_return_thunk_ecx/__x86_return_thunk_ecx_bnd. */
11070 bool need_alias;
11071 if (regno == INVALID_REGNUM)
11072 need_alias = true;
11073 else if (regno == CX_REG)
11075 if (need_prefix == indirect_thunk_prefix_bnd)
11076 need_alias = indirect_return_via_cx_bnd;
11077 else
11078 need_alias = indirect_return_via_cx;
11080 else
11081 need_alias = false;
11083 if (need_alias)
11085 char alias[32];
11087 indirect_thunk_name (alias, regno, need_prefix, true);
11088 #if TARGET_MACHO
11089 if (TARGET_MACHO)
11091 fputs ("\t.weak_definition\t", asm_out_file);
11092 assemble_name (asm_out_file, alias);
11093 fputs ("\n\t.private_extern\t", asm_out_file);
11094 assemble_name (asm_out_file, alias);
11095 putc ('\n', asm_out_file);
11096 ASM_OUTPUT_LABEL (asm_out_file, alias);
11098 #else
11099 ASM_OUTPUT_DEF (asm_out_file, alias, name);
11100 if (USE_HIDDEN_LINKONCE)
11102 fputs ("\t.globl\t", asm_out_file);
11103 assemble_name (asm_out_file, alias);
11104 putc ('\n', asm_out_file);
11105 fputs ("\t.hidden\t", asm_out_file);
11106 assemble_name (asm_out_file, alias);
11107 putc ('\n', asm_out_file);
11109 #endif
11112 DECL_INITIAL (decl) = make_node (BLOCK);
11113 current_function_decl = decl;
11114 allocate_struct_function (decl, false);
11115 init_function_start (decl);
11116 /* We're about to hide the function body from callees of final_* by
11117 emitting it directly; tell them we're a thunk, if they care. */
11118 cfun->is_thunk = true;
11119 first_function_block_is_cold = false;
11120 /* Make sure unwind info is emitted for the thunk if needed. */
11121 final_start_function (emit_barrier (), asm_out_file, 1);
11123 output_indirect_thunk (need_prefix, regno);
11125 final_end_function ();
11126 init_insn_lengths ();
11127 free_after_compilation (cfun);
11128 set_cfun (NULL);
11129 current_function_decl = NULL;
11132 static int pic_labels_used;
11134 /* Fills in the label name that should be used for a pc thunk for
11135 the given register. */
11137 static void
11138 get_pc_thunk_name (char name[32], unsigned int regno)
11140 gcc_assert (!TARGET_64BIT);
11142 if (USE_HIDDEN_LINKONCE)
11143 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11144 else
11145 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11149 /* This function generates code for -fpic that loads %ebx with
11150 the return address of the caller and then returns. */
11152 static void
11153 ix86_code_end (void)
11155 rtx xops[2];
11156 unsigned int regno;
11158 if (indirect_thunk_needed)
11159 output_indirect_thunk_function (indirect_thunk_prefix_none,
11160 INVALID_REGNUM);
11161 if (indirect_thunk_bnd_needed)
11162 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11163 INVALID_REGNUM);
11165 for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
11167 unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
11168 if ((indirect_thunks_used & (1 << i)))
11169 output_indirect_thunk_function (indirect_thunk_prefix_none,
11170 regno);
11172 if ((indirect_thunks_bnd_used & (1 << i)))
11173 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11174 regno);
11177 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
11179 char name[32];
11180 tree decl;
11182 if ((indirect_thunks_used & (1 << regno)))
11183 output_indirect_thunk_function (indirect_thunk_prefix_none,
11184 regno);
11186 if ((indirect_thunks_bnd_used & (1 << regno)))
11187 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11188 regno);
11190 if (!(pic_labels_used & (1 << regno)))
11191 continue;
11193 get_pc_thunk_name (name, regno);
11195 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11196 get_identifier (name),
11197 build_function_type_list (void_type_node, NULL_TREE));
11198 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11199 NULL_TREE, void_type_node);
11200 TREE_PUBLIC (decl) = 1;
11201 TREE_STATIC (decl) = 1;
11202 DECL_IGNORED_P (decl) = 1;
11204 #if TARGET_MACHO
11205 if (TARGET_MACHO)
11207 switch_to_section (darwin_sections[picbase_thunk_section]);
11208 fputs ("\t.weak_definition\t", asm_out_file);
11209 assemble_name (asm_out_file, name);
11210 fputs ("\n\t.private_extern\t", asm_out_file);
11211 assemble_name (asm_out_file, name);
11212 putc ('\n', asm_out_file);
11213 ASM_OUTPUT_LABEL (asm_out_file, name);
11214 DECL_WEAK (decl) = 1;
11216 else
11217 #endif
11218 if (USE_HIDDEN_LINKONCE)
11220 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11222 targetm.asm_out.unique_section (decl, 0);
11223 switch_to_section (get_named_section (decl, NULL, 0));
11225 targetm.asm_out.globalize_label (asm_out_file, name);
11226 fputs ("\t.hidden\t", asm_out_file);
11227 assemble_name (asm_out_file, name);
11228 putc ('\n', asm_out_file);
11229 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11231 else
11233 switch_to_section (text_section);
11234 ASM_OUTPUT_LABEL (asm_out_file, name);
11237 DECL_INITIAL (decl) = make_node (BLOCK);
11238 current_function_decl = decl;
11239 allocate_struct_function (decl, false);
11240 init_function_start (decl);
11241 /* We're about to hide the function body from callees of final_* by
11242 emitting it directly; tell them we're a thunk, if they care. */
11243 cfun->is_thunk = true;
11244 first_function_block_is_cold = false;
11245 /* Make sure unwind info is emitted for the thunk if needed. */
11246 final_start_function (emit_barrier (), asm_out_file, 1);
11248 /* Pad stack IP move with 4 instructions (two NOPs count
11249 as one instruction). */
11250 if (TARGET_PAD_SHORT_FUNCTION)
11252 int i = 8;
11254 while (i--)
11255 fputs ("\tnop\n", asm_out_file);
11258 xops[0] = gen_rtx_REG (Pmode, regno);
11259 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11260 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11261 output_asm_insn ("%!ret", NULL);
11262 final_end_function ();
11263 init_insn_lengths ();
11264 free_after_compilation (cfun);
11265 set_cfun (NULL);
11266 current_function_decl = NULL;
11269 if (flag_split_stack)
11270 file_end_indicate_split_stack ();
11273 /* Emit code for the SET_GOT patterns. */
11275 const char *
11276 output_set_got (rtx dest, rtx label)
11278 rtx xops[3];
11280 xops[0] = dest;
11282 if (TARGET_VXWORKS_RTP && flag_pic)
11284 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11285 xops[2] = gen_rtx_MEM (Pmode,
11286 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11287 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11289 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11290 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11291 an unadorned address. */
11292 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11293 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11294 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11295 return "";
11298 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11300 if (flag_pic)
11302 char name[32];
11303 get_pc_thunk_name (name, REGNO (dest));
11304 pic_labels_used |= 1 << REGNO (dest);
11306 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11307 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11308 output_asm_insn ("%!call\t%X2", xops);
11310 #if TARGET_MACHO
11311 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11312 This is what will be referenced by the Mach-O PIC subsystem. */
11313 if (machopic_should_output_picbase_label () || !label)
11314 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11316 /* When we are restoring the pic base at the site of a nonlocal label,
11317 and we decided to emit the pic base above, we will still output a
11318 local label used for calculating the correction offset (even though
11319 the offset will be 0 in that case). */
11320 if (label)
11321 targetm.asm_out.internal_label (asm_out_file, "L",
11322 CODE_LABEL_NUMBER (label));
11323 #endif
11325 else
11327 if (TARGET_MACHO)
11328 /* We don't need a pic base, we're not producing pic. */
11329 gcc_unreachable ();
11331 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11332 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11333 targetm.asm_out.internal_label (asm_out_file, "L",
11334 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11337 if (!TARGET_MACHO)
11338 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11340 return "";
11343 /* Generate an "push" pattern for input ARG. */
11345 static rtx
11346 gen_push (rtx arg)
11348 struct machine_function *m = cfun->machine;
11350 if (m->fs.cfa_reg == stack_pointer_rtx)
11351 m->fs.cfa_offset += UNITS_PER_WORD;
11352 m->fs.sp_offset += UNITS_PER_WORD;
11354 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11355 arg = gen_rtx_REG (word_mode, REGNO (arg));
11357 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11358 gen_rtx_PRE_DEC (Pmode,
11359 stack_pointer_rtx)),
11360 arg);
11363 /* Generate an "pop" pattern for input ARG. */
11365 static rtx
11366 gen_pop (rtx arg)
11368 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11369 arg = gen_rtx_REG (word_mode, REGNO (arg));
11371 return gen_rtx_SET (arg,
11372 gen_rtx_MEM (word_mode,
11373 gen_rtx_POST_INC (Pmode,
11374 stack_pointer_rtx)));
11377 /* Return >= 0 if there is an unused call-clobbered register available
11378 for the entire function. */
11380 static unsigned int
11381 ix86_select_alt_pic_regnum (void)
11383 if (ix86_use_pseudo_pic_reg ())
11384 return INVALID_REGNUM;
11386 if (crtl->is_leaf
11387 && !crtl->profile
11388 && !ix86_current_function_calls_tls_descriptor)
11390 int i, drap;
11391 /* Can't use the same register for both PIC and DRAP. */
11392 if (crtl->drap_reg)
11393 drap = REGNO (crtl->drap_reg);
11394 else
11395 drap = -1;
11396 for (i = 2; i >= 0; --i)
11397 if (i != drap && !df_regs_ever_live_p (i))
11398 return i;
11401 return INVALID_REGNUM;
11404 /* Return true if REGNO is used by the epilogue. */
11406 bool
11407 ix86_epilogue_uses (int regno)
11409 /* If there are no caller-saved registers, we preserve all registers,
11410 except for MMX and x87 registers which aren't supported when saving
11411 and restoring registers. Don't explicitly save SP register since
11412 it is always preserved. */
11413 return (epilogue_completed
11414 && cfun->machine->no_caller_saved_registers
11415 && !fixed_regs[regno]
11416 && !STACK_REGNO_P (regno)
11417 && !MMX_REGNO_P (regno));
11420 /* Return nonzero if register REGNO can be used as a scratch register
11421 in peephole2. */
11423 static bool
11424 ix86_hard_regno_scratch_ok (unsigned int regno)
11426 /* If there are no caller-saved registers, we can't use any register
11427 as a scratch register after epilogue and use REGNO as scratch
11428 register only if it has been used before to avoid saving and
11429 restoring it. */
11430 return (!cfun->machine->no_caller_saved_registers
11431 || (!epilogue_completed
11432 && df_regs_ever_live_p (regno)));
11435 /* Return true if register class CL should be an additional allocno
11436 class. */
11438 static bool
11439 ix86_additional_allocno_class_p (reg_class_t cl)
11441 return cl == MOD4_SSE_REGS;
11444 /* Return TRUE if we need to save REGNO. */
11446 static bool
11447 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
11449 /* If there are no caller-saved registers, we preserve all registers,
11450 except for MMX and x87 registers which aren't supported when saving
11451 and restoring registers. Don't explicitly save SP register since
11452 it is always preserved. */
11453 if (cfun->machine->no_caller_saved_registers)
11455 /* Don't preserve registers used for function return value. */
11456 rtx reg = crtl->return_rtx;
11457 if (reg)
11459 unsigned int i = REGNO (reg);
11460 unsigned int nregs = REG_NREGS (reg);
11461 while (nregs-- > 0)
11462 if ((i + nregs) == regno)
11463 return false;
11465 reg = crtl->return_bnd;
11466 if (reg)
11468 i = REGNO (reg);
11469 nregs = REG_NREGS (reg);
11470 while (nregs-- > 0)
11471 if ((i + nregs) == regno)
11472 return false;
11476 return (df_regs_ever_live_p (regno)
11477 && !fixed_regs[regno]
11478 && !STACK_REGNO_P (regno)
11479 && !MMX_REGNO_P (regno)
11480 && (regno != HARD_FRAME_POINTER_REGNUM
11481 || !frame_pointer_needed));
11484 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
11485 && pic_offset_table_rtx)
11487 if (ix86_use_pseudo_pic_reg ())
11489 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
11490 _mcount in prologue. */
11491 if (!TARGET_64BIT && flag_pic && crtl->profile)
11492 return true;
11494 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11495 || crtl->profile
11496 || crtl->calls_eh_return
11497 || crtl->uses_const_pool
11498 || cfun->has_nonlocal_label)
11499 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
11502 if (crtl->calls_eh_return && maybe_eh_return)
11504 unsigned i;
11505 for (i = 0; ; i++)
11507 unsigned test = EH_RETURN_DATA_REGNO (i);
11508 if (test == INVALID_REGNUM)
11509 break;
11510 if (test == regno)
11511 return true;
11515 if (ignore_outlined && cfun->machine->call_ms2sysv)
11517 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11518 + xlogue_layout::MIN_REGS;
11519 if (xlogue_layout::is_stub_managed_reg (regno, count))
11520 return false;
11523 if (crtl->drap_reg
11524 && regno == REGNO (crtl->drap_reg)
11525 && !cfun->machine->no_drap_save_restore)
11526 return true;
11528 return (df_regs_ever_live_p (regno)
11529 && !call_used_regs[regno]
11530 && !fixed_regs[regno]
11531 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11534 /* Return number of saved general prupose registers. */
11536 static int
11537 ix86_nsaved_regs (void)
11539 int nregs = 0;
11540 int regno;
11542 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11543 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11544 nregs ++;
11545 return nregs;
11548 /* Return number of saved SSE registers. */
11550 static int
11551 ix86_nsaved_sseregs (void)
11553 int nregs = 0;
11554 int regno;
11556 if (!TARGET_64BIT_MS_ABI)
11557 return 0;
11558 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11559 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11560 nregs ++;
11561 return nregs;
11564 /* Given FROM and TO register numbers, say whether this elimination is
11565 allowed. If stack alignment is needed, we can only replace argument
11566 pointer with hard frame pointer, or replace frame pointer with stack
11567 pointer. Otherwise, frame pointer elimination is automatically
11568 handled and all other eliminations are valid. */
11570 static bool
11571 ix86_can_eliminate (const int from, const int to)
11573 if (stack_realign_fp)
11574 return ((from == ARG_POINTER_REGNUM
11575 && to == HARD_FRAME_POINTER_REGNUM)
11576 || (from == FRAME_POINTER_REGNUM
11577 && to == STACK_POINTER_REGNUM));
11578 else
11579 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11582 /* Return the offset between two registers, one to be eliminated, and the other
11583 its replacement, at the start of a routine. */
11585 HOST_WIDE_INT
11586 ix86_initial_elimination_offset (int from, int to)
11588 struct ix86_frame &frame = cfun->machine->frame;
11590 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11591 return frame.hard_frame_pointer_offset;
11592 else if (from == FRAME_POINTER_REGNUM
11593 && to == HARD_FRAME_POINTER_REGNUM)
11594 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11595 else
11597 gcc_assert (to == STACK_POINTER_REGNUM);
11599 if (from == ARG_POINTER_REGNUM)
11600 return frame.stack_pointer_offset;
11602 gcc_assert (from == FRAME_POINTER_REGNUM);
11603 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11607 /* In a dynamically-aligned function, we can't know the offset from
11608 stack pointer to frame pointer, so we must ensure that setjmp
11609 eliminates fp against the hard fp (%ebp) rather than trying to
11610 index from %esp up to the top of the frame across a gap that is
11611 of unknown (at compile-time) size. */
11612 static rtx
11613 ix86_builtin_setjmp_frame_value (void)
11615 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11618 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11619 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11621 static bool warned_once = false;
11622 if (!warned_once)
11624 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11625 feature);
11626 warned_once = true;
11630 /* Return the probing interval for -fstack-clash-protection. */
11632 static HOST_WIDE_INT
11633 get_probe_interval (void)
11635 if (flag_stack_clash_protection)
11636 return (HOST_WIDE_INT_1U
11637 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
11638 else
11639 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
11642 /* When using -fsplit-stack, the allocation routines set a field in
11643 the TCB to the bottom of the stack plus this much space, measured
11644 in bytes. */
11646 #define SPLIT_STACK_AVAILABLE 256
11648 /* Fill structure ix86_frame about frame of currently computed function. */
11650 static void
11651 ix86_compute_frame_layout (void)
11653 struct ix86_frame *frame = &cfun->machine->frame;
11654 struct machine_function *m = cfun->machine;
11655 unsigned HOST_WIDE_INT stack_alignment_needed;
11656 HOST_WIDE_INT offset;
11657 unsigned HOST_WIDE_INT preferred_alignment;
11658 HOST_WIDE_INT size = get_frame_size ();
11659 HOST_WIDE_INT to_allocate;
11661 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11662 * ms_abi functions that call a sysv function. We now need to prune away
11663 * cases where it should be disabled. */
11664 if (TARGET_64BIT && m->call_ms2sysv)
11666 gcc_assert (TARGET_64BIT_MS_ABI);
11667 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11668 gcc_assert (!TARGET_SEH);
11669 gcc_assert (TARGET_SSE);
11670 gcc_assert (!ix86_using_red_zone ());
11672 if (crtl->calls_eh_return)
11674 gcc_assert (!reload_completed);
11675 m->call_ms2sysv = false;
11676 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11679 else if (ix86_static_chain_on_stack)
11681 gcc_assert (!reload_completed);
11682 m->call_ms2sysv = false;
11683 warn_once_call_ms2sysv_xlogues ("static call chains");
11686 /* Finally, compute which registers the stub will manage. */
11687 else
11689 unsigned count = xlogue_layout::count_stub_managed_regs ();
11690 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11691 m->call_ms2sysv_pad_in = 0;
11695 frame->nregs = ix86_nsaved_regs ();
11696 frame->nsseregs = ix86_nsaved_sseregs ();
11698 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11699 except for function prologues, leaf functions and when the defult
11700 incoming stack boundary is overriden at command line or via
11701 force_align_arg_pointer attribute. */
11702 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11703 && (!crtl->is_leaf || cfun->calls_alloca != 0
11704 || ix86_current_function_calls_tls_descriptor
11705 || ix86_incoming_stack_boundary < 128))
11707 crtl->preferred_stack_boundary = 128;
11708 crtl->stack_alignment_needed = 128;
11711 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11712 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11714 gcc_assert (!size || stack_alignment_needed);
11715 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11716 gcc_assert (preferred_alignment <= stack_alignment_needed);
11718 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11719 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11720 if (TARGET_64BIT && m->call_ms2sysv)
11722 gcc_assert (stack_alignment_needed >= 16);
11723 gcc_assert (!frame->nsseregs);
11726 /* For SEH we have to limit the amount of code movement into the prologue.
11727 At present we do this via a BLOCKAGE, at which point there's very little
11728 scheduling that can be done, which means that there's very little point
11729 in doing anything except PUSHs. */
11730 if (TARGET_SEH)
11731 m->use_fast_prologue_epilogue = false;
11732 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11734 int count = frame->nregs;
11735 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11737 /* The fast prologue uses move instead of push to save registers. This
11738 is significantly longer, but also executes faster as modern hardware
11739 can execute the moves in parallel, but can't do that for push/pop.
11741 Be careful about choosing what prologue to emit: When function takes
11742 many instructions to execute we may use slow version as well as in
11743 case function is known to be outside hot spot (this is known with
11744 feedback only). Weight the size of function by number of registers
11745 to save as it is cheap to use one or two push instructions but very
11746 slow to use many of them. */
11747 if (count)
11748 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11749 if (node->frequency < NODE_FREQUENCY_NORMAL
11750 || (flag_branch_probabilities
11751 && node->frequency < NODE_FREQUENCY_HOT))
11752 m->use_fast_prologue_epilogue = false;
11753 else
11754 m->use_fast_prologue_epilogue
11755 = !expensive_function_p (count);
11758 frame->save_regs_using_mov
11759 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11760 /* If static stack checking is enabled and done with probes,
11761 the registers need to be saved before allocating the frame. */
11762 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11764 /* Skip return address and error code in exception handler. */
11765 offset = INCOMING_FRAME_SP_OFFSET;
11767 /* Skip pushed static chain. */
11768 if (ix86_static_chain_on_stack)
11769 offset += UNITS_PER_WORD;
11771 /* Skip saved base pointer. */
11772 if (frame_pointer_needed)
11773 offset += UNITS_PER_WORD;
11774 frame->hfp_save_offset = offset;
11776 /* The traditional frame pointer location is at the top of the frame. */
11777 frame->hard_frame_pointer_offset = offset;
11779 /* Register save area */
11780 offset += frame->nregs * UNITS_PER_WORD;
11781 frame->reg_save_offset = offset;
11783 /* On SEH target, registers are pushed just before the frame pointer
11784 location. */
11785 if (TARGET_SEH)
11786 frame->hard_frame_pointer_offset = offset;
11788 /* Calculate the size of the va-arg area (not including padding, if any). */
11789 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11791 /* Also adjust stack_realign_offset for the largest alignment of
11792 stack slot actually used. */
11793 if (stack_realign_fp
11794 || (cfun->machine->max_used_stack_alignment != 0
11795 && (offset % cfun->machine->max_used_stack_alignment) != 0))
11797 /* We may need a 16-byte aligned stack for the remainder of the
11798 register save area, but the stack frame for the local function
11799 may require a greater alignment if using AVX/2/512. In order
11800 to avoid wasting space, we first calculate the space needed for
11801 the rest of the register saves, add that to the stack pointer,
11802 and then realign the stack to the boundary of the start of the
11803 frame for the local function. */
11804 HOST_WIDE_INT space_needed = 0;
11805 HOST_WIDE_INT sse_reg_space_needed = 0;
11807 if (TARGET_64BIT)
11809 if (m->call_ms2sysv)
11811 m->call_ms2sysv_pad_in = 0;
11812 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11815 else if (frame->nsseregs)
11816 /* The only ABI that has saved SSE registers (Win64) also has a
11817 16-byte aligned default stack. However, many programs violate
11818 the ABI, and Wine64 forces stack realignment to compensate. */
11819 space_needed = frame->nsseregs * 16;
11821 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11823 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11824 rounding to be pedantic. */
11825 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11827 else
11828 space_needed = frame->va_arg_size;
11830 /* Record the allocation size required prior to the realignment AND. */
11831 frame->stack_realign_allocate = space_needed;
11833 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11834 before this point are not directly comparable with values below
11835 this point. Use sp_valid_at to determine if the stack pointer is
11836 valid for a given offset, fp_valid_at for the frame pointer, or
11837 choose_baseaddr to have a base register chosen for you.
11839 Note that the result of (frame->stack_realign_offset
11840 & (stack_alignment_needed - 1)) may not equal zero. */
11841 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11842 frame->stack_realign_offset = offset - space_needed;
11843 frame->sse_reg_save_offset = frame->stack_realign_offset
11844 + sse_reg_space_needed;
11846 else
11848 frame->stack_realign_offset = offset;
11850 if (TARGET_64BIT && m->call_ms2sysv)
11852 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11853 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11856 /* Align and set SSE register save area. */
11857 else if (frame->nsseregs)
11859 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11860 required and the DRAP re-alignment boundary is at least 16 bytes,
11861 then we want the SSE register save area properly aligned. */
11862 if (ix86_incoming_stack_boundary >= 128
11863 || (stack_realign_drap && stack_alignment_needed >= 16))
11864 offset = ROUND_UP (offset, 16);
11865 offset += frame->nsseregs * 16;
11867 frame->sse_reg_save_offset = offset;
11868 offset += frame->va_arg_size;
11871 /* Align start of frame for local function. When a function call
11872 is removed, it may become a leaf function. But if argument may
11873 be passed on stack, we need to align the stack when there is no
11874 tail call. */
11875 if (m->call_ms2sysv
11876 || frame->va_arg_size != 0
11877 || size != 0
11878 || !crtl->is_leaf
11879 || (!crtl->tail_call_emit
11880 && cfun->machine->outgoing_args_on_stack)
11881 || cfun->calls_alloca
11882 || ix86_current_function_calls_tls_descriptor)
11883 offset = ROUND_UP (offset, stack_alignment_needed);
11885 /* Frame pointer points here. */
11886 frame->frame_pointer_offset = offset;
11888 offset += size;
11890 /* Add outgoing arguments area. Can be skipped if we eliminated
11891 all the function calls as dead code.
11892 Skipping is however impossible when function calls alloca. Alloca
11893 expander assumes that last crtl->outgoing_args_size
11894 of stack frame are unused. */
11895 if (ACCUMULATE_OUTGOING_ARGS
11896 && (!crtl->is_leaf || cfun->calls_alloca
11897 || ix86_current_function_calls_tls_descriptor))
11899 offset += crtl->outgoing_args_size;
11900 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11902 else
11903 frame->outgoing_arguments_size = 0;
11905 /* Align stack boundary. Only needed if we're calling another function
11906 or using alloca. */
11907 if (!crtl->is_leaf || cfun->calls_alloca
11908 || ix86_current_function_calls_tls_descriptor)
11909 offset = ROUND_UP (offset, preferred_alignment);
11911 /* We've reached end of stack frame. */
11912 frame->stack_pointer_offset = offset;
11914 /* Size prologue needs to allocate. */
11915 to_allocate = offset - frame->sse_reg_save_offset;
11917 if ((!to_allocate && frame->nregs <= 1)
11918 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
11919 /* If stack clash probing needs a loop, then it needs a
11920 scratch register. But the returned register is only guaranteed
11921 to be safe to use after register saves are complete. So if
11922 stack clash protections are enabled and the allocated frame is
11923 larger than the probe interval, then use pushes to save
11924 callee saved registers. */
11925 || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
11926 frame->save_regs_using_mov = false;
11928 if (ix86_using_red_zone ()
11929 && crtl->sp_is_unchanging
11930 && crtl->is_leaf
11931 && !ix86_pc_thunk_call_expanded
11932 && !ix86_current_function_calls_tls_descriptor)
11934 frame->red_zone_size = to_allocate;
11935 if (frame->save_regs_using_mov)
11936 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11937 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11938 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11940 else
11941 frame->red_zone_size = 0;
11942 frame->stack_pointer_offset -= frame->red_zone_size;
11944 /* The SEH frame pointer location is near the bottom of the frame.
11945 This is enforced by the fact that the difference between the
11946 stack pointer and the frame pointer is limited to 240 bytes in
11947 the unwind data structure. */
11948 if (TARGET_SEH)
11950 HOST_WIDE_INT diff;
11952 /* If we can leave the frame pointer where it is, do so. Also, returns
11953 the establisher frame for __builtin_frame_address (0). */
11954 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11955 if (diff <= SEH_MAX_FRAME_SIZE
11956 && (diff > 240 || (diff & 15) != 0)
11957 && !crtl->accesses_prior_frames)
11959 /* Ideally we'd determine what portion of the local stack frame
11960 (within the constraint of the lowest 240) is most heavily used.
11961 But without that complication, simply bias the frame pointer
11962 by 128 bytes so as to maximize the amount of the local stack
11963 frame that is addressable with 8-bit offsets. */
11964 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11969 /* This is semi-inlined memory_address_length, but simplified
11970 since we know that we're always dealing with reg+offset, and
11971 to avoid having to create and discard all that rtl. */
11973 static inline int
11974 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11976 int len = 4;
11978 if (offset == 0)
11980 /* EBP and R13 cannot be encoded without an offset. */
11981 len = (regno == BP_REG || regno == R13_REG);
11983 else if (IN_RANGE (offset, -128, 127))
11984 len = 1;
11986 /* ESP and R12 must be encoded with a SIB byte. */
11987 if (regno == SP_REG || regno == R12_REG)
11988 len++;
11990 return len;
11993 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11994 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11996 static bool
11997 sp_valid_at (HOST_WIDE_INT cfa_offset)
11999 const struct machine_frame_state &fs = cfun->machine->fs;
12000 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
12002 /* Validate that the cfa_offset isn't in a "no-man's land". */
12003 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
12004 return false;
12006 return fs.sp_valid;
12009 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
12010 the frame save area. The register is saved at CFA - CFA_OFFSET. */
12012 static inline bool
12013 fp_valid_at (HOST_WIDE_INT cfa_offset)
12015 const struct machine_frame_state &fs = cfun->machine->fs;
12016 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
12018 /* Validate that the cfa_offset isn't in a "no-man's land". */
12019 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
12020 return false;
12022 return fs.fp_valid;
12025 /* Choose a base register based upon alignment requested, speed and/or
12026 size. */
12028 static void
12029 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
12030 HOST_WIDE_INT &base_offset,
12031 unsigned int align_reqested, unsigned int *align)
12033 const struct machine_function *m = cfun->machine;
12034 unsigned int hfp_align;
12035 unsigned int drap_align;
12036 unsigned int sp_align;
12037 bool hfp_ok = fp_valid_at (cfa_offset);
12038 bool drap_ok = m->fs.drap_valid;
12039 bool sp_ok = sp_valid_at (cfa_offset);
12041 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
12043 /* Filter out any registers that don't meet the requested alignment
12044 criteria. */
12045 if (align_reqested)
12047 if (m->fs.realigned)
12048 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
12049 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
12050 notes (which we would need to use a realigned stack pointer),
12051 so disable on SEH targets. */
12052 else if (m->fs.sp_realigned)
12053 sp_align = crtl->stack_alignment_needed;
12055 hfp_ok = hfp_ok && hfp_align >= align_reqested;
12056 drap_ok = drap_ok && drap_align >= align_reqested;
12057 sp_ok = sp_ok && sp_align >= align_reqested;
12060 if (m->use_fast_prologue_epilogue)
12062 /* Choose the base register most likely to allow the most scheduling
12063 opportunities. Generally FP is valid throughout the function,
12064 while DRAP must be reloaded within the epilogue. But choose either
12065 over the SP due to increased encoding size. */
12067 if (hfp_ok)
12069 base_reg = hard_frame_pointer_rtx;
12070 base_offset = m->fs.fp_offset - cfa_offset;
12072 else if (drap_ok)
12074 base_reg = crtl->drap_reg;
12075 base_offset = 0 - cfa_offset;
12077 else if (sp_ok)
12079 base_reg = stack_pointer_rtx;
12080 base_offset = m->fs.sp_offset - cfa_offset;
12083 else
12085 HOST_WIDE_INT toffset;
12086 int len = 16, tlen;
12088 /* Choose the base register with the smallest address encoding.
12089 With a tie, choose FP > DRAP > SP. */
12090 if (sp_ok)
12092 base_reg = stack_pointer_rtx;
12093 base_offset = m->fs.sp_offset - cfa_offset;
12094 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12096 if (drap_ok)
12098 toffset = 0 - cfa_offset;
12099 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12100 if (tlen <= len)
12102 base_reg = crtl->drap_reg;
12103 base_offset = toffset;
12104 len = tlen;
12107 if (hfp_ok)
12109 toffset = m->fs.fp_offset - cfa_offset;
12110 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12111 if (tlen <= len)
12113 base_reg = hard_frame_pointer_rtx;
12114 base_offset = toffset;
12115 len = tlen;
12120 /* Set the align return value. */
12121 if (align)
12123 if (base_reg == stack_pointer_rtx)
12124 *align = sp_align;
12125 else if (base_reg == crtl->drap_reg)
12126 *align = drap_align;
12127 else if (base_reg == hard_frame_pointer_rtx)
12128 *align = hfp_align;
12132 /* Return an RTX that points to CFA_OFFSET within the stack frame and
12133 the alignment of address. If ALIGN is non-null, it should point to
12134 an alignment value (in bits) that is preferred or zero and will
12135 recieve the alignment of the base register that was selected,
12136 irrespective of rather or not CFA_OFFSET is a multiple of that
12137 alignment value. If it is possible for the base register offset to be
12138 non-immediate then SCRATCH_REGNO should specify a scratch register to
12139 use.
12141 The valid base registers are taken from CFUN->MACHINE->FS. */
12143 static rtx
12144 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
12145 unsigned int scratch_regno = INVALID_REGNUM)
12147 rtx base_reg = NULL;
12148 HOST_WIDE_INT base_offset = 0;
12150 /* If a specific alignment is requested, try to get a base register
12151 with that alignment first. */
12152 if (align && *align)
12153 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
12155 if (!base_reg)
12156 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
12158 gcc_assert (base_reg != NULL);
12160 rtx base_offset_rtx = GEN_INT (base_offset);
12162 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
12164 gcc_assert (scratch_regno != INVALID_REGNUM);
12166 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
12167 emit_move_insn (scratch_reg, base_offset_rtx);
12169 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
12172 return plus_constant (Pmode, base_reg, base_offset);
12175 /* Emit code to save registers in the prologue. */
12177 static void
12178 ix86_emit_save_regs (void)
12180 unsigned int regno;
12181 rtx_insn *insn;
12183 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12184 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12186 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12187 RTX_FRAME_RELATED_P (insn) = 1;
12191 /* Emit a single register save at CFA - CFA_OFFSET. */
12193 static void
12194 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12195 HOST_WIDE_INT cfa_offset)
12197 struct machine_function *m = cfun->machine;
12198 rtx reg = gen_rtx_REG (mode, regno);
12199 rtx mem, addr, base, insn;
12200 unsigned int align = GET_MODE_ALIGNMENT (mode);
12202 addr = choose_baseaddr (cfa_offset, &align);
12203 mem = gen_frame_mem (mode, addr);
12205 /* The location aligment depends upon the base register. */
12206 align = MIN (GET_MODE_ALIGNMENT (mode), align);
12207 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
12208 set_mem_align (mem, align);
12210 insn = emit_insn (gen_rtx_SET (mem, reg));
12211 RTX_FRAME_RELATED_P (insn) = 1;
12213 base = addr;
12214 if (GET_CODE (base) == PLUS)
12215 base = XEXP (base, 0);
12216 gcc_checking_assert (REG_P (base));
12218 /* When saving registers into a re-aligned local stack frame, avoid
12219 any tricky guessing by dwarf2out. */
12220 if (m->fs.realigned)
12222 gcc_checking_assert (stack_realign_drap);
12224 if (regno == REGNO (crtl->drap_reg))
12226 /* A bit of a hack. We force the DRAP register to be saved in
12227 the re-aligned stack frame, which provides us with a copy
12228 of the CFA that will last past the prologue. Install it. */
12229 gcc_checking_assert (cfun->machine->fs.fp_valid);
12230 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12231 cfun->machine->fs.fp_offset - cfa_offset);
12232 mem = gen_rtx_MEM (mode, addr);
12233 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12235 else
12237 /* The frame pointer is a stable reference within the
12238 aligned frame. Use it. */
12239 gcc_checking_assert (cfun->machine->fs.fp_valid);
12240 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12241 cfun->machine->fs.fp_offset - cfa_offset);
12242 mem = gen_rtx_MEM (mode, addr);
12243 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12247 else if (base == stack_pointer_rtx && m->fs.sp_realigned
12248 && cfa_offset >= m->fs.sp_realigned_offset)
12250 gcc_checking_assert (stack_realign_fp);
12251 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12254 /* The memory may not be relative to the current CFA register,
12255 which means that we may need to generate a new pattern for
12256 use by the unwind info. */
12257 else if (base != m->fs.cfa_reg)
12259 addr = plus_constant (Pmode, m->fs.cfa_reg,
12260 m->fs.cfa_offset - cfa_offset);
12261 mem = gen_rtx_MEM (mode, addr);
12262 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12266 /* Emit code to save registers using MOV insns.
12267 First register is stored at CFA - CFA_OFFSET. */
12268 static void
12269 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12271 unsigned int regno;
12273 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12274 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12276 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12277 cfa_offset -= UNITS_PER_WORD;
12281 /* Emit code to save SSE registers using MOV insns.
12282 First register is stored at CFA - CFA_OFFSET. */
12283 static void
12284 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12286 unsigned int regno;
12288 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12289 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12291 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12292 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12296 static GTY(()) rtx queued_cfa_restores;
12298 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12299 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12300 Don't add the note if the previously saved value will be left untouched
12301 within stack red-zone till return, as unwinders can find the same value
12302 in the register and on the stack. */
12304 static void
12305 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12307 if (!crtl->shrink_wrapped
12308 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12309 return;
12311 if (insn)
12313 add_reg_note (insn, REG_CFA_RESTORE, reg);
12314 RTX_FRAME_RELATED_P (insn) = 1;
12316 else
12317 queued_cfa_restores
12318 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12321 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12323 static void
12324 ix86_add_queued_cfa_restore_notes (rtx insn)
12326 rtx last;
12327 if (!queued_cfa_restores)
12328 return;
12329 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12331 XEXP (last, 1) = REG_NOTES (insn);
12332 REG_NOTES (insn) = queued_cfa_restores;
12333 queued_cfa_restores = NULL_RTX;
12334 RTX_FRAME_RELATED_P (insn) = 1;
12337 /* Expand prologue or epilogue stack adjustment.
12338 The pattern exist to put a dependency on all ebp-based memory accesses.
12339 STYLE should be negative if instructions should be marked as frame related,
12340 zero if %r11 register is live and cannot be freely used and positive
12341 otherwise. */
12343 static rtx
12344 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12345 int style, bool set_cfa)
12347 struct machine_function *m = cfun->machine;
12348 rtx insn;
12349 bool add_frame_related_expr = false;
12351 if (Pmode == SImode)
12352 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12353 else if (x86_64_immediate_operand (offset, DImode))
12354 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12355 else
12357 rtx tmp;
12358 /* r11 is used by indirect sibcall return as well, set before the
12359 epilogue and used after the epilogue. */
12360 if (style)
12361 tmp = gen_rtx_REG (DImode, R11_REG);
12362 else
12364 gcc_assert (src != hard_frame_pointer_rtx
12365 && dest != hard_frame_pointer_rtx);
12366 tmp = hard_frame_pointer_rtx;
12368 insn = emit_insn (gen_rtx_SET (tmp, offset));
12369 if (style < 0)
12370 add_frame_related_expr = true;
12372 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12375 insn = emit_insn (insn);
12376 if (style >= 0)
12377 ix86_add_queued_cfa_restore_notes (insn);
12379 if (set_cfa)
12381 rtx r;
12383 gcc_assert (m->fs.cfa_reg == src);
12384 m->fs.cfa_offset += INTVAL (offset);
12385 m->fs.cfa_reg = dest;
12387 r = gen_rtx_PLUS (Pmode, src, offset);
12388 r = gen_rtx_SET (dest, r);
12389 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12390 RTX_FRAME_RELATED_P (insn) = 1;
12392 else if (style < 0)
12394 RTX_FRAME_RELATED_P (insn) = 1;
12395 if (add_frame_related_expr)
12397 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12398 r = gen_rtx_SET (dest, r);
12399 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12403 if (dest == stack_pointer_rtx)
12405 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12406 bool valid = m->fs.sp_valid;
12407 bool realigned = m->fs.sp_realigned;
12409 if (src == hard_frame_pointer_rtx)
12411 valid = m->fs.fp_valid;
12412 realigned = false;
12413 ooffset = m->fs.fp_offset;
12415 else if (src == crtl->drap_reg)
12417 valid = m->fs.drap_valid;
12418 realigned = false;
12419 ooffset = 0;
12421 else
12423 /* Else there are two possibilities: SP itself, which we set
12424 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12425 taken care of this by hand along the eh_return path. */
12426 gcc_checking_assert (src == stack_pointer_rtx
12427 || offset == const0_rtx);
12430 m->fs.sp_offset = ooffset - INTVAL (offset);
12431 m->fs.sp_valid = valid;
12432 m->fs.sp_realigned = realigned;
12434 return insn;
12437 /* Find an available register to be used as dynamic realign argument
12438 pointer regsiter. Such a register will be written in prologue and
12439 used in begin of body, so it must not be
12440 1. parameter passing register.
12441 2. GOT pointer.
12442 We reuse static-chain register if it is available. Otherwise, we
12443 use DI for i386 and R13 for x86-64. We chose R13 since it has
12444 shorter encoding.
12446 Return: the regno of chosen register. */
12448 static unsigned int
12449 find_drap_reg (void)
12451 tree decl = cfun->decl;
12453 /* Always use callee-saved register if there are no caller-saved
12454 registers. */
12455 if (TARGET_64BIT)
12457 /* Use R13 for nested function or function need static chain.
12458 Since function with tail call may use any caller-saved
12459 registers in epilogue, DRAP must not use caller-saved
12460 register in such case. */
12461 if (DECL_STATIC_CHAIN (decl)
12462 || cfun->machine->no_caller_saved_registers
12463 || crtl->tail_call_emit)
12464 return R13_REG;
12466 return R10_REG;
12468 else
12470 /* Use DI for nested function or function need static chain.
12471 Since function with tail call may use any caller-saved
12472 registers in epilogue, DRAP must not use caller-saved
12473 register in such case. */
12474 if (DECL_STATIC_CHAIN (decl)
12475 || cfun->machine->no_caller_saved_registers
12476 || crtl->tail_call_emit)
12477 return DI_REG;
12479 /* Reuse static chain register if it isn't used for parameter
12480 passing. */
12481 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12483 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12484 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12485 return CX_REG;
12487 return DI_REG;
12491 /* Handle a "force_align_arg_pointer" attribute. */
12493 static tree
12494 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12495 tree, int, bool *no_add_attrs)
12497 if (TREE_CODE (*node) != FUNCTION_TYPE
12498 && TREE_CODE (*node) != METHOD_TYPE
12499 && TREE_CODE (*node) != FIELD_DECL
12500 && TREE_CODE (*node) != TYPE_DECL)
12502 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12503 name);
12504 *no_add_attrs = true;
12507 return NULL_TREE;
12510 /* Return minimum incoming stack alignment. */
12512 static unsigned int
12513 ix86_minimum_incoming_stack_boundary (bool sibcall)
12515 unsigned int incoming_stack_boundary;
12517 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
12518 if (cfun->machine->func_type != TYPE_NORMAL)
12519 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
12520 /* Prefer the one specified at command line. */
12521 else if (ix86_user_incoming_stack_boundary)
12522 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12523 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12524 if -mstackrealign is used, it isn't used for sibcall check and
12525 estimated stack alignment is 128bit. */
12526 else if (!sibcall
12527 && ix86_force_align_arg_pointer
12528 && crtl->stack_alignment_estimated == 128)
12529 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12530 else
12531 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12533 /* Incoming stack alignment can be changed on individual functions
12534 via force_align_arg_pointer attribute. We use the smallest
12535 incoming stack boundary. */
12536 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12537 && lookup_attribute (ix86_force_align_arg_pointer_string,
12538 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12539 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12541 /* The incoming stack frame has to be aligned at least at
12542 parm_stack_boundary. */
12543 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12544 incoming_stack_boundary = crtl->parm_stack_boundary;
12546 /* Stack at entrance of main is aligned by runtime. We use the
12547 smallest incoming stack boundary. */
12548 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12549 && DECL_NAME (current_function_decl)
12550 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12551 && DECL_FILE_SCOPE_P (current_function_decl))
12552 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12554 return incoming_stack_boundary;
12557 /* Update incoming stack boundary and estimated stack alignment. */
12559 static void
12560 ix86_update_stack_boundary (void)
12562 ix86_incoming_stack_boundary
12563 = ix86_minimum_incoming_stack_boundary (false);
12565 /* x86_64 vararg needs 16byte stack alignment for register save
12566 area. */
12567 if (TARGET_64BIT
12568 && cfun->stdarg
12569 && crtl->stack_alignment_estimated < 128)
12570 crtl->stack_alignment_estimated = 128;
12572 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12573 if (ix86_tls_descriptor_calls_expanded_in_cfun
12574 && crtl->preferred_stack_boundary < 128)
12575 crtl->preferred_stack_boundary = 128;
12578 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12579 needed or an rtx for DRAP otherwise. */
12581 static rtx
12582 ix86_get_drap_rtx (void)
12584 /* We must use DRAP if there are outgoing arguments on stack and
12585 ACCUMULATE_OUTGOING_ARGS is false. */
12586 if (ix86_force_drap
12587 || (cfun->machine->outgoing_args_on_stack
12588 && !ACCUMULATE_OUTGOING_ARGS))
12589 crtl->need_drap = true;
12591 if (stack_realign_drap)
12593 /* Assign DRAP to vDRAP and returns vDRAP */
12594 unsigned int regno = find_drap_reg ();
12595 rtx drap_vreg;
12596 rtx arg_ptr;
12597 rtx_insn *seq, *insn;
12599 arg_ptr = gen_rtx_REG (Pmode, regno);
12600 crtl->drap_reg = arg_ptr;
12602 start_sequence ();
12603 drap_vreg = copy_to_reg (arg_ptr);
12604 seq = get_insns ();
12605 end_sequence ();
12607 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12608 if (!optimize)
12610 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12611 RTX_FRAME_RELATED_P (insn) = 1;
12613 return drap_vreg;
12615 else
12616 return NULL;
12619 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12621 static rtx
12622 ix86_internal_arg_pointer (void)
12624 return virtual_incoming_args_rtx;
12627 struct scratch_reg {
12628 rtx reg;
12629 bool saved;
12632 /* Return a short-lived scratch register for use on function entry.
12633 In 32-bit mode, it is valid only after the registers are saved
12634 in the prologue. This register must be released by means of
12635 release_scratch_register_on_entry once it is dead. */
12637 static void
12638 get_scratch_register_on_entry (struct scratch_reg *sr)
12640 int regno;
12642 sr->saved = false;
12644 if (TARGET_64BIT)
12646 /* We always use R11 in 64-bit mode. */
12647 regno = R11_REG;
12649 else
12651 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12652 bool fastcall_p
12653 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12654 bool thiscall_p
12655 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12656 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12657 int regparm = ix86_function_regparm (fntype, decl);
12658 int drap_regno
12659 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12661 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12662 for the static chain register. */
12663 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12664 && drap_regno != AX_REG)
12665 regno = AX_REG;
12666 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12667 for the static chain register. */
12668 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12669 regno = AX_REG;
12670 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12671 regno = DX_REG;
12672 /* ecx is the static chain register. */
12673 else if (regparm < 3 && !fastcall_p && !thiscall_p
12674 && !static_chain_p
12675 && drap_regno != CX_REG)
12676 regno = CX_REG;
12677 else if (ix86_save_reg (BX_REG, true, false))
12678 regno = BX_REG;
12679 /* esi is the static chain register. */
12680 else if (!(regparm == 3 && static_chain_p)
12681 && ix86_save_reg (SI_REG, true, false))
12682 regno = SI_REG;
12683 else if (ix86_save_reg (DI_REG, true, false))
12684 regno = DI_REG;
12685 else
12687 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12688 sr->saved = true;
12692 sr->reg = gen_rtx_REG (Pmode, regno);
12693 if (sr->saved)
12695 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12696 RTX_FRAME_RELATED_P (insn) = 1;
12700 /* Release a scratch register obtained from the preceding function.
12702 If RELEASE_VIA_POP is true, we just pop the register off the stack
12703 to release it. This is what non-Linux systems use with -fstack-check.
12705 Otherwise we use OFFSET to locate the saved register and the
12706 allocated stack space becomes part of the local frame and is
12707 deallocated by the epilogue. */
12709 static void
12710 release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
12711 bool release_via_pop)
12713 if (sr->saved)
12715 if (release_via_pop)
12717 struct machine_function *m = cfun->machine;
12718 rtx x, insn = emit_insn (gen_pop (sr->reg));
12720 /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */
12721 RTX_FRAME_RELATED_P (insn) = 1;
12722 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12723 x = gen_rtx_SET (stack_pointer_rtx, x);
12724 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12725 m->fs.sp_offset -= UNITS_PER_WORD;
12727 else
12729 rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
12730 x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
12731 emit_insn (x);
12736 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12738 This differs from the next routine in that it tries hard to prevent
12739 attacks that jump the stack guard. Thus it is never allowed to allocate
12740 more than PROBE_INTERVAL bytes of stack space without a suitable
12741 probe.
12743 INT_REGISTERS_SAVED is true if integer registers have already been
12744 pushed on the stack. */
12746 static void
12747 ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
12748 const bool int_registers_saved)
12750 struct machine_function *m = cfun->machine;
12752 /* If this function does not statically allocate stack space, then
12753 no probes are needed. */
12754 if (!size)
12756 /* However, the allocation of space via pushes for register
12757 saves could be viewed as allocating space, but without the
12758 need to probe. */
12759 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12760 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12761 else
12762 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12763 return;
12766 /* If we are a noreturn function, then we have to consider the
12767 possibility that we're called via a jump rather than a call.
12769 Thus we don't have the implicit probe generated by saving the
12770 return address into the stack at the call. Thus, the stack
12771 pointer could be anywhere in the guard page. The safe thing
12772 to do is emit a probe now.
12774 The probe can be avoided if we have already emitted any callee
12775 register saves into the stack or have a frame pointer (which will
12776 have been saved as well). Those saves will function as implicit
12777 probes.
12779 ?!? This should be revamped to work like aarch64 and s390 where
12780 we track the offset from the most recent probe. Normally that
12781 offset would be zero. For a noreturn function we would reset
12782 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12783 we just probe when we cross PROBE_INTERVAL. */
12784 if (TREE_THIS_VOLATILE (cfun->decl)
12785 && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12787 /* We can safely use any register here since we're just going to push
12788 its value and immediately pop it back. But we do try and avoid
12789 argument passing registers so as not to introduce dependencies in
12790 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12791 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12792 rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12793 rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12794 m->fs.sp_offset -= UNITS_PER_WORD;
12795 if (m->fs.cfa_reg == stack_pointer_rtx)
12797 m->fs.cfa_offset -= UNITS_PER_WORD;
12798 rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12799 x = gen_rtx_SET (stack_pointer_rtx, x);
12800 add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12801 RTX_FRAME_RELATED_P (insn_push) = 1;
12802 x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12803 x = gen_rtx_SET (stack_pointer_rtx, x);
12804 add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12805 RTX_FRAME_RELATED_P (insn_pop) = 1;
12807 emit_insn (gen_blockage ());
12810 /* If we allocate less than the size of the guard statically,
12811 then no probing is necessary, but we do need to allocate
12812 the stack. */
12813 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12815 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12816 GEN_INT (-size), -1,
12817 m->fs.cfa_reg == stack_pointer_rtx);
12818 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12819 return;
12822 /* We're allocating a large enough stack frame that we need to
12823 emit probes. Either emit them inline or in a loop depending
12824 on the size. */
12825 HOST_WIDE_INT probe_interval = get_probe_interval ();
12826 if (size <= 4 * probe_interval)
12828 HOST_WIDE_INT i;
12829 for (i = probe_interval; i <= size; i += probe_interval)
12831 /* Allocate PROBE_INTERVAL bytes. */
12832 rtx insn
12833 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12834 GEN_INT (-probe_interval), -1,
12835 m->fs.cfa_reg == stack_pointer_rtx);
12836 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12838 /* And probe at *sp. */
12839 emit_stack_probe (stack_pointer_rtx);
12840 emit_insn (gen_blockage ());
12843 /* We need to allocate space for the residual, but we do not need
12844 to probe the residual. */
12845 HOST_WIDE_INT residual = (i - probe_interval - size);
12846 if (residual)
12847 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12848 GEN_INT (residual), -1,
12849 m->fs.cfa_reg == stack_pointer_rtx);
12850 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12852 else
12854 /* We expect the GP registers to be saved when probes are used
12855 as the probing sequences might need a scratch register and
12856 the routine to allocate one assumes the integer registers
12857 have already been saved. */
12858 gcc_assert (int_registers_saved);
12860 struct scratch_reg sr;
12861 get_scratch_register_on_entry (&sr);
12863 /* If we needed to save a register, then account for any space
12864 that was pushed (we are not going to pop the register when
12865 we do the restore). */
12866 if (sr.saved)
12867 size -= UNITS_PER_WORD;
12869 /* Step 1: round SIZE down to a multiple of the interval. */
12870 HOST_WIDE_INT rounded_size = size & -probe_interval;
12872 /* Step 2: compute final value of the loop counter. Use lea if
12873 possible. */
12874 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12875 rtx insn;
12876 if (address_no_seg_operand (addr, Pmode))
12877 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12878 else
12880 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12881 insn = emit_insn (gen_rtx_SET (sr.reg,
12882 gen_rtx_PLUS (Pmode, sr.reg,
12883 stack_pointer_rtx)));
12885 if (m->fs.cfa_reg == stack_pointer_rtx)
12887 add_reg_note (insn, REG_CFA_DEF_CFA,
12888 plus_constant (Pmode, sr.reg,
12889 m->fs.cfa_offset + rounded_size));
12890 RTX_FRAME_RELATED_P (insn) = 1;
12893 /* Step 3: the loop. */
12894 rtx size_rtx = GEN_INT (rounded_size);
12895 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12896 size_rtx));
12897 if (m->fs.cfa_reg == stack_pointer_rtx)
12899 m->fs.cfa_offset += rounded_size;
12900 add_reg_note (insn, REG_CFA_DEF_CFA,
12901 plus_constant (Pmode, stack_pointer_rtx,
12902 m->fs.cfa_offset));
12903 RTX_FRAME_RELATED_P (insn) = 1;
12905 m->fs.sp_offset += rounded_size;
12906 emit_insn (gen_blockage ());
12908 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12909 is equal to ROUNDED_SIZE. */
12911 if (size != rounded_size)
12912 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12913 GEN_INT (rounded_size - size), -1,
12914 m->fs.cfa_reg == stack_pointer_rtx);
12915 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12917 /* This does not deallocate the space reserved for the scratch
12918 register. That will be deallocated in the epilogue. */
12919 release_scratch_register_on_entry (&sr, size, false);
12922 /* Make sure nothing is scheduled before we are done. */
12923 emit_insn (gen_blockage ());
12926 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12928 INT_REGISTERS_SAVED is true if integer registers have already been
12929 pushed on the stack. */
12931 static void
12932 ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
12933 const bool int_registers_saved)
12935 /* We skip the probe for the first interval + a small dope of 4 words and
12936 probe that many bytes past the specified size to maintain a protection
12937 area at the botton of the stack. */
12938 const int dope = 4 * UNITS_PER_WORD;
12939 rtx size_rtx = GEN_INT (size), last;
12941 /* See if we have a constant small number of probes to generate. If so,
12942 that's the easy case. The run-time loop is made up of 9 insns in the
12943 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12944 for n # of intervals. */
12945 if (size <= 4 * get_probe_interval ())
12947 HOST_WIDE_INT i, adjust;
12948 bool first_probe = true;
12950 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12951 values of N from 1 until it exceeds SIZE. If only one probe is
12952 needed, this will not generate any code. Then adjust and probe
12953 to PROBE_INTERVAL + SIZE. */
12954 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12956 if (first_probe)
12958 adjust = 2 * get_probe_interval () + dope;
12959 first_probe = false;
12961 else
12962 adjust = get_probe_interval ();
12964 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12965 plus_constant (Pmode, stack_pointer_rtx,
12966 -adjust)));
12967 emit_stack_probe (stack_pointer_rtx);
12970 if (first_probe)
12971 adjust = size + get_probe_interval () + dope;
12972 else
12973 adjust = size + get_probe_interval () - i;
12975 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12976 plus_constant (Pmode, stack_pointer_rtx,
12977 -adjust)));
12978 emit_stack_probe (stack_pointer_rtx);
12980 /* Adjust back to account for the additional first interval. */
12981 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12982 plus_constant (Pmode, stack_pointer_rtx,
12983 (get_probe_interval ()
12984 + dope))));
12987 /* Otherwise, do the same as above, but in a loop. Note that we must be
12988 extra careful with variables wrapping around because we might be at
12989 the very top (or the very bottom) of the address space and we have
12990 to be able to handle this case properly; in particular, we use an
12991 equality test for the loop condition. */
12992 else
12994 /* We expect the GP registers to be saved when probes are used
12995 as the probing sequences might need a scratch register and
12996 the routine to allocate one assumes the integer registers
12997 have already been saved. */
12998 gcc_assert (int_registers_saved);
13000 HOST_WIDE_INT rounded_size;
13001 struct scratch_reg sr;
13003 get_scratch_register_on_entry (&sr);
13005 /* If we needed to save a register, then account for any space
13006 that was pushed (we are not going to pop the register when
13007 we do the restore). */
13008 if (sr.saved)
13009 size -= UNITS_PER_WORD;
13011 /* Step 1: round SIZE to the previous multiple of the interval. */
13013 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13016 /* Step 2: compute initial and final value of the loop counter. */
13018 /* SP = SP_0 + PROBE_INTERVAL. */
13019 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13020 plus_constant (Pmode, stack_pointer_rtx,
13021 - (get_probe_interval () + dope))));
13023 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13024 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13025 emit_insn (gen_rtx_SET (sr.reg,
13026 plus_constant (Pmode, stack_pointer_rtx,
13027 -rounded_size)));
13028 else
13030 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13031 emit_insn (gen_rtx_SET (sr.reg,
13032 gen_rtx_PLUS (Pmode, sr.reg,
13033 stack_pointer_rtx)));
13037 /* Step 3: the loop
13041 SP = SP + PROBE_INTERVAL
13042 probe at SP
13044 while (SP != LAST_ADDR)
13046 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13047 values of N from 1 until it is equal to ROUNDED_SIZE. */
13049 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13052 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13053 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13055 if (size != rounded_size)
13057 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13058 plus_constant (Pmode, stack_pointer_rtx,
13059 rounded_size - size)));
13060 emit_stack_probe (stack_pointer_rtx);
13063 /* Adjust back to account for the additional first interval. */
13064 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13065 plus_constant (Pmode, stack_pointer_rtx,
13066 (get_probe_interval ()
13067 + dope))));
13069 /* This does not deallocate the space reserved for the scratch
13070 register. That will be deallocated in the epilogue. */
13071 release_scratch_register_on_entry (&sr, size, false);
13074 /* Even if the stack pointer isn't the CFA register, we need to correctly
13075 describe the adjustments made to it, in particular differentiate the
13076 frame-related ones from the frame-unrelated ones. */
13077 if (size > 0)
13079 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13080 XVECEXP (expr, 0, 0)
13081 = gen_rtx_SET (stack_pointer_rtx,
13082 plus_constant (Pmode, stack_pointer_rtx, -size));
13083 XVECEXP (expr, 0, 1)
13084 = gen_rtx_SET (stack_pointer_rtx,
13085 plus_constant (Pmode, stack_pointer_rtx,
13086 get_probe_interval () + dope + size));
13087 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13088 RTX_FRAME_RELATED_P (last) = 1;
13090 cfun->machine->fs.sp_offset += size;
13093 /* Make sure nothing is scheduled before we are done. */
13094 emit_insn (gen_blockage ());
13097 /* Adjust the stack pointer up to REG while probing it. */
13099 const char *
13100 output_adjust_stack_and_probe (rtx reg)
13102 static int labelno = 0;
13103 char loop_lab[32];
13104 rtx xops[2];
13106 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13108 /* Loop. */
13109 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13111 /* SP = SP + PROBE_INTERVAL. */
13112 xops[0] = stack_pointer_rtx;
13113 xops[1] = GEN_INT (get_probe_interval ());
13114 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13116 /* Probe at SP. */
13117 xops[1] = const0_rtx;
13118 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13120 /* Test if SP == LAST_ADDR. */
13121 xops[0] = stack_pointer_rtx;
13122 xops[1] = reg;
13123 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13125 /* Branch. */
13126 fputs ("\tjne\t", asm_out_file);
13127 assemble_name_raw (asm_out_file, loop_lab);
13128 fputc ('\n', asm_out_file);
13130 return "";
13133 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13134 inclusive. These are offsets from the current stack pointer.
13136 INT_REGISTERS_SAVED is true if integer registers have already been
13137 pushed on the stack. */
13139 static void
13140 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
13141 const bool int_registers_saved)
13143 /* See if we have a constant small number of probes to generate. If so,
13144 that's the easy case. The run-time loop is made up of 6 insns in the
13145 generic case while the compile-time loop is made up of n insns for n #
13146 of intervals. */
13147 if (size <= 6 * get_probe_interval ())
13149 HOST_WIDE_INT i;
13151 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13152 it exceeds SIZE. If only one probe is needed, this will not
13153 generate any code. Then probe at FIRST + SIZE. */
13154 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
13155 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13156 -(first + i)));
13158 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13159 -(first + size)));
13162 /* Otherwise, do the same as above, but in a loop. Note that we must be
13163 extra careful with variables wrapping around because we might be at
13164 the very top (or the very bottom) of the address space and we have
13165 to be able to handle this case properly; in particular, we use an
13166 equality test for the loop condition. */
13167 else
13169 /* We expect the GP registers to be saved when probes are used
13170 as the probing sequences might need a scratch register and
13171 the routine to allocate one assumes the integer registers
13172 have already been saved. */
13173 gcc_assert (int_registers_saved);
13175 HOST_WIDE_INT rounded_size, last;
13176 struct scratch_reg sr;
13178 get_scratch_register_on_entry (&sr);
13181 /* Step 1: round SIZE to the previous multiple of the interval. */
13183 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13186 /* Step 2: compute initial and final value of the loop counter. */
13188 /* TEST_OFFSET = FIRST. */
13189 emit_move_insn (sr.reg, GEN_INT (-first));
13191 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13192 last = first + rounded_size;
13195 /* Step 3: the loop
13199 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13200 probe at TEST_ADDR
13202 while (TEST_ADDR != LAST_ADDR)
13204 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13205 until it is equal to ROUNDED_SIZE. */
13207 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13210 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13211 that SIZE is equal to ROUNDED_SIZE. */
13213 if (size != rounded_size)
13214 emit_stack_probe (plus_constant (Pmode,
13215 gen_rtx_PLUS (Pmode,
13216 stack_pointer_rtx,
13217 sr.reg),
13218 rounded_size - size));
13220 release_scratch_register_on_entry (&sr, size, true);
13223 /* Make sure nothing is scheduled before we are done. */
13224 emit_insn (gen_blockage ());
13227 /* Probe a range of stack addresses from REG to END, inclusive. These are
13228 offsets from the current stack pointer. */
13230 const char *
13231 output_probe_stack_range (rtx reg, rtx end)
13233 static int labelno = 0;
13234 char loop_lab[32];
13235 rtx xops[3];
13237 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13239 /* Loop. */
13240 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13242 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13243 xops[0] = reg;
13244 xops[1] = GEN_INT (get_probe_interval ());
13245 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13247 /* Probe at TEST_ADDR. */
13248 xops[0] = stack_pointer_rtx;
13249 xops[1] = reg;
13250 xops[2] = const0_rtx;
13251 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13253 /* Test if TEST_ADDR == LAST_ADDR. */
13254 xops[0] = reg;
13255 xops[1] = end;
13256 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13258 /* Branch. */
13259 fputs ("\tjne\t", asm_out_file);
13260 assemble_name_raw (asm_out_file, loop_lab);
13261 fputc ('\n', asm_out_file);
13263 return "";
13266 /* Return true if stack frame is required. Update STACK_ALIGNMENT
13267 to the largest alignment, in bits, of stack slot used if stack
13268 frame is required and CHECK_STACK_SLOT is true. */
13270 static bool
13271 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
13272 bool check_stack_slot)
13274 HARD_REG_SET set_up_by_prologue, prologue_used;
13275 basic_block bb;
13277 CLEAR_HARD_REG_SET (prologue_used);
13278 CLEAR_HARD_REG_SET (set_up_by_prologue);
13279 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13280 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13281 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13282 HARD_FRAME_POINTER_REGNUM);
13284 /* The preferred stack alignment is the minimum stack alignment. */
13285 if (stack_alignment > crtl->preferred_stack_boundary)
13286 stack_alignment = crtl->preferred_stack_boundary;
13288 bool require_stack_frame = false;
13290 FOR_EACH_BB_FN (bb, cfun)
13292 rtx_insn *insn;
13293 FOR_BB_INSNS (bb, insn)
13294 if (NONDEBUG_INSN_P (insn)
13295 && requires_stack_frame_p (insn, prologue_used,
13296 set_up_by_prologue))
13298 require_stack_frame = true;
13300 if (check_stack_slot)
13302 /* Find the maximum stack alignment. */
13303 subrtx_iterator::array_type array;
13304 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
13305 if (MEM_P (*iter)
13306 && (reg_mentioned_p (stack_pointer_rtx,
13307 *iter)
13308 || reg_mentioned_p (frame_pointer_rtx,
13309 *iter)))
13311 unsigned int alignment = MEM_ALIGN (*iter);
13312 if (alignment > stack_alignment)
13313 stack_alignment = alignment;
13319 return require_stack_frame;
13322 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
13323 will guide prologue/epilogue to be generated in correct form. */
13325 static void
13326 ix86_finalize_stack_frame_flags (void)
13328 /* Check if stack realign is really needed after reload, and
13329 stores result in cfun */
13330 unsigned int incoming_stack_boundary
13331 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13332 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13333 unsigned int stack_alignment
13334 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13335 ? crtl->max_used_stack_slot_alignment
13336 : crtl->stack_alignment_needed);
13337 unsigned int stack_realign
13338 = (incoming_stack_boundary < stack_alignment);
13339 bool recompute_frame_layout_p = false;
13341 if (crtl->stack_realign_finalized)
13343 /* After stack_realign_needed is finalized, we can't no longer
13344 change it. */
13345 gcc_assert (crtl->stack_realign_needed == stack_realign);
13346 return;
13349 /* If the only reason for frame_pointer_needed is that we conservatively
13350 assumed stack realignment might be needed or -fno-omit-frame-pointer
13351 is used, but in the end nothing that needed the stack alignment had
13352 been spilled nor stack access, clear frame_pointer_needed and say we
13353 don't need stack realignment. */
13354 if ((stack_realign || !flag_omit_frame_pointer)
13355 && frame_pointer_needed
13356 && crtl->is_leaf
13357 && crtl->sp_is_unchanging
13358 && !ix86_current_function_calls_tls_descriptor
13359 && !crtl->accesses_prior_frames
13360 && !cfun->calls_alloca
13361 && !crtl->calls_eh_return
13362 /* See ira_setup_eliminable_regset for the rationale. */
13363 && !(STACK_CHECK_MOVING_SP
13364 && flag_stack_check
13365 && flag_exceptions
13366 && cfun->can_throw_non_call_exceptions)
13367 && !ix86_frame_pointer_required ()
13368 && get_frame_size () == 0
13369 && ix86_nsaved_sseregs () == 0
13370 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13372 if (ix86_find_max_used_stack_alignment (stack_alignment,
13373 stack_realign))
13375 /* Stack frame is required. If stack alignment needed is less
13376 than incoming stack boundary, don't realign stack. */
13377 stack_realign = incoming_stack_boundary < stack_alignment;
13378 if (!stack_realign)
13380 crtl->max_used_stack_slot_alignment
13381 = incoming_stack_boundary;
13382 crtl->stack_alignment_needed
13383 = incoming_stack_boundary;
13384 /* Also update preferred_stack_boundary for leaf
13385 functions. */
13386 crtl->preferred_stack_boundary
13387 = incoming_stack_boundary;
13390 else
13392 /* If drap has been set, but it actually isn't live at the
13393 start of the function, there is no reason to set it up. */
13394 if (crtl->drap_reg)
13396 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13397 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
13398 REGNO (crtl->drap_reg)))
13400 crtl->drap_reg = NULL_RTX;
13401 crtl->need_drap = false;
13404 else
13405 cfun->machine->no_drap_save_restore = true;
13407 frame_pointer_needed = false;
13408 stack_realign = false;
13409 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13410 crtl->stack_alignment_needed = incoming_stack_boundary;
13411 crtl->stack_alignment_estimated = incoming_stack_boundary;
13412 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13413 crtl->preferred_stack_boundary = incoming_stack_boundary;
13414 df_finish_pass (true);
13415 df_scan_alloc (NULL);
13416 df_scan_blocks ();
13417 df_compute_regs_ever_live (true);
13418 df_analyze ();
13420 if (flag_var_tracking)
13422 /* Since frame pointer is no longer available, replace it with
13423 stack pointer - UNITS_PER_WORD in debug insns. */
13424 df_ref ref, next;
13425 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
13426 ref; ref = next)
13428 next = DF_REF_NEXT_REG (ref);
13429 if (!DF_REF_INSN_INFO (ref))
13430 continue;
13432 /* Make sure the next ref is for a different instruction,
13433 so that we're not affected by the rescan. */
13434 rtx_insn *insn = DF_REF_INSN (ref);
13435 while (next && DF_REF_INSN (next) == insn)
13436 next = DF_REF_NEXT_REG (next);
13438 if (DEBUG_INSN_P (insn))
13440 bool changed = false;
13441 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
13443 rtx *loc = DF_REF_LOC (ref);
13444 if (*loc == hard_frame_pointer_rtx)
13446 *loc = plus_constant (Pmode,
13447 stack_pointer_rtx,
13448 -UNITS_PER_WORD);
13449 changed = true;
13452 if (changed)
13453 df_insn_rescan (insn);
13458 recompute_frame_layout_p = true;
13461 else if (crtl->max_used_stack_slot_alignment
13462 > crtl->preferred_stack_boundary)
13464 /* We don't need to realign stack. But we still need to keep
13465 stack frame properly aligned to satisfy the largest alignment
13466 of stack slots. */
13467 if (ix86_find_max_used_stack_alignment (stack_alignment, true))
13468 cfun->machine->max_used_stack_alignment
13469 = stack_alignment / BITS_PER_UNIT;
13472 if (crtl->stack_realign_needed != stack_realign)
13473 recompute_frame_layout_p = true;
13474 crtl->stack_realign_needed = stack_realign;
13475 crtl->stack_realign_finalized = true;
13476 if (recompute_frame_layout_p)
13477 ix86_compute_frame_layout ();
13480 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13482 static void
13483 ix86_elim_entry_set_got (rtx reg)
13485 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13486 rtx_insn *c_insn = BB_HEAD (bb);
13487 if (!NONDEBUG_INSN_P (c_insn))
13488 c_insn = next_nonnote_nondebug_insn (c_insn);
13489 if (c_insn && NONJUMP_INSN_P (c_insn))
13491 rtx pat = PATTERN (c_insn);
13492 if (GET_CODE (pat) == PARALLEL)
13494 rtx vec = XVECEXP (pat, 0, 0);
13495 if (GET_CODE (vec) == SET
13496 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13497 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13498 delete_insn (c_insn);
13503 static rtx
13504 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
13506 rtx addr, mem;
13508 if (offset)
13509 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
13510 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
13511 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
13514 static inline rtx
13515 gen_frame_load (rtx reg, rtx frame_reg, int offset)
13517 return gen_frame_set (reg, frame_reg, offset, false);
13520 static inline rtx
13521 gen_frame_store (rtx reg, rtx frame_reg, int offset)
13523 return gen_frame_set (reg, frame_reg, offset, true);
13526 static void
13527 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
13529 struct machine_function *m = cfun->machine;
13530 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13531 + m->call_ms2sysv_extra_regs;
13532 rtvec v = rtvec_alloc (ncregs + 1);
13533 unsigned int align, i, vi = 0;
13534 rtx_insn *insn;
13535 rtx sym, addr;
13536 rtx rax = gen_rtx_REG (word_mode, AX_REG);
13537 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13539 /* AL should only be live with sysv_abi. */
13540 gcc_assert (!ix86_eax_live_at_start_p ());
13541 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
13543 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
13544 we've actually realigned the stack or not. */
13545 align = GET_MODE_ALIGNMENT (V4SFmode);
13546 addr = choose_baseaddr (frame.stack_realign_offset
13547 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
13548 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13550 emit_insn (gen_rtx_SET (rax, addr));
13552 /* Get the stub symbol. */
13553 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
13554 : XLOGUE_STUB_SAVE);
13555 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13557 for (i = 0; i < ncregs; ++i)
13559 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13560 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
13561 r.regno);
13562 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
13565 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
13567 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
13568 RTX_FRAME_RELATED_P (insn) = true;
13571 /* Expand the prologue into a bunch of separate insns. */
13573 void
13574 ix86_expand_prologue (void)
13576 struct machine_function *m = cfun->machine;
13577 rtx insn, t;
13578 HOST_WIDE_INT allocate;
13579 bool int_registers_saved;
13580 bool sse_registers_saved;
13581 bool save_stub_call_needed;
13582 rtx static_chain = NULL_RTX;
13584 if (ix86_function_naked (current_function_decl))
13585 return;
13587 ix86_finalize_stack_frame_flags ();
13589 /* DRAP should not coexist with stack_realign_fp */
13590 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13592 memset (&m->fs, 0, sizeof (m->fs));
13594 /* Initialize CFA state for before the prologue. */
13595 m->fs.cfa_reg = stack_pointer_rtx;
13596 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13598 /* Track SP offset to the CFA. We continue tracking this after we've
13599 swapped the CFA register away from SP. In the case of re-alignment
13600 this is fudged; we're interested to offsets within the local frame. */
13601 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13602 m->fs.sp_valid = true;
13603 m->fs.sp_realigned = false;
13605 const struct ix86_frame &frame = cfun->machine->frame;
13607 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13609 /* We should have already generated an error for any use of
13610 ms_hook on a nested function. */
13611 gcc_checking_assert (!ix86_static_chain_on_stack);
13613 /* Check if profiling is active and we shall use profiling before
13614 prologue variant. If so sorry. */
13615 if (crtl->profile && flag_fentry != 0)
13616 sorry ("ms_hook_prologue attribute isn%'t compatible "
13617 "with -mfentry for 32-bit");
13619 /* In ix86_asm_output_function_label we emitted:
13620 8b ff movl.s %edi,%edi
13621 55 push %ebp
13622 8b ec movl.s %esp,%ebp
13624 This matches the hookable function prologue in Win32 API
13625 functions in Microsoft Windows XP Service Pack 2 and newer.
13626 Wine uses this to enable Windows apps to hook the Win32 API
13627 functions provided by Wine.
13629 What that means is that we've already set up the frame pointer. */
13631 if (frame_pointer_needed
13632 && !(crtl->drap_reg && crtl->stack_realign_needed))
13634 rtx push, mov;
13636 /* We've decided to use the frame pointer already set up.
13637 Describe this to the unwinder by pretending that both
13638 push and mov insns happen right here.
13640 Putting the unwind info here at the end of the ms_hook
13641 is done so that we can make absolutely certain we get
13642 the required byte sequence at the start of the function,
13643 rather than relying on an assembler that can produce
13644 the exact encoding required.
13646 However it does mean (in the unpatched case) that we have
13647 a 1 insn window where the asynchronous unwind info is
13648 incorrect. However, if we placed the unwind info at
13649 its correct location we would have incorrect unwind info
13650 in the patched case. Which is probably all moot since
13651 I don't expect Wine generates dwarf2 unwind info for the
13652 system libraries that use this feature. */
13654 insn = emit_insn (gen_blockage ());
13656 push = gen_push (hard_frame_pointer_rtx);
13657 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13658 stack_pointer_rtx);
13659 RTX_FRAME_RELATED_P (push) = 1;
13660 RTX_FRAME_RELATED_P (mov) = 1;
13662 RTX_FRAME_RELATED_P (insn) = 1;
13663 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13664 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13666 /* Note that gen_push incremented m->fs.cfa_offset, even
13667 though we didn't emit the push insn here. */
13668 m->fs.cfa_reg = hard_frame_pointer_rtx;
13669 m->fs.fp_offset = m->fs.cfa_offset;
13670 m->fs.fp_valid = true;
13672 else
13674 /* The frame pointer is not needed so pop %ebp again.
13675 This leaves us with a pristine state. */
13676 emit_insn (gen_pop (hard_frame_pointer_rtx));
13680 /* The first insn of a function that accepts its static chain on the
13681 stack is to push the register that would be filled in by a direct
13682 call. This insn will be skipped by the trampoline. */
13683 else if (ix86_static_chain_on_stack)
13685 static_chain = ix86_static_chain (cfun->decl, false);
13686 insn = emit_insn (gen_push (static_chain));
13687 emit_insn (gen_blockage ());
13689 /* We don't want to interpret this push insn as a register save,
13690 only as a stack adjustment. The real copy of the register as
13691 a save will be done later, if needed. */
13692 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13693 t = gen_rtx_SET (stack_pointer_rtx, t);
13694 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13695 RTX_FRAME_RELATED_P (insn) = 1;
13698 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13699 of DRAP is needed and stack realignment is really needed after reload */
13700 if (stack_realign_drap)
13702 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13704 /* Can't use DRAP in interrupt function. */
13705 if (cfun->machine->func_type != TYPE_NORMAL)
13706 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13707 "in interrupt service routine. This may be worked "
13708 "around by avoiding functions with aggregate return.");
13710 /* Only need to push parameter pointer reg if it is caller saved. */
13711 if (!call_used_regs[REGNO (crtl->drap_reg)])
13713 /* Push arg pointer reg */
13714 insn = emit_insn (gen_push (crtl->drap_reg));
13715 RTX_FRAME_RELATED_P (insn) = 1;
13718 /* Grab the argument pointer. */
13719 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13720 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13721 RTX_FRAME_RELATED_P (insn) = 1;
13722 m->fs.cfa_reg = crtl->drap_reg;
13723 m->fs.cfa_offset = 0;
13725 /* Align the stack. */
13726 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13727 stack_pointer_rtx,
13728 GEN_INT (-align_bytes)));
13729 RTX_FRAME_RELATED_P (insn) = 1;
13731 /* Replicate the return address on the stack so that return
13732 address can be reached via (argp - 1) slot. This is needed
13733 to implement macro RETURN_ADDR_RTX and intrinsic function
13734 expand_builtin_return_addr etc. */
13735 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13736 t = gen_frame_mem (word_mode, t);
13737 insn = emit_insn (gen_push (t));
13738 RTX_FRAME_RELATED_P (insn) = 1;
13740 /* For the purposes of frame and register save area addressing,
13741 we've started over with a new frame. */
13742 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13743 m->fs.realigned = true;
13745 if (static_chain)
13747 /* Replicate static chain on the stack so that static chain
13748 can be reached via (argp - 2) slot. This is needed for
13749 nested function with stack realignment. */
13750 insn = emit_insn (gen_push (static_chain));
13751 RTX_FRAME_RELATED_P (insn) = 1;
13755 int_registers_saved = (frame.nregs == 0);
13756 sse_registers_saved = (frame.nsseregs == 0);
13757 save_stub_call_needed = (m->call_ms2sysv);
13758 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13760 if (frame_pointer_needed && !m->fs.fp_valid)
13762 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13763 slower on all targets. Also sdb didn't like it. */
13764 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13765 RTX_FRAME_RELATED_P (insn) = 1;
13767 /* Push registers now, before setting the frame pointer
13768 on SEH target. */
13769 if (!int_registers_saved
13770 && TARGET_SEH
13771 && !frame.save_regs_using_mov)
13773 ix86_emit_save_regs ();
13774 int_registers_saved = true;
13775 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13778 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13780 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13781 RTX_FRAME_RELATED_P (insn) = 1;
13783 if (m->fs.cfa_reg == stack_pointer_rtx)
13784 m->fs.cfa_reg = hard_frame_pointer_rtx;
13785 m->fs.fp_offset = m->fs.sp_offset;
13786 m->fs.fp_valid = true;
13790 if (!int_registers_saved)
13792 /* If saving registers via PUSH, do so now. */
13793 if (!frame.save_regs_using_mov)
13795 ix86_emit_save_regs ();
13796 int_registers_saved = true;
13797 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13800 /* When using red zone we may start register saving before allocating
13801 the stack frame saving one cycle of the prologue. However, avoid
13802 doing this if we have to probe the stack; at least on x86_64 the
13803 stack probe can turn into a call that clobbers a red zone location. */
13804 else if (ix86_using_red_zone ()
13805 && (! TARGET_STACK_PROBE
13806 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13808 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13809 int_registers_saved = true;
13813 if (stack_realign_fp)
13815 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13816 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13818 /* Record last valid frame pointer offset. */
13819 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13821 /* The computation of the size of the re-aligned stack frame means
13822 that we must allocate the size of the register save area before
13823 performing the actual alignment. Otherwise we cannot guarantee
13824 that there's enough storage above the realignment point. */
13825 allocate = frame.reg_save_offset - m->fs.sp_offset
13826 + frame.stack_realign_allocate;
13827 if (allocate)
13828 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13829 GEN_INT (-allocate), -1, false);
13831 /* Align the stack. */
13832 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13833 stack_pointer_rtx,
13834 GEN_INT (-align_bytes)));
13835 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13836 m->fs.sp_realigned_offset = m->fs.sp_offset
13837 - frame.stack_realign_allocate;
13838 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13839 Beyond this point, stack access should be done via choose_baseaddr or
13840 by using sp_valid_at and fp_valid_at to determine the correct base
13841 register. Henceforth, any CFA offset should be thought of as logical
13842 and not physical. */
13843 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13844 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13845 m->fs.sp_realigned = true;
13847 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13848 is needed to describe where a register is saved using a realigned
13849 stack pointer, so we need to invalidate the stack pointer for that
13850 target. */
13851 if (TARGET_SEH)
13852 m->fs.sp_valid = false;
13854 /* If SP offset is non-immediate after allocation of the stack frame,
13855 then emit SSE saves or stub call prior to allocating the rest of the
13856 stack frame. This is less efficient for the out-of-line stub because
13857 we can't combine allocations across the call barrier, but it's better
13858 than using a scratch register. */
13859 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13860 - m->fs.sp_realigned_offset),
13861 Pmode))
13863 if (!sse_registers_saved)
13865 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13866 sse_registers_saved = true;
13868 else if (save_stub_call_needed)
13870 ix86_emit_outlined_ms2sysv_save (frame);
13871 save_stub_call_needed = false;
13876 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13878 if (flag_stack_usage_info)
13880 /* We start to count from ARG_POINTER. */
13881 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13883 /* If it was realigned, take into account the fake frame. */
13884 if (stack_realign_drap)
13886 if (ix86_static_chain_on_stack)
13887 stack_size += UNITS_PER_WORD;
13889 if (!call_used_regs[REGNO (crtl->drap_reg)])
13890 stack_size += UNITS_PER_WORD;
13892 /* This over-estimates by 1 minimal-stack-alignment-unit but
13893 mitigates that by counting in the new return address slot. */
13894 current_function_dynamic_stack_size
13895 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13898 current_function_static_stack_size = stack_size;
13901 /* On SEH target with very large frame size, allocate an area to save
13902 SSE registers (as the very large allocation won't be described). */
13903 if (TARGET_SEH
13904 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13905 && !sse_registers_saved)
13907 HOST_WIDE_INT sse_size =
13908 frame.sse_reg_save_offset - frame.reg_save_offset;
13910 gcc_assert (int_registers_saved);
13912 /* No need to do stack checking as the area will be immediately
13913 written. */
13914 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13915 GEN_INT (-sse_size), -1,
13916 m->fs.cfa_reg == stack_pointer_rtx);
13917 allocate -= sse_size;
13918 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13919 sse_registers_saved = true;
13922 /* The stack has already been decremented by the instruction calling us
13923 so probe if the size is non-negative to preserve the protection area. */
13924 if (allocate >= 0
13925 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13926 || flag_stack_clash_protection))
13928 if (flag_stack_clash_protection)
13930 ix86_adjust_stack_and_probe_stack_clash (allocate,
13931 int_registers_saved);
13932 allocate = 0;
13934 else if (STACK_CHECK_MOVING_SP)
13936 if (!(crtl->is_leaf && !cfun->calls_alloca
13937 && allocate <= get_probe_interval ()))
13939 ix86_adjust_stack_and_probe (allocate, int_registers_saved);
13940 allocate = 0;
13943 else
13945 HOST_WIDE_INT size = allocate;
13947 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13948 size = 0x80000000 - get_stack_check_protect () - 1;
13950 if (TARGET_STACK_PROBE)
13952 if (crtl->is_leaf && !cfun->calls_alloca)
13954 if (size > get_probe_interval ())
13955 ix86_emit_probe_stack_range (0, size, int_registers_saved);
13957 else
13958 ix86_emit_probe_stack_range (0,
13959 size + get_stack_check_protect (),
13960 int_registers_saved);
13962 else
13964 if (crtl->is_leaf && !cfun->calls_alloca)
13966 if (size > get_probe_interval ()
13967 && size > get_stack_check_protect ())
13968 ix86_emit_probe_stack_range (get_stack_check_protect (),
13969 (size
13970 - get_stack_check_protect ()),
13971 int_registers_saved);
13973 else
13974 ix86_emit_probe_stack_range (get_stack_check_protect (), size,
13975 int_registers_saved);
13980 if (allocate == 0)
13982 else if (!ix86_target_stack_probe ()
13983 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13985 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13986 GEN_INT (-allocate), -1,
13987 m->fs.cfa_reg == stack_pointer_rtx);
13989 else
13991 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13992 rtx r10 = NULL;
13993 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13994 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13995 bool eax_live = ix86_eax_live_at_start_p ();
13996 bool r10_live = false;
13998 if (TARGET_64BIT)
13999 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14001 if (eax_live)
14003 insn = emit_insn (gen_push (eax));
14004 allocate -= UNITS_PER_WORD;
14005 /* Note that SEH directives need to continue tracking the stack
14006 pointer even after the frame pointer has been set up. */
14007 if (sp_is_cfa_reg || TARGET_SEH)
14009 if (sp_is_cfa_reg)
14010 m->fs.cfa_offset += UNITS_PER_WORD;
14011 RTX_FRAME_RELATED_P (insn) = 1;
14012 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14013 gen_rtx_SET (stack_pointer_rtx,
14014 plus_constant (Pmode, stack_pointer_rtx,
14015 -UNITS_PER_WORD)));
14019 if (r10_live)
14021 r10 = gen_rtx_REG (Pmode, R10_REG);
14022 insn = emit_insn (gen_push (r10));
14023 allocate -= UNITS_PER_WORD;
14024 if (sp_is_cfa_reg || TARGET_SEH)
14026 if (sp_is_cfa_reg)
14027 m->fs.cfa_offset += UNITS_PER_WORD;
14028 RTX_FRAME_RELATED_P (insn) = 1;
14029 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14030 gen_rtx_SET (stack_pointer_rtx,
14031 plus_constant (Pmode, stack_pointer_rtx,
14032 -UNITS_PER_WORD)));
14036 emit_move_insn (eax, GEN_INT (allocate));
14037 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14039 /* Use the fact that AX still contains ALLOCATE. */
14040 adjust_stack_insn = (Pmode == DImode
14041 ? gen_pro_epilogue_adjust_stack_di_sub
14042 : gen_pro_epilogue_adjust_stack_si_sub);
14044 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14045 stack_pointer_rtx, eax));
14047 if (sp_is_cfa_reg || TARGET_SEH)
14049 if (sp_is_cfa_reg)
14050 m->fs.cfa_offset += allocate;
14051 RTX_FRAME_RELATED_P (insn) = 1;
14052 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14053 gen_rtx_SET (stack_pointer_rtx,
14054 plus_constant (Pmode, stack_pointer_rtx,
14055 -allocate)));
14057 m->fs.sp_offset += allocate;
14059 /* Use stack_pointer_rtx for relative addressing so that code
14060 works for realigned stack, too. */
14061 if (r10_live && eax_live)
14063 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14064 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14065 gen_frame_mem (word_mode, t));
14066 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14067 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14068 gen_frame_mem (word_mode, t));
14070 else if (eax_live || r10_live)
14072 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14073 emit_move_insn (gen_rtx_REG (word_mode,
14074 (eax_live ? AX_REG : R10_REG)),
14075 gen_frame_mem (word_mode, t));
14078 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14080 /* If we havn't already set up the frame pointer, do so now. */
14081 if (frame_pointer_needed && !m->fs.fp_valid)
14083 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14084 GEN_INT (frame.stack_pointer_offset
14085 - frame.hard_frame_pointer_offset));
14086 insn = emit_insn (insn);
14087 RTX_FRAME_RELATED_P (insn) = 1;
14088 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14090 if (m->fs.cfa_reg == stack_pointer_rtx)
14091 m->fs.cfa_reg = hard_frame_pointer_rtx;
14092 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14093 m->fs.fp_valid = true;
14096 if (!int_registers_saved)
14097 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14098 if (!sse_registers_saved)
14099 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14100 else if (save_stub_call_needed)
14101 ix86_emit_outlined_ms2sysv_save (frame);
14103 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14104 in PROLOGUE. */
14105 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14107 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14108 insn = emit_insn (gen_set_got (pic));
14109 RTX_FRAME_RELATED_P (insn) = 1;
14110 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14111 emit_insn (gen_prologue_use (pic));
14112 /* Deleting already emmitted SET_GOT if exist and allocated to
14113 REAL_PIC_OFFSET_TABLE_REGNUM. */
14114 ix86_elim_entry_set_got (pic);
14117 if (crtl->drap_reg && !crtl->stack_realign_needed)
14119 /* vDRAP is setup but after reload it turns out stack realign
14120 isn't necessary, here we will emit prologue to setup DRAP
14121 without stack realign adjustment */
14122 t = choose_baseaddr (0, NULL);
14123 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14126 /* Prevent instructions from being scheduled into register save push
14127 sequence when access to the redzone area is done through frame pointer.
14128 The offset between the frame pointer and the stack pointer is calculated
14129 relative to the value of the stack pointer at the end of the function
14130 prologue, and moving instructions that access redzone area via frame
14131 pointer inside push sequence violates this assumption. */
14132 if (frame_pointer_needed && frame.red_zone_size)
14133 emit_insn (gen_memory_blockage ());
14135 /* SEH requires that the prologue end within 256 bytes of the start of
14136 the function. Prevent instruction schedules that would extend that.
14137 Further, prevent alloca modifications to the stack pointer from being
14138 combined with prologue modifications. */
14139 if (TARGET_SEH)
14140 emit_insn (gen_prologue_use (stack_pointer_rtx));
14143 /* Emit code to restore REG using a POP insn. */
14145 static void
14146 ix86_emit_restore_reg_using_pop (rtx reg)
14148 struct machine_function *m = cfun->machine;
14149 rtx_insn *insn = emit_insn (gen_pop (reg));
14151 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14152 m->fs.sp_offset -= UNITS_PER_WORD;
14154 if (m->fs.cfa_reg == crtl->drap_reg
14155 && REGNO (reg) == REGNO (crtl->drap_reg))
14157 /* Previously we'd represented the CFA as an expression
14158 like *(%ebp - 8). We've just popped that value from
14159 the stack, which means we need to reset the CFA to
14160 the drap register. This will remain until we restore
14161 the stack pointer. */
14162 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14163 RTX_FRAME_RELATED_P (insn) = 1;
14165 /* This means that the DRAP register is valid for addressing too. */
14166 m->fs.drap_valid = true;
14167 return;
14170 if (m->fs.cfa_reg == stack_pointer_rtx)
14172 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14173 x = gen_rtx_SET (stack_pointer_rtx, x);
14174 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14175 RTX_FRAME_RELATED_P (insn) = 1;
14177 m->fs.cfa_offset -= UNITS_PER_WORD;
14180 /* When the frame pointer is the CFA, and we pop it, we are
14181 swapping back to the stack pointer as the CFA. This happens
14182 for stack frames that don't allocate other data, so we assume
14183 the stack pointer is now pointing at the return address, i.e.
14184 the function entry state, which makes the offset be 1 word. */
14185 if (reg == hard_frame_pointer_rtx)
14187 m->fs.fp_valid = false;
14188 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14190 m->fs.cfa_reg = stack_pointer_rtx;
14191 m->fs.cfa_offset -= UNITS_PER_WORD;
14193 add_reg_note (insn, REG_CFA_DEF_CFA,
14194 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14195 GEN_INT (m->fs.cfa_offset)));
14196 RTX_FRAME_RELATED_P (insn) = 1;
14201 /* Emit code to restore saved registers using POP insns. */
14203 static void
14204 ix86_emit_restore_regs_using_pop (void)
14206 unsigned int regno;
14208 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14209 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14210 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14213 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14214 omits the emit and only attaches the notes. */
14216 static void
14217 ix86_emit_leave (rtx_insn *insn)
14219 struct machine_function *m = cfun->machine;
14220 if (!insn)
14221 insn = emit_insn (ix86_gen_leave ());
14223 ix86_add_queued_cfa_restore_notes (insn);
14225 gcc_assert (m->fs.fp_valid);
14226 m->fs.sp_valid = true;
14227 m->fs.sp_realigned = false;
14228 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14229 m->fs.fp_valid = false;
14231 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14233 m->fs.cfa_reg = stack_pointer_rtx;
14234 m->fs.cfa_offset = m->fs.sp_offset;
14236 add_reg_note (insn, REG_CFA_DEF_CFA,
14237 plus_constant (Pmode, stack_pointer_rtx,
14238 m->fs.sp_offset));
14239 RTX_FRAME_RELATED_P (insn) = 1;
14241 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14242 m->fs.fp_offset);
14245 /* Emit code to restore saved registers using MOV insns.
14246 First register is restored from CFA - CFA_OFFSET. */
14247 static void
14248 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14249 bool maybe_eh_return)
14251 struct machine_function *m = cfun->machine;
14252 unsigned int regno;
14254 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14255 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14257 rtx reg = gen_rtx_REG (word_mode, regno);
14258 rtx mem;
14259 rtx_insn *insn;
14261 mem = choose_baseaddr (cfa_offset, NULL);
14262 mem = gen_frame_mem (word_mode, mem);
14263 insn = emit_move_insn (reg, mem);
14265 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14267 /* Previously we'd represented the CFA as an expression
14268 like *(%ebp - 8). We've just popped that value from
14269 the stack, which means we need to reset the CFA to
14270 the drap register. This will remain until we restore
14271 the stack pointer. */
14272 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14273 RTX_FRAME_RELATED_P (insn) = 1;
14275 /* This means that the DRAP register is valid for addressing. */
14276 m->fs.drap_valid = true;
14278 else
14279 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14281 cfa_offset -= UNITS_PER_WORD;
14285 /* Emit code to restore saved registers using MOV insns.
14286 First register is restored from CFA - CFA_OFFSET. */
14287 static void
14288 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14289 bool maybe_eh_return)
14291 unsigned int regno;
14293 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14294 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14296 rtx reg = gen_rtx_REG (V4SFmode, regno);
14297 rtx mem;
14298 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14300 mem = choose_baseaddr (cfa_offset, &align);
14301 mem = gen_rtx_MEM (V4SFmode, mem);
14303 /* The location aligment depends upon the base register. */
14304 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14305 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14306 set_mem_align (mem, align);
14307 emit_insn (gen_rtx_SET (reg, mem));
14309 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14311 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14315 static void
14316 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
14317 bool use_call, int style)
14319 struct machine_function *m = cfun->machine;
14320 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14321 + m->call_ms2sysv_extra_regs;
14322 rtvec v;
14323 unsigned int elems_needed, align, i, vi = 0;
14324 rtx_insn *insn;
14325 rtx sym, tmp;
14326 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
14327 rtx r10 = NULL_RTX;
14328 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14329 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
14330 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
14331 rtx rsi_frame_load = NULL_RTX;
14332 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
14333 enum xlogue_stub stub;
14335 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
14337 /* If using a realigned stack, we should never start with padding. */
14338 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
14340 /* Setup RSI as the stub's base pointer. */
14341 align = GET_MODE_ALIGNMENT (V4SFmode);
14342 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
14343 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14345 emit_insn (gen_rtx_SET (rsi, tmp));
14347 /* Get a symbol for the stub. */
14348 if (frame_pointer_needed)
14349 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
14350 : XLOGUE_STUB_RESTORE_HFP_TAIL;
14351 else
14352 stub = use_call ? XLOGUE_STUB_RESTORE
14353 : XLOGUE_STUB_RESTORE_TAIL;
14354 sym = xlogue.get_stub_rtx (stub);
14356 elems_needed = ncregs;
14357 if (use_call)
14358 elems_needed += 1;
14359 else
14360 elems_needed += frame_pointer_needed ? 5 : 3;
14361 v = rtvec_alloc (elems_needed);
14363 /* We call the epilogue stub when we need to pop incoming args or we are
14364 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
14365 epilogue stub and it is the tail-call. */
14366 if (use_call)
14367 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14368 else
14370 RTVEC_ELT (v, vi++) = ret_rtx;
14371 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14372 if (frame_pointer_needed)
14374 rtx rbp = gen_rtx_REG (DImode, BP_REG);
14375 gcc_assert (m->fs.fp_valid);
14376 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
14378 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
14379 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
14380 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
14381 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
14382 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
14384 else
14386 /* If no hard frame pointer, we set R10 to the SP restore value. */
14387 gcc_assert (!m->fs.fp_valid);
14388 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14389 gcc_assert (m->fs.sp_valid);
14391 r10 = gen_rtx_REG (DImode, R10_REG);
14392 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
14393 emit_insn (gen_rtx_SET (r10, tmp));
14395 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
14399 /* Generate frame load insns and restore notes. */
14400 for (i = 0; i < ncregs; ++i)
14402 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14403 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
14404 rtx reg, frame_load;
14406 reg = gen_rtx_REG (mode, r.regno);
14407 frame_load = gen_frame_load (reg, rsi, r.offset);
14409 /* Save RSI frame load insn & note to add last. */
14410 if (r.regno == SI_REG)
14412 gcc_assert (!rsi_frame_load);
14413 rsi_frame_load = frame_load;
14414 rsi_restore_offset = r.offset;
14416 else
14418 RTVEC_ELT (v, vi++) = frame_load;
14419 ix86_add_cfa_restore_note (NULL, reg, r.offset);
14423 /* Add RSI frame load & restore note at the end. */
14424 gcc_assert (rsi_frame_load);
14425 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
14426 RTVEC_ELT (v, vi++) = rsi_frame_load;
14427 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
14428 rsi_restore_offset);
14430 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
14431 if (!use_call && !frame_pointer_needed)
14433 gcc_assert (m->fs.sp_valid);
14434 gcc_assert (!m->fs.sp_realigned);
14436 /* At this point, R10 should point to frame.stack_realign_offset. */
14437 if (m->fs.cfa_reg == stack_pointer_rtx)
14438 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
14439 m->fs.sp_offset = frame.stack_realign_offset;
14442 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
14443 tmp = gen_rtx_PARALLEL (VOIDmode, v);
14444 if (use_call)
14445 insn = emit_insn (tmp);
14446 else
14448 insn = emit_jump_insn (tmp);
14449 JUMP_LABEL (insn) = ret_rtx;
14451 if (frame_pointer_needed)
14452 ix86_emit_leave (insn);
14453 else
14455 /* Need CFA adjust note. */
14456 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
14457 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
14461 RTX_FRAME_RELATED_P (insn) = true;
14462 ix86_add_queued_cfa_restore_notes (insn);
14464 /* If we're not doing a tail-call, we need to adjust the stack. */
14465 if (use_call && m->fs.sp_valid)
14467 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
14468 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14469 GEN_INT (dealloc), style,
14470 m->fs.cfa_reg == stack_pointer_rtx);
14474 /* Restore function stack, frame, and registers. */
14476 void
14477 ix86_expand_epilogue (int style)
14479 struct machine_function *m = cfun->machine;
14480 struct machine_frame_state frame_state_save = m->fs;
14481 bool restore_regs_via_mov;
14482 bool using_drap;
14483 bool restore_stub_is_tail = false;
14485 if (ix86_function_naked (current_function_decl))
14487 /* The program should not reach this point. */
14488 emit_insn (gen_ud2 ());
14489 return;
14492 ix86_finalize_stack_frame_flags ();
14493 const struct ix86_frame &frame = cfun->machine->frame;
14495 m->fs.sp_realigned = stack_realign_fp;
14496 m->fs.sp_valid = stack_realign_fp
14497 || !frame_pointer_needed
14498 || crtl->sp_is_unchanging;
14499 gcc_assert (!m->fs.sp_valid
14500 || m->fs.sp_offset == frame.stack_pointer_offset);
14502 /* The FP must be valid if the frame pointer is present. */
14503 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14504 gcc_assert (!m->fs.fp_valid
14505 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14507 /* We must have *some* valid pointer to the stack frame. */
14508 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14510 /* The DRAP is never valid at this point. */
14511 gcc_assert (!m->fs.drap_valid);
14513 /* See the comment about red zone and frame
14514 pointer usage in ix86_expand_prologue. */
14515 if (frame_pointer_needed && frame.red_zone_size)
14516 emit_insn (gen_memory_blockage ());
14518 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14519 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14521 /* Determine the CFA offset of the end of the red-zone. */
14522 m->fs.red_zone_offset = 0;
14523 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14525 /* The red-zone begins below return address and error code in
14526 exception handler. */
14527 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
14529 /* When the register save area is in the aligned portion of
14530 the stack, determine the maximum runtime displacement that
14531 matches up with the aligned frame. */
14532 if (stack_realign_drap)
14533 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14534 + UNITS_PER_WORD);
14537 HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
14539 /* Special care must be taken for the normal return case of a function
14540 using eh_return: the eax and edx registers are marked as saved, but
14541 not restored along this path. Adjust the save location to match. */
14542 if (crtl->calls_eh_return && style != 2)
14543 reg_save_offset -= 2 * UNITS_PER_WORD;
14545 /* EH_RETURN requires the use of moves to function properly. */
14546 if (crtl->calls_eh_return)
14547 restore_regs_via_mov = true;
14548 /* SEH requires the use of pops to identify the epilogue. */
14549 else if (TARGET_SEH)
14550 restore_regs_via_mov = false;
14551 /* If we're only restoring one register and sp cannot be used then
14552 using a move instruction to restore the register since it's
14553 less work than reloading sp and popping the register. */
14554 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
14555 restore_regs_via_mov = true;
14556 else if (TARGET_EPILOGUE_USING_MOVE
14557 && cfun->machine->use_fast_prologue_epilogue
14558 && (frame.nregs > 1
14559 || m->fs.sp_offset != reg_save_offset))
14560 restore_regs_via_mov = true;
14561 else if (frame_pointer_needed
14562 && !frame.nregs
14563 && m->fs.sp_offset != reg_save_offset)
14564 restore_regs_via_mov = true;
14565 else if (frame_pointer_needed
14566 && TARGET_USE_LEAVE
14567 && cfun->machine->use_fast_prologue_epilogue
14568 && frame.nregs == 1)
14569 restore_regs_via_mov = true;
14570 else
14571 restore_regs_via_mov = false;
14573 if (restore_regs_via_mov || frame.nsseregs)
14575 /* Ensure that the entire register save area is addressable via
14576 the stack pointer, if we will restore SSE regs via sp. */
14577 if (TARGET_64BIT
14578 && m->fs.sp_offset > 0x7fffffff
14579 && sp_valid_at (frame.stack_realign_offset + 1)
14580 && (frame.nsseregs + frame.nregs) != 0)
14582 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14583 GEN_INT (m->fs.sp_offset
14584 - frame.sse_reg_save_offset),
14585 style,
14586 m->fs.cfa_reg == stack_pointer_rtx);
14590 /* If there are any SSE registers to restore, then we have to do it
14591 via moves, since there's obviously no pop for SSE regs. */
14592 if (frame.nsseregs)
14593 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14594 style == 2);
14596 if (m->call_ms2sysv)
14598 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14600 /* We cannot use a tail-call for the stub if:
14601 1. We have to pop incoming args,
14602 2. We have additional int regs to restore, or
14603 3. A sibling call will be the tail-call, or
14604 4. We are emitting an eh_return_internal epilogue.
14606 TODO: Item 4 has not yet tested!
14608 If any of the above are true, we will call the stub rather than
14609 jump to it. */
14610 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14611 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14614 /* If using out-of-line stub that is a tail-call, then...*/
14615 if (m->call_ms2sysv && restore_stub_is_tail)
14617 /* TODO: parinoid tests. (remove eventually) */
14618 gcc_assert (m->fs.sp_valid);
14619 gcc_assert (!m->fs.sp_realigned);
14620 gcc_assert (!m->fs.fp_valid);
14621 gcc_assert (!m->fs.realigned);
14622 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14623 gcc_assert (!crtl->drap_reg);
14624 gcc_assert (!frame.nregs);
14626 else if (restore_regs_via_mov)
14628 rtx t;
14630 if (frame.nregs)
14631 ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
14633 /* eh_return epilogues need %ecx added to the stack pointer. */
14634 if (style == 2)
14636 rtx sa = EH_RETURN_STACKADJ_RTX;
14637 rtx_insn *insn;
14639 /* %ecx can't be used for both DRAP register and eh_return. */
14640 if (crtl->drap_reg)
14641 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14643 /* regparm nested functions don't work with eh_return. */
14644 gcc_assert (!ix86_static_chain_on_stack);
14646 if (frame_pointer_needed)
14648 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14649 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14650 emit_insn (gen_rtx_SET (sa, t));
14652 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14653 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14655 /* Note that we use SA as a temporary CFA, as the return
14656 address is at the proper place relative to it. We
14657 pretend this happens at the FP restore insn because
14658 prior to this insn the FP would be stored at the wrong
14659 offset relative to SA, and after this insn we have no
14660 other reasonable register to use for the CFA. We don't
14661 bother resetting the CFA to the SP for the duration of
14662 the return insn, unless the control flow instrumentation
14663 is done. In this case the SP is used later and we have
14664 to reset CFA to SP. */
14665 add_reg_note (insn, REG_CFA_DEF_CFA,
14666 plus_constant (Pmode, sa, UNITS_PER_WORD));
14667 ix86_add_queued_cfa_restore_notes (insn);
14668 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14669 RTX_FRAME_RELATED_P (insn) = 1;
14671 m->fs.cfa_reg = sa;
14672 m->fs.cfa_offset = UNITS_PER_WORD;
14673 m->fs.fp_valid = false;
14675 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14676 const0_rtx, style,
14677 flag_cf_protection);
14679 else
14681 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14682 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14683 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14684 ix86_add_queued_cfa_restore_notes (insn);
14686 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14687 if (m->fs.cfa_offset != UNITS_PER_WORD)
14689 m->fs.cfa_offset = UNITS_PER_WORD;
14690 add_reg_note (insn, REG_CFA_DEF_CFA,
14691 plus_constant (Pmode, stack_pointer_rtx,
14692 UNITS_PER_WORD));
14693 RTX_FRAME_RELATED_P (insn) = 1;
14696 m->fs.sp_offset = UNITS_PER_WORD;
14697 m->fs.sp_valid = true;
14698 m->fs.sp_realigned = false;
14701 else
14703 /* SEH requires that the function end with (1) a stack adjustment
14704 if necessary, (2) a sequence of pops, and (3) a return or
14705 jump instruction. Prevent insns from the function body from
14706 being scheduled into this sequence. */
14707 if (TARGET_SEH)
14709 /* Prevent a catch region from being adjacent to the standard
14710 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14711 several other flags that would be interesting to test are
14712 not yet set up. */
14713 if (flag_non_call_exceptions)
14714 emit_insn (gen_nops (const1_rtx));
14715 else
14716 emit_insn (gen_blockage ());
14719 /* First step is to deallocate the stack frame so that we can
14720 pop the registers. If the stack pointer was realigned, it needs
14721 to be restored now. Also do it on SEH target for very large
14722 frame as the emitted instructions aren't allowed by the ABI
14723 in epilogues. */
14724 if (!m->fs.sp_valid || m->fs.sp_realigned
14725 || (TARGET_SEH
14726 && (m->fs.sp_offset - reg_save_offset
14727 >= SEH_MAX_FRAME_SIZE)))
14729 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14730 GEN_INT (m->fs.fp_offset
14731 - reg_save_offset),
14732 style, false);
14734 else if (m->fs.sp_offset != reg_save_offset)
14736 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14737 GEN_INT (m->fs.sp_offset
14738 - reg_save_offset),
14739 style,
14740 m->fs.cfa_reg == stack_pointer_rtx);
14743 ix86_emit_restore_regs_using_pop ();
14746 /* If we used a stack pointer and haven't already got rid of it,
14747 then do so now. */
14748 if (m->fs.fp_valid)
14750 /* If the stack pointer is valid and pointing at the frame
14751 pointer store address, then we only need a pop. */
14752 if (sp_valid_at (frame.hfp_save_offset)
14753 && m->fs.sp_offset == frame.hfp_save_offset)
14754 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14755 /* Leave results in shorter dependency chains on CPUs that are
14756 able to grok it fast. */
14757 else if (TARGET_USE_LEAVE
14758 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14759 || !cfun->machine->use_fast_prologue_epilogue)
14760 ix86_emit_leave (NULL);
14761 else
14763 pro_epilogue_adjust_stack (stack_pointer_rtx,
14764 hard_frame_pointer_rtx,
14765 const0_rtx, style, !using_drap);
14766 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14770 if (using_drap)
14772 int param_ptr_offset = UNITS_PER_WORD;
14773 rtx_insn *insn;
14775 gcc_assert (stack_realign_drap);
14777 if (ix86_static_chain_on_stack)
14778 param_ptr_offset += UNITS_PER_WORD;
14779 if (!call_used_regs[REGNO (crtl->drap_reg)])
14780 param_ptr_offset += UNITS_PER_WORD;
14782 insn = emit_insn (gen_rtx_SET
14783 (stack_pointer_rtx,
14784 gen_rtx_PLUS (Pmode,
14785 crtl->drap_reg,
14786 GEN_INT (-param_ptr_offset))));
14787 m->fs.cfa_reg = stack_pointer_rtx;
14788 m->fs.cfa_offset = param_ptr_offset;
14789 m->fs.sp_offset = param_ptr_offset;
14790 m->fs.realigned = false;
14792 add_reg_note (insn, REG_CFA_DEF_CFA,
14793 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14794 GEN_INT (param_ptr_offset)));
14795 RTX_FRAME_RELATED_P (insn) = 1;
14797 if (!call_used_regs[REGNO (crtl->drap_reg)])
14798 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14801 /* At this point the stack pointer must be valid, and we must have
14802 restored all of the registers. We may not have deallocated the
14803 entire stack frame. We've delayed this until now because it may
14804 be possible to merge the local stack deallocation with the
14805 deallocation forced by ix86_static_chain_on_stack. */
14806 gcc_assert (m->fs.sp_valid);
14807 gcc_assert (!m->fs.sp_realigned);
14808 gcc_assert (!m->fs.fp_valid);
14809 gcc_assert (!m->fs.realigned);
14810 if (m->fs.sp_offset != UNITS_PER_WORD)
14812 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14813 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14814 style, true);
14816 else
14817 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14819 /* Sibcall epilogues don't want a return instruction. */
14820 if (style == 0)
14822 m->fs = frame_state_save;
14823 return;
14826 if (cfun->machine->func_type != TYPE_NORMAL)
14827 emit_jump_insn (gen_interrupt_return ());
14828 else if (crtl->args.pops_args && crtl->args.size)
14830 rtx popc = GEN_INT (crtl->args.pops_args);
14832 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14833 address, do explicit add, and jump indirectly to the caller. */
14835 if (crtl->args.pops_args >= 65536)
14837 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14838 rtx_insn *insn;
14840 /* There is no "pascal" calling convention in any 64bit ABI. */
14841 gcc_assert (!TARGET_64BIT);
14843 insn = emit_insn (gen_pop (ecx));
14844 m->fs.cfa_offset -= UNITS_PER_WORD;
14845 m->fs.sp_offset -= UNITS_PER_WORD;
14847 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14848 x = gen_rtx_SET (stack_pointer_rtx, x);
14849 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14850 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14851 RTX_FRAME_RELATED_P (insn) = 1;
14853 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14854 popc, -1, true);
14855 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14857 else
14858 emit_jump_insn (gen_simple_return_pop_internal (popc));
14860 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14862 /* In case of return from EH a simple return cannot be used
14863 as a return address will be compared with a shadow stack
14864 return address. Use indirect jump instead. */
14865 if (style == 2 && flag_cf_protection)
14867 /* Register used in indirect jump must be in word_mode. But
14868 Pmode may not be the same as word_mode for x32. */
14869 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14870 rtx_insn *insn;
14872 insn = emit_insn (gen_pop (ecx));
14873 m->fs.cfa_offset -= UNITS_PER_WORD;
14874 m->fs.sp_offset -= UNITS_PER_WORD;
14876 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14877 x = gen_rtx_SET (stack_pointer_rtx, x);
14878 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14879 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14880 RTX_FRAME_RELATED_P (insn) = 1;
14882 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14884 else
14885 emit_jump_insn (gen_simple_return_internal ());
14888 /* Restore the state back to the state from the prologue,
14889 so that it's correct for the next epilogue. */
14890 m->fs = frame_state_save;
14893 /* Reset from the function's potential modifications. */
14895 static void
14896 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14898 if (pic_offset_table_rtx
14899 && !ix86_use_pseudo_pic_reg ())
14900 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14902 if (TARGET_MACHO)
14904 rtx_insn *insn = get_last_insn ();
14905 rtx_insn *deleted_debug_label = NULL;
14907 /* Mach-O doesn't support labels at the end of objects, so if
14908 it looks like we might want one, take special action.
14909 First, collect any sequence of deleted debug labels. */
14910 while (insn
14911 && NOTE_P (insn)
14912 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14914 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14915 notes only, instead set their CODE_LABEL_NUMBER to -1,
14916 otherwise there would be code generation differences
14917 in between -g and -g0. */
14918 if (NOTE_P (insn) && NOTE_KIND (insn)
14919 == NOTE_INSN_DELETED_DEBUG_LABEL)
14920 deleted_debug_label = insn;
14921 insn = PREV_INSN (insn);
14924 /* If we have:
14925 label:
14926 barrier
14927 then this needs to be detected, so skip past the barrier. */
14929 if (insn && BARRIER_P (insn))
14930 insn = PREV_INSN (insn);
14932 /* Up to now we've only seen notes or barriers. */
14933 if (insn)
14935 if (LABEL_P (insn)
14936 || (NOTE_P (insn)
14937 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14938 /* Trailing label. */
14939 fputs ("\tnop\n", file);
14940 else if (cfun && ! cfun->is_thunk)
14942 /* See if we have a completely empty function body, skipping
14943 the special case of the picbase thunk emitted as asm. */
14944 while (insn && ! INSN_P (insn))
14945 insn = PREV_INSN (insn);
14946 /* If we don't find any insns, we've got an empty function body;
14947 I.e. completely empty - without a return or branch. This is
14948 taken as the case where a function body has been removed
14949 because it contains an inline __builtin_unreachable(). GCC
14950 declares that reaching __builtin_unreachable() means UB so
14951 we're not obliged to do anything special; however, we want
14952 non-zero-sized function bodies. To meet this, and help the
14953 user out, let's trap the case. */
14954 if (insn == NULL)
14955 fputs ("\tud2\n", file);
14958 else if (deleted_debug_label)
14959 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14960 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14961 CODE_LABEL_NUMBER (insn) = -1;
14965 /* Return a scratch register to use in the split stack prologue. The
14966 split stack prologue is used for -fsplit-stack. It is the first
14967 instructions in the function, even before the regular prologue.
14968 The scratch register can be any caller-saved register which is not
14969 used for parameters or for the static chain. */
14971 static unsigned int
14972 split_stack_prologue_scratch_regno (void)
14974 if (TARGET_64BIT)
14975 return R11_REG;
14976 else
14978 bool is_fastcall, is_thiscall;
14979 int regparm;
14981 is_fastcall = (lookup_attribute ("fastcall",
14982 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14983 != NULL);
14984 is_thiscall = (lookup_attribute ("thiscall",
14985 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14986 != NULL);
14987 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14989 if (is_fastcall)
14991 if (DECL_STATIC_CHAIN (cfun->decl))
14993 sorry ("-fsplit-stack does not support fastcall with "
14994 "nested function");
14995 return INVALID_REGNUM;
14997 return AX_REG;
14999 else if (is_thiscall)
15001 if (!DECL_STATIC_CHAIN (cfun->decl))
15002 return DX_REG;
15003 return AX_REG;
15005 else if (regparm < 3)
15007 if (!DECL_STATIC_CHAIN (cfun->decl))
15008 return CX_REG;
15009 else
15011 if (regparm >= 2)
15013 sorry ("-fsplit-stack does not support 2 register "
15014 "parameters for a nested function");
15015 return INVALID_REGNUM;
15017 return DX_REG;
15020 else
15022 /* FIXME: We could make this work by pushing a register
15023 around the addition and comparison. */
15024 sorry ("-fsplit-stack does not support 3 register parameters");
15025 return INVALID_REGNUM;
15030 /* A SYMBOL_REF for the function which allocates new stackspace for
15031 -fsplit-stack. */
15033 static GTY(()) rtx split_stack_fn;
15035 /* A SYMBOL_REF for the more stack function when using the large
15036 model. */
15038 static GTY(()) rtx split_stack_fn_large;
15040 /* Return location of the stack guard value in the TLS block. */
15043 ix86_split_stack_guard (void)
15045 int offset;
15046 addr_space_t as = DEFAULT_TLS_SEG_REG;
15047 rtx r;
15049 gcc_assert (flag_split_stack);
15051 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15052 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15053 #else
15054 gcc_unreachable ();
15055 #endif
15057 r = GEN_INT (offset);
15058 r = gen_const_mem (Pmode, r);
15059 set_mem_addr_space (r, as);
15061 return r;
15064 /* Handle -fsplit-stack. These are the first instructions in the
15065 function, even before the regular prologue. */
15067 void
15068 ix86_expand_split_stack_prologue (void)
15070 HOST_WIDE_INT allocate;
15071 unsigned HOST_WIDE_INT args_size;
15072 rtx_code_label *label;
15073 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15074 rtx scratch_reg = NULL_RTX;
15075 rtx_code_label *varargs_label = NULL;
15076 rtx fn;
15078 gcc_assert (flag_split_stack && reload_completed);
15080 ix86_finalize_stack_frame_flags ();
15081 struct ix86_frame &frame = cfun->machine->frame;
15082 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15084 /* This is the label we will branch to if we have enough stack
15085 space. We expect the basic block reordering pass to reverse this
15086 branch if optimizing, so that we branch in the unlikely case. */
15087 label = gen_label_rtx ();
15089 /* We need to compare the stack pointer minus the frame size with
15090 the stack boundary in the TCB. The stack boundary always gives
15091 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15092 can compare directly. Otherwise we need to do an addition. */
15094 limit = ix86_split_stack_guard ();
15096 if (allocate < SPLIT_STACK_AVAILABLE)
15097 current = stack_pointer_rtx;
15098 else
15100 unsigned int scratch_regno;
15101 rtx offset;
15103 /* We need a scratch register to hold the stack pointer minus
15104 the required frame size. Since this is the very start of the
15105 function, the scratch register can be any caller-saved
15106 register which is not used for parameters. */
15107 offset = GEN_INT (- allocate);
15108 scratch_regno = split_stack_prologue_scratch_regno ();
15109 if (scratch_regno == INVALID_REGNUM)
15110 return;
15111 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15112 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15114 /* We don't use ix86_gen_add3 in this case because it will
15115 want to split to lea, but when not optimizing the insn
15116 will not be split after this point. */
15117 emit_insn (gen_rtx_SET (scratch_reg,
15118 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15119 offset)));
15121 else
15123 emit_move_insn (scratch_reg, offset);
15124 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15125 stack_pointer_rtx));
15127 current = scratch_reg;
15130 ix86_expand_branch (GEU, current, limit, label);
15131 rtx_insn *jump_insn = get_last_insn ();
15132 JUMP_LABEL (jump_insn) = label;
15134 /* Mark the jump as very likely to be taken. */
15135 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15137 if (split_stack_fn == NULL_RTX)
15139 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15140 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15142 fn = split_stack_fn;
15144 /* Get more stack space. We pass in the desired stack space and the
15145 size of the arguments to copy to the new stack. In 32-bit mode
15146 we push the parameters; __morestack will return on a new stack
15147 anyhow. In 64-bit mode we pass the parameters in r10 and
15148 r11. */
15149 allocate_rtx = GEN_INT (allocate);
15150 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
15151 call_fusage = NULL_RTX;
15152 rtx pop = NULL_RTX;
15153 if (TARGET_64BIT)
15155 rtx reg10, reg11;
15157 reg10 = gen_rtx_REG (Pmode, R10_REG);
15158 reg11 = gen_rtx_REG (Pmode, R11_REG);
15160 /* If this function uses a static chain, it will be in %r10.
15161 Preserve it across the call to __morestack. */
15162 if (DECL_STATIC_CHAIN (cfun->decl))
15164 rtx rax;
15166 rax = gen_rtx_REG (word_mode, AX_REG);
15167 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15168 use_reg (&call_fusage, rax);
15171 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15172 && !TARGET_PECOFF)
15174 HOST_WIDE_INT argval;
15176 gcc_assert (Pmode == DImode);
15177 /* When using the large model we need to load the address
15178 into a register, and we've run out of registers. So we
15179 switch to a different calling convention, and we call a
15180 different function: __morestack_large. We pass the
15181 argument size in the upper 32 bits of r10 and pass the
15182 frame size in the lower 32 bits. */
15183 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15184 gcc_assert ((args_size & 0xffffffff) == args_size);
15186 if (split_stack_fn_large == NULL_RTX)
15188 split_stack_fn_large =
15189 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15190 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15192 if (ix86_cmodel == CM_LARGE_PIC)
15194 rtx_code_label *label;
15195 rtx x;
15197 label = gen_label_rtx ();
15198 emit_label (label);
15199 LABEL_PRESERVE_P (label) = 1;
15200 emit_insn (gen_set_rip_rex64 (reg10, label));
15201 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15202 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15203 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15204 UNSPEC_GOT);
15205 x = gen_rtx_CONST (Pmode, x);
15206 emit_move_insn (reg11, x);
15207 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15208 x = gen_const_mem (Pmode, x);
15209 emit_move_insn (reg11, x);
15211 else
15212 emit_move_insn (reg11, split_stack_fn_large);
15214 fn = reg11;
15216 argval = ((args_size << 16) << 16) + allocate;
15217 emit_move_insn (reg10, GEN_INT (argval));
15219 else
15221 emit_move_insn (reg10, allocate_rtx);
15222 emit_move_insn (reg11, GEN_INT (args_size));
15223 use_reg (&call_fusage, reg11);
15226 use_reg (&call_fusage, reg10);
15228 else
15230 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15231 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15232 insn = emit_insn (gen_push (allocate_rtx));
15233 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15234 pop = GEN_INT (2 * UNITS_PER_WORD);
15236 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15237 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15238 pop, false);
15239 add_function_usage_to (call_insn, call_fusage);
15240 if (!TARGET_64BIT)
15241 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15242 /* Indicate that this function can't jump to non-local gotos. */
15243 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15245 /* In order to make call/return prediction work right, we now need
15246 to execute a return instruction. See
15247 libgcc/config/i386/morestack.S for the details on how this works.
15249 For flow purposes gcc must not see this as a return
15250 instruction--we need control flow to continue at the subsequent
15251 label. Therefore, we use an unspec. */
15252 gcc_assert (crtl->args.pops_args < 65536);
15253 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15255 /* If we are in 64-bit mode and this function uses a static chain,
15256 we saved %r10 in %rax before calling _morestack. */
15257 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15258 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15259 gen_rtx_REG (word_mode, AX_REG));
15261 /* If this function calls va_start, we need to store a pointer to
15262 the arguments on the old stack, because they may not have been
15263 all copied to the new stack. At this point the old stack can be
15264 found at the frame pointer value used by __morestack, because
15265 __morestack has set that up before calling back to us. Here we
15266 store that pointer in a scratch register, and in
15267 ix86_expand_prologue we store the scratch register in a stack
15268 slot. */
15269 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15271 unsigned int scratch_regno;
15272 rtx frame_reg;
15273 int words;
15275 scratch_regno = split_stack_prologue_scratch_regno ();
15276 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15277 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15279 /* 64-bit:
15280 fp -> old fp value
15281 return address within this function
15282 return address of caller of this function
15283 stack arguments
15284 So we add three words to get to the stack arguments.
15286 32-bit:
15287 fp -> old fp value
15288 return address within this function
15289 first argument to __morestack
15290 second argument to __morestack
15291 return address of caller of this function
15292 stack arguments
15293 So we add five words to get to the stack arguments.
15295 words = TARGET_64BIT ? 3 : 5;
15296 emit_insn (gen_rtx_SET (scratch_reg,
15297 gen_rtx_PLUS (Pmode, frame_reg,
15298 GEN_INT (words * UNITS_PER_WORD))));
15300 varargs_label = gen_label_rtx ();
15301 emit_jump_insn (gen_jump (varargs_label));
15302 JUMP_LABEL (get_last_insn ()) = varargs_label;
15304 emit_barrier ();
15307 emit_label (label);
15308 LABEL_NUSES (label) = 1;
15310 /* If this function calls va_start, we now have to set the scratch
15311 register for the case where we do not call __morestack. In this
15312 case we need to set it based on the stack pointer. */
15313 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15315 emit_insn (gen_rtx_SET (scratch_reg,
15316 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15317 GEN_INT (UNITS_PER_WORD))));
15319 emit_label (varargs_label);
15320 LABEL_NUSES (varargs_label) = 1;
15324 /* We may have to tell the dataflow pass that the split stack prologue
15325 is initializing a scratch register. */
15327 static void
15328 ix86_live_on_entry (bitmap regs)
15330 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15332 gcc_assert (flag_split_stack);
15333 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15337 /* Extract the parts of an RTL expression that is a valid memory address
15338 for an instruction. Return 0 if the structure of the address is
15339 grossly off. Return -1 if the address contains ASHIFT, so it is not
15340 strictly valid, but still used for computing length of lea instruction. */
15343 ix86_decompose_address (rtx addr, struct ix86_address *out)
15345 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15346 rtx base_reg, index_reg;
15347 HOST_WIDE_INT scale = 1;
15348 rtx scale_rtx = NULL_RTX;
15349 rtx tmp;
15350 int retval = 1;
15351 addr_space_t seg = ADDR_SPACE_GENERIC;
15353 /* Allow zero-extended SImode addresses,
15354 they will be emitted with addr32 prefix. */
15355 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15357 if (GET_CODE (addr) == ZERO_EXTEND
15358 && GET_MODE (XEXP (addr, 0)) == SImode)
15360 addr = XEXP (addr, 0);
15361 if (CONST_INT_P (addr))
15362 return 0;
15364 else if (GET_CODE (addr) == AND
15365 && const_32bit_mask (XEXP (addr, 1), DImode))
15367 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15368 if (addr == NULL_RTX)
15369 return 0;
15371 if (CONST_INT_P (addr))
15372 return 0;
15376 /* Allow SImode subregs of DImode addresses,
15377 they will be emitted with addr32 prefix. */
15378 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15380 if (SUBREG_P (addr)
15381 && GET_MODE (SUBREG_REG (addr)) == DImode)
15383 addr = SUBREG_REG (addr);
15384 if (CONST_INT_P (addr))
15385 return 0;
15389 if (REG_P (addr))
15390 base = addr;
15391 else if (SUBREG_P (addr))
15393 if (REG_P (SUBREG_REG (addr)))
15394 base = addr;
15395 else
15396 return 0;
15398 else if (GET_CODE (addr) == PLUS)
15400 rtx addends[4], op;
15401 int n = 0, i;
15403 op = addr;
15406 if (n >= 4)
15407 return 0;
15408 addends[n++] = XEXP (op, 1);
15409 op = XEXP (op, 0);
15411 while (GET_CODE (op) == PLUS);
15412 if (n >= 4)
15413 return 0;
15414 addends[n] = op;
15416 for (i = n; i >= 0; --i)
15418 op = addends[i];
15419 switch (GET_CODE (op))
15421 case MULT:
15422 if (index)
15423 return 0;
15424 index = XEXP (op, 0);
15425 scale_rtx = XEXP (op, 1);
15426 break;
15428 case ASHIFT:
15429 if (index)
15430 return 0;
15431 index = XEXP (op, 0);
15432 tmp = XEXP (op, 1);
15433 if (!CONST_INT_P (tmp))
15434 return 0;
15435 scale = INTVAL (tmp);
15436 if ((unsigned HOST_WIDE_INT) scale > 3)
15437 return 0;
15438 scale = 1 << scale;
15439 break;
15441 case ZERO_EXTEND:
15442 op = XEXP (op, 0);
15443 if (GET_CODE (op) != UNSPEC)
15444 return 0;
15445 /* FALLTHRU */
15447 case UNSPEC:
15448 if (XINT (op, 1) == UNSPEC_TP
15449 && TARGET_TLS_DIRECT_SEG_REFS
15450 && seg == ADDR_SPACE_GENERIC)
15451 seg = DEFAULT_TLS_SEG_REG;
15452 else
15453 return 0;
15454 break;
15456 case SUBREG:
15457 if (!REG_P (SUBREG_REG (op)))
15458 return 0;
15459 /* FALLTHRU */
15461 case REG:
15462 if (!base)
15463 base = op;
15464 else if (!index)
15465 index = op;
15466 else
15467 return 0;
15468 break;
15470 case CONST:
15471 case CONST_INT:
15472 case SYMBOL_REF:
15473 case LABEL_REF:
15474 if (disp)
15475 return 0;
15476 disp = op;
15477 break;
15479 default:
15480 return 0;
15484 else if (GET_CODE (addr) == MULT)
15486 index = XEXP (addr, 0); /* index*scale */
15487 scale_rtx = XEXP (addr, 1);
15489 else if (GET_CODE (addr) == ASHIFT)
15491 /* We're called for lea too, which implements ashift on occasion. */
15492 index = XEXP (addr, 0);
15493 tmp = XEXP (addr, 1);
15494 if (!CONST_INT_P (tmp))
15495 return 0;
15496 scale = INTVAL (tmp);
15497 if ((unsigned HOST_WIDE_INT) scale > 3)
15498 return 0;
15499 scale = 1 << scale;
15500 retval = -1;
15502 else
15503 disp = addr; /* displacement */
15505 if (index)
15507 if (REG_P (index))
15509 else if (SUBREG_P (index)
15510 && REG_P (SUBREG_REG (index)))
15512 else
15513 return 0;
15516 /* Extract the integral value of scale. */
15517 if (scale_rtx)
15519 if (!CONST_INT_P (scale_rtx))
15520 return 0;
15521 scale = INTVAL (scale_rtx);
15524 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15525 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15527 /* Avoid useless 0 displacement. */
15528 if (disp == const0_rtx && (base || index))
15529 disp = NULL_RTX;
15531 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15532 if (base_reg && index_reg && scale == 1
15533 && (REGNO (index_reg) == ARG_POINTER_REGNUM
15534 || REGNO (index_reg) == FRAME_POINTER_REGNUM
15535 || REGNO (index_reg) == SP_REG))
15537 std::swap (base, index);
15538 std::swap (base_reg, index_reg);
15541 /* Special case: %ebp cannot be encoded as a base without a displacement.
15542 Similarly %r13. */
15543 if (!disp && base_reg
15544 && (REGNO (base_reg) == ARG_POINTER_REGNUM
15545 || REGNO (base_reg) == FRAME_POINTER_REGNUM
15546 || REGNO (base_reg) == BP_REG
15547 || REGNO (base_reg) == R13_REG))
15548 disp = const0_rtx;
15550 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15551 Avoid this by transforming to [%esi+0].
15552 Reload calls address legitimization without cfun defined, so we need
15553 to test cfun for being non-NULL. */
15554 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15555 && base_reg && !index_reg && !disp
15556 && REGNO (base_reg) == SI_REG)
15557 disp = const0_rtx;
15559 /* Special case: encode reg+reg instead of reg*2. */
15560 if (!base && index && scale == 2)
15561 base = index, base_reg = index_reg, scale = 1;
15563 /* Special case: scaling cannot be encoded without base or displacement. */
15564 if (!base && !disp && index && scale != 1)
15565 disp = const0_rtx;
15567 out->base = base;
15568 out->index = index;
15569 out->disp = disp;
15570 out->scale = scale;
15571 out->seg = seg;
15573 return retval;
15576 /* Return cost of the memory address x.
15577 For i386, it is better to use a complex address than let gcc copy
15578 the address into a reg and make a new pseudo. But not if the address
15579 requires to two regs - that would mean more pseudos with longer
15580 lifetimes. */
15581 static int
15582 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15584 struct ix86_address parts;
15585 int cost = 1;
15586 int ok = ix86_decompose_address (x, &parts);
15588 gcc_assert (ok);
15590 if (parts.base && SUBREG_P (parts.base))
15591 parts.base = SUBREG_REG (parts.base);
15592 if (parts.index && SUBREG_P (parts.index))
15593 parts.index = SUBREG_REG (parts.index);
15595 /* Attempt to minimize number of registers in the address by increasing
15596 address cost for each used register. We don't increase address cost
15597 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15598 is not invariant itself it most likely means that base or index is not
15599 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15600 which is not profitable for x86. */
15601 if (parts.base
15602 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15603 && (current_pass->type == GIMPLE_PASS
15604 || !pic_offset_table_rtx
15605 || !REG_P (parts.base)
15606 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15607 cost++;
15609 if (parts.index
15610 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15611 && (current_pass->type == GIMPLE_PASS
15612 || !pic_offset_table_rtx
15613 || !REG_P (parts.index)
15614 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15615 cost++;
15617 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15618 since it's predecode logic can't detect the length of instructions
15619 and it degenerates to vector decoded. Increase cost of such
15620 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15621 to split such addresses or even refuse such addresses at all.
15623 Following addressing modes are affected:
15624 [base+scale*index]
15625 [scale*index+disp]
15626 [base+index]
15628 The first and last case may be avoidable by explicitly coding the zero in
15629 memory address, but I don't have AMD-K6 machine handy to check this
15630 theory. */
15632 if (TARGET_K6
15633 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15634 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15635 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15636 cost += 10;
15638 return cost;
15641 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15642 this is used for to form addresses to local data when -fPIC is in
15643 use. */
15645 static bool
15646 darwin_local_data_pic (rtx disp)
15648 return (GET_CODE (disp) == UNSPEC
15649 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15652 /* True if operand X should be loaded from GOT. */
15654 bool
15655 ix86_force_load_from_GOT_p (rtx x)
15657 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15658 && !TARGET_PECOFF && !TARGET_MACHO
15659 && !flag_plt && !flag_pic
15660 && ix86_cmodel != CM_LARGE
15661 && GET_CODE (x) == SYMBOL_REF
15662 && SYMBOL_REF_FUNCTION_P (x)
15663 && !SYMBOL_REF_LOCAL_P (x));
15666 /* Determine if a given RTX is a valid constant. We already know this
15667 satisfies CONSTANT_P. */
15669 static bool
15670 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15672 /* Pointer bounds constants are not valid. */
15673 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15674 return false;
15676 switch (GET_CODE (x))
15678 case CONST:
15679 x = XEXP (x, 0);
15681 if (GET_CODE (x) == PLUS)
15683 if (!CONST_INT_P (XEXP (x, 1)))
15684 return false;
15685 x = XEXP (x, 0);
15688 if (TARGET_MACHO && darwin_local_data_pic (x))
15689 return true;
15691 /* Only some unspecs are valid as "constants". */
15692 if (GET_CODE (x) == UNSPEC)
15693 switch (XINT (x, 1))
15695 case UNSPEC_GOT:
15696 case UNSPEC_GOTOFF:
15697 case UNSPEC_PLTOFF:
15698 return TARGET_64BIT;
15699 case UNSPEC_TPOFF:
15700 case UNSPEC_NTPOFF:
15701 x = XVECEXP (x, 0, 0);
15702 return (GET_CODE (x) == SYMBOL_REF
15703 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15704 case UNSPEC_DTPOFF:
15705 x = XVECEXP (x, 0, 0);
15706 return (GET_CODE (x) == SYMBOL_REF
15707 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15708 default:
15709 return false;
15712 /* We must have drilled down to a symbol. */
15713 if (GET_CODE (x) == LABEL_REF)
15714 return true;
15715 if (GET_CODE (x) != SYMBOL_REF)
15716 return false;
15717 /* FALLTHRU */
15719 case SYMBOL_REF:
15720 /* TLS symbols are never valid. */
15721 if (SYMBOL_REF_TLS_MODEL (x))
15722 return false;
15724 /* DLLIMPORT symbols are never valid. */
15725 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15726 && SYMBOL_REF_DLLIMPORT_P (x))
15727 return false;
15729 #if TARGET_MACHO
15730 /* mdynamic-no-pic */
15731 if (MACHO_DYNAMIC_NO_PIC_P)
15732 return machopic_symbol_defined_p (x);
15733 #endif
15735 /* External function address should be loaded
15736 via the GOT slot to avoid PLT. */
15737 if (ix86_force_load_from_GOT_p (x))
15738 return false;
15740 break;
15742 CASE_CONST_SCALAR_INT:
15743 switch (mode)
15745 case E_TImode:
15746 if (TARGET_64BIT)
15747 return true;
15748 /* FALLTHRU */
15749 case E_OImode:
15750 case E_XImode:
15751 if (!standard_sse_constant_p (x, mode))
15752 return false;
15753 default:
15754 break;
15756 break;
15758 case CONST_VECTOR:
15759 if (!standard_sse_constant_p (x, mode))
15760 return false;
15762 default:
15763 break;
15766 /* Otherwise we handle everything else in the move patterns. */
15767 return true;
15770 /* Determine if it's legal to put X into the constant pool. This
15771 is not possible for the address of thread-local symbols, which
15772 is checked above. */
15774 static bool
15775 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15777 /* We can put any immediate constant in memory. */
15778 switch (GET_CODE (x))
15780 CASE_CONST_ANY:
15781 return false;
15783 default:
15784 break;
15787 return !ix86_legitimate_constant_p (mode, x);
15790 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15791 otherwise zero. */
15793 static bool
15794 is_imported_p (rtx x)
15796 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15797 || GET_CODE (x) != SYMBOL_REF)
15798 return false;
15800 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15804 /* Nonzero if the constant value X is a legitimate general operand
15805 when generating PIC code. It is given that flag_pic is on and
15806 that X satisfies CONSTANT_P. */
15808 bool
15809 legitimate_pic_operand_p (rtx x)
15811 rtx inner;
15813 switch (GET_CODE (x))
15815 case CONST:
15816 inner = XEXP (x, 0);
15817 if (GET_CODE (inner) == PLUS
15818 && CONST_INT_P (XEXP (inner, 1)))
15819 inner = XEXP (inner, 0);
15821 /* Only some unspecs are valid as "constants". */
15822 if (GET_CODE (inner) == UNSPEC)
15823 switch (XINT (inner, 1))
15825 case UNSPEC_GOT:
15826 case UNSPEC_GOTOFF:
15827 case UNSPEC_PLTOFF:
15828 return TARGET_64BIT;
15829 case UNSPEC_TPOFF:
15830 x = XVECEXP (inner, 0, 0);
15831 return (GET_CODE (x) == SYMBOL_REF
15832 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15833 case UNSPEC_MACHOPIC_OFFSET:
15834 return legitimate_pic_address_disp_p (x);
15835 default:
15836 return false;
15838 /* FALLTHRU */
15840 case SYMBOL_REF:
15841 case LABEL_REF:
15842 return legitimate_pic_address_disp_p (x);
15844 default:
15845 return true;
15849 /* Determine if a given CONST RTX is a valid memory displacement
15850 in PIC mode. */
15852 bool
15853 legitimate_pic_address_disp_p (rtx disp)
15855 bool saw_plus;
15857 /* In 64bit mode we can allow direct addresses of symbols and labels
15858 when they are not dynamic symbols. */
15859 if (TARGET_64BIT)
15861 rtx op0 = disp, op1;
15863 switch (GET_CODE (disp))
15865 case LABEL_REF:
15866 return true;
15868 case CONST:
15869 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15870 break;
15871 op0 = XEXP (XEXP (disp, 0), 0);
15872 op1 = XEXP (XEXP (disp, 0), 1);
15873 if (!CONST_INT_P (op1))
15874 break;
15875 if (GET_CODE (op0) == UNSPEC
15876 && (XINT (op0, 1) == UNSPEC_DTPOFF
15877 || XINT (op0, 1) == UNSPEC_NTPOFF)
15878 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15879 return true;
15880 if (INTVAL (op1) >= 16*1024*1024
15881 || INTVAL (op1) < -16*1024*1024)
15882 break;
15883 if (GET_CODE (op0) == LABEL_REF)
15884 return true;
15885 if (GET_CODE (op0) == CONST
15886 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15887 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15888 return true;
15889 if (GET_CODE (op0) == UNSPEC
15890 && XINT (op0, 1) == UNSPEC_PCREL)
15891 return true;
15892 if (GET_CODE (op0) != SYMBOL_REF)
15893 break;
15894 /* FALLTHRU */
15896 case SYMBOL_REF:
15897 /* TLS references should always be enclosed in UNSPEC.
15898 The dllimported symbol needs always to be resolved. */
15899 if (SYMBOL_REF_TLS_MODEL (op0)
15900 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15901 return false;
15903 if (TARGET_PECOFF)
15905 if (is_imported_p (op0))
15906 return true;
15908 if (SYMBOL_REF_FAR_ADDR_P (op0)
15909 || !SYMBOL_REF_LOCAL_P (op0))
15910 break;
15912 /* Function-symbols need to be resolved only for
15913 large-model.
15914 For the small-model we don't need to resolve anything
15915 here. */
15916 if ((ix86_cmodel != CM_LARGE_PIC
15917 && SYMBOL_REF_FUNCTION_P (op0))
15918 || ix86_cmodel == CM_SMALL_PIC)
15919 return true;
15920 /* Non-external symbols don't need to be resolved for
15921 large, and medium-model. */
15922 if ((ix86_cmodel == CM_LARGE_PIC
15923 || ix86_cmodel == CM_MEDIUM_PIC)
15924 && !SYMBOL_REF_EXTERNAL_P (op0))
15925 return true;
15927 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15928 && (SYMBOL_REF_LOCAL_P (op0)
15929 || (HAVE_LD_PIE_COPYRELOC
15930 && flag_pie
15931 && !SYMBOL_REF_WEAK (op0)
15932 && !SYMBOL_REF_FUNCTION_P (op0)))
15933 && ix86_cmodel != CM_LARGE_PIC)
15934 return true;
15935 break;
15937 default:
15938 break;
15941 if (GET_CODE (disp) != CONST)
15942 return false;
15943 disp = XEXP (disp, 0);
15945 if (TARGET_64BIT)
15947 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15948 of GOT tables. We should not need these anyway. */
15949 if (GET_CODE (disp) != UNSPEC
15950 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15951 && XINT (disp, 1) != UNSPEC_GOTOFF
15952 && XINT (disp, 1) != UNSPEC_PCREL
15953 && XINT (disp, 1) != UNSPEC_PLTOFF))
15954 return false;
15956 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15957 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15958 return false;
15959 return true;
15962 saw_plus = false;
15963 if (GET_CODE (disp) == PLUS)
15965 if (!CONST_INT_P (XEXP (disp, 1)))
15966 return false;
15967 disp = XEXP (disp, 0);
15968 saw_plus = true;
15971 if (TARGET_MACHO && darwin_local_data_pic (disp))
15972 return true;
15974 if (GET_CODE (disp) != UNSPEC)
15975 return false;
15977 switch (XINT (disp, 1))
15979 case UNSPEC_GOT:
15980 if (saw_plus)
15981 return false;
15982 /* We need to check for both symbols and labels because VxWorks loads
15983 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15984 details. */
15985 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15986 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15987 case UNSPEC_GOTOFF:
15988 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15989 While ABI specify also 32bit relocation but we don't produce it in
15990 small PIC model at all. */
15991 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15992 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15993 && !TARGET_64BIT)
15994 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15995 return false;
15996 case UNSPEC_GOTTPOFF:
15997 case UNSPEC_GOTNTPOFF:
15998 case UNSPEC_INDNTPOFF:
15999 if (saw_plus)
16000 return false;
16001 disp = XVECEXP (disp, 0, 0);
16002 return (GET_CODE (disp) == SYMBOL_REF
16003 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16004 case UNSPEC_NTPOFF:
16005 disp = XVECEXP (disp, 0, 0);
16006 return (GET_CODE (disp) == SYMBOL_REF
16007 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16008 case UNSPEC_DTPOFF:
16009 disp = XVECEXP (disp, 0, 0);
16010 return (GET_CODE (disp) == SYMBOL_REF
16011 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16014 return false;
16017 /* Determine if op is suitable RTX for an address register.
16018 Return naked register if a register or a register subreg is
16019 found, otherwise return NULL_RTX. */
16021 static rtx
16022 ix86_validate_address_register (rtx op)
16024 machine_mode mode = GET_MODE (op);
16026 /* Only SImode or DImode registers can form the address. */
16027 if (mode != SImode && mode != DImode)
16028 return NULL_RTX;
16030 if (REG_P (op))
16031 return op;
16032 else if (SUBREG_P (op))
16034 rtx reg = SUBREG_REG (op);
16036 if (!REG_P (reg))
16037 return NULL_RTX;
16039 mode = GET_MODE (reg);
16041 /* Don't allow SUBREGs that span more than a word. It can
16042 lead to spill failures when the register is one word out
16043 of a two word structure. */
16044 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16045 return NULL_RTX;
16047 /* Allow only SUBREGs of non-eliminable hard registers. */
16048 if (register_no_elim_operand (reg, mode))
16049 return reg;
16052 /* Op is not a register. */
16053 return NULL_RTX;
16056 /* Recognizes RTL expressions that are valid memory addresses for an
16057 instruction. The MODE argument is the machine mode for the MEM
16058 expression that wants to use this address.
16060 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16061 convert common non-canonical forms to canonical form so that they will
16062 be recognized. */
16064 static bool
16065 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16067 struct ix86_address parts;
16068 rtx base, index, disp;
16069 HOST_WIDE_INT scale;
16070 addr_space_t seg;
16072 if (ix86_decompose_address (addr, &parts) <= 0)
16073 /* Decomposition failed. */
16074 return false;
16076 base = parts.base;
16077 index = parts.index;
16078 disp = parts.disp;
16079 scale = parts.scale;
16080 seg = parts.seg;
16082 /* Validate base register. */
16083 if (base)
16085 rtx reg = ix86_validate_address_register (base);
16087 if (reg == NULL_RTX)
16088 return false;
16090 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16091 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16092 /* Base is not valid. */
16093 return false;
16096 /* Validate index register. */
16097 if (index)
16099 rtx reg = ix86_validate_address_register (index);
16101 if (reg == NULL_RTX)
16102 return false;
16104 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16105 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16106 /* Index is not valid. */
16107 return false;
16110 /* Index and base should have the same mode. */
16111 if (base && index
16112 && GET_MODE (base) != GET_MODE (index))
16113 return false;
16115 /* Address override works only on the (%reg) part of %fs:(%reg). */
16116 if (seg != ADDR_SPACE_GENERIC
16117 && ((base && GET_MODE (base) != word_mode)
16118 || (index && GET_MODE (index) != word_mode)))
16119 return false;
16121 /* Validate scale factor. */
16122 if (scale != 1)
16124 if (!index)
16125 /* Scale without index. */
16126 return false;
16128 if (scale != 2 && scale != 4 && scale != 8)
16129 /* Scale is not a valid multiplier. */
16130 return false;
16133 /* Validate displacement. */
16134 if (disp)
16136 if (GET_CODE (disp) == CONST
16137 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16138 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16139 switch (XINT (XEXP (disp, 0), 1))
16141 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16142 when used. While ABI specify also 32bit relocations, we
16143 don't produce them at all and use IP relative instead.
16144 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16145 should be loaded via GOT. */
16146 case UNSPEC_GOT:
16147 if (!TARGET_64BIT
16148 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16149 goto is_legitimate_pic;
16150 /* FALLTHRU */
16151 case UNSPEC_GOTOFF:
16152 gcc_assert (flag_pic);
16153 if (!TARGET_64BIT)
16154 goto is_legitimate_pic;
16156 /* 64bit address unspec. */
16157 return false;
16159 case UNSPEC_GOTPCREL:
16160 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16161 goto is_legitimate_pic;
16162 /* FALLTHRU */
16163 case UNSPEC_PCREL:
16164 gcc_assert (flag_pic);
16165 goto is_legitimate_pic;
16167 case UNSPEC_GOTTPOFF:
16168 case UNSPEC_GOTNTPOFF:
16169 case UNSPEC_INDNTPOFF:
16170 case UNSPEC_NTPOFF:
16171 case UNSPEC_DTPOFF:
16172 break;
16174 default:
16175 /* Invalid address unspec. */
16176 return false;
16179 else if (SYMBOLIC_CONST (disp)
16180 && (flag_pic
16181 || (TARGET_MACHO
16182 #if TARGET_MACHO
16183 && MACHOPIC_INDIRECT
16184 && !machopic_operand_p (disp)
16185 #endif
16189 is_legitimate_pic:
16190 if (TARGET_64BIT && (index || base))
16192 /* foo@dtpoff(%rX) is ok. */
16193 if (GET_CODE (disp) != CONST
16194 || GET_CODE (XEXP (disp, 0)) != PLUS
16195 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16196 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16197 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16198 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16199 /* Non-constant pic memory reference. */
16200 return false;
16202 else if ((!TARGET_MACHO || flag_pic)
16203 && ! legitimate_pic_address_disp_p (disp))
16204 /* Displacement is an invalid pic construct. */
16205 return false;
16206 #if TARGET_MACHO
16207 else if (MACHO_DYNAMIC_NO_PIC_P
16208 && !ix86_legitimate_constant_p (Pmode, disp))
16209 /* displacment must be referenced via non_lazy_pointer */
16210 return false;
16211 #endif
16213 /* This code used to verify that a symbolic pic displacement
16214 includes the pic_offset_table_rtx register.
16216 While this is good idea, unfortunately these constructs may
16217 be created by "adds using lea" optimization for incorrect
16218 code like:
16220 int a;
16221 int foo(int i)
16223 return *(&a+i);
16226 This code is nonsensical, but results in addressing
16227 GOT table with pic_offset_table_rtx base. We can't
16228 just refuse it easily, since it gets matched by
16229 "addsi3" pattern, that later gets split to lea in the
16230 case output register differs from input. While this
16231 can be handled by separate addsi pattern for this case
16232 that never results in lea, this seems to be easier and
16233 correct fix for crash to disable this test. */
16235 else if (GET_CODE (disp) != LABEL_REF
16236 && !CONST_INT_P (disp)
16237 && (GET_CODE (disp) != CONST
16238 || !ix86_legitimate_constant_p (Pmode, disp))
16239 && (GET_CODE (disp) != SYMBOL_REF
16240 || !ix86_legitimate_constant_p (Pmode, disp)))
16241 /* Displacement is not constant. */
16242 return false;
16243 else if (TARGET_64BIT
16244 && !x86_64_immediate_operand (disp, VOIDmode))
16245 /* Displacement is out of range. */
16246 return false;
16247 /* In x32 mode, constant addresses are sign extended to 64bit, so
16248 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16249 else if (TARGET_X32 && !(index || base)
16250 && CONST_INT_P (disp)
16251 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16252 return false;
16255 /* Everything looks valid. */
16256 return true;
16259 /* Determine if a given RTX is a valid constant address. */
16261 bool
16262 constant_address_p (rtx x)
16264 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16267 /* Return a unique alias set for the GOT. */
16269 static alias_set_type
16270 ix86_GOT_alias_set (void)
16272 static alias_set_type set = -1;
16273 if (set == -1)
16274 set = new_alias_set ();
16275 return set;
16278 /* Return a legitimate reference for ORIG (an address) using the
16279 register REG. If REG is 0, a new pseudo is generated.
16281 There are two types of references that must be handled:
16283 1. Global data references must load the address from the GOT, via
16284 the PIC reg. An insn is emitted to do this load, and the reg is
16285 returned.
16287 2. Static data references, constant pool addresses, and code labels
16288 compute the address as an offset from the GOT, whose base is in
16289 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16290 differentiate them from global data objects. The returned
16291 address is the PIC reg + an unspec constant.
16293 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16294 reg also appears in the address. */
16296 static rtx
16297 legitimize_pic_address (rtx orig, rtx reg)
16299 rtx addr = orig;
16300 rtx new_rtx = orig;
16302 #if TARGET_MACHO
16303 if (TARGET_MACHO && !TARGET_64BIT)
16305 if (reg == 0)
16306 reg = gen_reg_rtx (Pmode);
16307 /* Use the generic Mach-O PIC machinery. */
16308 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16310 #endif
16312 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16314 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16315 if (tmp)
16316 return tmp;
16319 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16320 new_rtx = addr;
16321 else if ((!TARGET_64BIT
16322 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16323 && !TARGET_PECOFF
16324 && gotoff_operand (addr, Pmode))
16326 /* This symbol may be referenced via a displacement
16327 from the PIC base address (@GOTOFF). */
16328 if (GET_CODE (addr) == CONST)
16329 addr = XEXP (addr, 0);
16331 if (GET_CODE (addr) == PLUS)
16333 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16334 UNSPEC_GOTOFF);
16335 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16337 else
16338 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16340 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16342 if (TARGET_64BIT)
16343 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16345 if (reg != 0)
16347 gcc_assert (REG_P (reg));
16348 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16349 new_rtx, reg, 1, OPTAB_DIRECT);
16351 else
16352 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16354 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16355 /* We can't use @GOTOFF for text labels
16356 on VxWorks, see gotoff_operand. */
16357 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16359 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16360 if (tmp)
16361 return tmp;
16363 /* For x64 PE-COFF there is no GOT table,
16364 so we use address directly. */
16365 if (TARGET_64BIT && TARGET_PECOFF)
16367 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16368 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16370 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16372 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16373 UNSPEC_GOTPCREL);
16374 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16375 new_rtx = gen_const_mem (Pmode, new_rtx);
16376 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16378 else
16380 /* This symbol must be referenced via a load
16381 from the Global Offset Table (@GOT). */
16382 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16383 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16384 if (TARGET_64BIT)
16385 new_rtx = force_reg (Pmode, new_rtx);
16386 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16387 new_rtx = gen_const_mem (Pmode, new_rtx);
16388 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16391 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16393 else
16395 if (CONST_INT_P (addr)
16396 && !x86_64_immediate_operand (addr, VOIDmode))
16397 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16398 else if (GET_CODE (addr) == CONST)
16400 addr = XEXP (addr, 0);
16402 /* We must match stuff we generate before. Assume the only
16403 unspecs that can get here are ours. Not that we could do
16404 anything with them anyway.... */
16405 if (GET_CODE (addr) == UNSPEC
16406 || (GET_CODE (addr) == PLUS
16407 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16408 return orig;
16409 gcc_assert (GET_CODE (addr) == PLUS);
16412 if (GET_CODE (addr) == PLUS)
16414 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16416 /* Check first to see if this is a constant
16417 offset from a @GOTOFF symbol reference. */
16418 if (!TARGET_PECOFF
16419 && gotoff_operand (op0, Pmode)
16420 && CONST_INT_P (op1))
16422 if (!TARGET_64BIT)
16424 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16425 UNSPEC_GOTOFF);
16426 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16427 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16429 if (reg != 0)
16431 gcc_assert (REG_P (reg));
16432 new_rtx = expand_simple_binop (Pmode, PLUS,
16433 pic_offset_table_rtx,
16434 new_rtx, reg, 1,
16435 OPTAB_DIRECT);
16437 else
16438 new_rtx
16439 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16441 else
16443 if (INTVAL (op1) < -16*1024*1024
16444 || INTVAL (op1) >= 16*1024*1024)
16446 if (!x86_64_immediate_operand (op1, Pmode))
16447 op1 = force_reg (Pmode, op1);
16449 new_rtx
16450 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16454 else
16456 rtx base = legitimize_pic_address (op0, reg);
16457 machine_mode mode = GET_MODE (base);
16458 new_rtx
16459 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16461 if (CONST_INT_P (new_rtx))
16463 if (INTVAL (new_rtx) < -16*1024*1024
16464 || INTVAL (new_rtx) >= 16*1024*1024)
16466 if (!x86_64_immediate_operand (new_rtx, mode))
16467 new_rtx = force_reg (mode, new_rtx);
16469 new_rtx
16470 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16472 else
16473 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16475 else
16477 /* For %rip addressing, we have to use
16478 just disp32, not base nor index. */
16479 if (TARGET_64BIT
16480 && (GET_CODE (base) == SYMBOL_REF
16481 || GET_CODE (base) == LABEL_REF))
16482 base = force_reg (mode, base);
16483 if (GET_CODE (new_rtx) == PLUS
16484 && CONSTANT_P (XEXP (new_rtx, 1)))
16486 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16487 new_rtx = XEXP (new_rtx, 1);
16489 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16494 return new_rtx;
16497 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16499 static rtx
16500 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16502 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16504 if (GET_MODE (tp) != tp_mode)
16506 gcc_assert (GET_MODE (tp) == SImode);
16507 gcc_assert (tp_mode == DImode);
16509 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16512 if (to_reg)
16513 tp = copy_to_mode_reg (tp_mode, tp);
16515 return tp;
16518 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16520 static GTY(()) rtx ix86_tls_symbol;
16522 static rtx
16523 ix86_tls_get_addr (void)
16525 if (!ix86_tls_symbol)
16527 const char *sym
16528 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16529 ? "___tls_get_addr" : "__tls_get_addr");
16531 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16534 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16536 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16537 UNSPEC_PLTOFF);
16538 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16539 gen_rtx_CONST (Pmode, unspec));
16542 return ix86_tls_symbol;
16545 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16547 static GTY(()) rtx ix86_tls_module_base_symbol;
16550 ix86_tls_module_base (void)
16552 if (!ix86_tls_module_base_symbol)
16554 ix86_tls_module_base_symbol
16555 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16557 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16558 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16561 return ix86_tls_module_base_symbol;
16564 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16565 false if we expect this to be used for a memory address and true if
16566 we expect to load the address into a register. */
16568 static rtx
16569 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16571 rtx dest, base, off;
16572 rtx pic = NULL_RTX, tp = NULL_RTX;
16573 machine_mode tp_mode = Pmode;
16574 int type;
16576 /* Fall back to global dynamic model if tool chain cannot support local
16577 dynamic. */
16578 if (TARGET_SUN_TLS && !TARGET_64BIT
16579 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16580 && model == TLS_MODEL_LOCAL_DYNAMIC)
16581 model = TLS_MODEL_GLOBAL_DYNAMIC;
16583 switch (model)
16585 case TLS_MODEL_GLOBAL_DYNAMIC:
16586 dest = gen_reg_rtx (Pmode);
16588 if (!TARGET_64BIT)
16590 if (flag_pic && !TARGET_PECOFF)
16591 pic = pic_offset_table_rtx;
16592 else
16594 pic = gen_reg_rtx (Pmode);
16595 emit_insn (gen_set_got (pic));
16599 if (TARGET_GNU2_TLS)
16601 if (TARGET_64BIT)
16602 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16603 else
16604 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16606 tp = get_thread_pointer (Pmode, true);
16607 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16609 if (GET_MODE (x) != Pmode)
16610 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16612 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16614 else
16616 rtx caddr = ix86_tls_get_addr ();
16618 if (TARGET_64BIT)
16620 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16621 rtx_insn *insns;
16623 start_sequence ();
16624 emit_call_insn
16625 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16626 insns = get_insns ();
16627 end_sequence ();
16629 if (GET_MODE (x) != Pmode)
16630 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16632 RTL_CONST_CALL_P (insns) = 1;
16633 emit_libcall_block (insns, dest, rax, x);
16635 else
16636 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16638 break;
16640 case TLS_MODEL_LOCAL_DYNAMIC:
16641 base = gen_reg_rtx (Pmode);
16643 if (!TARGET_64BIT)
16645 if (flag_pic)
16646 pic = pic_offset_table_rtx;
16647 else
16649 pic = gen_reg_rtx (Pmode);
16650 emit_insn (gen_set_got (pic));
16654 if (TARGET_GNU2_TLS)
16656 rtx tmp = ix86_tls_module_base ();
16658 if (TARGET_64BIT)
16659 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16660 else
16661 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16663 tp = get_thread_pointer (Pmode, true);
16664 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16665 gen_rtx_MINUS (Pmode, tmp, tp));
16667 else
16669 rtx caddr = ix86_tls_get_addr ();
16671 if (TARGET_64BIT)
16673 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16674 rtx_insn *insns;
16675 rtx eqv;
16677 start_sequence ();
16678 emit_call_insn
16679 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16680 insns = get_insns ();
16681 end_sequence ();
16683 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16684 share the LD_BASE result with other LD model accesses. */
16685 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16686 UNSPEC_TLS_LD_BASE);
16688 RTL_CONST_CALL_P (insns) = 1;
16689 emit_libcall_block (insns, base, rax, eqv);
16691 else
16692 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16695 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16696 off = gen_rtx_CONST (Pmode, off);
16698 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16700 if (TARGET_GNU2_TLS)
16702 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16704 if (GET_MODE (x) != Pmode)
16705 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16707 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16709 break;
16711 case TLS_MODEL_INITIAL_EXEC:
16712 if (TARGET_64BIT)
16714 if (TARGET_SUN_TLS && !TARGET_X32)
16716 /* The Sun linker took the AMD64 TLS spec literally
16717 and can only handle %rax as destination of the
16718 initial executable code sequence. */
16720 dest = gen_reg_rtx (DImode);
16721 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16722 return dest;
16725 /* Generate DImode references to avoid %fs:(%reg32)
16726 problems and linker IE->LE relaxation bug. */
16727 tp_mode = DImode;
16728 pic = NULL;
16729 type = UNSPEC_GOTNTPOFF;
16731 else if (flag_pic)
16733 pic = pic_offset_table_rtx;
16734 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16736 else if (!TARGET_ANY_GNU_TLS)
16738 pic = gen_reg_rtx (Pmode);
16739 emit_insn (gen_set_got (pic));
16740 type = UNSPEC_GOTTPOFF;
16742 else
16744 pic = NULL;
16745 type = UNSPEC_INDNTPOFF;
16748 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16749 off = gen_rtx_CONST (tp_mode, off);
16750 if (pic)
16751 off = gen_rtx_PLUS (tp_mode, pic, off);
16752 off = gen_const_mem (tp_mode, off);
16753 set_mem_alias_set (off, ix86_GOT_alias_set ());
16755 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16757 base = get_thread_pointer (tp_mode,
16758 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16759 off = force_reg (tp_mode, off);
16760 dest = gen_rtx_PLUS (tp_mode, base, off);
16761 if (tp_mode != Pmode)
16762 dest = convert_to_mode (Pmode, dest, 1);
16764 else
16766 base = get_thread_pointer (Pmode, true);
16767 dest = gen_reg_rtx (Pmode);
16768 emit_insn (ix86_gen_sub3 (dest, base, off));
16770 break;
16772 case TLS_MODEL_LOCAL_EXEC:
16773 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16774 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16775 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16776 off = gen_rtx_CONST (Pmode, off);
16778 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16780 base = get_thread_pointer (Pmode,
16781 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16782 return gen_rtx_PLUS (Pmode, base, off);
16784 else
16786 base = get_thread_pointer (Pmode, true);
16787 dest = gen_reg_rtx (Pmode);
16788 emit_insn (ix86_gen_sub3 (dest, base, off));
16790 break;
16792 default:
16793 gcc_unreachable ();
16796 return dest;
16799 /* Return true if OP refers to a TLS address. */
16800 bool
16801 ix86_tls_address_pattern_p (rtx op)
16803 subrtx_var_iterator::array_type array;
16804 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16806 rtx op = *iter;
16807 if (MEM_P (op))
16809 rtx *x = &XEXP (op, 0);
16810 while (GET_CODE (*x) == PLUS)
16812 int i;
16813 for (i = 0; i < 2; i++)
16815 rtx u = XEXP (*x, i);
16816 if (GET_CODE (u) == ZERO_EXTEND)
16817 u = XEXP (u, 0);
16818 if (GET_CODE (u) == UNSPEC
16819 && XINT (u, 1) == UNSPEC_TP)
16820 return true;
16822 x = &XEXP (*x, 0);
16825 iter.skip_subrtxes ();
16829 return false;
16832 /* Rewrite *LOC so that it refers to a default TLS address space. */
16833 void
16834 ix86_rewrite_tls_address_1 (rtx *loc)
16836 subrtx_ptr_iterator::array_type array;
16837 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16839 rtx *loc = *iter;
16840 if (MEM_P (*loc))
16842 rtx addr = XEXP (*loc, 0);
16843 rtx *x = &addr;
16844 while (GET_CODE (*x) == PLUS)
16846 int i;
16847 for (i = 0; i < 2; i++)
16849 rtx u = XEXP (*x, i);
16850 if (GET_CODE (u) == ZERO_EXTEND)
16851 u = XEXP (u, 0);
16852 if (GET_CODE (u) == UNSPEC
16853 && XINT (u, 1) == UNSPEC_TP)
16855 addr_space_t as = DEFAULT_TLS_SEG_REG;
16857 *x = XEXP (*x, 1 - i);
16859 *loc = replace_equiv_address_nv (*loc, addr, true);
16860 set_mem_addr_space (*loc, as);
16861 return;
16864 x = &XEXP (*x, 0);
16867 iter.skip_subrtxes ();
16872 /* Rewrite instruction pattern involvning TLS address
16873 so that it refers to a default TLS address space. */
16875 ix86_rewrite_tls_address (rtx pattern)
16877 pattern = copy_insn (pattern);
16878 ix86_rewrite_tls_address_1 (&pattern);
16879 return pattern;
16882 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16883 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16884 unique refptr-DECL symbol corresponding to symbol DECL. */
16886 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16888 static inline hashval_t hash (tree_map *m) { return m->hash; }
16889 static inline bool
16890 equal (tree_map *a, tree_map *b)
16892 return a->base.from == b->base.from;
16895 static int
16896 keep_cache_entry (tree_map *&m)
16898 return ggc_marked_p (m->base.from);
16902 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16904 static tree
16905 get_dllimport_decl (tree decl, bool beimport)
16907 struct tree_map *h, in;
16908 const char *name;
16909 const char *prefix;
16910 size_t namelen, prefixlen;
16911 char *imp_name;
16912 tree to;
16913 rtx rtl;
16915 if (!dllimport_map)
16916 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16918 in.hash = htab_hash_pointer (decl);
16919 in.base.from = decl;
16920 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16921 h = *loc;
16922 if (h)
16923 return h->to;
16925 *loc = h = ggc_alloc<tree_map> ();
16926 h->hash = in.hash;
16927 h->base.from = decl;
16928 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16929 VAR_DECL, NULL, ptr_type_node);
16930 DECL_ARTIFICIAL (to) = 1;
16931 DECL_IGNORED_P (to) = 1;
16932 DECL_EXTERNAL (to) = 1;
16933 TREE_READONLY (to) = 1;
16935 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16936 name = targetm.strip_name_encoding (name);
16937 if (beimport)
16938 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16939 ? "*__imp_" : "*__imp__";
16940 else
16941 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16942 namelen = strlen (name);
16943 prefixlen = strlen (prefix);
16944 imp_name = (char *) alloca (namelen + prefixlen + 1);
16945 memcpy (imp_name, prefix, prefixlen);
16946 memcpy (imp_name + prefixlen, name, namelen + 1);
16948 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16949 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16950 SET_SYMBOL_REF_DECL (rtl, to);
16951 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16952 if (!beimport)
16954 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16955 #ifdef SUB_TARGET_RECORD_STUB
16956 SUB_TARGET_RECORD_STUB (name);
16957 #endif
16960 rtl = gen_const_mem (Pmode, rtl);
16961 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16963 SET_DECL_RTL (to, rtl);
16964 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16966 return to;
16969 /* Expand SYMBOL into its corresponding far-address symbol.
16970 WANT_REG is true if we require the result be a register. */
16972 static rtx
16973 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16975 tree imp_decl;
16976 rtx x;
16978 gcc_assert (SYMBOL_REF_DECL (symbol));
16979 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16981 x = DECL_RTL (imp_decl);
16982 if (want_reg)
16983 x = force_reg (Pmode, x);
16984 return x;
16987 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16988 true if we require the result be a register. */
16990 static rtx
16991 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16993 tree imp_decl;
16994 rtx x;
16996 gcc_assert (SYMBOL_REF_DECL (symbol));
16997 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16999 x = DECL_RTL (imp_decl);
17000 if (want_reg)
17001 x = force_reg (Pmode, x);
17002 return x;
17005 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17006 is true if we require the result be a register. */
17008 static rtx
17009 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17011 if (!TARGET_PECOFF)
17012 return NULL_RTX;
17014 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17016 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17017 return legitimize_dllimport_symbol (addr, inreg);
17018 if (GET_CODE (addr) == CONST
17019 && GET_CODE (XEXP (addr, 0)) == PLUS
17020 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17021 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17023 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17024 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17028 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17029 return NULL_RTX;
17030 if (GET_CODE (addr) == SYMBOL_REF
17031 && !is_imported_p (addr)
17032 && SYMBOL_REF_EXTERNAL_P (addr)
17033 && SYMBOL_REF_DECL (addr))
17034 return legitimize_pe_coff_extern_decl (addr, inreg);
17036 if (GET_CODE (addr) == CONST
17037 && GET_CODE (XEXP (addr, 0)) == PLUS
17038 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17039 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17040 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17041 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17043 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17044 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17046 return NULL_RTX;
17049 /* Try machine-dependent ways of modifying an illegitimate address
17050 to be legitimate. If we find one, return the new, valid address.
17051 This macro is used in only one place: `memory_address' in explow.c.
17053 OLDX is the address as it was before break_out_memory_refs was called.
17054 In some cases it is useful to look at this to decide what needs to be done.
17056 It is always safe for this macro to do nothing. It exists to recognize
17057 opportunities to optimize the output.
17059 For the 80386, we handle X+REG by loading X into a register R and
17060 using R+REG. R will go in a general reg and indexing will be used.
17061 However, if REG is a broken-out memory address or multiplication,
17062 nothing needs to be done because REG can certainly go in a general reg.
17064 When -fpic is used, special handling is needed for symbolic references.
17065 See comments by legitimize_pic_address in i386.c for details. */
17067 static rtx
17068 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17070 bool changed = false;
17071 unsigned log;
17073 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17074 if (log)
17075 return legitimize_tls_address (x, (enum tls_model) log, false);
17076 if (GET_CODE (x) == CONST
17077 && GET_CODE (XEXP (x, 0)) == PLUS
17078 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17079 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17081 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17082 (enum tls_model) log, false);
17083 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17086 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17088 rtx tmp = legitimize_pe_coff_symbol (x, true);
17089 if (tmp)
17090 return tmp;
17093 if (flag_pic && SYMBOLIC_CONST (x))
17094 return legitimize_pic_address (x, 0);
17096 #if TARGET_MACHO
17097 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17098 return machopic_indirect_data_reference (x, 0);
17099 #endif
17101 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17102 if (GET_CODE (x) == ASHIFT
17103 && CONST_INT_P (XEXP (x, 1))
17104 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17106 changed = true;
17107 log = INTVAL (XEXP (x, 1));
17108 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17109 GEN_INT (1 << log));
17112 if (GET_CODE (x) == PLUS)
17114 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17116 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17117 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17118 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17120 changed = true;
17121 log = INTVAL (XEXP (XEXP (x, 0), 1));
17122 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17123 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17124 GEN_INT (1 << log));
17127 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17128 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17129 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17131 changed = true;
17132 log = INTVAL (XEXP (XEXP (x, 1), 1));
17133 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17134 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17135 GEN_INT (1 << log));
17138 /* Put multiply first if it isn't already. */
17139 if (GET_CODE (XEXP (x, 1)) == MULT)
17141 std::swap (XEXP (x, 0), XEXP (x, 1));
17142 changed = true;
17145 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17146 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17147 created by virtual register instantiation, register elimination, and
17148 similar optimizations. */
17149 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17151 changed = true;
17152 x = gen_rtx_PLUS (Pmode,
17153 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17154 XEXP (XEXP (x, 1), 0)),
17155 XEXP (XEXP (x, 1), 1));
17158 /* Canonicalize
17159 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17160 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17161 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17162 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17163 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17164 && CONSTANT_P (XEXP (x, 1)))
17166 rtx constant;
17167 rtx other = NULL_RTX;
17169 if (CONST_INT_P (XEXP (x, 1)))
17171 constant = XEXP (x, 1);
17172 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17174 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17176 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17177 other = XEXP (x, 1);
17179 else
17180 constant = 0;
17182 if (constant)
17184 changed = true;
17185 x = gen_rtx_PLUS (Pmode,
17186 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17187 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17188 plus_constant (Pmode, other,
17189 INTVAL (constant)));
17193 if (changed && ix86_legitimate_address_p (mode, x, false))
17194 return x;
17196 if (GET_CODE (XEXP (x, 0)) == MULT)
17198 changed = true;
17199 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17202 if (GET_CODE (XEXP (x, 1)) == MULT)
17204 changed = true;
17205 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17208 if (changed
17209 && REG_P (XEXP (x, 1))
17210 && REG_P (XEXP (x, 0)))
17211 return x;
17213 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17215 changed = true;
17216 x = legitimize_pic_address (x, 0);
17219 if (changed && ix86_legitimate_address_p (mode, x, false))
17220 return x;
17222 if (REG_P (XEXP (x, 0)))
17224 rtx temp = gen_reg_rtx (Pmode);
17225 rtx val = force_operand (XEXP (x, 1), temp);
17226 if (val != temp)
17228 val = convert_to_mode (Pmode, val, 1);
17229 emit_move_insn (temp, val);
17232 XEXP (x, 1) = temp;
17233 return x;
17236 else if (REG_P (XEXP (x, 1)))
17238 rtx temp = gen_reg_rtx (Pmode);
17239 rtx val = force_operand (XEXP (x, 0), temp);
17240 if (val != temp)
17242 val = convert_to_mode (Pmode, val, 1);
17243 emit_move_insn (temp, val);
17246 XEXP (x, 0) = temp;
17247 return x;
17251 return x;
17254 /* Print an integer constant expression in assembler syntax. Addition
17255 and subtraction are the only arithmetic that may appear in these
17256 expressions. FILE is the stdio stream to write to, X is the rtx, and
17257 CODE is the operand print code from the output string. */
17259 static void
17260 output_pic_addr_const (FILE *file, rtx x, int code)
17262 char buf[256];
17264 switch (GET_CODE (x))
17266 case PC:
17267 gcc_assert (flag_pic);
17268 putc ('.', file);
17269 break;
17271 case SYMBOL_REF:
17272 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17273 output_addr_const (file, x);
17274 else
17276 const char *name = XSTR (x, 0);
17278 /* Mark the decl as referenced so that cgraph will
17279 output the function. */
17280 if (SYMBOL_REF_DECL (x))
17281 mark_decl_referenced (SYMBOL_REF_DECL (x));
17283 #if TARGET_MACHO
17284 if (MACHOPIC_INDIRECT
17285 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17286 name = machopic_indirection_name (x, /*stub_p=*/true);
17287 #endif
17288 assemble_name (file, name);
17290 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17291 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17292 fputs ("@PLT", file);
17293 break;
17295 case LABEL_REF:
17296 x = XEXP (x, 0);
17297 /* FALLTHRU */
17298 case CODE_LABEL:
17299 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17300 assemble_name (asm_out_file, buf);
17301 break;
17303 case CONST_INT:
17304 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17305 break;
17307 case CONST:
17308 /* This used to output parentheses around the expression,
17309 but that does not work on the 386 (either ATT or BSD assembler). */
17310 output_pic_addr_const (file, XEXP (x, 0), code);
17311 break;
17313 case CONST_DOUBLE:
17314 /* We can't handle floating point constants;
17315 TARGET_PRINT_OPERAND must handle them. */
17316 output_operand_lossage ("floating constant misused");
17317 break;
17319 case PLUS:
17320 /* Some assemblers need integer constants to appear first. */
17321 if (CONST_INT_P (XEXP (x, 0)))
17323 output_pic_addr_const (file, XEXP (x, 0), code);
17324 putc ('+', file);
17325 output_pic_addr_const (file, XEXP (x, 1), code);
17327 else
17329 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17330 output_pic_addr_const (file, XEXP (x, 1), code);
17331 putc ('+', file);
17332 output_pic_addr_const (file, XEXP (x, 0), code);
17334 break;
17336 case MINUS:
17337 if (!TARGET_MACHO)
17338 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17339 output_pic_addr_const (file, XEXP (x, 0), code);
17340 putc ('-', file);
17341 output_pic_addr_const (file, XEXP (x, 1), code);
17342 if (!TARGET_MACHO)
17343 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17344 break;
17346 case UNSPEC:
17347 gcc_assert (XVECLEN (x, 0) == 1);
17348 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17349 switch (XINT (x, 1))
17351 case UNSPEC_GOT:
17352 fputs ("@GOT", file);
17353 break;
17354 case UNSPEC_GOTOFF:
17355 fputs ("@GOTOFF", file);
17356 break;
17357 case UNSPEC_PLTOFF:
17358 fputs ("@PLTOFF", file);
17359 break;
17360 case UNSPEC_PCREL:
17361 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17362 "(%rip)" : "[rip]", file);
17363 break;
17364 case UNSPEC_GOTPCREL:
17365 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17366 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17367 break;
17368 case UNSPEC_GOTTPOFF:
17369 /* FIXME: This might be @TPOFF in Sun ld too. */
17370 fputs ("@gottpoff", file);
17371 break;
17372 case UNSPEC_TPOFF:
17373 fputs ("@tpoff", file);
17374 break;
17375 case UNSPEC_NTPOFF:
17376 if (TARGET_64BIT)
17377 fputs ("@tpoff", file);
17378 else
17379 fputs ("@ntpoff", file);
17380 break;
17381 case UNSPEC_DTPOFF:
17382 fputs ("@dtpoff", file);
17383 break;
17384 case UNSPEC_GOTNTPOFF:
17385 if (TARGET_64BIT)
17386 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17387 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17388 else
17389 fputs ("@gotntpoff", file);
17390 break;
17391 case UNSPEC_INDNTPOFF:
17392 fputs ("@indntpoff", file);
17393 break;
17394 #if TARGET_MACHO
17395 case UNSPEC_MACHOPIC_OFFSET:
17396 putc ('-', file);
17397 machopic_output_function_base_name (file);
17398 break;
17399 #endif
17400 default:
17401 output_operand_lossage ("invalid UNSPEC as operand");
17402 break;
17404 break;
17406 default:
17407 output_operand_lossage ("invalid expression as operand");
17411 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17412 We need to emit DTP-relative relocations. */
17414 static void ATTRIBUTE_UNUSED
17415 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17417 fputs (ASM_LONG, file);
17418 output_addr_const (file, x);
17419 fputs ("@dtpoff", file);
17420 switch (size)
17422 case 4:
17423 break;
17424 case 8:
17425 fputs (", 0", file);
17426 break;
17427 default:
17428 gcc_unreachable ();
17432 /* Return true if X is a representation of the PIC register. This copes
17433 with calls from ix86_find_base_term, where the register might have
17434 been replaced by a cselib value. */
17436 static bool
17437 ix86_pic_register_p (rtx x)
17439 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17440 return (pic_offset_table_rtx
17441 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17442 else if (!REG_P (x))
17443 return false;
17444 else if (pic_offset_table_rtx)
17446 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17447 return true;
17448 if (HARD_REGISTER_P (x)
17449 && !HARD_REGISTER_P (pic_offset_table_rtx)
17450 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17451 return true;
17452 return false;
17454 else
17455 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17458 /* Helper function for ix86_delegitimize_address.
17459 Attempt to delegitimize TLS local-exec accesses. */
17461 static rtx
17462 ix86_delegitimize_tls_address (rtx orig_x)
17464 rtx x = orig_x, unspec;
17465 struct ix86_address addr;
17467 if (!TARGET_TLS_DIRECT_SEG_REFS)
17468 return orig_x;
17469 if (MEM_P (x))
17470 x = XEXP (x, 0);
17471 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17472 return orig_x;
17473 if (ix86_decompose_address (x, &addr) == 0
17474 || addr.seg != DEFAULT_TLS_SEG_REG
17475 || addr.disp == NULL_RTX
17476 || GET_CODE (addr.disp) != CONST)
17477 return orig_x;
17478 unspec = XEXP (addr.disp, 0);
17479 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17480 unspec = XEXP (unspec, 0);
17481 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17482 return orig_x;
17483 x = XVECEXP (unspec, 0, 0);
17484 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17485 if (unspec != XEXP (addr.disp, 0))
17486 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17487 if (addr.index)
17489 rtx idx = addr.index;
17490 if (addr.scale != 1)
17491 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17492 x = gen_rtx_PLUS (Pmode, idx, x);
17494 if (addr.base)
17495 x = gen_rtx_PLUS (Pmode, addr.base, x);
17496 if (MEM_P (orig_x))
17497 x = replace_equiv_address_nv (orig_x, x);
17498 return x;
17501 /* In the name of slightly smaller debug output, and to cater to
17502 general assembler lossage, recognize PIC+GOTOFF and turn it back
17503 into a direct symbol reference.
17505 On Darwin, this is necessary to avoid a crash, because Darwin
17506 has a different PIC label for each routine but the DWARF debugging
17507 information is not associated with any particular routine, so it's
17508 necessary to remove references to the PIC label from RTL stored by
17509 the DWARF output code.
17511 This helper is used in the normal ix86_delegitimize_address
17512 entrypoint (e.g. used in the target delegitimization hook) and
17513 in ix86_find_base_term. As compile time memory optimization, we
17514 avoid allocating rtxes that will not change anything on the outcome
17515 of the callers (find_base_value and find_base_term). */
17517 static inline rtx
17518 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17520 rtx orig_x = delegitimize_mem_from_attrs (x);
17521 /* addend is NULL or some rtx if x is something+GOTOFF where
17522 something doesn't include the PIC register. */
17523 rtx addend = NULL_RTX;
17524 /* reg_addend is NULL or a multiple of some register. */
17525 rtx reg_addend = NULL_RTX;
17526 /* const_addend is NULL or a const_int. */
17527 rtx const_addend = NULL_RTX;
17528 /* This is the result, or NULL. */
17529 rtx result = NULL_RTX;
17531 x = orig_x;
17533 if (MEM_P (x))
17534 x = XEXP (x, 0);
17536 if (TARGET_64BIT)
17538 if (GET_CODE (x) == CONST
17539 && GET_CODE (XEXP (x, 0)) == PLUS
17540 && GET_MODE (XEXP (x, 0)) == Pmode
17541 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17542 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17543 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17545 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17546 base. A CONST can't be arg_pointer_rtx based. */
17547 if (base_term_p && MEM_P (orig_x))
17548 return orig_x;
17549 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17550 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17551 if (MEM_P (orig_x))
17552 x = replace_equiv_address_nv (orig_x, x);
17553 return x;
17556 if (GET_CODE (x) == CONST
17557 && GET_CODE (XEXP (x, 0)) == UNSPEC
17558 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17559 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17560 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17562 x = XVECEXP (XEXP (x, 0), 0, 0);
17563 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17565 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17566 if (x == NULL_RTX)
17567 return orig_x;
17569 return x;
17572 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17573 return ix86_delegitimize_tls_address (orig_x);
17575 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17576 and -mcmodel=medium -fpic. */
17579 if (GET_CODE (x) != PLUS
17580 || GET_CODE (XEXP (x, 1)) != CONST)
17581 return ix86_delegitimize_tls_address (orig_x);
17583 if (ix86_pic_register_p (XEXP (x, 0)))
17584 /* %ebx + GOT/GOTOFF */
17586 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17588 /* %ebx + %reg * scale + GOT/GOTOFF */
17589 reg_addend = XEXP (x, 0);
17590 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17591 reg_addend = XEXP (reg_addend, 1);
17592 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17593 reg_addend = XEXP (reg_addend, 0);
17594 else
17596 reg_addend = NULL_RTX;
17597 addend = XEXP (x, 0);
17600 else
17601 addend = XEXP (x, 0);
17603 x = XEXP (XEXP (x, 1), 0);
17604 if (GET_CODE (x) == PLUS
17605 && CONST_INT_P (XEXP (x, 1)))
17607 const_addend = XEXP (x, 1);
17608 x = XEXP (x, 0);
17611 if (GET_CODE (x) == UNSPEC
17612 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17613 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17614 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17615 && !MEM_P (orig_x) && !addend)))
17616 result = XVECEXP (x, 0, 0);
17618 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17619 && !MEM_P (orig_x))
17620 result = XVECEXP (x, 0, 0);
17622 if (! result)
17623 return ix86_delegitimize_tls_address (orig_x);
17625 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17626 recurse on the first operand. */
17627 if (const_addend && !base_term_p)
17628 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17629 if (reg_addend)
17630 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17631 if (addend)
17633 /* If the rest of original X doesn't involve the PIC register, add
17634 addend and subtract pic_offset_table_rtx. This can happen e.g.
17635 for code like:
17636 leal (%ebx, %ecx, 4), %ecx
17638 movl foo@GOTOFF(%ecx), %edx
17639 in which case we return (%ecx - %ebx) + foo
17640 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17641 and reload has completed. Don't do the latter for debug,
17642 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17643 if (pic_offset_table_rtx
17644 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17645 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17646 pic_offset_table_rtx),
17647 result);
17648 else if (base_term_p
17649 && pic_offset_table_rtx
17650 && !TARGET_MACHO
17651 && !TARGET_VXWORKS_RTP)
17653 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17654 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17655 result = gen_rtx_PLUS (Pmode, tmp, result);
17657 else
17658 return orig_x;
17660 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17662 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17663 if (result == NULL_RTX)
17664 return orig_x;
17666 return result;
17669 /* The normal instantiation of the above template. */
17671 static rtx
17672 ix86_delegitimize_address (rtx x)
17674 return ix86_delegitimize_address_1 (x, false);
17677 /* If X is a machine specific address (i.e. a symbol or label being
17678 referenced as a displacement from the GOT implemented using an
17679 UNSPEC), then return the base term. Otherwise return X. */
17682 ix86_find_base_term (rtx x)
17684 rtx term;
17686 if (TARGET_64BIT)
17688 if (GET_CODE (x) != CONST)
17689 return x;
17690 term = XEXP (x, 0);
17691 if (GET_CODE (term) == PLUS
17692 && CONST_INT_P (XEXP (term, 1)))
17693 term = XEXP (term, 0);
17694 if (GET_CODE (term) != UNSPEC
17695 || (XINT (term, 1) != UNSPEC_GOTPCREL
17696 && XINT (term, 1) != UNSPEC_PCREL))
17697 return x;
17699 return XVECEXP (term, 0, 0);
17702 return ix86_delegitimize_address_1 (x, true);
17705 /* Return true if X shouldn't be emitted into the debug info.
17706 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17707 symbol easily into the .debug_info section, so we need not to
17708 delegitimize, but instead assemble as @gotoff.
17709 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17710 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17712 static bool
17713 ix86_const_not_ok_for_debug_p (rtx x)
17715 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17716 return true;
17718 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17719 return true;
17721 return false;
17724 static void
17725 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17726 bool fp, FILE *file)
17728 const char *suffix;
17730 if (mode == CCFPmode)
17732 code = ix86_fp_compare_code_to_integer (code);
17733 mode = CCmode;
17735 if (reverse)
17736 code = reverse_condition (code);
17738 switch (code)
17740 case EQ:
17741 gcc_assert (mode != CCGZmode);
17742 switch (mode)
17744 case E_CCAmode:
17745 suffix = "a";
17746 break;
17747 case E_CCCmode:
17748 suffix = "c";
17749 break;
17750 case E_CCOmode:
17751 suffix = "o";
17752 break;
17753 case E_CCPmode:
17754 suffix = "p";
17755 break;
17756 case E_CCSmode:
17757 suffix = "s";
17758 break;
17759 default:
17760 suffix = "e";
17761 break;
17763 break;
17764 case NE:
17765 gcc_assert (mode != CCGZmode);
17766 switch (mode)
17768 case E_CCAmode:
17769 suffix = "na";
17770 break;
17771 case E_CCCmode:
17772 suffix = "nc";
17773 break;
17774 case E_CCOmode:
17775 suffix = "no";
17776 break;
17777 case E_CCPmode:
17778 suffix = "np";
17779 break;
17780 case E_CCSmode:
17781 suffix = "ns";
17782 break;
17783 default:
17784 suffix = "ne";
17785 break;
17787 break;
17788 case GT:
17789 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17790 suffix = "g";
17791 break;
17792 case GTU:
17793 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17794 Those same assemblers have the same but opposite lossage on cmov. */
17795 if (mode == CCmode)
17796 suffix = fp ? "nbe" : "a";
17797 else
17798 gcc_unreachable ();
17799 break;
17800 case LT:
17801 switch (mode)
17803 case E_CCNOmode:
17804 case E_CCGOCmode:
17805 suffix = "s";
17806 break;
17808 case E_CCmode:
17809 case E_CCGCmode:
17810 case E_CCGZmode:
17811 suffix = "l";
17812 break;
17814 default:
17815 gcc_unreachable ();
17817 break;
17818 case LTU:
17819 if (mode == CCmode || mode == CCGZmode)
17820 suffix = "b";
17821 else if (mode == CCCmode)
17822 suffix = fp ? "b" : "c";
17823 else
17824 gcc_unreachable ();
17825 break;
17826 case GE:
17827 switch (mode)
17829 case E_CCNOmode:
17830 case E_CCGOCmode:
17831 suffix = "ns";
17832 break;
17834 case E_CCmode:
17835 case E_CCGCmode:
17836 case E_CCGZmode:
17837 suffix = "ge";
17838 break;
17840 default:
17841 gcc_unreachable ();
17843 break;
17844 case GEU:
17845 if (mode == CCmode || mode == CCGZmode)
17846 suffix = "nb";
17847 else if (mode == CCCmode)
17848 suffix = fp ? "nb" : "nc";
17849 else
17850 gcc_unreachable ();
17851 break;
17852 case LE:
17853 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17854 suffix = "le";
17855 break;
17856 case LEU:
17857 if (mode == CCmode)
17858 suffix = "be";
17859 else
17860 gcc_unreachable ();
17861 break;
17862 case UNORDERED:
17863 suffix = fp ? "u" : "p";
17864 break;
17865 case ORDERED:
17866 suffix = fp ? "nu" : "np";
17867 break;
17868 default:
17869 gcc_unreachable ();
17871 fputs (suffix, file);
17874 /* Print the name of register X to FILE based on its machine mode and number.
17875 If CODE is 'w', pretend the mode is HImode.
17876 If CODE is 'b', pretend the mode is QImode.
17877 If CODE is 'k', pretend the mode is SImode.
17878 If CODE is 'q', pretend the mode is DImode.
17879 If CODE is 'x', pretend the mode is V4SFmode.
17880 If CODE is 't', pretend the mode is V8SFmode.
17881 If CODE is 'g', pretend the mode is V16SFmode.
17882 If CODE is 'h', pretend the reg is the 'high' byte register.
17883 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17884 If CODE is 'd', duplicate the operand for AVX instruction.
17885 If CODE is 'V', print naked full integer register name without %.
17888 void
17889 print_reg (rtx x, int code, FILE *file)
17891 const char *reg;
17892 int msize;
17893 unsigned int regno;
17894 bool duplicated;
17896 if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
17897 putc ('%', file);
17899 if (x == pc_rtx)
17901 gcc_assert (TARGET_64BIT);
17902 fputs ("rip", file);
17903 return;
17906 if (code == 'y' && STACK_TOP_P (x))
17908 fputs ("st(0)", file);
17909 return;
17912 if (code == 'w')
17913 msize = 2;
17914 else if (code == 'b')
17915 msize = 1;
17916 else if (code == 'k')
17917 msize = 4;
17918 else if (code == 'q')
17919 msize = 8;
17920 else if (code == 'h')
17921 msize = 0;
17922 else if (code == 'x')
17923 msize = 16;
17924 else if (code == 't')
17925 msize = 32;
17926 else if (code == 'g')
17927 msize = 64;
17928 else
17929 msize = GET_MODE_SIZE (GET_MODE (x));
17931 regno = REGNO (x);
17933 if (regno == ARG_POINTER_REGNUM
17934 || regno == FRAME_POINTER_REGNUM
17935 || regno == FPSR_REG
17936 || regno == FPCR_REG)
17938 output_operand_lossage
17939 ("invalid use of register '%s'", reg_names[regno]);
17940 return;
17942 else if (regno == FLAGS_REG)
17944 output_operand_lossage ("invalid use of asm flag output");
17945 return;
17948 if (code == 'V')
17950 if (GENERAL_REGNO_P (regno))
17951 msize = GET_MODE_SIZE (word_mode);
17952 else
17953 error ("'V' modifier on non-integer register");
17956 duplicated = code == 'd' && TARGET_AVX;
17958 switch (msize)
17960 case 16:
17961 case 12:
17962 case 8:
17963 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17964 warning (0, "unsupported size for integer register");
17965 /* FALLTHRU */
17966 case 4:
17967 if (LEGACY_INT_REGNO_P (regno))
17968 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17969 /* FALLTHRU */
17970 case 2:
17971 normal:
17972 reg = hi_reg_name[regno];
17973 break;
17974 case 1:
17975 if (regno >= ARRAY_SIZE (qi_reg_name))
17976 goto normal;
17977 if (!ANY_QI_REGNO_P (regno))
17978 error ("unsupported size for integer register");
17979 reg = qi_reg_name[regno];
17980 break;
17981 case 0:
17982 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17983 goto normal;
17984 reg = qi_high_reg_name[regno];
17985 break;
17986 case 32:
17987 case 64:
17988 if (SSE_REGNO_P (regno))
17990 gcc_assert (!duplicated);
17991 putc (msize == 32 ? 'y' : 'z', file);
17992 reg = hi_reg_name[regno] + 1;
17993 break;
17995 goto normal;
17996 default:
17997 gcc_unreachable ();
18000 fputs (reg, file);
18002 /* Irritatingly, AMD extended registers use
18003 different naming convention: "r%d[bwd]" */
18004 if (REX_INT_REGNO_P (regno))
18006 gcc_assert (TARGET_64BIT);
18007 switch (msize)
18009 case 0:
18010 error ("extended registers have no high halves");
18011 break;
18012 case 1:
18013 putc ('b', file);
18014 break;
18015 case 2:
18016 putc ('w', file);
18017 break;
18018 case 4:
18019 putc ('d', file);
18020 break;
18021 case 8:
18022 /* no suffix */
18023 break;
18024 default:
18025 error ("unsupported operand size for extended register");
18026 break;
18028 return;
18031 if (duplicated)
18033 if (ASSEMBLER_DIALECT == ASM_ATT)
18034 fprintf (file, ", %%%s", reg);
18035 else
18036 fprintf (file, ", %s", reg);
18040 /* Meaning of CODE:
18041 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18042 C -- print opcode suffix for set/cmov insn.
18043 c -- like C, but print reversed condition
18044 F,f -- likewise, but for floating-point.
18045 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18046 otherwise nothing
18047 R -- print embedded rounding and sae.
18048 r -- print only sae.
18049 z -- print the opcode suffix for the size of the current operand.
18050 Z -- likewise, with special suffixes for x87 instructions.
18051 * -- print a star (in certain assembler syntax)
18052 A -- print an absolute memory reference.
18053 E -- print address with DImode register names if TARGET_64BIT.
18054 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18055 s -- print a shift double count, followed by the assemblers argument
18056 delimiter.
18057 b -- print the QImode name of the register for the indicated operand.
18058 %b0 would print %al if operands[0] is reg 0.
18059 w -- likewise, print the HImode name of the register.
18060 k -- likewise, print the SImode name of the register.
18061 q -- likewise, print the DImode name of the register.
18062 x -- likewise, print the V4SFmode name of the register.
18063 t -- likewise, print the V8SFmode name of the register.
18064 g -- likewise, print the V16SFmode name of the register.
18065 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18066 y -- print "st(0)" instead of "st" as a register.
18067 d -- print duplicated register operand for AVX instruction.
18068 D -- print condition for SSE cmp instruction.
18069 P -- if PIC, print an @PLT suffix.
18070 p -- print raw symbol name.
18071 X -- don't print any sort of PIC '@' suffix for a symbol.
18072 & -- print some in-use local-dynamic symbol name.
18073 H -- print a memory address offset by 8; used for sse high-parts
18074 Y -- print condition for XOP pcom* instruction.
18075 V -- print naked full integer register name without %.
18076 + -- print a branch hint as 'cs' or 'ds' prefix
18077 ; -- print a semicolon (after prefixes due to bug in older gas).
18078 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18079 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18080 ! -- print MPX prefix for jxx/call/ret instructions if required.
18083 void
18084 ix86_print_operand (FILE *file, rtx x, int code)
18086 if (code)
18088 switch (code)
18090 case 'A':
18091 switch (ASSEMBLER_DIALECT)
18093 case ASM_ATT:
18094 putc ('*', file);
18095 break;
18097 case ASM_INTEL:
18098 /* Intel syntax. For absolute addresses, registers should not
18099 be surrounded by braces. */
18100 if (!REG_P (x))
18102 putc ('[', file);
18103 ix86_print_operand (file, x, 0);
18104 putc (']', file);
18105 return;
18107 break;
18109 default:
18110 gcc_unreachable ();
18113 ix86_print_operand (file, x, 0);
18114 return;
18116 case 'E':
18117 /* Wrap address in an UNSPEC to declare special handling. */
18118 if (TARGET_64BIT)
18119 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18121 output_address (VOIDmode, x);
18122 return;
18124 case 'L':
18125 if (ASSEMBLER_DIALECT == ASM_ATT)
18126 putc ('l', file);
18127 return;
18129 case 'W':
18130 if (ASSEMBLER_DIALECT == ASM_ATT)
18131 putc ('w', file);
18132 return;
18134 case 'B':
18135 if (ASSEMBLER_DIALECT == ASM_ATT)
18136 putc ('b', file);
18137 return;
18139 case 'Q':
18140 if (ASSEMBLER_DIALECT == ASM_ATT)
18141 putc ('l', file);
18142 return;
18144 case 'S':
18145 if (ASSEMBLER_DIALECT == ASM_ATT)
18146 putc ('s', file);
18147 return;
18149 case 'T':
18150 if (ASSEMBLER_DIALECT == ASM_ATT)
18151 putc ('t', file);
18152 return;
18154 case 'O':
18155 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18156 if (ASSEMBLER_DIALECT != ASM_ATT)
18157 return;
18159 switch (GET_MODE_SIZE (GET_MODE (x)))
18161 case 2:
18162 putc ('w', file);
18163 break;
18165 case 4:
18166 putc ('l', file);
18167 break;
18169 case 8:
18170 putc ('q', file);
18171 break;
18173 default:
18174 output_operand_lossage ("invalid operand size for operand "
18175 "code 'O'");
18176 return;
18179 putc ('.', file);
18180 #endif
18181 return;
18183 case 'z':
18184 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18186 /* Opcodes don't get size suffixes if using Intel opcodes. */
18187 if (ASSEMBLER_DIALECT == ASM_INTEL)
18188 return;
18190 switch (GET_MODE_SIZE (GET_MODE (x)))
18192 case 1:
18193 putc ('b', file);
18194 return;
18196 case 2:
18197 putc ('w', file);
18198 return;
18200 case 4:
18201 putc ('l', file);
18202 return;
18204 case 8:
18205 putc ('q', file);
18206 return;
18208 default:
18209 output_operand_lossage ("invalid operand size for operand "
18210 "code 'z'");
18211 return;
18215 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18216 warning (0, "non-integer operand used with operand code 'z'");
18217 /* FALLTHRU */
18219 case 'Z':
18220 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18221 if (ASSEMBLER_DIALECT == ASM_INTEL)
18222 return;
18224 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18226 switch (GET_MODE_SIZE (GET_MODE (x)))
18228 case 2:
18229 #ifdef HAVE_AS_IX86_FILDS
18230 putc ('s', file);
18231 #endif
18232 return;
18234 case 4:
18235 putc ('l', file);
18236 return;
18238 case 8:
18239 #ifdef HAVE_AS_IX86_FILDQ
18240 putc ('q', file);
18241 #else
18242 fputs ("ll", file);
18243 #endif
18244 return;
18246 default:
18247 break;
18250 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18252 /* 387 opcodes don't get size suffixes
18253 if the operands are registers. */
18254 if (STACK_REG_P (x))
18255 return;
18257 switch (GET_MODE_SIZE (GET_MODE (x)))
18259 case 4:
18260 putc ('s', file);
18261 return;
18263 case 8:
18264 putc ('l', file);
18265 return;
18267 case 12:
18268 case 16:
18269 putc ('t', file);
18270 return;
18272 default:
18273 break;
18276 else
18278 output_operand_lossage ("invalid operand type used with "
18279 "operand code 'Z'");
18280 return;
18283 output_operand_lossage ("invalid operand size for operand code 'Z'");
18284 return;
18286 case 'd':
18287 case 'b':
18288 case 'w':
18289 case 'k':
18290 case 'q':
18291 case 'h':
18292 case 't':
18293 case 'g':
18294 case 'y':
18295 case 'x':
18296 case 'X':
18297 case 'P':
18298 case 'p':
18299 case 'V':
18300 break;
18302 case 's':
18303 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18305 ix86_print_operand (file, x, 0);
18306 fputs (", ", file);
18308 return;
18310 case 'Y':
18311 switch (GET_CODE (x))
18313 case NE:
18314 fputs ("neq", file);
18315 break;
18316 case EQ:
18317 fputs ("eq", file);
18318 break;
18319 case GE:
18320 case GEU:
18321 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18322 break;
18323 case GT:
18324 case GTU:
18325 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18326 break;
18327 case LE:
18328 case LEU:
18329 fputs ("le", file);
18330 break;
18331 case LT:
18332 case LTU:
18333 fputs ("lt", file);
18334 break;
18335 case UNORDERED:
18336 fputs ("unord", file);
18337 break;
18338 case ORDERED:
18339 fputs ("ord", file);
18340 break;
18341 case UNEQ:
18342 fputs ("ueq", file);
18343 break;
18344 case UNGE:
18345 fputs ("nlt", file);
18346 break;
18347 case UNGT:
18348 fputs ("nle", file);
18349 break;
18350 case UNLE:
18351 fputs ("ule", file);
18352 break;
18353 case UNLT:
18354 fputs ("ult", file);
18355 break;
18356 case LTGT:
18357 fputs ("une", file);
18358 break;
18359 default:
18360 output_operand_lossage ("operand is not a condition code, "
18361 "invalid operand code 'Y'");
18362 return;
18364 return;
18366 case 'D':
18367 /* Little bit of braindamage here. The SSE compare instructions
18368 does use completely different names for the comparisons that the
18369 fp conditional moves. */
18370 switch (GET_CODE (x))
18372 case UNEQ:
18373 if (TARGET_AVX)
18375 fputs ("eq_us", file);
18376 break;
18378 /* FALLTHRU */
18379 case EQ:
18380 fputs ("eq", file);
18381 break;
18382 case UNLT:
18383 if (TARGET_AVX)
18385 fputs ("nge", file);
18386 break;
18388 /* FALLTHRU */
18389 case LT:
18390 fputs ("lt", file);
18391 break;
18392 case UNLE:
18393 if (TARGET_AVX)
18395 fputs ("ngt", file);
18396 break;
18398 /* FALLTHRU */
18399 case LE:
18400 fputs ("le", file);
18401 break;
18402 case UNORDERED:
18403 fputs ("unord", file);
18404 break;
18405 case LTGT:
18406 if (TARGET_AVX)
18408 fputs ("neq_oq", file);
18409 break;
18411 /* FALLTHRU */
18412 case NE:
18413 fputs ("neq", file);
18414 break;
18415 case GE:
18416 if (TARGET_AVX)
18418 fputs ("ge", file);
18419 break;
18421 /* FALLTHRU */
18422 case UNGE:
18423 fputs ("nlt", file);
18424 break;
18425 case GT:
18426 if (TARGET_AVX)
18428 fputs ("gt", file);
18429 break;
18431 /* FALLTHRU */
18432 case UNGT:
18433 fputs ("nle", file);
18434 break;
18435 case ORDERED:
18436 fputs ("ord", file);
18437 break;
18438 default:
18439 output_operand_lossage ("operand is not a condition code, "
18440 "invalid operand code 'D'");
18441 return;
18443 return;
18445 case 'F':
18446 case 'f':
18447 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18448 if (ASSEMBLER_DIALECT == ASM_ATT)
18449 putc ('.', file);
18450 gcc_fallthrough ();
18451 #endif
18453 case 'C':
18454 case 'c':
18455 if (!COMPARISON_P (x))
18457 output_operand_lossage ("operand is not a condition code, "
18458 "invalid operand code '%c'", code);
18459 return;
18461 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18462 code == 'c' || code == 'f',
18463 code == 'F' || code == 'f',
18464 file);
18465 return;
18467 case 'H':
18468 if (!offsettable_memref_p (x))
18470 output_operand_lossage ("operand is not an offsettable memory "
18471 "reference, invalid operand code 'H'");
18472 return;
18474 /* It doesn't actually matter what mode we use here, as we're
18475 only going to use this for printing. */
18476 x = adjust_address_nv (x, DImode, 8);
18477 /* Output 'qword ptr' for intel assembler dialect. */
18478 if (ASSEMBLER_DIALECT == ASM_INTEL)
18479 code = 'q';
18480 break;
18482 case 'K':
18483 if (!CONST_INT_P (x))
18485 output_operand_lossage ("operand is not an integer, invalid "
18486 "operand code 'K'");
18487 return;
18490 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18491 #ifdef HAVE_AS_IX86_HLE
18492 fputs ("xacquire ", file);
18493 #else
18494 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18495 #endif
18496 else if (INTVAL (x) & IX86_HLE_RELEASE)
18497 #ifdef HAVE_AS_IX86_HLE
18498 fputs ("xrelease ", file);
18499 #else
18500 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18501 #endif
18502 /* We do not want to print value of the operand. */
18503 return;
18505 case 'N':
18506 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18507 fputs ("{z}", file);
18508 return;
18510 case 'r':
18511 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18513 output_operand_lossage ("operand is not a specific integer, "
18514 "invalid operand code 'r'");
18515 return;
18518 if (ASSEMBLER_DIALECT == ASM_INTEL)
18519 fputs (", ", file);
18521 fputs ("{sae}", file);
18523 if (ASSEMBLER_DIALECT == ASM_ATT)
18524 fputs (", ", file);
18526 return;
18528 case 'R':
18529 if (!CONST_INT_P (x))
18531 output_operand_lossage ("operand is not an integer, invalid "
18532 "operand code 'R'");
18533 return;
18536 if (ASSEMBLER_DIALECT == ASM_INTEL)
18537 fputs (", ", file);
18539 switch (INTVAL (x))
18541 case ROUND_NEAREST_INT | ROUND_SAE:
18542 fputs ("{rn-sae}", file);
18543 break;
18544 case ROUND_NEG_INF | ROUND_SAE:
18545 fputs ("{rd-sae}", file);
18546 break;
18547 case ROUND_POS_INF | ROUND_SAE:
18548 fputs ("{ru-sae}", file);
18549 break;
18550 case ROUND_ZERO | ROUND_SAE:
18551 fputs ("{rz-sae}", file);
18552 break;
18553 default:
18554 output_operand_lossage ("operand is not a specific integer, "
18555 "invalid operand code 'R'");
18558 if (ASSEMBLER_DIALECT == ASM_ATT)
18559 fputs (", ", file);
18561 return;
18563 case '*':
18564 if (ASSEMBLER_DIALECT == ASM_ATT)
18565 putc ('*', file);
18566 return;
18568 case '&':
18570 const char *name = get_some_local_dynamic_name ();
18571 if (name == NULL)
18572 output_operand_lossage ("'%%&' used without any "
18573 "local dynamic TLS references");
18574 else
18575 assemble_name (file, name);
18576 return;
18579 case '+':
18581 rtx x;
18583 if (!optimize
18584 || optimize_function_for_size_p (cfun)
18585 || !TARGET_BRANCH_PREDICTION_HINTS)
18586 return;
18588 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18589 if (x)
18591 int pred_val = profile_probability::from_reg_br_prob_note
18592 (XINT (x, 0)).to_reg_br_prob_base ();
18594 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18595 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18597 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18598 bool cputaken
18599 = final_forward_branch_p (current_output_insn) == 0;
18601 /* Emit hints only in the case default branch prediction
18602 heuristics would fail. */
18603 if (taken != cputaken)
18605 /* We use 3e (DS) prefix for taken branches and
18606 2e (CS) prefix for not taken branches. */
18607 if (taken)
18608 fputs ("ds ; ", file);
18609 else
18610 fputs ("cs ; ", file);
18614 return;
18617 case ';':
18618 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18619 putc (';', file);
18620 #endif
18621 return;
18623 case '~':
18624 putc (TARGET_AVX2 ? 'i' : 'f', file);
18625 return;
18627 case '^':
18628 if (TARGET_64BIT && Pmode != word_mode)
18629 fputs ("addr32 ", file);
18630 return;
18632 case '!':
18633 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18634 fputs ("bnd ", file);
18635 if (ix86_notrack_prefixed_insn_p (current_output_insn))
18636 fputs ("notrack ", file);
18637 return;
18639 default:
18640 output_operand_lossage ("invalid operand code '%c'", code);
18644 if (REG_P (x))
18645 print_reg (x, code, file);
18647 else if (MEM_P (x))
18649 rtx addr = XEXP (x, 0);
18651 /* No `byte ptr' prefix for call instructions ... */
18652 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18654 machine_mode mode = GET_MODE (x);
18655 const char *size;
18657 /* Check for explicit size override codes. */
18658 if (code == 'b')
18659 size = "BYTE";
18660 else if (code == 'w')
18661 size = "WORD";
18662 else if (code == 'k')
18663 size = "DWORD";
18664 else if (code == 'q')
18665 size = "QWORD";
18666 else if (code == 'x')
18667 size = "XMMWORD";
18668 else if (code == 't')
18669 size = "YMMWORD";
18670 else if (code == 'g')
18671 size = "ZMMWORD";
18672 else if (mode == BLKmode)
18673 /* ... or BLKmode operands, when not overridden. */
18674 size = NULL;
18675 else
18676 switch (GET_MODE_SIZE (mode))
18678 case 1: size = "BYTE"; break;
18679 case 2: size = "WORD"; break;
18680 case 4: size = "DWORD"; break;
18681 case 8: size = "QWORD"; break;
18682 case 12: size = "TBYTE"; break;
18683 case 16:
18684 if (mode == XFmode)
18685 size = "TBYTE";
18686 else
18687 size = "XMMWORD";
18688 break;
18689 case 32: size = "YMMWORD"; break;
18690 case 64: size = "ZMMWORD"; break;
18691 default:
18692 gcc_unreachable ();
18694 if (size)
18696 fputs (size, file);
18697 fputs (" PTR ", file);
18701 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18702 output_operand_lossage ("invalid constraints for operand");
18703 else
18704 ix86_print_operand_address_as
18705 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18708 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18710 long l;
18712 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18714 if (ASSEMBLER_DIALECT == ASM_ATT)
18715 putc ('$', file);
18716 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18717 if (code == 'q')
18718 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18719 (unsigned long long) (int) l);
18720 else
18721 fprintf (file, "0x%08x", (unsigned int) l);
18724 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18726 long l[2];
18728 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18730 if (ASSEMBLER_DIALECT == ASM_ATT)
18731 putc ('$', file);
18732 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18735 /* These float cases don't actually occur as immediate operands. */
18736 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18738 char dstr[30];
18740 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18741 fputs (dstr, file);
18744 else
18746 /* We have patterns that allow zero sets of memory, for instance.
18747 In 64-bit mode, we should probably support all 8-byte vectors,
18748 since we can in fact encode that into an immediate. */
18749 if (GET_CODE (x) == CONST_VECTOR)
18751 if (x != CONST0_RTX (GET_MODE (x)))
18752 output_operand_lossage ("invalid vector immediate");
18753 x = const0_rtx;
18756 if (code != 'P' && code != 'p')
18758 if (CONST_INT_P (x))
18760 if (ASSEMBLER_DIALECT == ASM_ATT)
18761 putc ('$', file);
18763 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18764 || GET_CODE (x) == LABEL_REF)
18766 if (ASSEMBLER_DIALECT == ASM_ATT)
18767 putc ('$', file);
18768 else
18769 fputs ("OFFSET FLAT:", file);
18772 if (CONST_INT_P (x))
18773 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18774 else if (flag_pic || MACHOPIC_INDIRECT)
18775 output_pic_addr_const (file, x, code);
18776 else
18777 output_addr_const (file, x);
18781 static bool
18782 ix86_print_operand_punct_valid_p (unsigned char code)
18784 return (code == '*' || code == '+' || code == '&' || code == ';'
18785 || code == '~' || code == '^' || code == '!');
18788 /* Print a memory operand whose address is ADDR. */
18790 static void
18791 ix86_print_operand_address_as (FILE *file, rtx addr,
18792 addr_space_t as, bool no_rip)
18794 struct ix86_address parts;
18795 rtx base, index, disp;
18796 int scale;
18797 int ok;
18798 bool vsib = false;
18799 int code = 0;
18801 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18803 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18804 gcc_assert (parts.index == NULL_RTX);
18805 parts.index = XVECEXP (addr, 0, 1);
18806 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18807 addr = XVECEXP (addr, 0, 0);
18808 vsib = true;
18810 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18812 gcc_assert (TARGET_64BIT);
18813 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18814 code = 'q';
18816 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18818 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18819 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18820 if (parts.base != NULL_RTX)
18822 parts.index = parts.base;
18823 parts.scale = 1;
18825 parts.base = XVECEXP (addr, 0, 0);
18826 addr = XVECEXP (addr, 0, 0);
18828 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18830 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18831 gcc_assert (parts.index == NULL_RTX);
18832 parts.index = XVECEXP (addr, 0, 1);
18833 addr = XVECEXP (addr, 0, 0);
18835 else
18836 ok = ix86_decompose_address (addr, &parts);
18838 gcc_assert (ok);
18840 base = parts.base;
18841 index = parts.index;
18842 disp = parts.disp;
18843 scale = parts.scale;
18845 if (ADDR_SPACE_GENERIC_P (as))
18846 as = parts.seg;
18847 else
18848 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18850 if (!ADDR_SPACE_GENERIC_P (as))
18852 const char *string;
18854 if (as == ADDR_SPACE_SEG_FS)
18855 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18856 else if (as == ADDR_SPACE_SEG_GS)
18857 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18858 else
18859 gcc_unreachable ();
18860 fputs (string, file);
18863 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18864 if (TARGET_64BIT && !base && !index && !no_rip)
18866 rtx symbol = disp;
18868 if (GET_CODE (disp) == CONST
18869 && GET_CODE (XEXP (disp, 0)) == PLUS
18870 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18871 symbol = XEXP (XEXP (disp, 0), 0);
18873 if (GET_CODE (symbol) == LABEL_REF
18874 || (GET_CODE (symbol) == SYMBOL_REF
18875 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18876 base = pc_rtx;
18879 if (!base && !index)
18881 /* Displacement only requires special attention. */
18882 if (CONST_INT_P (disp))
18884 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18885 fputs ("ds:", file);
18886 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18888 /* Load the external function address via the GOT slot to avoid PLT. */
18889 else if (GET_CODE (disp) == CONST
18890 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18891 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18892 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18893 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18894 output_pic_addr_const (file, disp, 0);
18895 else if (flag_pic)
18896 output_pic_addr_const (file, disp, 0);
18897 else
18898 output_addr_const (file, disp);
18900 else
18902 /* Print SImode register names to force addr32 prefix. */
18903 if (SImode_address_operand (addr, VOIDmode))
18905 if (flag_checking)
18907 gcc_assert (TARGET_64BIT);
18908 switch (GET_CODE (addr))
18910 case SUBREG:
18911 gcc_assert (GET_MODE (addr) == SImode);
18912 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18913 break;
18914 case ZERO_EXTEND:
18915 case AND:
18916 gcc_assert (GET_MODE (addr) == DImode);
18917 break;
18918 default:
18919 gcc_unreachable ();
18922 gcc_assert (!code);
18923 code = 'k';
18925 else if (code == 0
18926 && TARGET_X32
18927 && disp
18928 && CONST_INT_P (disp)
18929 && INTVAL (disp) < -16*1024*1024)
18931 /* X32 runs in 64-bit mode, where displacement, DISP, in
18932 address DISP(%r64), is encoded as 32-bit immediate sign-
18933 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18934 address is %r64 + 0xffffffffbffffd00. When %r64 <
18935 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18936 which is invalid for x32. The correct address is %r64
18937 - 0x40000300 == 0xf7ffdd64. To properly encode
18938 -0x40000300(%r64) for x32, we zero-extend negative
18939 displacement by forcing addr32 prefix which truncates
18940 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18941 zero-extend all negative displacements, including -1(%rsp).
18942 However, for small negative displacements, sign-extension
18943 won't cause overflow. We only zero-extend negative
18944 displacements if they < -16*1024*1024, which is also used
18945 to check legitimate address displacements for PIC. */
18946 code = 'k';
18949 /* Since the upper 32 bits of RSP are always zero for x32,
18950 we can encode %esp as %rsp to avoid 0x67 prefix if
18951 there is no index register. */
18952 if (TARGET_X32 && Pmode == SImode
18953 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18954 code = 'q';
18956 if (ASSEMBLER_DIALECT == ASM_ATT)
18958 if (disp)
18960 if (flag_pic)
18961 output_pic_addr_const (file, disp, 0);
18962 else if (GET_CODE (disp) == LABEL_REF)
18963 output_asm_label (disp);
18964 else
18965 output_addr_const (file, disp);
18968 putc ('(', file);
18969 if (base)
18970 print_reg (base, code, file);
18971 if (index)
18973 putc (',', file);
18974 print_reg (index, vsib ? 0 : code, file);
18975 if (scale != 1 || vsib)
18976 fprintf (file, ",%d", scale);
18978 putc (')', file);
18980 else
18982 rtx offset = NULL_RTX;
18984 if (disp)
18986 /* Pull out the offset of a symbol; print any symbol itself. */
18987 if (GET_CODE (disp) == CONST
18988 && GET_CODE (XEXP (disp, 0)) == PLUS
18989 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18991 offset = XEXP (XEXP (disp, 0), 1);
18992 disp = gen_rtx_CONST (VOIDmode,
18993 XEXP (XEXP (disp, 0), 0));
18996 if (flag_pic)
18997 output_pic_addr_const (file, disp, 0);
18998 else if (GET_CODE (disp) == LABEL_REF)
18999 output_asm_label (disp);
19000 else if (CONST_INT_P (disp))
19001 offset = disp;
19002 else
19003 output_addr_const (file, disp);
19006 putc ('[', file);
19007 if (base)
19009 print_reg (base, code, file);
19010 if (offset)
19012 if (INTVAL (offset) >= 0)
19013 putc ('+', file);
19014 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19017 else if (offset)
19018 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19019 else
19020 putc ('0', file);
19022 if (index)
19024 putc ('+', file);
19025 print_reg (index, vsib ? 0 : code, file);
19026 if (scale != 1 || vsib)
19027 fprintf (file, "*%d", scale);
19029 putc (']', file);
19034 static void
19035 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19037 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19040 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19042 static bool
19043 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19045 rtx op;
19047 if (GET_CODE (x) != UNSPEC)
19048 return false;
19050 op = XVECEXP (x, 0, 0);
19051 switch (XINT (x, 1))
19053 case UNSPEC_GOTOFF:
19054 output_addr_const (file, op);
19055 fputs ("@gotoff", file);
19056 break;
19057 case UNSPEC_GOTTPOFF:
19058 output_addr_const (file, op);
19059 /* FIXME: This might be @TPOFF in Sun ld. */
19060 fputs ("@gottpoff", file);
19061 break;
19062 case UNSPEC_TPOFF:
19063 output_addr_const (file, op);
19064 fputs ("@tpoff", file);
19065 break;
19066 case UNSPEC_NTPOFF:
19067 output_addr_const (file, op);
19068 if (TARGET_64BIT)
19069 fputs ("@tpoff", file);
19070 else
19071 fputs ("@ntpoff", file);
19072 break;
19073 case UNSPEC_DTPOFF:
19074 output_addr_const (file, op);
19075 fputs ("@dtpoff", file);
19076 break;
19077 case UNSPEC_GOTNTPOFF:
19078 output_addr_const (file, op);
19079 if (TARGET_64BIT)
19080 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19081 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19082 else
19083 fputs ("@gotntpoff", file);
19084 break;
19085 case UNSPEC_INDNTPOFF:
19086 output_addr_const (file, op);
19087 fputs ("@indntpoff", file);
19088 break;
19089 #if TARGET_MACHO
19090 case UNSPEC_MACHOPIC_OFFSET:
19091 output_addr_const (file, op);
19092 putc ('-', file);
19093 machopic_output_function_base_name (file);
19094 break;
19095 #endif
19097 default:
19098 return false;
19101 return true;
19104 /* Split one or more double-mode RTL references into pairs of half-mode
19105 references. The RTL can be REG, offsettable MEM, integer constant, or
19106 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19107 split and "num" is its length. lo_half and hi_half are output arrays
19108 that parallel "operands". */
19110 void
19111 split_double_mode (machine_mode mode, rtx operands[],
19112 int num, rtx lo_half[], rtx hi_half[])
19114 machine_mode half_mode;
19115 unsigned int byte;
19117 switch (mode)
19119 case E_TImode:
19120 half_mode = DImode;
19121 break;
19122 case E_DImode:
19123 half_mode = SImode;
19124 break;
19125 default:
19126 gcc_unreachable ();
19129 byte = GET_MODE_SIZE (half_mode);
19131 while (num--)
19133 rtx op = operands[num];
19135 /* simplify_subreg refuse to split volatile memory addresses,
19136 but we still have to handle it. */
19137 if (MEM_P (op))
19139 lo_half[num] = adjust_address (op, half_mode, 0);
19140 hi_half[num] = adjust_address (op, half_mode, byte);
19142 else
19144 lo_half[num] = simplify_gen_subreg (half_mode, op,
19145 GET_MODE (op) == VOIDmode
19146 ? mode : GET_MODE (op), 0);
19147 hi_half[num] = simplify_gen_subreg (half_mode, op,
19148 GET_MODE (op) == VOIDmode
19149 ? mode : GET_MODE (op), byte);
19154 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19155 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19156 is the expression of the binary operation. The output may either be
19157 emitted here, or returned to the caller, like all output_* functions.
19159 There is no guarantee that the operands are the same mode, as they
19160 might be within FLOAT or FLOAT_EXTEND expressions. */
19162 #ifndef SYSV386_COMPAT
19163 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19164 wants to fix the assemblers because that causes incompatibility
19165 with gcc. No-one wants to fix gcc because that causes
19166 incompatibility with assemblers... You can use the option of
19167 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19168 #define SYSV386_COMPAT 1
19169 #endif
19171 const char *
19172 output_387_binary_op (rtx_insn *insn, rtx *operands)
19174 static char buf[40];
19175 const char *p;
19176 bool is_sse
19177 = (SSE_REG_P (operands[0])
19178 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
19180 if (is_sse)
19181 p = "%v";
19182 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19183 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19184 p = "fi";
19185 else
19186 p = "f";
19188 strcpy (buf, p);
19190 switch (GET_CODE (operands[3]))
19192 case PLUS:
19193 p = "add"; break;
19194 case MINUS:
19195 p = "sub"; break;
19196 case MULT:
19197 p = "mul"; break;
19198 case DIV:
19199 p = "div"; break;
19200 default:
19201 gcc_unreachable ();
19204 strcat (buf, p);
19206 if (is_sse)
19208 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
19209 strcat (buf, p);
19211 if (TARGET_AVX)
19212 p = "\t{%2, %1, %0|%0, %1, %2}";
19213 else
19214 p = "\t{%2, %0|%0, %2}";
19216 strcat (buf, p);
19217 return buf;
19220 /* Even if we do not want to check the inputs, this documents input
19221 constraints. Which helps in understanding the following code. */
19222 if (flag_checking)
19224 if (STACK_REG_P (operands[0])
19225 && ((REG_P (operands[1])
19226 && REGNO (operands[0]) == REGNO (operands[1])
19227 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19228 || (REG_P (operands[2])
19229 && REGNO (operands[0]) == REGNO (operands[2])
19230 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19231 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19232 ; /* ok */
19233 else
19234 gcc_unreachable ();
19237 switch (GET_CODE (operands[3]))
19239 case MULT:
19240 case PLUS:
19241 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19242 std::swap (operands[1], operands[2]);
19244 /* know operands[0] == operands[1]. */
19246 if (MEM_P (operands[2]))
19248 p = "%Z2\t%2";
19249 break;
19252 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19254 if (STACK_TOP_P (operands[0]))
19255 /* How is it that we are storing to a dead operand[2]?
19256 Well, presumably operands[1] is dead too. We can't
19257 store the result to st(0) as st(0) gets popped on this
19258 instruction. Instead store to operands[2] (which I
19259 think has to be st(1)). st(1) will be popped later.
19260 gcc <= 2.8.1 didn't have this check and generated
19261 assembly code that the Unixware assembler rejected. */
19262 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19263 else
19264 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19265 break;
19268 if (STACK_TOP_P (operands[0]))
19269 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19270 else
19271 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19272 break;
19274 case MINUS:
19275 case DIV:
19276 if (MEM_P (operands[1]))
19278 p = "r%Z1\t%1";
19279 break;
19282 if (MEM_P (operands[2]))
19284 p = "%Z2\t%2";
19285 break;
19288 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19290 #if SYSV386_COMPAT
19291 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19292 derived assemblers, confusingly reverse the direction of
19293 the operation for fsub{r} and fdiv{r} when the
19294 destination register is not st(0). The Intel assembler
19295 doesn't have this brain damage. Read !SYSV386_COMPAT to
19296 figure out what the hardware really does. */
19297 if (STACK_TOP_P (operands[0]))
19298 p = "{p\t%0, %2|rp\t%2, %0}";
19299 else
19300 p = "{rp\t%2, %0|p\t%0, %2}";
19301 #else
19302 if (STACK_TOP_P (operands[0]))
19303 /* As above for fmul/fadd, we can't store to st(0). */
19304 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19305 else
19306 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19307 #endif
19308 break;
19311 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19313 #if SYSV386_COMPAT
19314 if (STACK_TOP_P (operands[0]))
19315 p = "{rp\t%0, %1|p\t%1, %0}";
19316 else
19317 p = "{p\t%1, %0|rp\t%0, %1}";
19318 #else
19319 if (STACK_TOP_P (operands[0]))
19320 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19321 else
19322 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19323 #endif
19324 break;
19327 if (STACK_TOP_P (operands[0]))
19329 if (STACK_TOP_P (operands[1]))
19330 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19331 else
19332 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19333 break;
19335 else if (STACK_TOP_P (operands[1]))
19337 #if SYSV386_COMPAT
19338 p = "{\t%1, %0|r\t%0, %1}";
19339 #else
19340 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19341 #endif
19343 else
19345 #if SYSV386_COMPAT
19346 p = "{r\t%2, %0|\t%0, %2}";
19347 #else
19348 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19349 #endif
19351 break;
19353 default:
19354 gcc_unreachable ();
19357 strcat (buf, p);
19358 return buf;
19361 /* Return needed mode for entity in optimize_mode_switching pass. */
19363 static int
19364 ix86_dirflag_mode_needed (rtx_insn *insn)
19366 if (CALL_P (insn))
19368 if (cfun->machine->func_type == TYPE_NORMAL)
19369 return X86_DIRFLAG_ANY;
19370 else
19371 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19372 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19375 if (recog_memoized (insn) < 0)
19376 return X86_DIRFLAG_ANY;
19378 if (get_attr_type (insn) == TYPE_STR)
19380 /* Emit cld instruction if stringops are used in the function. */
19381 if (cfun->machine->func_type == TYPE_NORMAL)
19382 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19383 else
19384 return X86_DIRFLAG_RESET;
19387 return X86_DIRFLAG_ANY;
19390 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
19392 static bool
19393 ix86_check_avx_upper_register (const_rtx exp)
19395 if (SUBREG_P (exp))
19396 exp = SUBREG_REG (exp);
19398 return (REG_P (exp)
19399 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
19400 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
19403 /* Return needed mode for entity in optimize_mode_switching pass. */
19405 static int
19406 ix86_avx_u128_mode_needed (rtx_insn *insn)
19408 if (CALL_P (insn))
19410 rtx link;
19412 /* Needed mode is set to AVX_U128_CLEAN if there are
19413 no 256bit or 512bit modes used in function arguments. */
19414 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19415 link;
19416 link = XEXP (link, 1))
19418 if (GET_CODE (XEXP (link, 0)) == USE)
19420 rtx arg = XEXP (XEXP (link, 0), 0);
19422 if (ix86_check_avx_upper_register (arg))
19423 return AVX_U128_DIRTY;
19427 return AVX_U128_CLEAN;
19430 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
19431 Hardware changes state only when a 256bit register is written to,
19432 but we need to prevent the compiler from moving optimal insertion
19433 point above eventual read from 256bit or 512 bit register. */
19434 subrtx_iterator::array_type array;
19435 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19436 if (ix86_check_avx_upper_register (*iter))
19437 return AVX_U128_DIRTY;
19439 return AVX_U128_ANY;
19442 /* Return mode that i387 must be switched into
19443 prior to the execution of insn. */
19445 static int
19446 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19448 enum attr_i387_cw mode;
19450 /* The mode UNINITIALIZED is used to store control word after a
19451 function call or ASM pattern. The mode ANY specify that function
19452 has no requirements on the control word and make no changes in the
19453 bits we are interested in. */
19455 if (CALL_P (insn)
19456 || (NONJUMP_INSN_P (insn)
19457 && (asm_noperands (PATTERN (insn)) >= 0
19458 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19459 return I387_CW_UNINITIALIZED;
19461 if (recog_memoized (insn) < 0)
19462 return I387_CW_ANY;
19464 mode = get_attr_i387_cw (insn);
19466 switch (entity)
19468 case I387_TRUNC:
19469 if (mode == I387_CW_TRUNC)
19470 return mode;
19471 break;
19473 case I387_FLOOR:
19474 if (mode == I387_CW_FLOOR)
19475 return mode;
19476 break;
19478 case I387_CEIL:
19479 if (mode == I387_CW_CEIL)
19480 return mode;
19481 break;
19483 case I387_MASK_PM:
19484 if (mode == I387_CW_MASK_PM)
19485 return mode;
19486 break;
19488 default:
19489 gcc_unreachable ();
19492 return I387_CW_ANY;
19495 /* Return mode that entity must be switched into
19496 prior to the execution of insn. */
19498 static int
19499 ix86_mode_needed (int entity, rtx_insn *insn)
19501 switch (entity)
19503 case X86_DIRFLAG:
19504 return ix86_dirflag_mode_needed (insn);
19505 case AVX_U128:
19506 return ix86_avx_u128_mode_needed (insn);
19507 case I387_TRUNC:
19508 case I387_FLOOR:
19509 case I387_CEIL:
19510 case I387_MASK_PM:
19511 return ix86_i387_mode_needed (entity, insn);
19512 default:
19513 gcc_unreachable ();
19515 return 0;
19518 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
19520 static void
19521 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
19523 if (ix86_check_avx_upper_register (dest))
19525 bool *used = (bool *) data;
19526 *used = true;
19530 /* Calculate mode of upper 128bit AVX registers after the insn. */
19532 static int
19533 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19535 rtx pat = PATTERN (insn);
19537 if (vzeroupper_operation (pat, VOIDmode)
19538 || vzeroall_operation (pat, VOIDmode))
19539 return AVX_U128_CLEAN;
19541 /* We know that state is clean after CALL insn if there are no
19542 256bit or 512bit registers used in the function return register. */
19543 if (CALL_P (insn))
19545 bool avx_upper_reg_found = false;
19546 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
19548 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19551 /* Otherwise, return current mode. Remember that if insn
19552 references AVX 256bit or 512bit registers, the mode was already
19553 changed to DIRTY from MODE_NEEDED. */
19554 return mode;
19557 /* Return the mode that an insn results in. */
19559 static int
19560 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19562 switch (entity)
19564 case X86_DIRFLAG:
19565 return mode;
19566 case AVX_U128:
19567 return ix86_avx_u128_mode_after (mode, insn);
19568 case I387_TRUNC:
19569 case I387_FLOOR:
19570 case I387_CEIL:
19571 case I387_MASK_PM:
19572 return mode;
19573 default:
19574 gcc_unreachable ();
19578 static int
19579 ix86_dirflag_mode_entry (void)
19581 /* For TARGET_CLD or in the interrupt handler we can't assume
19582 direction flag state at function entry. */
19583 if (TARGET_CLD
19584 || cfun->machine->func_type != TYPE_NORMAL)
19585 return X86_DIRFLAG_ANY;
19587 return X86_DIRFLAG_RESET;
19590 static int
19591 ix86_avx_u128_mode_entry (void)
19593 tree arg;
19595 /* Entry mode is set to AVX_U128_DIRTY if there are
19596 256bit or 512bit modes used in function arguments. */
19597 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19598 arg = TREE_CHAIN (arg))
19600 rtx incoming = DECL_INCOMING_RTL (arg);
19602 if (incoming && ix86_check_avx_upper_register (incoming))
19603 return AVX_U128_DIRTY;
19606 return AVX_U128_CLEAN;
19609 /* Return a mode that ENTITY is assumed to be
19610 switched to at function entry. */
19612 static int
19613 ix86_mode_entry (int entity)
19615 switch (entity)
19617 case X86_DIRFLAG:
19618 return ix86_dirflag_mode_entry ();
19619 case AVX_U128:
19620 return ix86_avx_u128_mode_entry ();
19621 case I387_TRUNC:
19622 case I387_FLOOR:
19623 case I387_CEIL:
19624 case I387_MASK_PM:
19625 return I387_CW_ANY;
19626 default:
19627 gcc_unreachable ();
19631 static int
19632 ix86_avx_u128_mode_exit (void)
19634 rtx reg = crtl->return_rtx;
19636 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19637 or 512 bit modes used in the function return register. */
19638 if (reg && ix86_check_avx_upper_register (reg))
19639 return AVX_U128_DIRTY;
19641 return AVX_U128_CLEAN;
19644 /* Return a mode that ENTITY is assumed to be
19645 switched to at function exit. */
19647 static int
19648 ix86_mode_exit (int entity)
19650 switch (entity)
19652 case X86_DIRFLAG:
19653 return X86_DIRFLAG_ANY;
19654 case AVX_U128:
19655 return ix86_avx_u128_mode_exit ();
19656 case I387_TRUNC:
19657 case I387_FLOOR:
19658 case I387_CEIL:
19659 case I387_MASK_PM:
19660 return I387_CW_ANY;
19661 default:
19662 gcc_unreachable ();
19666 static int
19667 ix86_mode_priority (int, int n)
19669 return n;
19672 /* Output code to initialize control word copies used by trunc?f?i and
19673 rounding patterns. CURRENT_MODE is set to current control word,
19674 while NEW_MODE is set to new control word. */
19676 static void
19677 emit_i387_cw_initialization (int mode)
19679 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19680 rtx new_mode;
19682 enum ix86_stack_slot slot;
19684 rtx reg = gen_reg_rtx (HImode);
19686 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19687 emit_move_insn (reg, copy_rtx (stored_mode));
19689 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19690 || optimize_insn_for_size_p ())
19692 switch (mode)
19694 case I387_CW_TRUNC:
19695 /* round toward zero (truncate) */
19696 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19697 slot = SLOT_CW_TRUNC;
19698 break;
19700 case I387_CW_FLOOR:
19701 /* round down toward -oo */
19702 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19703 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19704 slot = SLOT_CW_FLOOR;
19705 break;
19707 case I387_CW_CEIL:
19708 /* round up toward +oo */
19709 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19710 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19711 slot = SLOT_CW_CEIL;
19712 break;
19714 case I387_CW_MASK_PM:
19715 /* mask precision exception for nearbyint() */
19716 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19717 slot = SLOT_CW_MASK_PM;
19718 break;
19720 default:
19721 gcc_unreachable ();
19724 else
19726 switch (mode)
19728 case I387_CW_TRUNC:
19729 /* round toward zero (truncate) */
19730 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19731 slot = SLOT_CW_TRUNC;
19732 break;
19734 case I387_CW_FLOOR:
19735 /* round down toward -oo */
19736 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19737 slot = SLOT_CW_FLOOR;
19738 break;
19740 case I387_CW_CEIL:
19741 /* round up toward +oo */
19742 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19743 slot = SLOT_CW_CEIL;
19744 break;
19746 case I387_CW_MASK_PM:
19747 /* mask precision exception for nearbyint() */
19748 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19749 slot = SLOT_CW_MASK_PM;
19750 break;
19752 default:
19753 gcc_unreachable ();
19757 gcc_assert (slot < MAX_386_STACK_LOCALS);
19759 new_mode = assign_386_stack_local (HImode, slot);
19760 emit_move_insn (new_mode, reg);
19763 /* Emit vzeroupper. */
19765 void
19766 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19768 int i;
19770 /* Cancel automatic vzeroupper insertion if there are
19771 live call-saved SSE registers at the insertion point. */
19773 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19774 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19775 return;
19777 if (TARGET_64BIT)
19778 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19779 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19780 return;
19782 emit_insn (gen_avx_vzeroupper ());
19785 /* Generate one or more insns to set ENTITY to MODE. */
19787 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19788 is the set of hard registers live at the point where the insn(s)
19789 are to be inserted. */
19791 static void
19792 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19793 HARD_REG_SET regs_live)
19795 switch (entity)
19797 case X86_DIRFLAG:
19798 if (mode == X86_DIRFLAG_RESET)
19799 emit_insn (gen_cld ());
19800 break;
19801 case AVX_U128:
19802 if (mode == AVX_U128_CLEAN)
19803 ix86_avx_emit_vzeroupper (regs_live);
19804 break;
19805 case I387_TRUNC:
19806 case I387_FLOOR:
19807 case I387_CEIL:
19808 case I387_MASK_PM:
19809 if (mode != I387_CW_ANY
19810 && mode != I387_CW_UNINITIALIZED)
19811 emit_i387_cw_initialization (mode);
19812 break;
19813 default:
19814 gcc_unreachable ();
19818 /* Output code for INSN to convert a float to a signed int. OPERANDS
19819 are the insn operands. The output may be [HSD]Imode and the input
19820 operand may be [SDX]Fmode. */
19822 const char *
19823 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19825 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19826 bool dimode_p = GET_MODE (operands[0]) == DImode;
19827 int round_mode = get_attr_i387_cw (insn);
19829 static char buf[40];
19830 const char *p;
19832 /* Jump through a hoop or two for DImode, since the hardware has no
19833 non-popping instruction. We used to do this a different way, but
19834 that was somewhat fragile and broke with post-reload splitters. */
19835 if ((dimode_p || fisttp) && !stack_top_dies)
19836 output_asm_insn ("fld\t%y1", operands);
19838 gcc_assert (STACK_TOP_P (operands[1]));
19839 gcc_assert (MEM_P (operands[0]));
19840 gcc_assert (GET_MODE (operands[1]) != TFmode);
19842 if (fisttp)
19843 return "fisttp%Z0\t%0";
19845 strcpy (buf, "fist");
19847 if (round_mode != I387_CW_ANY)
19848 output_asm_insn ("fldcw\t%3", operands);
19850 p = "p%Z0\t%0";
19851 strcat (buf, p + !(stack_top_dies || dimode_p));
19853 output_asm_insn (buf, operands);
19855 if (round_mode != I387_CW_ANY)
19856 output_asm_insn ("fldcw\t%2", operands);
19858 return "";
19861 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19862 have the values zero or one, indicates the ffreep insn's operand
19863 from the OPERANDS array. */
19865 static const char *
19866 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19868 if (TARGET_USE_FFREEP)
19869 #ifdef HAVE_AS_IX86_FFREEP
19870 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19871 #else
19873 static char retval[32];
19874 int regno = REGNO (operands[opno]);
19876 gcc_assert (STACK_REGNO_P (regno));
19878 regno -= FIRST_STACK_REG;
19880 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19881 return retval;
19883 #endif
19885 return opno ? "fstp\t%y1" : "fstp\t%y0";
19889 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19890 should be used. UNORDERED_P is true when fucom should be used. */
19892 const char *
19893 output_fp_compare (rtx_insn *insn, rtx *operands,
19894 bool eflags_p, bool unordered_p)
19896 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19897 bool stack_top_dies;
19899 static char buf[40];
19900 const char *p;
19902 gcc_assert (STACK_TOP_P (xops[0]));
19904 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19906 if (eflags_p)
19908 p = unordered_p ? "fucomi" : "fcomi";
19909 strcpy (buf, p);
19911 p = "p\t{%y1, %0|%0, %y1}";
19912 strcat (buf, p + !stack_top_dies);
19914 return buf;
19917 if (STACK_REG_P (xops[1])
19918 && stack_top_dies
19919 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19921 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19923 /* If both the top of the 387 stack die, and the other operand
19924 is also a stack register that dies, then this must be a
19925 `fcompp' float compare. */
19926 p = unordered_p ? "fucompp" : "fcompp";
19927 strcpy (buf, p);
19929 else if (const0_operand (xops[1], VOIDmode))
19931 gcc_assert (!unordered_p);
19932 strcpy (buf, "ftst");
19934 else
19936 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19938 gcc_assert (!unordered_p);
19939 p = "ficom";
19941 else
19942 p = unordered_p ? "fucom" : "fcom";
19944 strcpy (buf, p);
19946 p = "p%Z2\t%y2";
19947 strcat (buf, p + !stack_top_dies);
19950 output_asm_insn (buf, operands);
19951 return "fnstsw\t%0";
19954 void
19955 ix86_output_addr_vec_elt (FILE *file, int value)
19957 const char *directive = ASM_LONG;
19959 #ifdef ASM_QUAD
19960 if (TARGET_LP64)
19961 directive = ASM_QUAD;
19962 #else
19963 gcc_assert (!TARGET_64BIT);
19964 #endif
19966 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19969 void
19970 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19972 const char *directive = ASM_LONG;
19974 #ifdef ASM_QUAD
19975 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19976 directive = ASM_QUAD;
19977 #else
19978 gcc_assert (!TARGET_64BIT);
19979 #endif
19980 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19981 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19982 fprintf (file, "%s%s%d-%s%d\n",
19983 directive, LPREFIX, value, LPREFIX, rel);
19984 else if (HAVE_AS_GOTOFF_IN_DATA)
19985 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19986 #if TARGET_MACHO
19987 else if (TARGET_MACHO)
19989 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19990 machopic_output_function_base_name (file);
19991 putc ('\n', file);
19993 #endif
19994 else
19995 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19996 GOT_SYMBOL_NAME, LPREFIX, value);
19999 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20000 for the target. */
20002 void
20003 ix86_expand_clear (rtx dest)
20005 rtx tmp;
20007 /* We play register width games, which are only valid after reload. */
20008 gcc_assert (reload_completed);
20010 /* Avoid HImode and its attendant prefix byte. */
20011 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20012 dest = gen_rtx_REG (SImode, REGNO (dest));
20013 tmp = gen_rtx_SET (dest, const0_rtx);
20015 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20017 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20018 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20021 emit_insn (tmp);
20024 void
20025 ix86_expand_move (machine_mode mode, rtx operands[])
20027 rtx op0, op1;
20028 rtx tmp, addend = NULL_RTX;
20029 enum tls_model model;
20031 op0 = operands[0];
20032 op1 = operands[1];
20034 switch (GET_CODE (op1))
20036 case CONST:
20037 tmp = XEXP (op1, 0);
20039 if (GET_CODE (tmp) != PLUS
20040 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20041 break;
20043 op1 = XEXP (tmp, 0);
20044 addend = XEXP (tmp, 1);
20045 /* FALLTHRU */
20047 case SYMBOL_REF:
20048 model = SYMBOL_REF_TLS_MODEL (op1);
20050 if (model)
20051 op1 = legitimize_tls_address (op1, model, true);
20052 else if (ix86_force_load_from_GOT_p (op1))
20054 /* Load the external function address via GOT slot to avoid PLT. */
20055 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20056 (TARGET_64BIT
20057 ? UNSPEC_GOTPCREL
20058 : UNSPEC_GOT));
20059 op1 = gen_rtx_CONST (Pmode, op1);
20060 op1 = gen_const_mem (Pmode, op1);
20061 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20063 else
20065 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20066 if (tmp)
20068 op1 = tmp;
20069 if (!addend)
20070 break;
20072 else
20074 op1 = operands[1];
20075 break;
20079 if (addend)
20081 op1 = force_operand (op1, NULL_RTX);
20082 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20083 op0, 1, OPTAB_DIRECT);
20085 else
20086 op1 = force_operand (op1, op0);
20088 if (op1 == op0)
20089 return;
20091 op1 = convert_to_mode (mode, op1, 1);
20093 default:
20094 break;
20097 if ((flag_pic || MACHOPIC_INDIRECT)
20098 && symbolic_operand (op1, mode))
20100 if (TARGET_MACHO && !TARGET_64BIT)
20102 #if TARGET_MACHO
20103 /* dynamic-no-pic */
20104 if (MACHOPIC_INDIRECT)
20106 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20107 ? op0 : gen_reg_rtx (Pmode);
20108 op1 = machopic_indirect_data_reference (op1, temp);
20109 if (MACHOPIC_PURE)
20110 op1 = machopic_legitimize_pic_address (op1, mode,
20111 temp == op1 ? 0 : temp);
20113 if (op0 != op1 && GET_CODE (op0) != MEM)
20115 rtx insn = gen_rtx_SET (op0, op1);
20116 emit_insn (insn);
20117 return;
20119 if (GET_CODE (op0) == MEM)
20120 op1 = force_reg (Pmode, op1);
20121 else
20123 rtx temp = op0;
20124 if (GET_CODE (temp) != REG)
20125 temp = gen_reg_rtx (Pmode);
20126 temp = legitimize_pic_address (op1, temp);
20127 if (temp == op0)
20128 return;
20129 op1 = temp;
20131 /* dynamic-no-pic */
20132 #endif
20134 else
20136 if (MEM_P (op0))
20137 op1 = force_reg (mode, op1);
20138 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20140 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20141 op1 = legitimize_pic_address (op1, reg);
20142 if (op0 == op1)
20143 return;
20144 op1 = convert_to_mode (mode, op1, 1);
20148 else
20150 if (MEM_P (op0)
20151 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20152 || !push_operand (op0, mode))
20153 && MEM_P (op1))
20154 op1 = force_reg (mode, op1);
20156 if (push_operand (op0, mode)
20157 && ! general_no_elim_operand (op1, mode))
20158 op1 = copy_to_mode_reg (mode, op1);
20160 /* Force large constants in 64bit compilation into register
20161 to get them CSEed. */
20162 if (can_create_pseudo_p ()
20163 && (mode == DImode) && TARGET_64BIT
20164 && immediate_operand (op1, mode)
20165 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20166 && !register_operand (op0, mode)
20167 && optimize)
20168 op1 = copy_to_mode_reg (mode, op1);
20170 if (can_create_pseudo_p ()
20171 && CONST_DOUBLE_P (op1))
20173 /* If we are loading a floating point constant to a register,
20174 force the value to memory now, since we'll get better code
20175 out the back end. */
20177 op1 = validize_mem (force_const_mem (mode, op1));
20178 if (!register_operand (op0, mode))
20180 rtx temp = gen_reg_rtx (mode);
20181 emit_insn (gen_rtx_SET (temp, op1));
20182 emit_move_insn (op0, temp);
20183 return;
20188 emit_insn (gen_rtx_SET (op0, op1));
20191 void
20192 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20194 rtx op0 = operands[0], op1 = operands[1];
20195 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20196 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20197 unsigned int align = (TARGET_IAMCU
20198 ? GET_MODE_BITSIZE (mode)
20199 : GET_MODE_ALIGNMENT (mode));
20201 if (push_operand (op0, VOIDmode))
20202 op0 = emit_move_resolve_push (mode, op0);
20204 /* Force constants other than zero into memory. We do not know how
20205 the instructions used to build constants modify the upper 64 bits
20206 of the register, once we have that information we may be able
20207 to handle some of them more efficiently. */
20208 if (can_create_pseudo_p ()
20209 && (CONSTANT_P (op1)
20210 || (SUBREG_P (op1)
20211 && CONSTANT_P (SUBREG_REG (op1))))
20212 && ((register_operand (op0, mode)
20213 && !standard_sse_constant_p (op1, mode))
20214 /* ix86_expand_vector_move_misalign() does not like constants. */
20215 || (SSE_REG_MODE_P (mode)
20216 && MEM_P (op0)
20217 && MEM_ALIGN (op0) < align)))
20219 if (SUBREG_P (op1))
20221 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20222 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20223 if (r)
20224 r = validize_mem (r);
20225 else
20226 r = force_reg (imode, SUBREG_REG (op1));
20227 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20229 else
20230 op1 = validize_mem (force_const_mem (mode, op1));
20233 /* We need to check memory alignment for SSE mode since attribute
20234 can make operands unaligned. */
20235 if (can_create_pseudo_p ()
20236 && SSE_REG_MODE_P (mode)
20237 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20238 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20240 rtx tmp[2];
20242 /* ix86_expand_vector_move_misalign() does not like both
20243 arguments in memory. */
20244 if (!register_operand (op0, mode)
20245 && !register_operand (op1, mode))
20246 op1 = force_reg (mode, op1);
20248 tmp[0] = op0; tmp[1] = op1;
20249 ix86_expand_vector_move_misalign (mode, tmp);
20250 return;
20253 /* Make operand1 a register if it isn't already. */
20254 if (can_create_pseudo_p ()
20255 && !register_operand (op0, mode)
20256 && !register_operand (op1, mode))
20258 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20259 return;
20262 emit_insn (gen_rtx_SET (op0, op1));
20265 /* Split 32-byte AVX unaligned load and store if needed. */
20267 static void
20268 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20270 rtx m;
20271 rtx (*extract) (rtx, rtx, rtx);
20272 machine_mode mode;
20274 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20275 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20277 emit_insn (gen_rtx_SET (op0, op1));
20278 return;
20281 rtx orig_op0 = NULL_RTX;
20282 mode = GET_MODE (op0);
20283 switch (GET_MODE_CLASS (mode))
20285 case MODE_VECTOR_INT:
20286 case MODE_INT:
20287 if (mode != V32QImode)
20289 if (!MEM_P (op0))
20291 orig_op0 = op0;
20292 op0 = gen_reg_rtx (V32QImode);
20294 else
20295 op0 = gen_lowpart (V32QImode, op0);
20296 op1 = gen_lowpart (V32QImode, op1);
20297 mode = V32QImode;
20299 break;
20300 case MODE_VECTOR_FLOAT:
20301 break;
20302 default:
20303 gcc_unreachable ();
20306 switch (mode)
20308 default:
20309 gcc_unreachable ();
20310 case E_V32QImode:
20311 extract = gen_avx_vextractf128v32qi;
20312 mode = V16QImode;
20313 break;
20314 case E_V8SFmode:
20315 extract = gen_avx_vextractf128v8sf;
20316 mode = V4SFmode;
20317 break;
20318 case E_V4DFmode:
20319 extract = gen_avx_vextractf128v4df;
20320 mode = V2DFmode;
20321 break;
20324 if (MEM_P (op1))
20326 rtx r = gen_reg_rtx (mode);
20327 m = adjust_address (op1, mode, 0);
20328 emit_move_insn (r, m);
20329 m = adjust_address (op1, mode, 16);
20330 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20331 emit_move_insn (op0, r);
20333 else if (MEM_P (op0))
20335 m = adjust_address (op0, mode, 0);
20336 emit_insn (extract (m, op1, const0_rtx));
20337 m = adjust_address (op0, mode, 16);
20338 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20340 else
20341 gcc_unreachable ();
20343 if (orig_op0)
20344 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20347 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20348 straight to ix86_expand_vector_move. */
20349 /* Code generation for scalar reg-reg moves of single and double precision data:
20350 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20351 movaps reg, reg
20352 else
20353 movss reg, reg
20354 if (x86_sse_partial_reg_dependency == true)
20355 movapd reg, reg
20356 else
20357 movsd reg, reg
20359 Code generation for scalar loads of double precision data:
20360 if (x86_sse_split_regs == true)
20361 movlpd mem, reg (gas syntax)
20362 else
20363 movsd mem, reg
20365 Code generation for unaligned packed loads of single precision data
20366 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20367 if (x86_sse_unaligned_move_optimal)
20368 movups mem, reg
20370 if (x86_sse_partial_reg_dependency == true)
20372 xorps reg, reg
20373 movlps mem, reg
20374 movhps mem+8, reg
20376 else
20378 movlps mem, reg
20379 movhps mem+8, reg
20382 Code generation for unaligned packed loads of double precision data
20383 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20384 if (x86_sse_unaligned_move_optimal)
20385 movupd mem, reg
20387 if (x86_sse_split_regs == true)
20389 movlpd mem, reg
20390 movhpd mem+8, reg
20392 else
20394 movsd mem, reg
20395 movhpd mem+8, reg
20399 void
20400 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20402 rtx op0, op1, m;
20404 op0 = operands[0];
20405 op1 = operands[1];
20407 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20408 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20410 emit_insn (gen_rtx_SET (op0, op1));
20411 return;
20414 if (TARGET_AVX)
20416 if (GET_MODE_SIZE (mode) == 32)
20417 ix86_avx256_split_vector_move_misalign (op0, op1);
20418 else
20419 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20420 emit_insn (gen_rtx_SET (op0, op1));
20421 return;
20424 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20425 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20427 emit_insn (gen_rtx_SET (op0, op1));
20428 return;
20431 /* ??? If we have typed data, then it would appear that using
20432 movdqu is the only way to get unaligned data loaded with
20433 integer type. */
20434 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20436 emit_insn (gen_rtx_SET (op0, op1));
20437 return;
20440 if (MEM_P (op1))
20442 if (TARGET_SSE2 && mode == V2DFmode)
20444 rtx zero;
20446 /* When SSE registers are split into halves, we can avoid
20447 writing to the top half twice. */
20448 if (TARGET_SSE_SPLIT_REGS)
20450 emit_clobber (op0);
20451 zero = op0;
20453 else
20455 /* ??? Not sure about the best option for the Intel chips.
20456 The following would seem to satisfy; the register is
20457 entirely cleared, breaking the dependency chain. We
20458 then store to the upper half, with a dependency depth
20459 of one. A rumor has it that Intel recommends two movsd
20460 followed by an unpacklpd, but this is unconfirmed. And
20461 given that the dependency depth of the unpacklpd would
20462 still be one, I'm not sure why this would be better. */
20463 zero = CONST0_RTX (V2DFmode);
20466 m = adjust_address (op1, DFmode, 0);
20467 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20468 m = adjust_address (op1, DFmode, 8);
20469 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20471 else
20473 rtx t;
20475 if (mode != V4SFmode)
20476 t = gen_reg_rtx (V4SFmode);
20477 else
20478 t = op0;
20480 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20481 emit_move_insn (t, CONST0_RTX (V4SFmode));
20482 else
20483 emit_clobber (t);
20485 m = adjust_address (op1, V2SFmode, 0);
20486 emit_insn (gen_sse_loadlps (t, t, m));
20487 m = adjust_address (op1, V2SFmode, 8);
20488 emit_insn (gen_sse_loadhps (t, t, m));
20489 if (mode != V4SFmode)
20490 emit_move_insn (op0, gen_lowpart (mode, t));
20493 else if (MEM_P (op0))
20495 if (TARGET_SSE2 && mode == V2DFmode)
20497 m = adjust_address (op0, DFmode, 0);
20498 emit_insn (gen_sse2_storelpd (m, op1));
20499 m = adjust_address (op0, DFmode, 8);
20500 emit_insn (gen_sse2_storehpd (m, op1));
20502 else
20504 if (mode != V4SFmode)
20505 op1 = gen_lowpart (V4SFmode, op1);
20507 m = adjust_address (op0, V2SFmode, 0);
20508 emit_insn (gen_sse_storelps (m, op1));
20509 m = adjust_address (op0, V2SFmode, 8);
20510 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20513 else
20514 gcc_unreachable ();
20517 /* Helper function of ix86_fixup_binary_operands to canonicalize
20518 operand order. Returns true if the operands should be swapped. */
20520 static bool
20521 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20522 rtx operands[])
20524 rtx dst = operands[0];
20525 rtx src1 = operands[1];
20526 rtx src2 = operands[2];
20528 /* If the operation is not commutative, we can't do anything. */
20529 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
20530 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
20531 return false;
20533 /* Highest priority is that src1 should match dst. */
20534 if (rtx_equal_p (dst, src1))
20535 return false;
20536 if (rtx_equal_p (dst, src2))
20537 return true;
20539 /* Next highest priority is that immediate constants come second. */
20540 if (immediate_operand (src2, mode))
20541 return false;
20542 if (immediate_operand (src1, mode))
20543 return true;
20545 /* Lowest priority is that memory references should come second. */
20546 if (MEM_P (src2))
20547 return false;
20548 if (MEM_P (src1))
20549 return true;
20551 return false;
20555 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20556 destination to use for the operation. If different from the true
20557 destination in operands[0], a copy operation will be required. */
20560 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20561 rtx operands[])
20563 rtx dst = operands[0];
20564 rtx src1 = operands[1];
20565 rtx src2 = operands[2];
20567 /* Canonicalize operand order. */
20568 if (ix86_swap_binary_operands_p (code, mode, operands))
20570 /* It is invalid to swap operands of different modes. */
20571 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20573 std::swap (src1, src2);
20576 /* Both source operands cannot be in memory. */
20577 if (MEM_P (src1) && MEM_P (src2))
20579 /* Optimization: Only read from memory once. */
20580 if (rtx_equal_p (src1, src2))
20582 src2 = force_reg (mode, src2);
20583 src1 = src2;
20585 else if (rtx_equal_p (dst, src1))
20586 src2 = force_reg (mode, src2);
20587 else
20588 src1 = force_reg (mode, src1);
20591 /* If the destination is memory, and we do not have matching source
20592 operands, do things in registers. */
20593 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20594 dst = gen_reg_rtx (mode);
20596 /* Source 1 cannot be a constant. */
20597 if (CONSTANT_P (src1))
20598 src1 = force_reg (mode, src1);
20600 /* Source 1 cannot be a non-matching memory. */
20601 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20602 src1 = force_reg (mode, src1);
20604 /* Improve address combine. */
20605 if (code == PLUS
20606 && GET_MODE_CLASS (mode) == MODE_INT
20607 && MEM_P (src2))
20608 src2 = force_reg (mode, src2);
20610 operands[1] = src1;
20611 operands[2] = src2;
20612 return dst;
20615 /* Similarly, but assume that the destination has already been
20616 set up properly. */
20618 void
20619 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20620 machine_mode mode, rtx operands[])
20622 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20623 gcc_assert (dst == operands[0]);
20626 /* Attempt to expand a binary operator. Make the expansion closer to the
20627 actual machine, then just general_operand, which will allow 3 separate
20628 memory references (one output, two input) in a single insn. */
20630 void
20631 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20632 rtx operands[])
20634 rtx src1, src2, dst, op, clob;
20636 dst = ix86_fixup_binary_operands (code, mode, operands);
20637 src1 = operands[1];
20638 src2 = operands[2];
20640 /* Emit the instruction. */
20642 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20644 if (reload_completed
20645 && code == PLUS
20646 && !rtx_equal_p (dst, src1))
20648 /* This is going to be an LEA; avoid splitting it later. */
20649 emit_insn (op);
20651 else
20653 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20654 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20657 /* Fix up the destination if needed. */
20658 if (dst != operands[0])
20659 emit_move_insn (operands[0], dst);
20662 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20663 the given OPERANDS. */
20665 void
20666 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20667 rtx operands[])
20669 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20670 if (SUBREG_P (operands[1]))
20672 op1 = operands[1];
20673 op2 = operands[2];
20675 else if (SUBREG_P (operands[2]))
20677 op1 = operands[2];
20678 op2 = operands[1];
20680 /* Optimize (__m128i) d | (__m128i) e and similar code
20681 when d and e are float vectors into float vector logical
20682 insn. In C/C++ without using intrinsics there is no other way
20683 to express vector logical operation on float vectors than
20684 to cast them temporarily to integer vectors. */
20685 if (op1
20686 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20687 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20688 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20689 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20690 && SUBREG_BYTE (op1) == 0
20691 && (GET_CODE (op2) == CONST_VECTOR
20692 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20693 && SUBREG_BYTE (op2) == 0))
20694 && can_create_pseudo_p ())
20696 rtx dst;
20697 switch (GET_MODE (SUBREG_REG (op1)))
20699 case E_V4SFmode:
20700 case E_V8SFmode:
20701 case E_V16SFmode:
20702 case E_V2DFmode:
20703 case E_V4DFmode:
20704 case E_V8DFmode:
20705 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20706 if (GET_CODE (op2) == CONST_VECTOR)
20708 op2 = gen_lowpart (GET_MODE (dst), op2);
20709 op2 = force_reg (GET_MODE (dst), op2);
20711 else
20713 op1 = operands[1];
20714 op2 = SUBREG_REG (operands[2]);
20715 if (!vector_operand (op2, GET_MODE (dst)))
20716 op2 = force_reg (GET_MODE (dst), op2);
20718 op1 = SUBREG_REG (op1);
20719 if (!vector_operand (op1, GET_MODE (dst)))
20720 op1 = force_reg (GET_MODE (dst), op1);
20721 emit_insn (gen_rtx_SET (dst,
20722 gen_rtx_fmt_ee (code, GET_MODE (dst),
20723 op1, op2)));
20724 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20725 return;
20726 default:
20727 break;
20730 if (!vector_operand (operands[1], mode))
20731 operands[1] = force_reg (mode, operands[1]);
20732 if (!vector_operand (operands[2], mode))
20733 operands[2] = force_reg (mode, operands[2]);
20734 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20735 emit_insn (gen_rtx_SET (operands[0],
20736 gen_rtx_fmt_ee (code, mode, operands[1],
20737 operands[2])));
20740 /* Return TRUE or FALSE depending on whether the binary operator meets the
20741 appropriate constraints. */
20743 bool
20744 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20745 rtx operands[3])
20747 rtx dst = operands[0];
20748 rtx src1 = operands[1];
20749 rtx src2 = operands[2];
20751 /* Both source operands cannot be in memory. */
20752 if (MEM_P (src1) && MEM_P (src2))
20753 return false;
20755 /* Canonicalize operand order for commutative operators. */
20756 if (ix86_swap_binary_operands_p (code, mode, operands))
20757 std::swap (src1, src2);
20759 /* If the destination is memory, we must have a matching source operand. */
20760 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20761 return false;
20763 /* Source 1 cannot be a constant. */
20764 if (CONSTANT_P (src1))
20765 return false;
20767 /* Source 1 cannot be a non-matching memory. */
20768 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20769 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20770 return (code == AND
20771 && (mode == HImode
20772 || mode == SImode
20773 || (TARGET_64BIT && mode == DImode))
20774 && satisfies_constraint_L (src2));
20776 return true;
20779 /* Attempt to expand a unary operator. Make the expansion closer to the
20780 actual machine, then just general_operand, which will allow 2 separate
20781 memory references (one output, one input) in a single insn. */
20783 void
20784 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20785 rtx operands[])
20787 bool matching_memory = false;
20788 rtx src, dst, op, clob;
20790 dst = operands[0];
20791 src = operands[1];
20793 /* If the destination is memory, and we do not have matching source
20794 operands, do things in registers. */
20795 if (MEM_P (dst))
20797 if (rtx_equal_p (dst, src))
20798 matching_memory = true;
20799 else
20800 dst = gen_reg_rtx (mode);
20803 /* When source operand is memory, destination must match. */
20804 if (MEM_P (src) && !matching_memory)
20805 src = force_reg (mode, src);
20807 /* Emit the instruction. */
20809 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20811 if (code == NOT)
20812 emit_insn (op);
20813 else
20815 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20816 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20819 /* Fix up the destination if needed. */
20820 if (dst != operands[0])
20821 emit_move_insn (operands[0], dst);
20824 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20825 divisor are within the range [0-255]. */
20827 void
20828 ix86_split_idivmod (machine_mode mode, rtx operands[],
20829 bool signed_p)
20831 rtx_code_label *end_label, *qimode_label;
20832 rtx div, mod;
20833 rtx_insn *insn;
20834 rtx scratch, tmp0, tmp1, tmp2;
20835 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20836 rtx (*gen_zero_extend) (rtx, rtx);
20837 rtx (*gen_test_ccno_1) (rtx, rtx);
20839 switch (mode)
20841 case E_SImode:
20842 if (GET_MODE (operands[0]) == SImode)
20844 if (GET_MODE (operands[1]) == SImode)
20845 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20846 else
20847 gen_divmod4_1
20848 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20849 gen_zero_extend = gen_zero_extendqisi2;
20851 else
20853 gen_divmod4_1
20854 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20855 gen_zero_extend = gen_zero_extendqidi2;
20857 gen_test_ccno_1 = gen_testsi_ccno_1;
20858 break;
20859 case E_DImode:
20860 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20861 gen_test_ccno_1 = gen_testdi_ccno_1;
20862 gen_zero_extend = gen_zero_extendqidi2;
20863 break;
20864 default:
20865 gcc_unreachable ();
20868 end_label = gen_label_rtx ();
20869 qimode_label = gen_label_rtx ();
20871 scratch = gen_reg_rtx (mode);
20873 /* Use 8bit unsigned divimod if dividend and divisor are within
20874 the range [0-255]. */
20875 emit_move_insn (scratch, operands[2]);
20876 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20877 scratch, 1, OPTAB_DIRECT);
20878 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20879 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20880 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20881 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20882 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20883 pc_rtx);
20884 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20885 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20886 JUMP_LABEL (insn) = qimode_label;
20888 /* Generate original signed/unsigned divimod. */
20889 div = gen_divmod4_1 (operands[0], operands[1],
20890 operands[2], operands[3]);
20891 emit_insn (div);
20893 /* Branch to the end. */
20894 emit_jump_insn (gen_jump (end_label));
20895 emit_barrier ();
20897 /* Generate 8bit unsigned divide. */
20898 emit_label (qimode_label);
20899 /* Don't use operands[0] for result of 8bit divide since not all
20900 registers support QImode ZERO_EXTRACT. */
20901 tmp0 = lowpart_subreg (HImode, scratch, mode);
20902 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20903 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20904 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20906 if (signed_p)
20908 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20909 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20911 else
20913 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20914 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20916 if (mode == SImode)
20918 if (GET_MODE (operands[0]) != SImode)
20919 div = gen_rtx_ZERO_EXTEND (DImode, div);
20920 if (GET_MODE (operands[1]) != SImode)
20921 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20924 /* Extract remainder from AH. */
20925 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20926 tmp0, GEN_INT (8), GEN_INT (8));
20927 if (REG_P (operands[1]))
20928 insn = emit_move_insn (operands[1], tmp1);
20929 else
20931 /* Need a new scratch register since the old one has result
20932 of 8bit divide. */
20933 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20934 emit_move_insn (scratch, tmp1);
20935 insn = emit_move_insn (operands[1], scratch);
20937 set_unique_reg_note (insn, REG_EQUAL, mod);
20939 /* Zero extend quotient from AL. */
20940 tmp1 = gen_lowpart (QImode, tmp0);
20941 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20942 set_unique_reg_note (insn, REG_EQUAL, div);
20944 emit_label (end_label);
20947 #define LEA_MAX_STALL (3)
20948 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20950 /* Increase given DISTANCE in half-cycles according to
20951 dependencies between PREV and NEXT instructions.
20952 Add 1 half-cycle if there is no dependency and
20953 go to next cycle if there is some dependecy. */
20955 static unsigned int
20956 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20958 df_ref def, use;
20960 if (!prev || !next)
20961 return distance + (distance & 1) + 2;
20963 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20964 return distance + 1;
20966 FOR_EACH_INSN_USE (use, next)
20967 FOR_EACH_INSN_DEF (def, prev)
20968 if (!DF_REF_IS_ARTIFICIAL (def)
20969 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20970 return distance + (distance & 1) + 2;
20972 return distance + 1;
20975 /* Function checks if instruction INSN defines register number
20976 REGNO1 or REGNO2. */
20978 static bool
20979 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20980 rtx_insn *insn)
20982 df_ref def;
20984 FOR_EACH_INSN_DEF (def, insn)
20985 if (DF_REF_REG_DEF_P (def)
20986 && !DF_REF_IS_ARTIFICIAL (def)
20987 && (regno1 == DF_REF_REGNO (def)
20988 || regno2 == DF_REF_REGNO (def)))
20989 return true;
20991 return false;
20994 /* Function checks if instruction INSN uses register number
20995 REGNO as a part of address expression. */
20997 static bool
20998 insn_uses_reg_mem (unsigned int regno, rtx insn)
21000 df_ref use;
21002 FOR_EACH_INSN_USE (use, insn)
21003 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21004 return true;
21006 return false;
21009 /* Search backward for non-agu definition of register number REGNO1
21010 or register number REGNO2 in basic block starting from instruction
21011 START up to head of basic block or instruction INSN.
21013 Function puts true value into *FOUND var if definition was found
21014 and false otherwise.
21016 Distance in half-cycles between START and found instruction or head
21017 of BB is added to DISTANCE and returned. */
21019 static int
21020 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21021 rtx_insn *insn, int distance,
21022 rtx_insn *start, bool *found)
21024 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21025 rtx_insn *prev = start;
21026 rtx_insn *next = NULL;
21028 *found = false;
21030 while (prev
21031 && prev != insn
21032 && distance < LEA_SEARCH_THRESHOLD)
21034 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21036 distance = increase_distance (prev, next, distance);
21037 if (insn_defines_reg (regno1, regno2, prev))
21039 if (recog_memoized (prev) < 0
21040 || get_attr_type (prev) != TYPE_LEA)
21042 *found = true;
21043 return distance;
21047 next = prev;
21049 if (prev == BB_HEAD (bb))
21050 break;
21052 prev = PREV_INSN (prev);
21055 return distance;
21058 /* Search backward for non-agu definition of register number REGNO1
21059 or register number REGNO2 in INSN's basic block until
21060 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21061 2. Reach neighbor BBs boundary, or
21062 3. Reach agu definition.
21063 Returns the distance between the non-agu definition point and INSN.
21064 If no definition point, returns -1. */
21066 static int
21067 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21068 rtx_insn *insn)
21070 basic_block bb = BLOCK_FOR_INSN (insn);
21071 int distance = 0;
21072 bool found = false;
21074 if (insn != BB_HEAD (bb))
21075 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21076 distance, PREV_INSN (insn),
21077 &found);
21079 if (!found && distance < LEA_SEARCH_THRESHOLD)
21081 edge e;
21082 edge_iterator ei;
21083 bool simple_loop = false;
21085 FOR_EACH_EDGE (e, ei, bb->preds)
21086 if (e->src == bb)
21088 simple_loop = true;
21089 break;
21092 if (simple_loop)
21093 distance = distance_non_agu_define_in_bb (regno1, regno2,
21094 insn, distance,
21095 BB_END (bb), &found);
21096 else
21098 int shortest_dist = -1;
21099 bool found_in_bb = false;
21101 FOR_EACH_EDGE (e, ei, bb->preds)
21103 int bb_dist
21104 = distance_non_agu_define_in_bb (regno1, regno2,
21105 insn, distance,
21106 BB_END (e->src),
21107 &found_in_bb);
21108 if (found_in_bb)
21110 if (shortest_dist < 0)
21111 shortest_dist = bb_dist;
21112 else if (bb_dist > 0)
21113 shortest_dist = MIN (bb_dist, shortest_dist);
21115 found = true;
21119 distance = shortest_dist;
21123 /* get_attr_type may modify recog data. We want to make sure
21124 that recog data is valid for instruction INSN, on which
21125 distance_non_agu_define is called. INSN is unchanged here. */
21126 extract_insn_cached (insn);
21128 if (!found)
21129 return -1;
21131 return distance >> 1;
21134 /* Return the distance in half-cycles between INSN and the next
21135 insn that uses register number REGNO in memory address added
21136 to DISTANCE. Return -1 if REGNO0 is set.
21138 Put true value into *FOUND if register usage was found and
21139 false otherwise.
21140 Put true value into *REDEFINED if register redefinition was
21141 found and false otherwise. */
21143 static int
21144 distance_agu_use_in_bb (unsigned int regno,
21145 rtx_insn *insn, int distance, rtx_insn *start,
21146 bool *found, bool *redefined)
21148 basic_block bb = NULL;
21149 rtx_insn *next = start;
21150 rtx_insn *prev = NULL;
21152 *found = false;
21153 *redefined = false;
21155 if (start != NULL_RTX)
21157 bb = BLOCK_FOR_INSN (start);
21158 if (start != BB_HEAD (bb))
21159 /* If insn and start belong to the same bb, set prev to insn,
21160 so the call to increase_distance will increase the distance
21161 between insns by 1. */
21162 prev = insn;
21165 while (next
21166 && next != insn
21167 && distance < LEA_SEARCH_THRESHOLD)
21169 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21171 distance = increase_distance(prev, next, distance);
21172 if (insn_uses_reg_mem (regno, next))
21174 /* Return DISTANCE if OP0 is used in memory
21175 address in NEXT. */
21176 *found = true;
21177 return distance;
21180 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21182 /* Return -1 if OP0 is set in NEXT. */
21183 *redefined = true;
21184 return -1;
21187 prev = next;
21190 if (next == BB_END (bb))
21191 break;
21193 next = NEXT_INSN (next);
21196 return distance;
21199 /* Return the distance between INSN and the next insn that uses
21200 register number REGNO0 in memory address. Return -1 if no such
21201 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21203 static int
21204 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21206 basic_block bb = BLOCK_FOR_INSN (insn);
21207 int distance = 0;
21208 bool found = false;
21209 bool redefined = false;
21211 if (insn != BB_END (bb))
21212 distance = distance_agu_use_in_bb (regno0, insn, distance,
21213 NEXT_INSN (insn),
21214 &found, &redefined);
21216 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21218 edge e;
21219 edge_iterator ei;
21220 bool simple_loop = false;
21222 FOR_EACH_EDGE (e, ei, bb->succs)
21223 if (e->dest == bb)
21225 simple_loop = true;
21226 break;
21229 if (simple_loop)
21230 distance = distance_agu_use_in_bb (regno0, insn,
21231 distance, BB_HEAD (bb),
21232 &found, &redefined);
21233 else
21235 int shortest_dist = -1;
21236 bool found_in_bb = false;
21237 bool redefined_in_bb = false;
21239 FOR_EACH_EDGE (e, ei, bb->succs)
21241 int bb_dist
21242 = distance_agu_use_in_bb (regno0, insn,
21243 distance, BB_HEAD (e->dest),
21244 &found_in_bb, &redefined_in_bb);
21245 if (found_in_bb)
21247 if (shortest_dist < 0)
21248 shortest_dist = bb_dist;
21249 else if (bb_dist > 0)
21250 shortest_dist = MIN (bb_dist, shortest_dist);
21252 found = true;
21256 distance = shortest_dist;
21260 if (!found || redefined)
21261 return -1;
21263 return distance >> 1;
21266 /* Define this macro to tune LEA priority vs ADD, it take effect when
21267 there is a dilemma of choicing LEA or ADD
21268 Negative value: ADD is more preferred than LEA
21269 Zero: Netrual
21270 Positive value: LEA is more preferred than ADD*/
21271 #define IX86_LEA_PRIORITY 0
21273 /* Return true if usage of lea INSN has performance advantage
21274 over a sequence of instructions. Instructions sequence has
21275 SPLIT_COST cycles higher latency than lea latency. */
21277 static bool
21278 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21279 unsigned int regno2, int split_cost, bool has_scale)
21281 int dist_define, dist_use;
21283 /* For Silvermont if using a 2-source or 3-source LEA for
21284 non-destructive destination purposes, or due to wanting
21285 ability to use SCALE, the use of LEA is justified. */
21286 if (TARGET_SILVERMONT || TARGET_INTEL)
21288 if (has_scale)
21289 return true;
21290 if (split_cost < 1)
21291 return false;
21292 if (regno0 == regno1 || regno0 == regno2)
21293 return false;
21294 return true;
21297 dist_define = distance_non_agu_define (regno1, regno2, insn);
21298 dist_use = distance_agu_use (regno0, insn);
21300 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21302 /* If there is no non AGU operand definition, no AGU
21303 operand usage and split cost is 0 then both lea
21304 and non lea variants have same priority. Currently
21305 we prefer lea for 64 bit code and non lea on 32 bit
21306 code. */
21307 if (dist_use < 0 && split_cost == 0)
21308 return TARGET_64BIT || IX86_LEA_PRIORITY;
21309 else
21310 return true;
21313 /* With longer definitions distance lea is more preferable.
21314 Here we change it to take into account splitting cost and
21315 lea priority. */
21316 dist_define += split_cost + IX86_LEA_PRIORITY;
21318 /* If there is no use in memory addess then we just check
21319 that split cost exceeds AGU stall. */
21320 if (dist_use < 0)
21321 return dist_define > LEA_MAX_STALL;
21323 /* If this insn has both backward non-agu dependence and forward
21324 agu dependence, the one with short distance takes effect. */
21325 return dist_define >= dist_use;
21328 /* Return true if it is legal to clobber flags by INSN and
21329 false otherwise. */
21331 static bool
21332 ix86_ok_to_clobber_flags (rtx_insn *insn)
21334 basic_block bb = BLOCK_FOR_INSN (insn);
21335 df_ref use;
21336 bitmap live;
21338 while (insn)
21340 if (NONDEBUG_INSN_P (insn))
21342 FOR_EACH_INSN_USE (use, insn)
21343 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21344 return false;
21346 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21347 return true;
21350 if (insn == BB_END (bb))
21351 break;
21353 insn = NEXT_INSN (insn);
21356 live = df_get_live_out(bb);
21357 return !REGNO_REG_SET_P (live, FLAGS_REG);
21360 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21361 move and add to avoid AGU stalls. */
21363 bool
21364 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21366 unsigned int regno0, regno1, regno2;
21368 /* Check if we need to optimize. */
21369 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21370 return false;
21372 /* Check it is correct to split here. */
21373 if (!ix86_ok_to_clobber_flags(insn))
21374 return false;
21376 regno0 = true_regnum (operands[0]);
21377 regno1 = true_regnum (operands[1]);
21378 regno2 = true_regnum (operands[2]);
21380 /* We need to split only adds with non destructive
21381 destination operand. */
21382 if (regno0 == regno1 || regno0 == regno2)
21383 return false;
21384 else
21385 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21388 /* Return true if we should emit lea instruction instead of mov
21389 instruction. */
21391 bool
21392 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21394 unsigned int regno0, regno1;
21396 /* Check if we need to optimize. */
21397 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21398 return false;
21400 /* Use lea for reg to reg moves only. */
21401 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21402 return false;
21404 regno0 = true_regnum (operands[0]);
21405 regno1 = true_regnum (operands[1]);
21407 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21410 /* Return true if we need to split lea into a sequence of
21411 instructions to avoid AGU stalls. */
21413 bool
21414 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21416 unsigned int regno0, regno1, regno2;
21417 int split_cost;
21418 struct ix86_address parts;
21419 int ok;
21421 /* Check we need to optimize. */
21422 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21423 return false;
21425 /* The "at least two components" test below might not catch simple
21426 move or zero extension insns if parts.base is non-NULL and parts.disp
21427 is const0_rtx as the only components in the address, e.g. if the
21428 register is %rbp or %r13. As this test is much cheaper and moves or
21429 zero extensions are the common case, do this check first. */
21430 if (REG_P (operands[1])
21431 || (SImode_address_operand (operands[1], VOIDmode)
21432 && REG_P (XEXP (operands[1], 0))))
21433 return false;
21435 /* Check if it is OK to split here. */
21436 if (!ix86_ok_to_clobber_flags (insn))
21437 return false;
21439 ok = ix86_decompose_address (operands[1], &parts);
21440 gcc_assert (ok);
21442 /* There should be at least two components in the address. */
21443 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21444 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21445 return false;
21447 /* We should not split into add if non legitimate pic
21448 operand is used as displacement. */
21449 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21450 return false;
21452 regno0 = true_regnum (operands[0]) ;
21453 regno1 = INVALID_REGNUM;
21454 regno2 = INVALID_REGNUM;
21456 if (parts.base)
21457 regno1 = true_regnum (parts.base);
21458 if (parts.index)
21459 regno2 = true_regnum (parts.index);
21461 split_cost = 0;
21463 /* Compute how many cycles we will add to execution time
21464 if split lea into a sequence of instructions. */
21465 if (parts.base || parts.index)
21467 /* Have to use mov instruction if non desctructive
21468 destination form is used. */
21469 if (regno1 != regno0 && regno2 != regno0)
21470 split_cost += 1;
21472 /* Have to add index to base if both exist. */
21473 if (parts.base && parts.index)
21474 split_cost += 1;
21476 /* Have to use shift and adds if scale is 2 or greater. */
21477 if (parts.scale > 1)
21479 if (regno0 != regno1)
21480 split_cost += 1;
21481 else if (regno2 == regno0)
21482 split_cost += 4;
21483 else
21484 split_cost += parts.scale;
21487 /* Have to use add instruction with immediate if
21488 disp is non zero. */
21489 if (parts.disp && parts.disp != const0_rtx)
21490 split_cost += 1;
21492 /* Subtract the price of lea. */
21493 split_cost -= 1;
21496 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21497 parts.scale > 1);
21500 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21501 matches destination. RTX includes clobber of FLAGS_REG. */
21503 static void
21504 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21505 rtx dst, rtx src)
21507 rtx op, clob;
21509 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21510 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21512 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21515 /* Return true if regno1 def is nearest to the insn. */
21517 static bool
21518 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21520 rtx_insn *prev = insn;
21521 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21523 if (insn == start)
21524 return false;
21525 while (prev && prev != start)
21527 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21529 prev = PREV_INSN (prev);
21530 continue;
21532 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21533 return true;
21534 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21535 return false;
21536 prev = PREV_INSN (prev);
21539 /* None of the regs is defined in the bb. */
21540 return false;
21543 /* Split lea instructions into a sequence of instructions
21544 which are executed on ALU to avoid AGU stalls.
21545 It is assumed that it is allowed to clobber flags register
21546 at lea position. */
21548 void
21549 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21551 unsigned int regno0, regno1, regno2;
21552 struct ix86_address parts;
21553 rtx target, tmp;
21554 int ok, adds;
21556 ok = ix86_decompose_address (operands[1], &parts);
21557 gcc_assert (ok);
21559 target = gen_lowpart (mode, operands[0]);
21561 regno0 = true_regnum (target);
21562 regno1 = INVALID_REGNUM;
21563 regno2 = INVALID_REGNUM;
21565 if (parts.base)
21567 parts.base = gen_lowpart (mode, parts.base);
21568 regno1 = true_regnum (parts.base);
21571 if (parts.index)
21573 parts.index = gen_lowpart (mode, parts.index);
21574 regno2 = true_regnum (parts.index);
21577 if (parts.disp)
21578 parts.disp = gen_lowpart (mode, parts.disp);
21580 if (parts.scale > 1)
21582 /* Case r1 = r1 + ... */
21583 if (regno1 == regno0)
21585 /* If we have a case r1 = r1 + C * r2 then we
21586 should use multiplication which is very
21587 expensive. Assume cost model is wrong if we
21588 have such case here. */
21589 gcc_assert (regno2 != regno0);
21591 for (adds = parts.scale; adds > 0; adds--)
21592 ix86_emit_binop (PLUS, mode, target, parts.index);
21594 else
21596 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21597 if (regno0 != regno2)
21598 emit_insn (gen_rtx_SET (target, parts.index));
21600 /* Use shift for scaling. */
21601 ix86_emit_binop (ASHIFT, mode, target,
21602 GEN_INT (exact_log2 (parts.scale)));
21604 if (parts.base)
21605 ix86_emit_binop (PLUS, mode, target, parts.base);
21607 if (parts.disp && parts.disp != const0_rtx)
21608 ix86_emit_binop (PLUS, mode, target, parts.disp);
21611 else if (!parts.base && !parts.index)
21613 gcc_assert(parts.disp);
21614 emit_insn (gen_rtx_SET (target, parts.disp));
21616 else
21618 if (!parts.base)
21620 if (regno0 != regno2)
21621 emit_insn (gen_rtx_SET (target, parts.index));
21623 else if (!parts.index)
21625 if (regno0 != regno1)
21626 emit_insn (gen_rtx_SET (target, parts.base));
21628 else
21630 if (regno0 == regno1)
21631 tmp = parts.index;
21632 else if (regno0 == regno2)
21633 tmp = parts.base;
21634 else
21636 rtx tmp1;
21638 /* Find better operand for SET instruction, depending
21639 on which definition is farther from the insn. */
21640 if (find_nearest_reg_def (insn, regno1, regno2))
21641 tmp = parts.index, tmp1 = parts.base;
21642 else
21643 tmp = parts.base, tmp1 = parts.index;
21645 emit_insn (gen_rtx_SET (target, tmp));
21647 if (parts.disp && parts.disp != const0_rtx)
21648 ix86_emit_binop (PLUS, mode, target, parts.disp);
21650 ix86_emit_binop (PLUS, mode, target, tmp1);
21651 return;
21654 ix86_emit_binop (PLUS, mode, target, tmp);
21657 if (parts.disp && parts.disp != const0_rtx)
21658 ix86_emit_binop (PLUS, mode, target, parts.disp);
21662 /* Return true if it is ok to optimize an ADD operation to LEA
21663 operation to avoid flag register consumation. For most processors,
21664 ADD is faster than LEA. For the processors like BONNELL, if the
21665 destination register of LEA holds an actual address which will be
21666 used soon, LEA is better and otherwise ADD is better. */
21668 bool
21669 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21671 unsigned int regno0 = true_regnum (operands[0]);
21672 unsigned int regno1 = true_regnum (operands[1]);
21673 unsigned int regno2 = true_regnum (operands[2]);
21675 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21676 if (regno0 != regno1 && regno0 != regno2)
21677 return true;
21679 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21680 return false;
21682 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21685 /* Return true if destination reg of SET_BODY is shift count of
21686 USE_BODY. */
21688 static bool
21689 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21691 rtx set_dest;
21692 rtx shift_rtx;
21693 int i;
21695 /* Retrieve destination of SET_BODY. */
21696 switch (GET_CODE (set_body))
21698 case SET:
21699 set_dest = SET_DEST (set_body);
21700 if (!set_dest || !REG_P (set_dest))
21701 return false;
21702 break;
21703 case PARALLEL:
21704 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21705 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21706 use_body))
21707 return true;
21708 /* FALLTHROUGH */
21709 default:
21710 return false;
21713 /* Retrieve shift count of USE_BODY. */
21714 switch (GET_CODE (use_body))
21716 case SET:
21717 shift_rtx = XEXP (use_body, 1);
21718 break;
21719 case PARALLEL:
21720 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21721 if (ix86_dep_by_shift_count_body (set_body,
21722 XVECEXP (use_body, 0, i)))
21723 return true;
21724 /* FALLTHROUGH */
21725 default:
21726 return false;
21729 if (shift_rtx
21730 && (GET_CODE (shift_rtx) == ASHIFT
21731 || GET_CODE (shift_rtx) == LSHIFTRT
21732 || GET_CODE (shift_rtx) == ASHIFTRT
21733 || GET_CODE (shift_rtx) == ROTATE
21734 || GET_CODE (shift_rtx) == ROTATERT))
21736 rtx shift_count = XEXP (shift_rtx, 1);
21738 /* Return true if shift count is dest of SET_BODY. */
21739 if (REG_P (shift_count))
21741 /* Add check since it can be invoked before register
21742 allocation in pre-reload schedule. */
21743 if (reload_completed
21744 && true_regnum (set_dest) == true_regnum (shift_count))
21745 return true;
21746 else if (REGNO(set_dest) == REGNO(shift_count))
21747 return true;
21751 return false;
21754 /* Return true if destination reg of SET_INSN is shift count of
21755 USE_INSN. */
21757 bool
21758 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21760 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21761 PATTERN (use_insn));
21764 /* Return TRUE or FALSE depending on whether the unary operator meets the
21765 appropriate constraints. */
21767 bool
21768 ix86_unary_operator_ok (enum rtx_code,
21769 machine_mode,
21770 rtx operands[2])
21772 /* If one of operands is memory, source and destination must match. */
21773 if ((MEM_P (operands[0])
21774 || MEM_P (operands[1]))
21775 && ! rtx_equal_p (operands[0], operands[1]))
21776 return false;
21777 return true;
21780 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21781 are ok, keeping in mind the possible movddup alternative. */
21783 bool
21784 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21786 if (MEM_P (operands[0]))
21787 return rtx_equal_p (operands[0], operands[1 + high]);
21788 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21789 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21790 return true;
21793 /* Post-reload splitter for converting an SF or DFmode value in an
21794 SSE register into an unsigned SImode. */
21796 void
21797 ix86_split_convert_uns_si_sse (rtx operands[])
21799 machine_mode vecmode;
21800 rtx value, large, zero_or_two31, input, two31, x;
21802 large = operands[1];
21803 zero_or_two31 = operands[2];
21804 input = operands[3];
21805 two31 = operands[4];
21806 vecmode = GET_MODE (large);
21807 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21809 /* Load up the value into the low element. We must ensure that the other
21810 elements are valid floats -- zero is the easiest such value. */
21811 if (MEM_P (input))
21813 if (vecmode == V4SFmode)
21814 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21815 else
21816 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21818 else
21820 input = gen_rtx_REG (vecmode, REGNO (input));
21821 emit_move_insn (value, CONST0_RTX (vecmode));
21822 if (vecmode == V4SFmode)
21823 emit_insn (gen_sse_movss (value, value, input));
21824 else
21825 emit_insn (gen_sse2_movsd (value, value, input));
21828 emit_move_insn (large, two31);
21829 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21831 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21832 emit_insn (gen_rtx_SET (large, x));
21834 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21835 emit_insn (gen_rtx_SET (zero_or_two31, x));
21837 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21838 emit_insn (gen_rtx_SET (value, x));
21840 large = gen_rtx_REG (V4SImode, REGNO (large));
21841 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21843 x = gen_rtx_REG (V4SImode, REGNO (value));
21844 if (vecmode == V4SFmode)
21845 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21846 else
21847 emit_insn (gen_sse2_cvttpd2dq (x, value));
21848 value = x;
21850 emit_insn (gen_xorv4si3 (value, value, large));
21853 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21854 Expects the 64-bit DImode to be supplied in a pair of integral
21855 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21856 -mfpmath=sse, !optimize_size only. */
21858 void
21859 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21861 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21862 rtx int_xmm, fp_xmm;
21863 rtx biases, exponents;
21864 rtx x;
21866 int_xmm = gen_reg_rtx (V4SImode);
21867 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21868 emit_insn (gen_movdi_to_sse (int_xmm, input));
21869 else if (TARGET_SSE_SPLIT_REGS)
21871 emit_clobber (int_xmm);
21872 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21874 else
21876 x = gen_reg_rtx (V2DImode);
21877 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21878 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21881 x = gen_rtx_CONST_VECTOR (V4SImode,
21882 gen_rtvec (4, GEN_INT (0x43300000UL),
21883 GEN_INT (0x45300000UL),
21884 const0_rtx, const0_rtx));
21885 exponents = validize_mem (force_const_mem (V4SImode, x));
21887 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21888 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21890 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21891 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21892 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21893 (0x1.0p84 + double(fp_value_hi_xmm)).
21894 Note these exponents differ by 32. */
21896 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21898 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21899 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21900 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21901 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21902 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21903 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21904 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21905 biases = validize_mem (force_const_mem (V2DFmode, biases));
21906 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21908 /* Add the upper and lower DFmode values together. */
21909 if (TARGET_SSE3)
21910 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21911 else
21913 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21914 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21915 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21918 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21921 /* Not used, but eases macroization of patterns. */
21922 void
21923 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21925 gcc_unreachable ();
21928 /* Convert an unsigned SImode value into a DFmode. Only currently used
21929 for SSE, but applicable anywhere. */
21931 void
21932 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21934 REAL_VALUE_TYPE TWO31r;
21935 rtx x, fp;
21937 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21938 NULL, 1, OPTAB_DIRECT);
21940 fp = gen_reg_rtx (DFmode);
21941 emit_insn (gen_floatsidf2 (fp, x));
21943 real_ldexp (&TWO31r, &dconst1, 31);
21944 x = const_double_from_real_value (TWO31r, DFmode);
21946 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21947 if (x != target)
21948 emit_move_insn (target, x);
21951 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21952 32-bit mode; otherwise we have a direct convert instruction. */
21954 void
21955 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21957 REAL_VALUE_TYPE TWO32r;
21958 rtx fp_lo, fp_hi, x;
21960 fp_lo = gen_reg_rtx (DFmode);
21961 fp_hi = gen_reg_rtx (DFmode);
21963 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21965 real_ldexp (&TWO32r, &dconst1, 32);
21966 x = const_double_from_real_value (TWO32r, DFmode);
21967 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21969 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21971 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21972 0, OPTAB_DIRECT);
21973 if (x != target)
21974 emit_move_insn (target, x);
21977 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21978 For x86_32, -mfpmath=sse, !optimize_size only. */
21979 void
21980 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21982 REAL_VALUE_TYPE ONE16r;
21983 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21985 real_ldexp (&ONE16r, &dconst1, 16);
21986 x = const_double_from_real_value (ONE16r, SFmode);
21987 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21988 NULL, 0, OPTAB_DIRECT);
21989 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21990 NULL, 0, OPTAB_DIRECT);
21991 fp_hi = gen_reg_rtx (SFmode);
21992 fp_lo = gen_reg_rtx (SFmode);
21993 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21994 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21995 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21996 0, OPTAB_DIRECT);
21997 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21998 0, OPTAB_DIRECT);
21999 if (!rtx_equal_p (target, fp_hi))
22000 emit_move_insn (target, fp_hi);
22003 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22004 a vector of unsigned ints VAL to vector of floats TARGET. */
22006 void
22007 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22009 rtx tmp[8];
22010 REAL_VALUE_TYPE TWO16r;
22011 machine_mode intmode = GET_MODE (val);
22012 machine_mode fltmode = GET_MODE (target);
22013 rtx (*cvt) (rtx, rtx);
22015 if (intmode == V4SImode)
22016 cvt = gen_floatv4siv4sf2;
22017 else
22018 cvt = gen_floatv8siv8sf2;
22019 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22020 tmp[0] = force_reg (intmode, tmp[0]);
22021 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22022 OPTAB_DIRECT);
22023 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22024 NULL_RTX, 1, OPTAB_DIRECT);
22025 tmp[3] = gen_reg_rtx (fltmode);
22026 emit_insn (cvt (tmp[3], tmp[1]));
22027 tmp[4] = gen_reg_rtx (fltmode);
22028 emit_insn (cvt (tmp[4], tmp[2]));
22029 real_ldexp (&TWO16r, &dconst1, 16);
22030 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22031 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22032 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22033 OPTAB_DIRECT);
22034 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22035 OPTAB_DIRECT);
22036 if (tmp[7] != target)
22037 emit_move_insn (target, tmp[7]);
22040 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22041 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22042 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22043 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22046 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22048 REAL_VALUE_TYPE TWO31r;
22049 rtx two31r, tmp[4];
22050 machine_mode mode = GET_MODE (val);
22051 machine_mode scalarmode = GET_MODE_INNER (mode);
22052 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22053 rtx (*cmp) (rtx, rtx, rtx, rtx);
22054 int i;
22056 for (i = 0; i < 3; i++)
22057 tmp[i] = gen_reg_rtx (mode);
22058 real_ldexp (&TWO31r, &dconst1, 31);
22059 two31r = const_double_from_real_value (TWO31r, scalarmode);
22060 two31r = ix86_build_const_vector (mode, 1, two31r);
22061 two31r = force_reg (mode, two31r);
22062 switch (mode)
22064 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22065 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22066 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22067 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22068 default: gcc_unreachable ();
22070 tmp[3] = gen_rtx_LE (mode, two31r, val);
22071 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22072 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22073 0, OPTAB_DIRECT);
22074 if (intmode == V4SImode || TARGET_AVX2)
22075 *xorp = expand_simple_binop (intmode, ASHIFT,
22076 gen_lowpart (intmode, tmp[0]),
22077 GEN_INT (31), NULL_RTX, 0,
22078 OPTAB_DIRECT);
22079 else
22081 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22082 two31 = ix86_build_const_vector (intmode, 1, two31);
22083 *xorp = expand_simple_binop (intmode, AND,
22084 gen_lowpart (intmode, tmp[0]),
22085 two31, NULL_RTX, 0,
22086 OPTAB_DIRECT);
22088 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22089 0, OPTAB_DIRECT);
22092 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22093 then replicate the value for all elements of the vector
22094 register. */
22097 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22099 int i, n_elt;
22100 rtvec v;
22101 machine_mode scalar_mode;
22103 switch (mode)
22105 case E_V64QImode:
22106 case E_V32QImode:
22107 case E_V16QImode:
22108 case E_V32HImode:
22109 case E_V16HImode:
22110 case E_V8HImode:
22111 case E_V16SImode:
22112 case E_V8SImode:
22113 case E_V4SImode:
22114 case E_V8DImode:
22115 case E_V4DImode:
22116 case E_V2DImode:
22117 gcc_assert (vect);
22118 /* FALLTHRU */
22119 case E_V16SFmode:
22120 case E_V8SFmode:
22121 case E_V4SFmode:
22122 case E_V8DFmode:
22123 case E_V4DFmode:
22124 case E_V2DFmode:
22125 n_elt = GET_MODE_NUNITS (mode);
22126 v = rtvec_alloc (n_elt);
22127 scalar_mode = GET_MODE_INNER (mode);
22129 RTVEC_ELT (v, 0) = value;
22131 for (i = 1; i < n_elt; ++i)
22132 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22134 return gen_rtx_CONST_VECTOR (mode, v);
22136 default:
22137 gcc_unreachable ();
22141 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22142 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22143 for an SSE register. If VECT is true, then replicate the mask for
22144 all elements of the vector register. If INVERT is true, then create
22145 a mask excluding the sign bit. */
22148 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22150 machine_mode vec_mode, imode;
22151 wide_int w;
22152 rtx mask, v;
22154 switch (mode)
22156 case E_V16SImode:
22157 case E_V16SFmode:
22158 case E_V8SImode:
22159 case E_V4SImode:
22160 case E_V8SFmode:
22161 case E_V4SFmode:
22162 vec_mode = mode;
22163 imode = SImode;
22164 break;
22166 case E_V8DImode:
22167 case E_V4DImode:
22168 case E_V2DImode:
22169 case E_V8DFmode:
22170 case E_V4DFmode:
22171 case E_V2DFmode:
22172 vec_mode = mode;
22173 imode = DImode;
22174 break;
22176 case E_TImode:
22177 case E_TFmode:
22178 vec_mode = VOIDmode;
22179 imode = TImode;
22180 break;
22182 default:
22183 gcc_unreachable ();
22186 machine_mode inner_mode = GET_MODE_INNER (mode);
22187 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22188 GET_MODE_BITSIZE (inner_mode));
22189 if (invert)
22190 w = wi::bit_not (w);
22192 /* Force this value into the low part of a fp vector constant. */
22193 mask = immed_wide_int_const (w, imode);
22194 mask = gen_lowpart (inner_mode, mask);
22196 if (vec_mode == VOIDmode)
22197 return force_reg (inner_mode, mask);
22199 v = ix86_build_const_vector (vec_mode, vect, mask);
22200 return force_reg (vec_mode, v);
22203 /* Generate code for floating point ABS or NEG. */
22205 void
22206 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22207 rtx operands[])
22209 rtx mask, set, dst, src;
22210 bool use_sse = false;
22211 bool vector_mode = VECTOR_MODE_P (mode);
22212 machine_mode vmode = mode;
22214 if (vector_mode)
22215 use_sse = true;
22216 else if (mode == TFmode)
22217 use_sse = true;
22218 else if (TARGET_SSE_MATH)
22220 use_sse = SSE_FLOAT_MODE_P (mode);
22221 if (mode == SFmode)
22222 vmode = V4SFmode;
22223 else if (mode == DFmode)
22224 vmode = V2DFmode;
22227 /* NEG and ABS performed with SSE use bitwise mask operations.
22228 Create the appropriate mask now. */
22229 if (use_sse)
22230 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22231 else
22232 mask = NULL_RTX;
22234 dst = operands[0];
22235 src = operands[1];
22237 set = gen_rtx_fmt_e (code, mode, src);
22238 set = gen_rtx_SET (dst, set);
22240 if (mask)
22242 rtx use, clob;
22243 rtvec par;
22245 use = gen_rtx_USE (VOIDmode, mask);
22246 if (vector_mode)
22247 par = gen_rtvec (2, set, use);
22248 else
22250 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22251 par = gen_rtvec (3, set, use, clob);
22253 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22255 else
22256 emit_insn (set);
22259 /* Expand a copysign operation. Special case operand 0 being a constant. */
22261 void
22262 ix86_expand_copysign (rtx operands[])
22264 machine_mode mode, vmode;
22265 rtx dest, op0, op1, mask, nmask;
22267 dest = operands[0];
22268 op0 = operands[1];
22269 op1 = operands[2];
22271 mode = GET_MODE (dest);
22273 if (mode == SFmode)
22274 vmode = V4SFmode;
22275 else if (mode == DFmode)
22276 vmode = V2DFmode;
22277 else
22278 vmode = mode;
22280 if (CONST_DOUBLE_P (op0))
22282 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22284 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22285 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22287 if (mode == SFmode || mode == DFmode)
22289 if (op0 == CONST0_RTX (mode))
22290 op0 = CONST0_RTX (vmode);
22291 else
22293 rtx v = ix86_build_const_vector (vmode, false, op0);
22295 op0 = force_reg (vmode, v);
22298 else if (op0 != CONST0_RTX (mode))
22299 op0 = force_reg (mode, op0);
22301 mask = ix86_build_signbit_mask (vmode, 0, 0);
22303 if (mode == SFmode)
22304 copysign_insn = gen_copysignsf3_const;
22305 else if (mode == DFmode)
22306 copysign_insn = gen_copysigndf3_const;
22307 else
22308 copysign_insn = gen_copysigntf3_const;
22310 emit_insn (copysign_insn (dest, op0, op1, mask));
22312 else
22314 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22316 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22317 mask = ix86_build_signbit_mask (vmode, 0, 0);
22319 if (mode == SFmode)
22320 copysign_insn = gen_copysignsf3_var;
22321 else if (mode == DFmode)
22322 copysign_insn = gen_copysigndf3_var;
22323 else
22324 copysign_insn = gen_copysigntf3_var;
22326 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22330 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22331 be a constant, and so has already been expanded into a vector constant. */
22333 void
22334 ix86_split_copysign_const (rtx operands[])
22336 machine_mode mode, vmode;
22337 rtx dest, op0, mask, x;
22339 dest = operands[0];
22340 op0 = operands[1];
22341 mask = operands[3];
22343 mode = GET_MODE (dest);
22344 vmode = GET_MODE (mask);
22346 dest = lowpart_subreg (vmode, dest, mode);
22347 x = gen_rtx_AND (vmode, dest, mask);
22348 emit_insn (gen_rtx_SET (dest, x));
22350 if (op0 != CONST0_RTX (vmode))
22352 x = gen_rtx_IOR (vmode, dest, op0);
22353 emit_insn (gen_rtx_SET (dest, x));
22357 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22358 so we have to do two masks. */
22360 void
22361 ix86_split_copysign_var (rtx operands[])
22363 machine_mode mode, vmode;
22364 rtx dest, scratch, op0, op1, mask, nmask, x;
22366 dest = operands[0];
22367 scratch = operands[1];
22368 op0 = operands[2];
22369 op1 = operands[3];
22370 nmask = operands[4];
22371 mask = operands[5];
22373 mode = GET_MODE (dest);
22374 vmode = GET_MODE (mask);
22376 if (rtx_equal_p (op0, op1))
22378 /* Shouldn't happen often (it's useless, obviously), but when it does
22379 we'd generate incorrect code if we continue below. */
22380 emit_move_insn (dest, op0);
22381 return;
22384 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22386 gcc_assert (REGNO (op1) == REGNO (scratch));
22388 x = gen_rtx_AND (vmode, scratch, mask);
22389 emit_insn (gen_rtx_SET (scratch, x));
22391 dest = mask;
22392 op0 = lowpart_subreg (vmode, op0, mode);
22393 x = gen_rtx_NOT (vmode, dest);
22394 x = gen_rtx_AND (vmode, x, op0);
22395 emit_insn (gen_rtx_SET (dest, x));
22397 else
22399 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22401 x = gen_rtx_AND (vmode, scratch, mask);
22403 else /* alternative 2,4 */
22405 gcc_assert (REGNO (mask) == REGNO (scratch));
22406 op1 = lowpart_subreg (vmode, op1, mode);
22407 x = gen_rtx_AND (vmode, scratch, op1);
22409 emit_insn (gen_rtx_SET (scratch, x));
22411 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22413 dest = lowpart_subreg (vmode, op0, mode);
22414 x = gen_rtx_AND (vmode, dest, nmask);
22416 else /* alternative 3,4 */
22418 gcc_assert (REGNO (nmask) == REGNO (dest));
22419 dest = nmask;
22420 op0 = lowpart_subreg (vmode, op0, mode);
22421 x = gen_rtx_AND (vmode, dest, op0);
22423 emit_insn (gen_rtx_SET (dest, x));
22426 x = gen_rtx_IOR (vmode, dest, scratch);
22427 emit_insn (gen_rtx_SET (dest, x));
22430 /* Return TRUE or FALSE depending on whether the first SET in INSN
22431 has source and destination with matching CC modes, and that the
22432 CC mode is at least as constrained as REQ_MODE. */
22434 bool
22435 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22437 rtx set;
22438 machine_mode set_mode;
22440 set = PATTERN (insn);
22441 if (GET_CODE (set) == PARALLEL)
22442 set = XVECEXP (set, 0, 0);
22443 gcc_assert (GET_CODE (set) == SET);
22444 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22446 set_mode = GET_MODE (SET_DEST (set));
22447 switch (set_mode)
22449 case E_CCNOmode:
22450 if (req_mode != CCNOmode
22451 && (req_mode != CCmode
22452 || XEXP (SET_SRC (set), 1) != const0_rtx))
22453 return false;
22454 break;
22455 case E_CCmode:
22456 if (req_mode == CCGCmode)
22457 return false;
22458 /* FALLTHRU */
22459 case E_CCGCmode:
22460 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22461 return false;
22462 /* FALLTHRU */
22463 case E_CCGOCmode:
22464 if (req_mode == CCZmode)
22465 return false;
22466 /* FALLTHRU */
22467 case E_CCZmode:
22468 break;
22470 case E_CCGZmode:
22472 case E_CCAmode:
22473 case E_CCCmode:
22474 case E_CCOmode:
22475 case E_CCPmode:
22476 case E_CCSmode:
22477 if (set_mode != req_mode)
22478 return false;
22479 break;
22481 default:
22482 gcc_unreachable ();
22485 return GET_MODE (SET_SRC (set)) == set_mode;
22488 /* Generate insn patterns to do an integer compare of OPERANDS. */
22490 static rtx
22491 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22493 machine_mode cmpmode;
22494 rtx tmp, flags;
22496 cmpmode = SELECT_CC_MODE (code, op0, op1);
22497 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22499 /* This is very simple, but making the interface the same as in the
22500 FP case makes the rest of the code easier. */
22501 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22502 emit_insn (gen_rtx_SET (flags, tmp));
22504 /* Return the test that should be put into the flags user, i.e.
22505 the bcc, scc, or cmov instruction. */
22506 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22509 /* Figure out whether to use unordered fp comparisons. */
22511 static bool
22512 ix86_unordered_fp_compare (enum rtx_code code)
22514 if (!TARGET_IEEE_FP)
22515 return false;
22517 switch (code)
22519 case GT:
22520 case GE:
22521 case LT:
22522 case LE:
22523 return false;
22525 case EQ:
22526 case NE:
22528 case LTGT:
22529 case UNORDERED:
22530 case ORDERED:
22531 case UNLT:
22532 case UNLE:
22533 case UNGT:
22534 case UNGE:
22535 case UNEQ:
22536 return true;
22538 default:
22539 gcc_unreachable ();
22543 machine_mode
22544 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22546 machine_mode mode = GET_MODE (op0);
22548 if (SCALAR_FLOAT_MODE_P (mode))
22550 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22551 return CCFPmode;
22554 switch (code)
22556 /* Only zero flag is needed. */
22557 case EQ: /* ZF=0 */
22558 case NE: /* ZF!=0 */
22559 return CCZmode;
22560 /* Codes needing carry flag. */
22561 case GEU: /* CF=0 */
22562 case LTU: /* CF=1 */
22563 /* Detect overflow checks. They need just the carry flag. */
22564 if (GET_CODE (op0) == PLUS
22565 && (rtx_equal_p (op1, XEXP (op0, 0))
22566 || rtx_equal_p (op1, XEXP (op0, 1))))
22567 return CCCmode;
22568 else
22569 return CCmode;
22570 case GTU: /* CF=0 & ZF=0 */
22571 case LEU: /* CF=1 | ZF=1 */
22572 return CCmode;
22573 /* Codes possibly doable only with sign flag when
22574 comparing against zero. */
22575 case GE: /* SF=OF or SF=0 */
22576 case LT: /* SF<>OF or SF=1 */
22577 if (op1 == const0_rtx)
22578 return CCGOCmode;
22579 else
22580 /* For other cases Carry flag is not required. */
22581 return CCGCmode;
22582 /* Codes doable only with sign flag when comparing
22583 against zero, but we miss jump instruction for it
22584 so we need to use relational tests against overflow
22585 that thus needs to be zero. */
22586 case GT: /* ZF=0 & SF=OF */
22587 case LE: /* ZF=1 | SF<>OF */
22588 if (op1 == const0_rtx)
22589 return CCNOmode;
22590 else
22591 return CCGCmode;
22592 /* strcmp pattern do (use flags) and combine may ask us for proper
22593 mode. */
22594 case USE:
22595 return CCmode;
22596 default:
22597 gcc_unreachable ();
22601 /* Return the fixed registers used for condition codes. */
22603 static bool
22604 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22606 *p1 = FLAGS_REG;
22607 *p2 = FPSR_REG;
22608 return true;
22611 /* If two condition code modes are compatible, return a condition code
22612 mode which is compatible with both. Otherwise, return
22613 VOIDmode. */
22615 static machine_mode
22616 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22618 if (m1 == m2)
22619 return m1;
22621 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22622 return VOIDmode;
22624 if ((m1 == CCGCmode && m2 == CCGOCmode)
22625 || (m1 == CCGOCmode && m2 == CCGCmode))
22626 return CCGCmode;
22628 if ((m1 == CCNOmode && m2 == CCGOCmode)
22629 || (m1 == CCGOCmode && m2 == CCNOmode))
22630 return CCNOmode;
22632 if (m1 == CCZmode
22633 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22634 return m2;
22635 else if (m2 == CCZmode
22636 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22637 return m1;
22639 switch (m1)
22641 default:
22642 gcc_unreachable ();
22644 case E_CCmode:
22645 case E_CCGCmode:
22646 case E_CCGOCmode:
22647 case E_CCNOmode:
22648 case E_CCAmode:
22649 case E_CCCmode:
22650 case E_CCOmode:
22651 case E_CCPmode:
22652 case E_CCSmode:
22653 case E_CCZmode:
22654 switch (m2)
22656 default:
22657 return VOIDmode;
22659 case E_CCmode:
22660 case E_CCGCmode:
22661 case E_CCGOCmode:
22662 case E_CCNOmode:
22663 case E_CCAmode:
22664 case E_CCCmode:
22665 case E_CCOmode:
22666 case E_CCPmode:
22667 case E_CCSmode:
22668 case E_CCZmode:
22669 return CCmode;
22672 case E_CCFPmode:
22673 /* These are only compatible with themselves, which we already
22674 checked above. */
22675 return VOIDmode;
22680 /* Return a comparison we can do and that it is equivalent to
22681 swap_condition (code) apart possibly from orderedness.
22682 But, never change orderedness if TARGET_IEEE_FP, returning
22683 UNKNOWN in that case if necessary. */
22685 static enum rtx_code
22686 ix86_fp_swap_condition (enum rtx_code code)
22688 switch (code)
22690 case GT: /* GTU - CF=0 & ZF=0 */
22691 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22692 case GE: /* GEU - CF=0 */
22693 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22694 case UNLT: /* LTU - CF=1 */
22695 return TARGET_IEEE_FP ? UNKNOWN : GT;
22696 case UNLE: /* LEU - CF=1 | ZF=1 */
22697 return TARGET_IEEE_FP ? UNKNOWN : GE;
22698 default:
22699 return swap_condition (code);
22703 /* Return cost of comparison CODE using the best strategy for performance.
22704 All following functions do use number of instructions as a cost metrics.
22705 In future this should be tweaked to compute bytes for optimize_size and
22706 take into account performance of various instructions on various CPUs. */
22708 static int
22709 ix86_fp_comparison_cost (enum rtx_code code)
22711 int arith_cost;
22713 /* The cost of code using bit-twiddling on %ah. */
22714 switch (code)
22716 case UNLE:
22717 case UNLT:
22718 case LTGT:
22719 case GT:
22720 case GE:
22721 case UNORDERED:
22722 case ORDERED:
22723 case UNEQ:
22724 arith_cost = 4;
22725 break;
22726 case LT:
22727 case NE:
22728 case EQ:
22729 case UNGE:
22730 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22731 break;
22732 case LE:
22733 case UNGT:
22734 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22735 break;
22736 default:
22737 gcc_unreachable ();
22740 switch (ix86_fp_comparison_strategy (code))
22742 case IX86_FPCMP_COMI:
22743 return arith_cost > 4 ? 3 : 2;
22744 case IX86_FPCMP_SAHF:
22745 return arith_cost > 4 ? 4 : 3;
22746 default:
22747 return arith_cost;
22751 /* Return strategy to use for floating-point. We assume that fcomi is always
22752 preferrable where available, since that is also true when looking at size
22753 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22755 enum ix86_fpcmp_strategy
22756 ix86_fp_comparison_strategy (enum rtx_code)
22758 /* Do fcomi/sahf based test when profitable. */
22760 if (TARGET_CMOVE)
22761 return IX86_FPCMP_COMI;
22763 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22764 return IX86_FPCMP_SAHF;
22766 return IX86_FPCMP_ARITH;
22769 /* Swap, force into registers, or otherwise massage the two operands
22770 to a fp comparison. The operands are updated in place; the new
22771 comparison code is returned. */
22773 static enum rtx_code
22774 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22776 bool unordered_compare = ix86_unordered_fp_compare (code);
22777 rtx op0 = *pop0, op1 = *pop1;
22778 machine_mode op_mode = GET_MODE (op0);
22779 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22781 /* All of the unordered compare instructions only work on registers.
22782 The same is true of the fcomi compare instructions. The XFmode
22783 compare instructions require registers except when comparing
22784 against zero or when converting operand 1 from fixed point to
22785 floating point. */
22787 if (!is_sse
22788 && (unordered_compare
22789 || (op_mode == XFmode
22790 && ! (standard_80387_constant_p (op0) == 1
22791 || standard_80387_constant_p (op1) == 1)
22792 && GET_CODE (op1) != FLOAT)
22793 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22795 op0 = force_reg (op_mode, op0);
22796 op1 = force_reg (op_mode, op1);
22798 else
22800 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22801 things around if they appear profitable, otherwise force op0
22802 into a register. */
22804 if (standard_80387_constant_p (op0) == 0
22805 || (MEM_P (op0)
22806 && ! (standard_80387_constant_p (op1) == 0
22807 || MEM_P (op1))))
22809 enum rtx_code new_code = ix86_fp_swap_condition (code);
22810 if (new_code != UNKNOWN)
22812 std::swap (op0, op1);
22813 code = new_code;
22817 if (!REG_P (op0))
22818 op0 = force_reg (op_mode, op0);
22820 if (CONSTANT_P (op1))
22822 int tmp = standard_80387_constant_p (op1);
22823 if (tmp == 0)
22824 op1 = validize_mem (force_const_mem (op_mode, op1));
22825 else if (tmp == 1)
22827 if (TARGET_CMOVE)
22828 op1 = force_reg (op_mode, op1);
22830 else
22831 op1 = force_reg (op_mode, op1);
22835 /* Try to rearrange the comparison to make it cheaper. */
22836 if (ix86_fp_comparison_cost (code)
22837 > ix86_fp_comparison_cost (swap_condition (code))
22838 && (REG_P (op1) || can_create_pseudo_p ()))
22840 std::swap (op0, op1);
22841 code = swap_condition (code);
22842 if (!REG_P (op0))
22843 op0 = force_reg (op_mode, op0);
22846 *pop0 = op0;
22847 *pop1 = op1;
22848 return code;
22851 /* Convert comparison codes we use to represent FP comparison to integer
22852 code that will result in proper branch. Return UNKNOWN if no such code
22853 is available. */
22855 enum rtx_code
22856 ix86_fp_compare_code_to_integer (enum rtx_code code)
22858 switch (code)
22860 case GT:
22861 return GTU;
22862 case GE:
22863 return GEU;
22864 case ORDERED:
22865 case UNORDERED:
22866 return code;
22867 case UNEQ:
22868 return EQ;
22869 case UNLT:
22870 return LTU;
22871 case UNLE:
22872 return LEU;
22873 case LTGT:
22874 return NE;
22875 default:
22876 return UNKNOWN;
22880 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22882 static rtx
22883 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22885 bool unordered_compare = ix86_unordered_fp_compare (code);
22886 machine_mode intcmp_mode;
22887 rtx tmp, tmp2;
22889 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22891 /* Do fcomi/sahf based test when profitable. */
22892 switch (ix86_fp_comparison_strategy (code))
22894 case IX86_FPCMP_COMI:
22895 intcmp_mode = CCFPmode;
22896 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22897 if (unordered_compare)
22898 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22899 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22900 break;
22902 case IX86_FPCMP_SAHF:
22903 intcmp_mode = CCFPmode;
22904 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22905 if (unordered_compare)
22906 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22907 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22908 if (!scratch)
22909 scratch = gen_reg_rtx (HImode);
22910 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22911 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22912 break;
22914 case IX86_FPCMP_ARITH:
22915 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22916 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22917 if (unordered_compare)
22918 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22919 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22920 if (!scratch)
22921 scratch = gen_reg_rtx (HImode);
22922 emit_insn (gen_rtx_SET (scratch, tmp));
22924 /* In the unordered case, we have to check C2 for NaN's, which
22925 doesn't happen to work out to anything nice combination-wise.
22926 So do some bit twiddling on the value we've got in AH to come
22927 up with an appropriate set of condition codes. */
22929 intcmp_mode = CCNOmode;
22930 switch (code)
22932 case GT:
22933 case UNGT:
22934 if (code == GT || !TARGET_IEEE_FP)
22936 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22937 code = EQ;
22939 else
22941 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22942 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22943 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22944 intcmp_mode = CCmode;
22945 code = GEU;
22947 break;
22948 case LT:
22949 case UNLT:
22950 if (code == LT && TARGET_IEEE_FP)
22952 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22953 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22954 intcmp_mode = CCmode;
22955 code = EQ;
22957 else
22959 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22960 code = NE;
22962 break;
22963 case GE:
22964 case UNGE:
22965 if (code == GE || !TARGET_IEEE_FP)
22967 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22968 code = EQ;
22970 else
22972 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22973 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22974 code = NE;
22976 break;
22977 case LE:
22978 case UNLE:
22979 if (code == LE && TARGET_IEEE_FP)
22981 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22982 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22983 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22984 intcmp_mode = CCmode;
22985 code = LTU;
22987 else
22989 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22990 code = NE;
22992 break;
22993 case EQ:
22994 case UNEQ:
22995 if (code == EQ && TARGET_IEEE_FP)
22997 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22998 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22999 intcmp_mode = CCmode;
23000 code = EQ;
23002 else
23004 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23005 code = NE;
23007 break;
23008 case NE:
23009 case LTGT:
23010 if (code == NE && TARGET_IEEE_FP)
23012 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23013 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23014 GEN_INT (0x40)));
23015 code = NE;
23017 else
23019 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23020 code = EQ;
23022 break;
23024 case UNORDERED:
23025 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23026 code = NE;
23027 break;
23028 case ORDERED:
23029 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23030 code = EQ;
23031 break;
23033 default:
23034 gcc_unreachable ();
23036 break;
23038 default:
23039 gcc_unreachable();
23042 /* Return the test that should be put into the flags user, i.e.
23043 the bcc, scc, or cmov instruction. */
23044 return gen_rtx_fmt_ee (code, VOIDmode,
23045 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23046 const0_rtx);
23049 static rtx
23050 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23052 rtx ret;
23054 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23055 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23057 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23059 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23060 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23062 else
23063 ret = ix86_expand_int_compare (code, op0, op1);
23065 return ret;
23068 void
23069 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23071 machine_mode mode = GET_MODE (op0);
23072 rtx tmp;
23074 /* Handle special case - vector comparsion with boolean result, transform
23075 it using ptest instruction. */
23076 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23078 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23079 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23081 gcc_assert (code == EQ || code == NE);
23082 /* Generate XOR since we can't check that one operand is zero vector. */
23083 tmp = gen_reg_rtx (mode);
23084 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23085 tmp = gen_lowpart (p_mode, tmp);
23086 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23087 gen_rtx_UNSPEC (CCmode,
23088 gen_rtvec (2, tmp, tmp),
23089 UNSPEC_PTEST)));
23090 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23091 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23092 gen_rtx_LABEL_REF (VOIDmode, label),
23093 pc_rtx);
23094 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23095 return;
23098 switch (mode)
23100 case E_SFmode:
23101 case E_DFmode:
23102 case E_XFmode:
23103 case E_QImode:
23104 case E_HImode:
23105 case E_SImode:
23106 simple:
23107 tmp = ix86_expand_compare (code, op0, op1);
23108 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23109 gen_rtx_LABEL_REF (VOIDmode, label),
23110 pc_rtx);
23111 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23112 return;
23114 case E_DImode:
23115 if (TARGET_64BIT)
23116 goto simple;
23117 /* For 32-bit target DI comparison may be performed on
23118 SSE registers. To allow this we should avoid split
23119 to SI mode which is achieved by doing xor in DI mode
23120 and then comparing with zero (which is recognized by
23121 STV pass). We don't compare using xor when optimizing
23122 for size. */
23123 if (!optimize_insn_for_size_p ()
23124 && TARGET_STV
23125 && (code == EQ || code == NE))
23127 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23128 op1 = const0_rtx;
23130 /* FALLTHRU */
23131 case E_TImode:
23132 /* Expand DImode branch into multiple compare+branch. */
23134 rtx lo[2], hi[2];
23135 rtx_code_label *label2;
23136 enum rtx_code code1, code2, code3;
23137 machine_mode submode;
23139 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23141 std::swap (op0, op1);
23142 code = swap_condition (code);
23145 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23146 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23148 submode = mode == DImode ? SImode : DImode;
23150 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23151 avoid two branches. This costs one extra insn, so disable when
23152 optimizing for size. */
23154 if ((code == EQ || code == NE)
23155 && (!optimize_insn_for_size_p ()
23156 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23158 rtx xor0, xor1;
23160 xor1 = hi[0];
23161 if (hi[1] != const0_rtx)
23162 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23163 NULL_RTX, 0, OPTAB_WIDEN);
23165 xor0 = lo[0];
23166 if (lo[1] != const0_rtx)
23167 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23168 NULL_RTX, 0, OPTAB_WIDEN);
23170 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23171 NULL_RTX, 0, OPTAB_WIDEN);
23173 ix86_expand_branch (code, tmp, const0_rtx, label);
23174 return;
23177 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23178 op1 is a constant and the low word is zero, then we can just
23179 examine the high word. Similarly for low word -1 and
23180 less-or-equal-than or greater-than. */
23182 if (CONST_INT_P (hi[1]))
23183 switch (code)
23185 case LT: case LTU: case GE: case GEU:
23186 if (lo[1] == const0_rtx)
23188 ix86_expand_branch (code, hi[0], hi[1], label);
23189 return;
23191 break;
23192 case LE: case LEU: case GT: case GTU:
23193 if (lo[1] == constm1_rtx)
23195 ix86_expand_branch (code, hi[0], hi[1], label);
23196 return;
23198 break;
23199 default:
23200 break;
23203 /* Emulate comparisons that do not depend on Zero flag with
23204 double-word subtraction. Note that only Overflow, Sign
23205 and Carry flags are valid, so swap arguments and condition
23206 of comparisons that would otherwise test Zero flag. */
23208 switch (code)
23210 case LE: case LEU: case GT: case GTU:
23211 std::swap (lo[0], lo[1]);
23212 std::swap (hi[0], hi[1]);
23213 code = swap_condition (code);
23214 /* FALLTHRU */
23216 case LT: case LTU: case GE: case GEU:
23218 rtx (*cmp_insn) (rtx, rtx);
23219 rtx (*sbb_insn) (rtx, rtx, rtx);
23220 bool uns = (code == LTU || code == GEU);
23222 if (TARGET_64BIT)
23224 cmp_insn = gen_cmpdi_1;
23225 sbb_insn
23226 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
23228 else
23230 cmp_insn = gen_cmpsi_1;
23231 sbb_insn
23232 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
23235 if (!nonimmediate_operand (lo[0], submode))
23236 lo[0] = force_reg (submode, lo[0]);
23237 if (!x86_64_general_operand (lo[1], submode))
23238 lo[1] = force_reg (submode, lo[1]);
23240 if (!register_operand (hi[0], submode))
23241 hi[0] = force_reg (submode, hi[0]);
23242 if ((uns && !nonimmediate_operand (hi[1], submode))
23243 || (!uns && !x86_64_general_operand (hi[1], submode)))
23244 hi[1] = force_reg (submode, hi[1]);
23246 emit_insn (cmp_insn (lo[0], lo[1]));
23247 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
23249 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
23251 ix86_expand_branch (code, tmp, const0_rtx, label);
23252 return;
23255 default:
23256 break;
23259 /* Otherwise, we need two or three jumps. */
23261 label2 = gen_label_rtx ();
23263 code1 = code;
23264 code2 = swap_condition (code);
23265 code3 = unsigned_condition (code);
23267 switch (code)
23269 case LT: case GT: case LTU: case GTU:
23270 break;
23272 case LE: code1 = LT; code2 = GT; break;
23273 case GE: code1 = GT; code2 = LT; break;
23274 case LEU: code1 = LTU; code2 = GTU; break;
23275 case GEU: code1 = GTU; code2 = LTU; break;
23277 case EQ: code1 = UNKNOWN; code2 = NE; break;
23278 case NE: code2 = UNKNOWN; break;
23280 default:
23281 gcc_unreachable ();
23285 * a < b =>
23286 * if (hi(a) < hi(b)) goto true;
23287 * if (hi(a) > hi(b)) goto false;
23288 * if (lo(a) < lo(b)) goto true;
23289 * false:
23292 if (code1 != UNKNOWN)
23293 ix86_expand_branch (code1, hi[0], hi[1], label);
23294 if (code2 != UNKNOWN)
23295 ix86_expand_branch (code2, hi[0], hi[1], label2);
23297 ix86_expand_branch (code3, lo[0], lo[1], label);
23299 if (code2 != UNKNOWN)
23300 emit_label (label2);
23301 return;
23304 default:
23305 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23306 goto simple;
23310 void
23311 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23313 rtx ret;
23315 gcc_assert (GET_MODE (dest) == QImode);
23317 ret = ix86_expand_compare (code, op0, op1);
23318 PUT_MODE (ret, QImode);
23319 emit_insn (gen_rtx_SET (dest, ret));
23322 /* Expand comparison setting or clearing carry flag. Return true when
23323 successful and set pop for the operation. */
23324 static bool
23325 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23327 machine_mode mode =
23328 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23330 /* Do not handle double-mode compares that go through special path. */
23331 if (mode == (TARGET_64BIT ? TImode : DImode))
23332 return false;
23334 if (SCALAR_FLOAT_MODE_P (mode))
23336 rtx compare_op;
23337 rtx_insn *compare_seq;
23339 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23341 /* Shortcut: following common codes never translate
23342 into carry flag compares. */
23343 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23344 || code == ORDERED || code == UNORDERED)
23345 return false;
23347 /* These comparisons require zero flag; swap operands so they won't. */
23348 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23349 && !TARGET_IEEE_FP)
23351 std::swap (op0, op1);
23352 code = swap_condition (code);
23355 /* Try to expand the comparison and verify that we end up with
23356 carry flag based comparison. This fails to be true only when
23357 we decide to expand comparison using arithmetic that is not
23358 too common scenario. */
23359 start_sequence ();
23360 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23361 compare_seq = get_insns ();
23362 end_sequence ();
23364 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
23365 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23366 else
23367 code = GET_CODE (compare_op);
23369 if (code != LTU && code != GEU)
23370 return false;
23372 emit_insn (compare_seq);
23373 *pop = compare_op;
23374 return true;
23377 if (!INTEGRAL_MODE_P (mode))
23378 return false;
23380 switch (code)
23382 case LTU:
23383 case GEU:
23384 break;
23386 /* Convert a==0 into (unsigned)a<1. */
23387 case EQ:
23388 case NE:
23389 if (op1 != const0_rtx)
23390 return false;
23391 op1 = const1_rtx;
23392 code = (code == EQ ? LTU : GEU);
23393 break;
23395 /* Convert a>b into b<a or a>=b-1. */
23396 case GTU:
23397 case LEU:
23398 if (CONST_INT_P (op1))
23400 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23401 /* Bail out on overflow. We still can swap operands but that
23402 would force loading of the constant into register. */
23403 if (op1 == const0_rtx
23404 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23405 return false;
23406 code = (code == GTU ? GEU : LTU);
23408 else
23410 std::swap (op0, op1);
23411 code = (code == GTU ? LTU : GEU);
23413 break;
23415 /* Convert a>=0 into (unsigned)a<0x80000000. */
23416 case LT:
23417 case GE:
23418 if (mode == DImode || op1 != const0_rtx)
23419 return false;
23420 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23421 code = (code == LT ? GEU : LTU);
23422 break;
23423 case LE:
23424 case GT:
23425 if (mode == DImode || op1 != constm1_rtx)
23426 return false;
23427 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23428 code = (code == LE ? GEU : LTU);
23429 break;
23431 default:
23432 return false;
23434 /* Swapping operands may cause constant to appear as first operand. */
23435 if (!nonimmediate_operand (op0, VOIDmode))
23437 if (!can_create_pseudo_p ())
23438 return false;
23439 op0 = force_reg (mode, op0);
23441 *pop = ix86_expand_compare (code, op0, op1);
23442 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23443 return true;
23446 bool
23447 ix86_expand_int_movcc (rtx operands[])
23449 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23450 rtx_insn *compare_seq;
23451 rtx compare_op;
23452 machine_mode mode = GET_MODE (operands[0]);
23453 bool sign_bit_compare_p = false;
23454 rtx op0 = XEXP (operands[1], 0);
23455 rtx op1 = XEXP (operands[1], 1);
23457 if (GET_MODE (op0) == TImode
23458 || (GET_MODE (op0) == DImode
23459 && !TARGET_64BIT))
23460 return false;
23462 start_sequence ();
23463 compare_op = ix86_expand_compare (code, op0, op1);
23464 compare_seq = get_insns ();
23465 end_sequence ();
23467 compare_code = GET_CODE (compare_op);
23469 if ((op1 == const0_rtx && (code == GE || code == LT))
23470 || (op1 == constm1_rtx && (code == GT || code == LE)))
23471 sign_bit_compare_p = true;
23473 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23474 HImode insns, we'd be swallowed in word prefix ops. */
23476 if ((mode != HImode || TARGET_FAST_PREFIX)
23477 && (mode != (TARGET_64BIT ? TImode : DImode))
23478 && CONST_INT_P (operands[2])
23479 && CONST_INT_P (operands[3]))
23481 rtx out = operands[0];
23482 HOST_WIDE_INT ct = INTVAL (operands[2]);
23483 HOST_WIDE_INT cf = INTVAL (operands[3]);
23484 HOST_WIDE_INT diff;
23486 diff = ct - cf;
23487 /* Sign bit compares are better done using shifts than we do by using
23488 sbb. */
23489 if (sign_bit_compare_p
23490 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23492 /* Detect overlap between destination and compare sources. */
23493 rtx tmp = out;
23495 if (!sign_bit_compare_p)
23497 rtx flags;
23498 bool fpcmp = false;
23500 compare_code = GET_CODE (compare_op);
23502 flags = XEXP (compare_op, 0);
23504 if (GET_MODE (flags) == CCFPmode)
23506 fpcmp = true;
23507 compare_code
23508 = ix86_fp_compare_code_to_integer (compare_code);
23511 /* To simplify rest of code, restrict to the GEU case. */
23512 if (compare_code == LTU)
23514 std::swap (ct, cf);
23515 compare_code = reverse_condition (compare_code);
23516 code = reverse_condition (code);
23518 else
23520 if (fpcmp)
23521 PUT_CODE (compare_op,
23522 reverse_condition_maybe_unordered
23523 (GET_CODE (compare_op)));
23524 else
23525 PUT_CODE (compare_op,
23526 reverse_condition (GET_CODE (compare_op)));
23528 diff = ct - cf;
23530 if (reg_overlap_mentioned_p (out, op0)
23531 || reg_overlap_mentioned_p (out, op1))
23532 tmp = gen_reg_rtx (mode);
23534 if (mode == DImode)
23535 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23536 else
23537 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23538 flags, compare_op));
23540 else
23542 if (code == GT || code == GE)
23543 code = reverse_condition (code);
23544 else
23546 std::swap (ct, cf);
23547 diff = ct - cf;
23549 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23552 if (diff == 1)
23555 * cmpl op0,op1
23556 * sbbl dest,dest
23557 * [addl dest, ct]
23559 * Size 5 - 8.
23561 if (ct)
23562 tmp = expand_simple_binop (mode, PLUS,
23563 tmp, GEN_INT (ct),
23564 copy_rtx (tmp), 1, OPTAB_DIRECT);
23566 else if (cf == -1)
23569 * cmpl op0,op1
23570 * sbbl dest,dest
23571 * orl $ct, dest
23573 * Size 8.
23575 tmp = expand_simple_binop (mode, IOR,
23576 tmp, GEN_INT (ct),
23577 copy_rtx (tmp), 1, OPTAB_DIRECT);
23579 else if (diff == -1 && ct)
23582 * cmpl op0,op1
23583 * sbbl dest,dest
23584 * notl dest
23585 * [addl dest, cf]
23587 * Size 8 - 11.
23589 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23590 if (cf)
23591 tmp = expand_simple_binop (mode, PLUS,
23592 copy_rtx (tmp), GEN_INT (cf),
23593 copy_rtx (tmp), 1, OPTAB_DIRECT);
23595 else
23598 * cmpl op0,op1
23599 * sbbl dest,dest
23600 * [notl dest]
23601 * andl cf - ct, dest
23602 * [addl dest, ct]
23604 * Size 8 - 11.
23607 if (cf == 0)
23609 cf = ct;
23610 ct = 0;
23611 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23614 tmp = expand_simple_binop (mode, AND,
23615 copy_rtx (tmp),
23616 gen_int_mode (cf - ct, mode),
23617 copy_rtx (tmp), 1, OPTAB_DIRECT);
23618 if (ct)
23619 tmp = expand_simple_binop (mode, PLUS,
23620 copy_rtx (tmp), GEN_INT (ct),
23621 copy_rtx (tmp), 1, OPTAB_DIRECT);
23624 if (!rtx_equal_p (tmp, out))
23625 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23627 return true;
23630 if (diff < 0)
23632 machine_mode cmp_mode = GET_MODE (op0);
23633 enum rtx_code new_code;
23635 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23637 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23639 /* We may be reversing unordered compare to normal compare, that
23640 is not valid in general (we may convert non-trapping condition
23641 to trapping one), however on i386 we currently emit all
23642 comparisons unordered. */
23643 new_code = reverse_condition_maybe_unordered (code);
23645 else
23646 new_code = ix86_reverse_condition (code, cmp_mode);
23647 if (new_code != UNKNOWN)
23649 std::swap (ct, cf);
23650 diff = -diff;
23651 code = new_code;
23655 compare_code = UNKNOWN;
23656 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23657 && CONST_INT_P (op1))
23659 if (op1 == const0_rtx
23660 && (code == LT || code == GE))
23661 compare_code = code;
23662 else if (op1 == constm1_rtx)
23664 if (code == LE)
23665 compare_code = LT;
23666 else if (code == GT)
23667 compare_code = GE;
23671 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23672 if (compare_code != UNKNOWN
23673 && GET_MODE (op0) == GET_MODE (out)
23674 && (cf == -1 || ct == -1))
23676 /* If lea code below could be used, only optimize
23677 if it results in a 2 insn sequence. */
23679 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23680 || diff == 3 || diff == 5 || diff == 9)
23681 || (compare_code == LT && ct == -1)
23682 || (compare_code == GE && cf == -1))
23685 * notl op1 (if necessary)
23686 * sarl $31, op1
23687 * orl cf, op1
23689 if (ct != -1)
23691 cf = ct;
23692 ct = -1;
23693 code = reverse_condition (code);
23696 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23698 out = expand_simple_binop (mode, IOR,
23699 out, GEN_INT (cf),
23700 out, 1, OPTAB_DIRECT);
23701 if (out != operands[0])
23702 emit_move_insn (operands[0], out);
23704 return true;
23709 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23710 || diff == 3 || diff == 5 || diff == 9)
23711 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23712 && (mode != DImode
23713 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23716 * xorl dest,dest
23717 * cmpl op1,op2
23718 * setcc dest
23719 * lea cf(dest*(ct-cf)),dest
23721 * Size 14.
23723 * This also catches the degenerate setcc-only case.
23726 rtx tmp;
23727 int nops;
23729 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23731 nops = 0;
23732 /* On x86_64 the lea instruction operates on Pmode, so we need
23733 to get arithmetics done in proper mode to match. */
23734 if (diff == 1)
23735 tmp = copy_rtx (out);
23736 else
23738 rtx out1;
23739 out1 = copy_rtx (out);
23740 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23741 nops++;
23742 if (diff & 1)
23744 tmp = gen_rtx_PLUS (mode, tmp, out1);
23745 nops++;
23748 if (cf != 0)
23750 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23751 nops++;
23753 if (!rtx_equal_p (tmp, out))
23755 if (nops == 1)
23756 out = force_operand (tmp, copy_rtx (out));
23757 else
23758 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23760 if (!rtx_equal_p (out, operands[0]))
23761 emit_move_insn (operands[0], copy_rtx (out));
23763 return true;
23767 * General case: Jumpful:
23768 * xorl dest,dest cmpl op1, op2
23769 * cmpl op1, op2 movl ct, dest
23770 * setcc dest jcc 1f
23771 * decl dest movl cf, dest
23772 * andl (cf-ct),dest 1:
23773 * addl ct,dest
23775 * Size 20. Size 14.
23777 * This is reasonably steep, but branch mispredict costs are
23778 * high on modern cpus, so consider failing only if optimizing
23779 * for space.
23782 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23783 && BRANCH_COST (optimize_insn_for_speed_p (),
23784 false) >= 2)
23786 if (cf == 0)
23788 machine_mode cmp_mode = GET_MODE (op0);
23789 enum rtx_code new_code;
23791 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23793 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23795 /* We may be reversing unordered compare to normal compare,
23796 that is not valid in general (we may convert non-trapping
23797 condition to trapping one), however on i386 we currently
23798 emit all comparisons unordered. */
23799 new_code = reverse_condition_maybe_unordered (code);
23801 else
23803 new_code = ix86_reverse_condition (code, cmp_mode);
23804 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23805 compare_code = reverse_condition (compare_code);
23808 if (new_code != UNKNOWN)
23810 cf = ct;
23811 ct = 0;
23812 code = new_code;
23816 if (compare_code != UNKNOWN)
23818 /* notl op1 (if needed)
23819 sarl $31, op1
23820 andl (cf-ct), op1
23821 addl ct, op1
23823 For x < 0 (resp. x <= -1) there will be no notl,
23824 so if possible swap the constants to get rid of the
23825 complement.
23826 True/false will be -1/0 while code below (store flag
23827 followed by decrement) is 0/-1, so the constants need
23828 to be exchanged once more. */
23830 if (compare_code == GE || !cf)
23832 code = reverse_condition (code);
23833 compare_code = LT;
23835 else
23836 std::swap (ct, cf);
23838 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23840 else
23842 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23844 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23845 constm1_rtx,
23846 copy_rtx (out), 1, OPTAB_DIRECT);
23849 out = expand_simple_binop (mode, AND, copy_rtx (out),
23850 gen_int_mode (cf - ct, mode),
23851 copy_rtx (out), 1, OPTAB_DIRECT);
23852 if (ct)
23853 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23854 copy_rtx (out), 1, OPTAB_DIRECT);
23855 if (!rtx_equal_p (out, operands[0]))
23856 emit_move_insn (operands[0], copy_rtx (out));
23858 return true;
23862 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23864 /* Try a few things more with specific constants and a variable. */
23866 optab op;
23867 rtx var, orig_out, out, tmp;
23869 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23870 return false;
23872 /* If one of the two operands is an interesting constant, load a
23873 constant with the above and mask it in with a logical operation. */
23875 if (CONST_INT_P (operands[2]))
23877 var = operands[3];
23878 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23879 operands[3] = constm1_rtx, op = and_optab;
23880 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23881 operands[3] = const0_rtx, op = ior_optab;
23882 else
23883 return false;
23885 else if (CONST_INT_P (operands[3]))
23887 var = operands[2];
23888 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23889 operands[2] = constm1_rtx, op = and_optab;
23890 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23891 operands[2] = const0_rtx, op = ior_optab;
23892 else
23893 return false;
23895 else
23896 return false;
23898 orig_out = operands[0];
23899 tmp = gen_reg_rtx (mode);
23900 operands[0] = tmp;
23902 /* Recurse to get the constant loaded. */
23903 if (!ix86_expand_int_movcc (operands))
23904 return false;
23906 /* Mask in the interesting variable. */
23907 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23908 OPTAB_WIDEN);
23909 if (!rtx_equal_p (out, orig_out))
23910 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23912 return true;
23916 * For comparison with above,
23918 * movl cf,dest
23919 * movl ct,tmp
23920 * cmpl op1,op2
23921 * cmovcc tmp,dest
23923 * Size 15.
23926 if (! nonimmediate_operand (operands[2], mode))
23927 operands[2] = force_reg (mode, operands[2]);
23928 if (! nonimmediate_operand (operands[3], mode))
23929 operands[3] = force_reg (mode, operands[3]);
23931 if (! register_operand (operands[2], VOIDmode)
23932 && (mode == QImode
23933 || ! register_operand (operands[3], VOIDmode)))
23934 operands[2] = force_reg (mode, operands[2]);
23936 if (mode == QImode
23937 && ! register_operand (operands[3], VOIDmode))
23938 operands[3] = force_reg (mode, operands[3]);
23940 emit_insn (compare_seq);
23941 emit_insn (gen_rtx_SET (operands[0],
23942 gen_rtx_IF_THEN_ELSE (mode,
23943 compare_op, operands[2],
23944 operands[3])));
23945 return true;
23948 /* Swap, force into registers, or otherwise massage the two operands
23949 to an sse comparison with a mask result. Thus we differ a bit from
23950 ix86_prepare_fp_compare_args which expects to produce a flags result.
23952 The DEST operand exists to help determine whether to commute commutative
23953 operators. The POP0/POP1 operands are updated in place. The new
23954 comparison code is returned, or UNKNOWN if not implementable. */
23956 static enum rtx_code
23957 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23958 rtx *pop0, rtx *pop1)
23960 switch (code)
23962 case LTGT:
23963 case UNEQ:
23964 /* AVX supports all the needed comparisons. */
23965 if (TARGET_AVX)
23966 break;
23967 /* We have no LTGT as an operator. We could implement it with
23968 NE & ORDERED, but this requires an extra temporary. It's
23969 not clear that it's worth it. */
23970 return UNKNOWN;
23972 case LT:
23973 case LE:
23974 case UNGT:
23975 case UNGE:
23976 /* These are supported directly. */
23977 break;
23979 case EQ:
23980 case NE:
23981 case UNORDERED:
23982 case ORDERED:
23983 /* AVX has 3 operand comparisons, no need to swap anything. */
23984 if (TARGET_AVX)
23985 break;
23986 /* For commutative operators, try to canonicalize the destination
23987 operand to be first in the comparison - this helps reload to
23988 avoid extra moves. */
23989 if (!dest || !rtx_equal_p (dest, *pop1))
23990 break;
23991 /* FALLTHRU */
23993 case GE:
23994 case GT:
23995 case UNLE:
23996 case UNLT:
23997 /* These are not supported directly before AVX, and furthermore
23998 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23999 comparison operands to transform into something that is
24000 supported. */
24001 std::swap (*pop0, *pop1);
24002 code = swap_condition (code);
24003 break;
24005 default:
24006 gcc_unreachable ();
24009 return code;
24012 /* Detect conditional moves that exactly match min/max operational
24013 semantics. Note that this is IEEE safe, as long as we don't
24014 interchange the operands.
24016 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24017 and TRUE if the operation is successful and instructions are emitted. */
24019 static bool
24020 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24021 rtx cmp_op1, rtx if_true, rtx if_false)
24023 machine_mode mode;
24024 bool is_min;
24025 rtx tmp;
24027 if (code == LT)
24029 else if (code == UNGE)
24030 std::swap (if_true, if_false);
24031 else
24032 return false;
24034 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24035 is_min = true;
24036 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24037 is_min = false;
24038 else
24039 return false;
24041 mode = GET_MODE (dest);
24043 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24044 but MODE may be a vector mode and thus not appropriate. */
24045 if (!flag_finite_math_only || flag_signed_zeros)
24047 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24048 rtvec v;
24050 if_true = force_reg (mode, if_true);
24051 v = gen_rtvec (2, if_true, if_false);
24052 tmp = gen_rtx_UNSPEC (mode, v, u);
24054 else
24056 code = is_min ? SMIN : SMAX;
24057 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24060 emit_insn (gen_rtx_SET (dest, tmp));
24061 return true;
24064 /* Expand an sse vector comparison. Return the register with the result. */
24066 static rtx
24067 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24068 rtx op_true, rtx op_false)
24070 machine_mode mode = GET_MODE (dest);
24071 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24073 /* In general case result of comparison can differ from operands' type. */
24074 machine_mode cmp_mode;
24076 /* In AVX512F the result of comparison is an integer mask. */
24077 bool maskcmp = false;
24078 rtx x;
24080 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24082 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
24083 cmp_mode = int_mode_for_size (nbits, 0).require ();
24084 maskcmp = true;
24086 else
24087 cmp_mode = cmp_ops_mode;
24090 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24091 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24092 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24094 if (optimize
24095 || (maskcmp && cmp_mode != mode)
24096 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24097 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24098 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24100 /* Compare patterns for int modes are unspec in AVX512F only. */
24101 if (maskcmp && (code == GT || code == EQ))
24103 rtx (*gen)(rtx, rtx, rtx);
24105 switch (cmp_ops_mode)
24107 case E_V64QImode:
24108 gcc_assert (TARGET_AVX512BW);
24109 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24110 break;
24111 case E_V32HImode:
24112 gcc_assert (TARGET_AVX512BW);
24113 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24114 break;
24115 case E_V16SImode:
24116 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24117 break;
24118 case E_V8DImode:
24119 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24120 break;
24121 default:
24122 gen = NULL;
24125 if (gen)
24127 emit_insn (gen (dest, cmp_op0, cmp_op1));
24128 return dest;
24131 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24133 if (cmp_mode != mode && !maskcmp)
24135 x = force_reg (cmp_ops_mode, x);
24136 convert_move (dest, x, false);
24138 else
24139 emit_insn (gen_rtx_SET (dest, x));
24141 return dest;
24144 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24145 operations. This is used for both scalar and vector conditional moves. */
24147 void
24148 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24150 machine_mode mode = GET_MODE (dest);
24151 machine_mode cmpmode = GET_MODE (cmp);
24153 /* In AVX512F the result of comparison is an integer mask. */
24154 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24156 rtx t2, t3, x;
24158 /* If we have an integer mask and FP value then we need
24159 to cast mask to FP mode. */
24160 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24162 cmp = force_reg (cmpmode, cmp);
24163 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24166 if (vector_all_ones_operand (op_true, mode)
24167 && rtx_equal_p (op_false, CONST0_RTX (mode))
24168 && !maskcmp)
24170 emit_insn (gen_rtx_SET (dest, cmp));
24172 else if (op_false == CONST0_RTX (mode)
24173 && !maskcmp)
24175 op_true = force_reg (mode, op_true);
24176 x = gen_rtx_AND (mode, cmp, op_true);
24177 emit_insn (gen_rtx_SET (dest, x));
24179 else if (op_true == CONST0_RTX (mode)
24180 && !maskcmp)
24182 op_false = force_reg (mode, op_false);
24183 x = gen_rtx_NOT (mode, cmp);
24184 x = gen_rtx_AND (mode, x, op_false);
24185 emit_insn (gen_rtx_SET (dest, x));
24187 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24188 && !maskcmp)
24190 op_false = force_reg (mode, op_false);
24191 x = gen_rtx_IOR (mode, cmp, op_false);
24192 emit_insn (gen_rtx_SET (dest, x));
24194 else if (TARGET_XOP
24195 && !maskcmp)
24197 op_true = force_reg (mode, op_true);
24199 if (!nonimmediate_operand (op_false, mode))
24200 op_false = force_reg (mode, op_false);
24202 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24203 op_true,
24204 op_false)));
24206 else
24208 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24209 rtx d = dest;
24211 if (!nonimmediate_operand (op_true, mode))
24212 op_true = force_reg (mode, op_true);
24214 op_false = force_reg (mode, op_false);
24216 switch (mode)
24218 case E_V4SFmode:
24219 if (TARGET_SSE4_1)
24220 gen = gen_sse4_1_blendvps;
24221 break;
24222 case E_V2DFmode:
24223 if (TARGET_SSE4_1)
24224 gen = gen_sse4_1_blendvpd;
24225 break;
24226 case E_V16QImode:
24227 case E_V8HImode:
24228 case E_V4SImode:
24229 case E_V2DImode:
24230 if (TARGET_SSE4_1)
24232 gen = gen_sse4_1_pblendvb;
24233 if (mode != V16QImode)
24234 d = gen_reg_rtx (V16QImode);
24235 op_false = gen_lowpart (V16QImode, op_false);
24236 op_true = gen_lowpart (V16QImode, op_true);
24237 cmp = gen_lowpart (V16QImode, cmp);
24239 break;
24240 case E_V8SFmode:
24241 if (TARGET_AVX)
24242 gen = gen_avx_blendvps256;
24243 break;
24244 case E_V4DFmode:
24245 if (TARGET_AVX)
24246 gen = gen_avx_blendvpd256;
24247 break;
24248 case E_V32QImode:
24249 case E_V16HImode:
24250 case E_V8SImode:
24251 case E_V4DImode:
24252 if (TARGET_AVX2)
24254 gen = gen_avx2_pblendvb;
24255 if (mode != V32QImode)
24256 d = gen_reg_rtx (V32QImode);
24257 op_false = gen_lowpart (V32QImode, op_false);
24258 op_true = gen_lowpart (V32QImode, op_true);
24259 cmp = gen_lowpart (V32QImode, cmp);
24261 break;
24263 case E_V64QImode:
24264 gen = gen_avx512bw_blendmv64qi;
24265 break;
24266 case E_V32HImode:
24267 gen = gen_avx512bw_blendmv32hi;
24268 break;
24269 case E_V16SImode:
24270 gen = gen_avx512f_blendmv16si;
24271 break;
24272 case E_V8DImode:
24273 gen = gen_avx512f_blendmv8di;
24274 break;
24275 case E_V8DFmode:
24276 gen = gen_avx512f_blendmv8df;
24277 break;
24278 case E_V16SFmode:
24279 gen = gen_avx512f_blendmv16sf;
24280 break;
24282 default:
24283 break;
24286 if (gen != NULL)
24288 emit_insn (gen (d, op_false, op_true, cmp));
24289 if (d != dest)
24290 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24292 else
24294 op_true = force_reg (mode, op_true);
24296 t2 = gen_reg_rtx (mode);
24297 if (optimize)
24298 t3 = gen_reg_rtx (mode);
24299 else
24300 t3 = dest;
24302 x = gen_rtx_AND (mode, op_true, cmp);
24303 emit_insn (gen_rtx_SET (t2, x));
24305 x = gen_rtx_NOT (mode, cmp);
24306 x = gen_rtx_AND (mode, x, op_false);
24307 emit_insn (gen_rtx_SET (t3, x));
24309 x = gen_rtx_IOR (mode, t3, t2);
24310 emit_insn (gen_rtx_SET (dest, x));
24315 /* Expand a floating-point conditional move. Return true if successful. */
24317 bool
24318 ix86_expand_fp_movcc (rtx operands[])
24320 machine_mode mode = GET_MODE (operands[0]);
24321 enum rtx_code code = GET_CODE (operands[1]);
24322 rtx tmp, compare_op;
24323 rtx op0 = XEXP (operands[1], 0);
24324 rtx op1 = XEXP (operands[1], 1);
24326 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24328 machine_mode cmode;
24330 /* Since we've no cmove for sse registers, don't force bad register
24331 allocation just to gain access to it. Deny movcc when the
24332 comparison mode doesn't match the move mode. */
24333 cmode = GET_MODE (op0);
24334 if (cmode == VOIDmode)
24335 cmode = GET_MODE (op1);
24336 if (cmode != mode)
24337 return false;
24339 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24340 if (code == UNKNOWN)
24341 return false;
24343 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24344 operands[2], operands[3]))
24345 return true;
24347 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24348 operands[2], operands[3]);
24349 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24350 return true;
24353 if (GET_MODE (op0) == TImode
24354 || (GET_MODE (op0) == DImode
24355 && !TARGET_64BIT))
24356 return false;
24358 /* The floating point conditional move instructions don't directly
24359 support conditions resulting from a signed integer comparison. */
24361 compare_op = ix86_expand_compare (code, op0, op1);
24362 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24364 tmp = gen_reg_rtx (QImode);
24365 ix86_expand_setcc (tmp, code, op0, op1);
24367 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24370 emit_insn (gen_rtx_SET (operands[0],
24371 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24372 operands[2], operands[3])));
24374 return true;
24377 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24379 static int
24380 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24382 switch (code)
24384 case EQ:
24385 return 0;
24386 case LT:
24387 case LTU:
24388 return 1;
24389 case LE:
24390 case LEU:
24391 return 2;
24392 case NE:
24393 return 4;
24394 case GE:
24395 case GEU:
24396 return 5;
24397 case GT:
24398 case GTU:
24399 return 6;
24400 default:
24401 gcc_unreachable ();
24405 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24407 static int
24408 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24410 switch (code)
24412 case EQ:
24413 return 0x00;
24414 case NE:
24415 return 0x04;
24416 case GT:
24417 return 0x0e;
24418 case LE:
24419 return 0x02;
24420 case GE:
24421 return 0x0d;
24422 case LT:
24423 return 0x01;
24424 case UNLE:
24425 return 0x0a;
24426 case UNLT:
24427 return 0x09;
24428 case UNGE:
24429 return 0x05;
24430 case UNGT:
24431 return 0x06;
24432 case UNEQ:
24433 return 0x18;
24434 case LTGT:
24435 return 0x0c;
24436 case ORDERED:
24437 return 0x07;
24438 case UNORDERED:
24439 return 0x03;
24440 default:
24441 gcc_unreachable ();
24445 /* Return immediate value to be used in UNSPEC_PCMP
24446 for comparison CODE in MODE. */
24448 static int
24449 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24451 if (FLOAT_MODE_P (mode))
24452 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24453 return ix86_int_cmp_code_to_pcmp_immediate (code);
24456 /* Expand AVX-512 vector comparison. */
24458 bool
24459 ix86_expand_mask_vec_cmp (rtx operands[])
24461 machine_mode mask_mode = GET_MODE (operands[0]);
24462 machine_mode cmp_mode = GET_MODE (operands[2]);
24463 enum rtx_code code = GET_CODE (operands[1]);
24464 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24465 int unspec_code;
24466 rtx unspec;
24468 switch (code)
24470 case LEU:
24471 case GTU:
24472 case GEU:
24473 case LTU:
24474 unspec_code = UNSPEC_UNSIGNED_PCMP;
24475 break;
24477 default:
24478 unspec_code = UNSPEC_PCMP;
24481 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24482 operands[3], imm),
24483 unspec_code);
24484 emit_insn (gen_rtx_SET (operands[0], unspec));
24486 return true;
24489 /* Expand fp vector comparison. */
24491 bool
24492 ix86_expand_fp_vec_cmp (rtx operands[])
24494 enum rtx_code code = GET_CODE (operands[1]);
24495 rtx cmp;
24497 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24498 &operands[2], &operands[3]);
24499 if (code == UNKNOWN)
24501 rtx temp;
24502 switch (GET_CODE (operands[1]))
24504 case LTGT:
24505 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24506 operands[3], NULL, NULL);
24507 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24508 operands[3], NULL, NULL);
24509 code = AND;
24510 break;
24511 case UNEQ:
24512 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24513 operands[3], NULL, NULL);
24514 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24515 operands[3], NULL, NULL);
24516 code = IOR;
24517 break;
24518 default:
24519 gcc_unreachable ();
24521 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24522 OPTAB_DIRECT);
24524 else
24525 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24526 operands[1], operands[2]);
24528 if (operands[0] != cmp)
24529 emit_move_insn (operands[0], cmp);
24531 return true;
24534 static rtx
24535 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24536 rtx op_true, rtx op_false, bool *negate)
24538 machine_mode data_mode = GET_MODE (dest);
24539 machine_mode mode = GET_MODE (cop0);
24540 rtx x;
24542 *negate = false;
24544 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24545 if (TARGET_XOP
24546 && (mode == V16QImode || mode == V8HImode
24547 || mode == V4SImode || mode == V2DImode))
24549 else
24551 /* Canonicalize the comparison to EQ, GT, GTU. */
24552 switch (code)
24554 case EQ:
24555 case GT:
24556 case GTU:
24557 break;
24559 case NE:
24560 case LE:
24561 case LEU:
24562 code = reverse_condition (code);
24563 *negate = true;
24564 break;
24566 case GE:
24567 case GEU:
24568 code = reverse_condition (code);
24569 *negate = true;
24570 /* FALLTHRU */
24572 case LT:
24573 case LTU:
24574 std::swap (cop0, cop1);
24575 code = swap_condition (code);
24576 break;
24578 default:
24579 gcc_unreachable ();
24582 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24583 if (mode == V2DImode)
24585 switch (code)
24587 case EQ:
24588 /* SSE4.1 supports EQ. */
24589 if (!TARGET_SSE4_1)
24590 return NULL;
24591 break;
24593 case GT:
24594 case GTU:
24595 /* SSE4.2 supports GT/GTU. */
24596 if (!TARGET_SSE4_2)
24597 return NULL;
24598 break;
24600 default:
24601 gcc_unreachable ();
24605 /* Unsigned parallel compare is not supported by the hardware.
24606 Play some tricks to turn this into a signed comparison
24607 against 0. */
24608 if (code == GTU)
24610 cop0 = force_reg (mode, cop0);
24612 switch (mode)
24614 case E_V16SImode:
24615 case E_V8DImode:
24616 case E_V8SImode:
24617 case E_V4DImode:
24618 case E_V4SImode:
24619 case E_V2DImode:
24621 rtx t1, t2, mask;
24622 rtx (*gen_sub3) (rtx, rtx, rtx);
24624 switch (mode)
24626 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24627 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24628 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24629 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24630 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24631 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24632 default:
24633 gcc_unreachable ();
24635 /* Subtract (-(INT MAX) - 1) from both operands to make
24636 them signed. */
24637 mask = ix86_build_signbit_mask (mode, true, false);
24638 t1 = gen_reg_rtx (mode);
24639 emit_insn (gen_sub3 (t1, cop0, mask));
24641 t2 = gen_reg_rtx (mode);
24642 emit_insn (gen_sub3 (t2, cop1, mask));
24644 cop0 = t1;
24645 cop1 = t2;
24646 code = GT;
24648 break;
24650 case E_V64QImode:
24651 case E_V32HImode:
24652 case E_V32QImode:
24653 case E_V16HImode:
24654 case E_V16QImode:
24655 case E_V8HImode:
24656 /* Perform a parallel unsigned saturating subtraction. */
24657 x = gen_reg_rtx (mode);
24658 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24659 cop1)));
24661 cop0 = x;
24662 cop1 = CONST0_RTX (mode);
24663 code = EQ;
24664 *negate = !*negate;
24665 break;
24667 default:
24668 gcc_unreachable ();
24673 if (*negate)
24674 std::swap (op_true, op_false);
24676 /* Allow the comparison to be done in one mode, but the movcc to
24677 happen in another mode. */
24678 if (data_mode == mode)
24680 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24681 op_true, op_false);
24683 else
24685 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24686 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24687 op_true, op_false);
24688 if (GET_MODE (x) == mode)
24689 x = gen_lowpart (data_mode, x);
24692 return x;
24695 /* Expand integer vector comparison. */
24697 bool
24698 ix86_expand_int_vec_cmp (rtx operands[])
24700 rtx_code code = GET_CODE (operands[1]);
24701 bool negate = false;
24702 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24703 operands[3], NULL, NULL, &negate);
24705 if (!cmp)
24706 return false;
24708 if (negate)
24709 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24710 CONST0_RTX (GET_MODE (cmp)),
24711 NULL, NULL, &negate);
24713 gcc_assert (!negate);
24715 if (operands[0] != cmp)
24716 emit_move_insn (operands[0], cmp);
24718 return true;
24721 /* Expand a floating-point vector conditional move; a vcond operation
24722 rather than a movcc operation. */
24724 bool
24725 ix86_expand_fp_vcond (rtx operands[])
24727 enum rtx_code code = GET_CODE (operands[3]);
24728 rtx cmp;
24730 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24731 &operands[4], &operands[5]);
24732 if (code == UNKNOWN)
24734 rtx temp;
24735 switch (GET_CODE (operands[3]))
24737 case LTGT:
24738 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24739 operands[5], operands[0], operands[0]);
24740 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24741 operands[5], operands[1], operands[2]);
24742 code = AND;
24743 break;
24744 case UNEQ:
24745 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24746 operands[5], operands[0], operands[0]);
24747 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24748 operands[5], operands[1], operands[2]);
24749 code = IOR;
24750 break;
24751 default:
24752 gcc_unreachable ();
24754 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24755 OPTAB_DIRECT);
24756 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24757 return true;
24760 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24761 operands[5], operands[1], operands[2]))
24762 return true;
24764 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24765 operands[1], operands[2]);
24766 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24767 return true;
24770 /* Expand a signed/unsigned integral vector conditional move. */
24772 bool
24773 ix86_expand_int_vcond (rtx operands[])
24775 machine_mode data_mode = GET_MODE (operands[0]);
24776 machine_mode mode = GET_MODE (operands[4]);
24777 enum rtx_code code = GET_CODE (operands[3]);
24778 bool negate = false;
24779 rtx x, cop0, cop1;
24781 cop0 = operands[4];
24782 cop1 = operands[5];
24784 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24785 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24786 if ((code == LT || code == GE)
24787 && data_mode == mode
24788 && cop1 == CONST0_RTX (mode)
24789 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24790 && GET_MODE_UNIT_SIZE (data_mode) > 1
24791 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24792 && (GET_MODE_SIZE (data_mode) == 16
24793 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24795 rtx negop = operands[2 - (code == LT)];
24796 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24797 if (negop == CONST1_RTX (data_mode))
24799 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24800 operands[0], 1, OPTAB_DIRECT);
24801 if (res != operands[0])
24802 emit_move_insn (operands[0], res);
24803 return true;
24805 else if (GET_MODE_INNER (data_mode) != DImode
24806 && vector_all_ones_operand (negop, data_mode))
24808 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24809 operands[0], 0, OPTAB_DIRECT);
24810 if (res != operands[0])
24811 emit_move_insn (operands[0], res);
24812 return true;
24816 if (!nonimmediate_operand (cop1, mode))
24817 cop1 = force_reg (mode, cop1);
24818 if (!general_operand (operands[1], data_mode))
24819 operands[1] = force_reg (data_mode, operands[1]);
24820 if (!general_operand (operands[2], data_mode))
24821 operands[2] = force_reg (data_mode, operands[2]);
24823 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24824 operands[1], operands[2], &negate);
24826 if (!x)
24827 return false;
24829 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24830 operands[2-negate]);
24831 return true;
24834 /* AVX512F does support 64-byte integer vector operations,
24835 thus the longest vector we are faced with is V64QImode. */
24836 #define MAX_VECT_LEN 64
24838 struct expand_vec_perm_d
24840 rtx target, op0, op1;
24841 unsigned char perm[MAX_VECT_LEN];
24842 machine_mode vmode;
24843 unsigned char nelt;
24844 bool one_operand_p;
24845 bool testing_p;
24848 static bool
24849 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24850 struct expand_vec_perm_d *d)
24852 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24853 expander, so args are either in d, or in op0, op1 etc. */
24854 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24855 machine_mode maskmode = mode;
24856 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24858 switch (mode)
24860 case E_V8HImode:
24861 if (TARGET_AVX512VL && TARGET_AVX512BW)
24862 gen = gen_avx512vl_vpermt2varv8hi3;
24863 break;
24864 case E_V16HImode:
24865 if (TARGET_AVX512VL && TARGET_AVX512BW)
24866 gen = gen_avx512vl_vpermt2varv16hi3;
24867 break;
24868 case E_V64QImode:
24869 if (TARGET_AVX512VBMI)
24870 gen = gen_avx512bw_vpermt2varv64qi3;
24871 break;
24872 case E_V32HImode:
24873 if (TARGET_AVX512BW)
24874 gen = gen_avx512bw_vpermt2varv32hi3;
24875 break;
24876 case E_V4SImode:
24877 if (TARGET_AVX512VL)
24878 gen = gen_avx512vl_vpermt2varv4si3;
24879 break;
24880 case E_V8SImode:
24881 if (TARGET_AVX512VL)
24882 gen = gen_avx512vl_vpermt2varv8si3;
24883 break;
24884 case E_V16SImode:
24885 if (TARGET_AVX512F)
24886 gen = gen_avx512f_vpermt2varv16si3;
24887 break;
24888 case E_V4SFmode:
24889 if (TARGET_AVX512VL)
24891 gen = gen_avx512vl_vpermt2varv4sf3;
24892 maskmode = V4SImode;
24894 break;
24895 case E_V8SFmode:
24896 if (TARGET_AVX512VL)
24898 gen = gen_avx512vl_vpermt2varv8sf3;
24899 maskmode = V8SImode;
24901 break;
24902 case E_V16SFmode:
24903 if (TARGET_AVX512F)
24905 gen = gen_avx512f_vpermt2varv16sf3;
24906 maskmode = V16SImode;
24908 break;
24909 case E_V2DImode:
24910 if (TARGET_AVX512VL)
24911 gen = gen_avx512vl_vpermt2varv2di3;
24912 break;
24913 case E_V4DImode:
24914 if (TARGET_AVX512VL)
24915 gen = gen_avx512vl_vpermt2varv4di3;
24916 break;
24917 case E_V8DImode:
24918 if (TARGET_AVX512F)
24919 gen = gen_avx512f_vpermt2varv8di3;
24920 break;
24921 case E_V2DFmode:
24922 if (TARGET_AVX512VL)
24924 gen = gen_avx512vl_vpermt2varv2df3;
24925 maskmode = V2DImode;
24927 break;
24928 case E_V4DFmode:
24929 if (TARGET_AVX512VL)
24931 gen = gen_avx512vl_vpermt2varv4df3;
24932 maskmode = V4DImode;
24934 break;
24935 case E_V8DFmode:
24936 if (TARGET_AVX512F)
24938 gen = gen_avx512f_vpermt2varv8df3;
24939 maskmode = V8DImode;
24941 break;
24942 default:
24943 break;
24946 if (gen == NULL)
24947 return false;
24949 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24950 expander, so args are either in d, or in op0, op1 etc. */
24951 if (d)
24953 rtx vec[64];
24954 target = d->target;
24955 op0 = d->op0;
24956 op1 = d->op1;
24957 for (int i = 0; i < d->nelt; ++i)
24958 vec[i] = GEN_INT (d->perm[i]);
24959 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24962 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24963 return true;
24966 /* Expand a variable vector permutation. */
24968 void
24969 ix86_expand_vec_perm (rtx operands[])
24971 rtx target = operands[0];
24972 rtx op0 = operands[1];
24973 rtx op1 = operands[2];
24974 rtx mask = operands[3];
24975 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24976 machine_mode mode = GET_MODE (op0);
24977 machine_mode maskmode = GET_MODE (mask);
24978 int w, e, i;
24979 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24981 /* Number of elements in the vector. */
24982 w = GET_MODE_NUNITS (mode);
24983 e = GET_MODE_UNIT_SIZE (mode);
24984 gcc_assert (w <= 64);
24986 if (TARGET_AVX512F && one_operand_shuffle)
24988 rtx (*gen) (rtx, rtx, rtx) = NULL;
24989 switch (mode)
24991 case E_V16SImode:
24992 gen =gen_avx512f_permvarv16si;
24993 break;
24994 case E_V16SFmode:
24995 gen = gen_avx512f_permvarv16sf;
24996 break;
24997 case E_V8DImode:
24998 gen = gen_avx512f_permvarv8di;
24999 break;
25000 case E_V8DFmode:
25001 gen = gen_avx512f_permvarv8df;
25002 break;
25003 default:
25004 break;
25006 if (gen != NULL)
25008 emit_insn (gen (target, op0, mask));
25009 return;
25013 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
25014 return;
25016 if (TARGET_AVX2)
25018 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25020 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25021 an constant shuffle operand. With a tiny bit of effort we can
25022 use VPERMD instead. A re-interpretation stall for V4DFmode is
25023 unfortunate but there's no avoiding it.
25024 Similarly for V16HImode we don't have instructions for variable
25025 shuffling, while for V32QImode we can use after preparing suitable
25026 masks vpshufb; vpshufb; vpermq; vpor. */
25028 if (mode == V16HImode)
25030 maskmode = mode = V32QImode;
25031 w = 32;
25032 e = 1;
25034 else
25036 maskmode = mode = V8SImode;
25037 w = 8;
25038 e = 4;
25040 t1 = gen_reg_rtx (maskmode);
25042 /* Replicate the low bits of the V4DImode mask into V8SImode:
25043 mask = { A B C D }
25044 t1 = { A A B B C C D D }. */
25045 for (i = 0; i < w / 2; ++i)
25046 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25047 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25048 vt = force_reg (maskmode, vt);
25049 mask = gen_lowpart (maskmode, mask);
25050 if (maskmode == V8SImode)
25051 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25052 else
25053 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25055 /* Multiply the shuffle indicies by two. */
25056 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25057 OPTAB_DIRECT);
25059 /* Add one to the odd shuffle indicies:
25060 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25061 for (i = 0; i < w / 2; ++i)
25063 vec[i * 2] = const0_rtx;
25064 vec[i * 2 + 1] = const1_rtx;
25066 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25067 vt = validize_mem (force_const_mem (maskmode, vt));
25068 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25069 OPTAB_DIRECT);
25071 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25072 operands[3] = mask = t1;
25073 target = gen_reg_rtx (mode);
25074 op0 = gen_lowpart (mode, op0);
25075 op1 = gen_lowpart (mode, op1);
25078 switch (mode)
25080 case E_V8SImode:
25081 /* The VPERMD and VPERMPS instructions already properly ignore
25082 the high bits of the shuffle elements. No need for us to
25083 perform an AND ourselves. */
25084 if (one_operand_shuffle)
25086 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25087 if (target != operands[0])
25088 emit_move_insn (operands[0],
25089 gen_lowpart (GET_MODE (operands[0]), target));
25091 else
25093 t1 = gen_reg_rtx (V8SImode);
25094 t2 = gen_reg_rtx (V8SImode);
25095 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25096 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25097 goto merge_two;
25099 return;
25101 case E_V8SFmode:
25102 mask = gen_lowpart (V8SImode, mask);
25103 if (one_operand_shuffle)
25104 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25105 else
25107 t1 = gen_reg_rtx (V8SFmode);
25108 t2 = gen_reg_rtx (V8SFmode);
25109 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25110 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25111 goto merge_two;
25113 return;
25115 case E_V4SImode:
25116 /* By combining the two 128-bit input vectors into one 256-bit
25117 input vector, we can use VPERMD and VPERMPS for the full
25118 two-operand shuffle. */
25119 t1 = gen_reg_rtx (V8SImode);
25120 t2 = gen_reg_rtx (V8SImode);
25121 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25122 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25123 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25124 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25125 return;
25127 case E_V4SFmode:
25128 t1 = gen_reg_rtx (V8SFmode);
25129 t2 = gen_reg_rtx (V8SImode);
25130 mask = gen_lowpart (V4SImode, mask);
25131 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25132 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25133 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25134 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25135 return;
25137 case E_V32QImode:
25138 t1 = gen_reg_rtx (V32QImode);
25139 t2 = gen_reg_rtx (V32QImode);
25140 t3 = gen_reg_rtx (V32QImode);
25141 vt2 = GEN_INT (-128);
25142 vt = gen_const_vec_duplicate (V32QImode, vt2);
25143 vt = force_reg (V32QImode, vt);
25144 for (i = 0; i < 32; i++)
25145 vec[i] = i < 16 ? vt2 : const0_rtx;
25146 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25147 vt2 = force_reg (V32QImode, vt2);
25148 /* From mask create two adjusted masks, which contain the same
25149 bits as mask in the low 7 bits of each vector element.
25150 The first mask will have the most significant bit clear
25151 if it requests element from the same 128-bit lane
25152 and MSB set if it requests element from the other 128-bit lane.
25153 The second mask will have the opposite values of the MSB,
25154 and additionally will have its 128-bit lanes swapped.
25155 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25156 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25157 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25158 stands for other 12 bytes. */
25159 /* The bit whether element is from the same lane or the other
25160 lane is bit 4, so shift it up by 3 to the MSB position. */
25161 t5 = gen_reg_rtx (V4DImode);
25162 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25163 GEN_INT (3)));
25164 /* Clear MSB bits from the mask just in case it had them set. */
25165 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25166 /* After this t1 will have MSB set for elements from other lane. */
25167 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25168 /* Clear bits other than MSB. */
25169 emit_insn (gen_andv32qi3 (t1, t1, vt));
25170 /* Or in the lower bits from mask into t3. */
25171 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25172 /* And invert MSB bits in t1, so MSB is set for elements from the same
25173 lane. */
25174 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25175 /* Swap 128-bit lanes in t3. */
25176 t6 = gen_reg_rtx (V4DImode);
25177 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25178 const2_rtx, GEN_INT (3),
25179 const0_rtx, const1_rtx));
25180 /* And or in the lower bits from mask into t1. */
25181 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25182 if (one_operand_shuffle)
25184 /* Each of these shuffles will put 0s in places where
25185 element from the other 128-bit lane is needed, otherwise
25186 will shuffle in the requested value. */
25187 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25188 gen_lowpart (V32QImode, t6)));
25189 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25190 /* For t3 the 128-bit lanes are swapped again. */
25191 t7 = gen_reg_rtx (V4DImode);
25192 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25193 const2_rtx, GEN_INT (3),
25194 const0_rtx, const1_rtx));
25195 /* And oring both together leads to the result. */
25196 emit_insn (gen_iorv32qi3 (target, t1,
25197 gen_lowpart (V32QImode, t7)));
25198 if (target != operands[0])
25199 emit_move_insn (operands[0],
25200 gen_lowpart (GET_MODE (operands[0]), target));
25201 return;
25204 t4 = gen_reg_rtx (V32QImode);
25205 /* Similarly to the above one_operand_shuffle code,
25206 just for repeated twice for each operand. merge_two:
25207 code will merge the two results together. */
25208 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25209 gen_lowpart (V32QImode, t6)));
25210 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25211 gen_lowpart (V32QImode, t6)));
25212 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25213 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25214 t7 = gen_reg_rtx (V4DImode);
25215 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25216 const2_rtx, GEN_INT (3),
25217 const0_rtx, const1_rtx));
25218 t8 = gen_reg_rtx (V4DImode);
25219 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25220 const2_rtx, GEN_INT (3),
25221 const0_rtx, const1_rtx));
25222 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25223 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25224 t1 = t4;
25225 t2 = t3;
25226 goto merge_two;
25228 default:
25229 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25230 break;
25234 if (TARGET_XOP)
25236 /* The XOP VPPERM insn supports three inputs. By ignoring the
25237 one_operand_shuffle special case, we avoid creating another
25238 set of constant vectors in memory. */
25239 one_operand_shuffle = false;
25241 /* mask = mask & {2*w-1, ...} */
25242 vt = GEN_INT (2*w - 1);
25244 else
25246 /* mask = mask & {w-1, ...} */
25247 vt = GEN_INT (w - 1);
25250 vt = gen_const_vec_duplicate (maskmode, vt);
25251 mask = expand_simple_binop (maskmode, AND, mask, vt,
25252 NULL_RTX, 0, OPTAB_DIRECT);
25254 /* For non-QImode operations, convert the word permutation control
25255 into a byte permutation control. */
25256 if (mode != V16QImode)
25258 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25259 GEN_INT (exact_log2 (e)),
25260 NULL_RTX, 0, OPTAB_DIRECT);
25262 /* Convert mask to vector of chars. */
25263 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25265 /* Replicate each of the input bytes into byte positions:
25266 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25267 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25268 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25269 for (i = 0; i < 16; ++i)
25270 vec[i] = GEN_INT (i/e * e);
25271 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25272 vt = validize_mem (force_const_mem (V16QImode, vt));
25273 if (TARGET_XOP)
25274 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25275 else
25276 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25278 /* Convert it into the byte positions by doing
25279 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25280 for (i = 0; i < 16; ++i)
25281 vec[i] = GEN_INT (i % e);
25282 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25283 vt = validize_mem (force_const_mem (V16QImode, vt));
25284 emit_insn (gen_addv16qi3 (mask, mask, vt));
25287 /* The actual shuffle operations all operate on V16QImode. */
25288 op0 = gen_lowpart (V16QImode, op0);
25289 op1 = gen_lowpart (V16QImode, op1);
25291 if (TARGET_XOP)
25293 if (GET_MODE (target) != V16QImode)
25294 target = gen_reg_rtx (V16QImode);
25295 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25296 if (target != operands[0])
25297 emit_move_insn (operands[0],
25298 gen_lowpart (GET_MODE (operands[0]), target));
25300 else if (one_operand_shuffle)
25302 if (GET_MODE (target) != V16QImode)
25303 target = gen_reg_rtx (V16QImode);
25304 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25305 if (target != operands[0])
25306 emit_move_insn (operands[0],
25307 gen_lowpart (GET_MODE (operands[0]), target));
25309 else
25311 rtx xops[6];
25312 bool ok;
25314 /* Shuffle the two input vectors independently. */
25315 t1 = gen_reg_rtx (V16QImode);
25316 t2 = gen_reg_rtx (V16QImode);
25317 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25318 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25320 merge_two:
25321 /* Then merge them together. The key is whether any given control
25322 element contained a bit set that indicates the second word. */
25323 mask = operands[3];
25324 vt = GEN_INT (w);
25325 if (maskmode == V2DImode && !TARGET_SSE4_1)
25327 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25328 more shuffle to convert the V2DI input mask into a V4SI
25329 input mask. At which point the masking that expand_int_vcond
25330 will work as desired. */
25331 rtx t3 = gen_reg_rtx (V4SImode);
25332 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25333 const0_rtx, const0_rtx,
25334 const2_rtx, const2_rtx));
25335 mask = t3;
25336 maskmode = V4SImode;
25337 e = w = 4;
25340 vt = gen_const_vec_duplicate (maskmode, vt);
25341 vt = force_reg (maskmode, vt);
25342 mask = expand_simple_binop (maskmode, AND, mask, vt,
25343 NULL_RTX, 0, OPTAB_DIRECT);
25345 if (GET_MODE (target) != mode)
25346 target = gen_reg_rtx (mode);
25347 xops[0] = target;
25348 xops[1] = gen_lowpart (mode, t2);
25349 xops[2] = gen_lowpart (mode, t1);
25350 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25351 xops[4] = mask;
25352 xops[5] = vt;
25353 ok = ix86_expand_int_vcond (xops);
25354 gcc_assert (ok);
25355 if (target != operands[0])
25356 emit_move_insn (operands[0],
25357 gen_lowpart (GET_MODE (operands[0]), target));
25361 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25362 true if we should do zero extension, else sign extension. HIGH_P is
25363 true if we want the N/2 high elements, else the low elements. */
25365 void
25366 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25368 machine_mode imode = GET_MODE (src);
25369 rtx tmp;
25371 if (TARGET_SSE4_1)
25373 rtx (*unpack)(rtx, rtx);
25374 rtx (*extract)(rtx, rtx) = NULL;
25375 machine_mode halfmode = BLKmode;
25377 switch (imode)
25379 case E_V64QImode:
25380 if (unsigned_p)
25381 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25382 else
25383 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25384 halfmode = V32QImode;
25385 extract
25386 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25387 break;
25388 case E_V32QImode:
25389 if (unsigned_p)
25390 unpack = gen_avx2_zero_extendv16qiv16hi2;
25391 else
25392 unpack = gen_avx2_sign_extendv16qiv16hi2;
25393 halfmode = V16QImode;
25394 extract
25395 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25396 break;
25397 case E_V32HImode:
25398 if (unsigned_p)
25399 unpack = gen_avx512f_zero_extendv16hiv16si2;
25400 else
25401 unpack = gen_avx512f_sign_extendv16hiv16si2;
25402 halfmode = V16HImode;
25403 extract
25404 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25405 break;
25406 case E_V16HImode:
25407 if (unsigned_p)
25408 unpack = gen_avx2_zero_extendv8hiv8si2;
25409 else
25410 unpack = gen_avx2_sign_extendv8hiv8si2;
25411 halfmode = V8HImode;
25412 extract
25413 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25414 break;
25415 case E_V16SImode:
25416 if (unsigned_p)
25417 unpack = gen_avx512f_zero_extendv8siv8di2;
25418 else
25419 unpack = gen_avx512f_sign_extendv8siv8di2;
25420 halfmode = V8SImode;
25421 extract
25422 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25423 break;
25424 case E_V8SImode:
25425 if (unsigned_p)
25426 unpack = gen_avx2_zero_extendv4siv4di2;
25427 else
25428 unpack = gen_avx2_sign_extendv4siv4di2;
25429 halfmode = V4SImode;
25430 extract
25431 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25432 break;
25433 case E_V16QImode:
25434 if (unsigned_p)
25435 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25436 else
25437 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25438 break;
25439 case E_V8HImode:
25440 if (unsigned_p)
25441 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25442 else
25443 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25444 break;
25445 case E_V4SImode:
25446 if (unsigned_p)
25447 unpack = gen_sse4_1_zero_extendv2siv2di2;
25448 else
25449 unpack = gen_sse4_1_sign_extendv2siv2di2;
25450 break;
25451 default:
25452 gcc_unreachable ();
25455 if (GET_MODE_SIZE (imode) >= 32)
25457 tmp = gen_reg_rtx (halfmode);
25458 emit_insn (extract (tmp, src));
25460 else if (high_p)
25462 /* Shift higher 8 bytes to lower 8 bytes. */
25463 tmp = gen_reg_rtx (V1TImode);
25464 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25465 GEN_INT (64)));
25466 tmp = gen_lowpart (imode, tmp);
25468 else
25469 tmp = src;
25471 emit_insn (unpack (dest, tmp));
25473 else
25475 rtx (*unpack)(rtx, rtx, rtx);
25477 switch (imode)
25479 case E_V16QImode:
25480 if (high_p)
25481 unpack = gen_vec_interleave_highv16qi;
25482 else
25483 unpack = gen_vec_interleave_lowv16qi;
25484 break;
25485 case E_V8HImode:
25486 if (high_p)
25487 unpack = gen_vec_interleave_highv8hi;
25488 else
25489 unpack = gen_vec_interleave_lowv8hi;
25490 break;
25491 case E_V4SImode:
25492 if (high_p)
25493 unpack = gen_vec_interleave_highv4si;
25494 else
25495 unpack = gen_vec_interleave_lowv4si;
25496 break;
25497 default:
25498 gcc_unreachable ();
25501 if (unsigned_p)
25502 tmp = force_reg (imode, CONST0_RTX (imode));
25503 else
25504 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25505 src, pc_rtx, pc_rtx);
25507 rtx tmp2 = gen_reg_rtx (imode);
25508 emit_insn (unpack (tmp2, src, tmp));
25509 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25513 /* Expand conditional increment or decrement using adb/sbb instructions.
25514 The default case using setcc followed by the conditional move can be
25515 done by generic code. */
25516 bool
25517 ix86_expand_int_addcc (rtx operands[])
25519 enum rtx_code code = GET_CODE (operands[1]);
25520 rtx flags;
25521 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25522 rtx compare_op;
25523 rtx val = const0_rtx;
25524 bool fpcmp = false;
25525 machine_mode mode;
25526 rtx op0 = XEXP (operands[1], 0);
25527 rtx op1 = XEXP (operands[1], 1);
25529 if (operands[3] != const1_rtx
25530 && operands[3] != constm1_rtx)
25531 return false;
25532 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25533 return false;
25534 code = GET_CODE (compare_op);
25536 flags = XEXP (compare_op, 0);
25538 if (GET_MODE (flags) == CCFPmode)
25540 fpcmp = true;
25541 code = ix86_fp_compare_code_to_integer (code);
25544 if (code != LTU)
25546 val = constm1_rtx;
25547 if (fpcmp)
25548 PUT_CODE (compare_op,
25549 reverse_condition_maybe_unordered
25550 (GET_CODE (compare_op)));
25551 else
25552 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25555 mode = GET_MODE (operands[0]);
25557 /* Construct either adc or sbb insn. */
25558 if ((code == LTU) == (operands[3] == constm1_rtx))
25560 switch (mode)
25562 case E_QImode:
25563 insn = gen_subqi3_carry;
25564 break;
25565 case E_HImode:
25566 insn = gen_subhi3_carry;
25567 break;
25568 case E_SImode:
25569 insn = gen_subsi3_carry;
25570 break;
25571 case E_DImode:
25572 insn = gen_subdi3_carry;
25573 break;
25574 default:
25575 gcc_unreachable ();
25578 else
25580 switch (mode)
25582 case E_QImode:
25583 insn = gen_addqi3_carry;
25584 break;
25585 case E_HImode:
25586 insn = gen_addhi3_carry;
25587 break;
25588 case E_SImode:
25589 insn = gen_addsi3_carry;
25590 break;
25591 case E_DImode:
25592 insn = gen_adddi3_carry;
25593 break;
25594 default:
25595 gcc_unreachable ();
25598 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25600 return true;
25604 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25605 but works for floating pointer parameters and nonoffsetable memories.
25606 For pushes, it returns just stack offsets; the values will be saved
25607 in the right order. Maximally three parts are generated. */
25609 static int
25610 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25612 int size;
25614 if (!TARGET_64BIT)
25615 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25616 else
25617 size = (GET_MODE_SIZE (mode) + 4) / 8;
25619 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25620 gcc_assert (size >= 2 && size <= 4);
25622 /* Optimize constant pool reference to immediates. This is used by fp
25623 moves, that force all constants to memory to allow combining. */
25624 if (MEM_P (operand) && MEM_READONLY_P (operand))
25625 operand = avoid_constant_pool_reference (operand);
25627 if (MEM_P (operand) && !offsettable_memref_p (operand))
25629 /* The only non-offsetable memories we handle are pushes. */
25630 int ok = push_operand (operand, VOIDmode);
25632 gcc_assert (ok);
25634 operand = copy_rtx (operand);
25635 PUT_MODE (operand, word_mode);
25636 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25637 return size;
25640 if (GET_CODE (operand) == CONST_VECTOR)
25642 scalar_int_mode imode = int_mode_for_mode (mode).require ();
25643 /* Caution: if we looked through a constant pool memory above,
25644 the operand may actually have a different mode now. That's
25645 ok, since we want to pun this all the way back to an integer. */
25646 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25647 gcc_assert (operand != NULL);
25648 mode = imode;
25651 if (!TARGET_64BIT)
25653 if (mode == DImode)
25654 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25655 else
25657 int i;
25659 if (REG_P (operand))
25661 gcc_assert (reload_completed);
25662 for (i = 0; i < size; i++)
25663 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25665 else if (offsettable_memref_p (operand))
25667 operand = adjust_address (operand, SImode, 0);
25668 parts[0] = operand;
25669 for (i = 1; i < size; i++)
25670 parts[i] = adjust_address (operand, SImode, 4 * i);
25672 else if (CONST_DOUBLE_P (operand))
25674 const REAL_VALUE_TYPE *r;
25675 long l[4];
25677 r = CONST_DOUBLE_REAL_VALUE (operand);
25678 switch (mode)
25680 case E_TFmode:
25681 real_to_target (l, r, mode);
25682 parts[3] = gen_int_mode (l[3], SImode);
25683 parts[2] = gen_int_mode (l[2], SImode);
25684 break;
25685 case E_XFmode:
25686 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25687 long double may not be 80-bit. */
25688 real_to_target (l, r, mode);
25689 parts[2] = gen_int_mode (l[2], SImode);
25690 break;
25691 case E_DFmode:
25692 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25693 break;
25694 default:
25695 gcc_unreachable ();
25697 parts[1] = gen_int_mode (l[1], SImode);
25698 parts[0] = gen_int_mode (l[0], SImode);
25700 else
25701 gcc_unreachable ();
25704 else
25706 if (mode == TImode)
25707 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25708 if (mode == XFmode || mode == TFmode)
25710 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25711 if (REG_P (operand))
25713 gcc_assert (reload_completed);
25714 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25715 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25717 else if (offsettable_memref_p (operand))
25719 operand = adjust_address (operand, DImode, 0);
25720 parts[0] = operand;
25721 parts[1] = adjust_address (operand, upper_mode, 8);
25723 else if (CONST_DOUBLE_P (operand))
25725 long l[4];
25727 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25729 /* real_to_target puts 32-bit pieces in each long. */
25730 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25731 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25732 << 32), DImode);
25734 if (upper_mode == SImode)
25735 parts[1] = gen_int_mode (l[2], SImode);
25736 else
25737 parts[1]
25738 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25739 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25740 << 32), DImode);
25742 else
25743 gcc_unreachable ();
25747 return size;
25750 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25751 Return false when normal moves are needed; true when all required
25752 insns have been emitted. Operands 2-4 contain the input values
25753 int the correct order; operands 5-7 contain the output values. */
25755 void
25756 ix86_split_long_move (rtx operands[])
25758 rtx part[2][4];
25759 int nparts, i, j;
25760 int push = 0;
25761 int collisions = 0;
25762 machine_mode mode = GET_MODE (operands[0]);
25763 bool collisionparts[4];
25765 /* The DFmode expanders may ask us to move double.
25766 For 64bit target this is single move. By hiding the fact
25767 here we simplify i386.md splitters. */
25768 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25770 /* Optimize constant pool reference to immediates. This is used by
25771 fp moves, that force all constants to memory to allow combining. */
25773 if (MEM_P (operands[1])
25774 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25775 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25776 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25777 if (push_operand (operands[0], VOIDmode))
25779 operands[0] = copy_rtx (operands[0]);
25780 PUT_MODE (operands[0], word_mode);
25782 else
25783 operands[0] = gen_lowpart (DImode, operands[0]);
25784 operands[1] = gen_lowpart (DImode, operands[1]);
25785 emit_move_insn (operands[0], operands[1]);
25786 return;
25789 /* The only non-offsettable memory we handle is push. */
25790 if (push_operand (operands[0], VOIDmode))
25791 push = 1;
25792 else
25793 gcc_assert (!MEM_P (operands[0])
25794 || offsettable_memref_p (operands[0]));
25796 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25797 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25799 /* When emitting push, take care for source operands on the stack. */
25800 if (push && MEM_P (operands[1])
25801 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25803 rtx src_base = XEXP (part[1][nparts - 1], 0);
25805 /* Compensate for the stack decrement by 4. */
25806 if (!TARGET_64BIT && nparts == 3
25807 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25808 src_base = plus_constant (Pmode, src_base, 4);
25810 /* src_base refers to the stack pointer and is
25811 automatically decreased by emitted push. */
25812 for (i = 0; i < nparts; i++)
25813 part[1][i] = change_address (part[1][i],
25814 GET_MODE (part[1][i]), src_base);
25817 /* We need to do copy in the right order in case an address register
25818 of the source overlaps the destination. */
25819 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25821 rtx tmp;
25823 for (i = 0; i < nparts; i++)
25825 collisionparts[i]
25826 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25827 if (collisionparts[i])
25828 collisions++;
25831 /* Collision in the middle part can be handled by reordering. */
25832 if (collisions == 1 && nparts == 3 && collisionparts [1])
25834 std::swap (part[0][1], part[0][2]);
25835 std::swap (part[1][1], part[1][2]);
25837 else if (collisions == 1
25838 && nparts == 4
25839 && (collisionparts [1] || collisionparts [2]))
25841 if (collisionparts [1])
25843 std::swap (part[0][1], part[0][2]);
25844 std::swap (part[1][1], part[1][2]);
25846 else
25848 std::swap (part[0][2], part[0][3]);
25849 std::swap (part[1][2], part[1][3]);
25853 /* If there are more collisions, we can't handle it by reordering.
25854 Do an lea to the last part and use only one colliding move. */
25855 else if (collisions > 1)
25857 rtx base, addr;
25859 collisions = 1;
25861 base = part[0][nparts - 1];
25863 /* Handle the case when the last part isn't valid for lea.
25864 Happens in 64-bit mode storing the 12-byte XFmode. */
25865 if (GET_MODE (base) != Pmode)
25866 base = gen_rtx_REG (Pmode, REGNO (base));
25868 addr = XEXP (part[1][0], 0);
25869 if (TARGET_TLS_DIRECT_SEG_REFS)
25871 struct ix86_address parts;
25872 int ok = ix86_decompose_address (addr, &parts);
25873 gcc_assert (ok);
25874 /* It is not valid to use %gs: or %fs: in lea. */
25875 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25877 emit_insn (gen_rtx_SET (base, addr));
25878 part[1][0] = replace_equiv_address (part[1][0], base);
25879 for (i = 1; i < nparts; i++)
25881 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25882 part[1][i] = replace_equiv_address (part[1][i], tmp);
25887 if (push)
25889 if (!TARGET_64BIT)
25891 if (nparts == 3)
25893 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25894 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25895 stack_pointer_rtx, GEN_INT (-4)));
25896 emit_move_insn (part[0][2], part[1][2]);
25898 else if (nparts == 4)
25900 emit_move_insn (part[0][3], part[1][3]);
25901 emit_move_insn (part[0][2], part[1][2]);
25904 else
25906 /* In 64bit mode we don't have 32bit push available. In case this is
25907 register, it is OK - we will just use larger counterpart. We also
25908 retype memory - these comes from attempt to avoid REX prefix on
25909 moving of second half of TFmode value. */
25910 if (GET_MODE (part[1][1]) == SImode)
25912 switch (GET_CODE (part[1][1]))
25914 case MEM:
25915 part[1][1] = adjust_address (part[1][1], DImode, 0);
25916 break;
25918 case REG:
25919 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25920 break;
25922 default:
25923 gcc_unreachable ();
25926 if (GET_MODE (part[1][0]) == SImode)
25927 part[1][0] = part[1][1];
25930 emit_move_insn (part[0][1], part[1][1]);
25931 emit_move_insn (part[0][0], part[1][0]);
25932 return;
25935 /* Choose correct order to not overwrite the source before it is copied. */
25936 if ((REG_P (part[0][0])
25937 && REG_P (part[1][1])
25938 && (REGNO (part[0][0]) == REGNO (part[1][1])
25939 || (nparts == 3
25940 && REGNO (part[0][0]) == REGNO (part[1][2]))
25941 || (nparts == 4
25942 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25943 || (collisions > 0
25944 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25946 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25948 operands[2 + i] = part[0][j];
25949 operands[6 + i] = part[1][j];
25952 else
25954 for (i = 0; i < nparts; i++)
25956 operands[2 + i] = part[0][i];
25957 operands[6 + i] = part[1][i];
25961 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25962 if (optimize_insn_for_size_p ())
25964 for (j = 0; j < nparts - 1; j++)
25965 if (CONST_INT_P (operands[6 + j])
25966 && operands[6 + j] != const0_rtx
25967 && REG_P (operands[2 + j]))
25968 for (i = j; i < nparts - 1; i++)
25969 if (CONST_INT_P (operands[7 + i])
25970 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25971 operands[7 + i] = operands[2 + j];
25974 for (i = 0; i < nparts; i++)
25975 emit_move_insn (operands[2 + i], operands[6 + i]);
25977 return;
25980 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25981 left shift by a constant, either using a single shift or
25982 a sequence of add instructions. */
25984 static void
25985 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25987 rtx (*insn)(rtx, rtx, rtx);
25989 if (count == 1
25990 || (count * ix86_cost->add <= ix86_cost->shift_const
25991 && !optimize_insn_for_size_p ()))
25993 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25994 while (count-- > 0)
25995 emit_insn (insn (operand, operand, operand));
25997 else
25999 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26000 emit_insn (insn (operand, operand, GEN_INT (count)));
26004 void
26005 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26007 rtx (*gen_ashl3)(rtx, rtx, rtx);
26008 rtx (*gen_shld)(rtx, rtx, rtx);
26009 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26011 rtx low[2], high[2];
26012 int count;
26014 if (CONST_INT_P (operands[2]))
26016 split_double_mode (mode, operands, 2, low, high);
26017 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26019 if (count >= half_width)
26021 emit_move_insn (high[0], low[1]);
26022 emit_move_insn (low[0], const0_rtx);
26024 if (count > half_width)
26025 ix86_expand_ashl_const (high[0], count - half_width, mode);
26027 else
26029 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26031 if (!rtx_equal_p (operands[0], operands[1]))
26032 emit_move_insn (operands[0], operands[1]);
26034 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26035 ix86_expand_ashl_const (low[0], count, mode);
26037 return;
26040 split_double_mode (mode, operands, 1, low, high);
26042 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26044 if (operands[1] == const1_rtx)
26046 /* Assuming we've chosen a QImode capable registers, then 1 << N
26047 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26048 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26050 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26052 ix86_expand_clear (low[0]);
26053 ix86_expand_clear (high[0]);
26054 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26056 d = gen_lowpart (QImode, low[0]);
26057 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26058 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26059 emit_insn (gen_rtx_SET (d, s));
26061 d = gen_lowpart (QImode, high[0]);
26062 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26063 s = gen_rtx_NE (QImode, flags, const0_rtx);
26064 emit_insn (gen_rtx_SET (d, s));
26067 /* Otherwise, we can get the same results by manually performing
26068 a bit extract operation on bit 5/6, and then performing the two
26069 shifts. The two methods of getting 0/1 into low/high are exactly
26070 the same size. Avoiding the shift in the bit extract case helps
26071 pentium4 a bit; no one else seems to care much either way. */
26072 else
26074 machine_mode half_mode;
26075 rtx (*gen_lshr3)(rtx, rtx, rtx);
26076 rtx (*gen_and3)(rtx, rtx, rtx);
26077 rtx (*gen_xor3)(rtx, rtx, rtx);
26078 HOST_WIDE_INT bits;
26079 rtx x;
26081 if (mode == DImode)
26083 half_mode = SImode;
26084 gen_lshr3 = gen_lshrsi3;
26085 gen_and3 = gen_andsi3;
26086 gen_xor3 = gen_xorsi3;
26087 bits = 5;
26089 else
26091 half_mode = DImode;
26092 gen_lshr3 = gen_lshrdi3;
26093 gen_and3 = gen_anddi3;
26094 gen_xor3 = gen_xordi3;
26095 bits = 6;
26098 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26099 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26100 else
26101 x = gen_lowpart (half_mode, operands[2]);
26102 emit_insn (gen_rtx_SET (high[0], x));
26104 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26105 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26106 emit_move_insn (low[0], high[0]);
26107 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26110 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26111 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26112 return;
26115 if (operands[1] == constm1_rtx)
26117 /* For -1 << N, we can avoid the shld instruction, because we
26118 know that we're shifting 0...31/63 ones into a -1. */
26119 emit_move_insn (low[0], constm1_rtx);
26120 if (optimize_insn_for_size_p ())
26121 emit_move_insn (high[0], low[0]);
26122 else
26123 emit_move_insn (high[0], constm1_rtx);
26125 else
26127 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26129 if (!rtx_equal_p (operands[0], operands[1]))
26130 emit_move_insn (operands[0], operands[1]);
26132 split_double_mode (mode, operands, 1, low, high);
26133 emit_insn (gen_shld (high[0], low[0], operands[2]));
26136 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26138 if (TARGET_CMOVE && scratch)
26140 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26141 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26143 ix86_expand_clear (scratch);
26144 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26146 else
26148 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26149 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26151 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26155 void
26156 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26158 rtx (*gen_ashr3)(rtx, rtx, rtx)
26159 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26160 rtx (*gen_shrd)(rtx, rtx, rtx);
26161 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26163 rtx low[2], high[2];
26164 int count;
26166 if (CONST_INT_P (operands[2]))
26168 split_double_mode (mode, operands, 2, low, high);
26169 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26171 if (count == GET_MODE_BITSIZE (mode) - 1)
26173 emit_move_insn (high[0], high[1]);
26174 emit_insn (gen_ashr3 (high[0], high[0],
26175 GEN_INT (half_width - 1)));
26176 emit_move_insn (low[0], high[0]);
26179 else if (count >= half_width)
26181 emit_move_insn (low[0], high[1]);
26182 emit_move_insn (high[0], low[0]);
26183 emit_insn (gen_ashr3 (high[0], high[0],
26184 GEN_INT (half_width - 1)));
26186 if (count > half_width)
26187 emit_insn (gen_ashr3 (low[0], low[0],
26188 GEN_INT (count - half_width)));
26190 else
26192 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26194 if (!rtx_equal_p (operands[0], operands[1]))
26195 emit_move_insn (operands[0], operands[1]);
26197 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26198 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26201 else
26203 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26205 if (!rtx_equal_p (operands[0], operands[1]))
26206 emit_move_insn (operands[0], operands[1]);
26208 split_double_mode (mode, operands, 1, low, high);
26210 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26211 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26213 if (TARGET_CMOVE && scratch)
26215 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26216 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26218 emit_move_insn (scratch, high[0]);
26219 emit_insn (gen_ashr3 (scratch, scratch,
26220 GEN_INT (half_width - 1)));
26221 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26222 scratch));
26224 else
26226 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26227 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26229 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26234 void
26235 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26237 rtx (*gen_lshr3)(rtx, rtx, rtx)
26238 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26239 rtx (*gen_shrd)(rtx, rtx, rtx);
26240 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26242 rtx low[2], high[2];
26243 int count;
26245 if (CONST_INT_P (operands[2]))
26247 split_double_mode (mode, operands, 2, low, high);
26248 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26250 if (count >= half_width)
26252 emit_move_insn (low[0], high[1]);
26253 ix86_expand_clear (high[0]);
26255 if (count > half_width)
26256 emit_insn (gen_lshr3 (low[0], low[0],
26257 GEN_INT (count - half_width)));
26259 else
26261 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26263 if (!rtx_equal_p (operands[0], operands[1]))
26264 emit_move_insn (operands[0], operands[1]);
26266 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26267 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26270 else
26272 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26274 if (!rtx_equal_p (operands[0], operands[1]))
26275 emit_move_insn (operands[0], operands[1]);
26277 split_double_mode (mode, operands, 1, low, high);
26279 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26280 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26282 if (TARGET_CMOVE && scratch)
26284 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26285 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26287 ix86_expand_clear (scratch);
26288 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26289 scratch));
26291 else
26293 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26294 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26296 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26301 /* Predict just emitted jump instruction to be taken with probability PROB. */
26302 static void
26303 predict_jump (int prob)
26305 rtx_insn *insn = get_last_insn ();
26306 gcc_assert (JUMP_P (insn));
26307 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26310 /* Helper function for the string operations below. Dest VARIABLE whether
26311 it is aligned to VALUE bytes. If true, jump to the label. */
26312 static rtx_code_label *
26313 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26315 rtx_code_label *label = gen_label_rtx ();
26316 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26317 if (GET_MODE (variable) == DImode)
26318 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26319 else
26320 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26321 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26322 1, label);
26323 if (epilogue)
26324 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26325 else
26326 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26327 return label;
26330 /* Adjust COUNTER by the VALUE. */
26331 static void
26332 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26334 rtx (*gen_add)(rtx, rtx, rtx)
26335 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26337 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26340 /* Zero extend possibly SImode EXP to Pmode register. */
26342 ix86_zero_extend_to_Pmode (rtx exp)
26344 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26347 /* Divide COUNTREG by SCALE. */
26348 static rtx
26349 scale_counter (rtx countreg, int scale)
26351 rtx sc;
26353 if (scale == 1)
26354 return countreg;
26355 if (CONST_INT_P (countreg))
26356 return GEN_INT (INTVAL (countreg) / scale);
26357 gcc_assert (REG_P (countreg));
26359 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26360 GEN_INT (exact_log2 (scale)),
26361 NULL, 1, OPTAB_DIRECT);
26362 return sc;
26365 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26366 DImode for constant loop counts. */
26368 static machine_mode
26369 counter_mode (rtx count_exp)
26371 if (GET_MODE (count_exp) != VOIDmode)
26372 return GET_MODE (count_exp);
26373 if (!CONST_INT_P (count_exp))
26374 return Pmode;
26375 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26376 return DImode;
26377 return SImode;
26380 /* Copy the address to a Pmode register. This is used for x32 to
26381 truncate DImode TLS address to a SImode register. */
26383 static rtx
26384 ix86_copy_addr_to_reg (rtx addr)
26386 rtx reg;
26387 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26389 reg = copy_addr_to_reg (addr);
26390 REG_POINTER (reg) = 1;
26391 return reg;
26393 else
26395 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26396 reg = copy_to_mode_reg (DImode, addr);
26397 REG_POINTER (reg) = 1;
26398 return gen_rtx_SUBREG (SImode, reg, 0);
26402 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26403 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26404 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26405 memory by VALUE (supposed to be in MODE).
26407 The size is rounded down to whole number of chunk size moved at once.
26408 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26411 static void
26412 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26413 rtx destptr, rtx srcptr, rtx value,
26414 rtx count, machine_mode mode, int unroll,
26415 int expected_size, bool issetmem)
26417 rtx_code_label *out_label, *top_label;
26418 rtx iter, tmp;
26419 machine_mode iter_mode = counter_mode (count);
26420 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26421 rtx piece_size = GEN_INT (piece_size_n);
26422 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26423 rtx size;
26424 int i;
26426 top_label = gen_label_rtx ();
26427 out_label = gen_label_rtx ();
26428 iter = gen_reg_rtx (iter_mode);
26430 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26431 NULL, 1, OPTAB_DIRECT);
26432 /* Those two should combine. */
26433 if (piece_size == const1_rtx)
26435 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26436 true, out_label);
26437 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26439 emit_move_insn (iter, const0_rtx);
26441 emit_label (top_label);
26443 tmp = convert_modes (Pmode, iter_mode, iter, true);
26445 /* This assert could be relaxed - in this case we'll need to compute
26446 smallest power of two, containing in PIECE_SIZE_N and pass it to
26447 offset_address. */
26448 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26449 destmem = offset_address (destmem, tmp, piece_size_n);
26450 destmem = adjust_address (destmem, mode, 0);
26452 if (!issetmem)
26454 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26455 srcmem = adjust_address (srcmem, mode, 0);
26457 /* When unrolling for chips that reorder memory reads and writes,
26458 we can save registers by using single temporary.
26459 Also using 4 temporaries is overkill in 32bit mode. */
26460 if (!TARGET_64BIT && 0)
26462 for (i = 0; i < unroll; i++)
26464 if (i)
26466 destmem =
26467 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26468 srcmem =
26469 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26471 emit_move_insn (destmem, srcmem);
26474 else
26476 rtx tmpreg[4];
26477 gcc_assert (unroll <= 4);
26478 for (i = 0; i < unroll; i++)
26480 tmpreg[i] = gen_reg_rtx (mode);
26481 if (i)
26483 srcmem =
26484 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26486 emit_move_insn (tmpreg[i], srcmem);
26488 for (i = 0; i < unroll; i++)
26490 if (i)
26492 destmem =
26493 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26495 emit_move_insn (destmem, tmpreg[i]);
26499 else
26500 for (i = 0; i < unroll; i++)
26502 if (i)
26503 destmem =
26504 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26505 emit_move_insn (destmem, value);
26508 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26509 true, OPTAB_LIB_WIDEN);
26510 if (tmp != iter)
26511 emit_move_insn (iter, tmp);
26513 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26514 true, top_label);
26515 if (expected_size != -1)
26517 expected_size /= GET_MODE_SIZE (mode) * unroll;
26518 if (expected_size == 0)
26519 predict_jump (0);
26520 else if (expected_size > REG_BR_PROB_BASE)
26521 predict_jump (REG_BR_PROB_BASE - 1);
26522 else
26523 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26525 else
26526 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26527 iter = ix86_zero_extend_to_Pmode (iter);
26528 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26529 true, OPTAB_LIB_WIDEN);
26530 if (tmp != destptr)
26531 emit_move_insn (destptr, tmp);
26532 if (!issetmem)
26534 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26535 true, OPTAB_LIB_WIDEN);
26536 if (tmp != srcptr)
26537 emit_move_insn (srcptr, tmp);
26539 emit_label (out_label);
26542 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26543 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26544 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26545 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26546 ORIG_VALUE is the original value passed to memset to fill the memory with.
26547 Other arguments have same meaning as for previous function. */
26549 static void
26550 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26551 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26552 rtx count,
26553 machine_mode mode, bool issetmem)
26555 rtx destexp;
26556 rtx srcexp;
26557 rtx countreg;
26558 HOST_WIDE_INT rounded_count;
26560 /* If possible, it is shorter to use rep movs.
26561 TODO: Maybe it is better to move this logic to decide_alg. */
26562 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26563 && (!issetmem || orig_value == const0_rtx))
26564 mode = SImode;
26566 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26567 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26569 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26570 GET_MODE_SIZE (mode)));
26571 if (mode != QImode)
26573 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26574 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26575 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26577 else
26578 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26579 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26581 rounded_count
26582 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26583 destmem = shallow_copy_rtx (destmem);
26584 set_mem_size (destmem, rounded_count);
26586 else if (MEM_SIZE_KNOWN_P (destmem))
26587 clear_mem_size (destmem);
26589 if (issetmem)
26591 value = force_reg (mode, gen_lowpart (mode, value));
26592 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26594 else
26596 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26597 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26598 if (mode != QImode)
26600 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26601 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26602 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26604 else
26605 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26606 if (CONST_INT_P (count))
26608 rounded_count
26609 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26610 srcmem = shallow_copy_rtx (srcmem);
26611 set_mem_size (srcmem, rounded_count);
26613 else
26615 if (MEM_SIZE_KNOWN_P (srcmem))
26616 clear_mem_size (srcmem);
26618 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26619 destexp, srcexp));
26623 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26624 DESTMEM.
26625 SRC is passed by pointer to be updated on return.
26626 Return value is updated DST. */
26627 static rtx
26628 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26629 HOST_WIDE_INT size_to_move)
26631 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26632 enum insn_code code;
26633 machine_mode move_mode;
26634 int piece_size, i;
26636 /* Find the widest mode in which we could perform moves.
26637 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26638 it until move of such size is supported. */
26639 piece_size = 1 << floor_log2 (size_to_move);
26640 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26641 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26643 gcc_assert (piece_size > 1);
26644 piece_size >>= 1;
26647 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26648 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26649 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26651 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26652 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26653 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26655 move_mode = word_mode;
26656 piece_size = GET_MODE_SIZE (move_mode);
26657 code = optab_handler (mov_optab, move_mode);
26660 gcc_assert (code != CODE_FOR_nothing);
26662 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26663 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26665 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26666 gcc_assert (size_to_move % piece_size == 0);
26667 adjust = GEN_INT (piece_size);
26668 for (i = 0; i < size_to_move; i += piece_size)
26670 /* We move from memory to memory, so we'll need to do it via
26671 a temporary register. */
26672 tempreg = gen_reg_rtx (move_mode);
26673 emit_insn (GEN_FCN (code) (tempreg, src));
26674 emit_insn (GEN_FCN (code) (dst, tempreg));
26676 emit_move_insn (destptr,
26677 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26678 emit_move_insn (srcptr,
26679 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26681 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26682 piece_size);
26683 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26684 piece_size);
26687 /* Update DST and SRC rtx. */
26688 *srcmem = src;
26689 return dst;
26692 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26693 static void
26694 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26695 rtx destptr, rtx srcptr, rtx count, int max_size)
26697 rtx src, dest;
26698 if (CONST_INT_P (count))
26700 HOST_WIDE_INT countval = INTVAL (count);
26701 HOST_WIDE_INT epilogue_size = countval % max_size;
26702 int i;
26704 /* For now MAX_SIZE should be a power of 2. This assert could be
26705 relaxed, but it'll require a bit more complicated epilogue
26706 expanding. */
26707 gcc_assert ((max_size & (max_size - 1)) == 0);
26708 for (i = max_size; i >= 1; i >>= 1)
26710 if (epilogue_size & i)
26711 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26713 return;
26715 if (max_size > 8)
26717 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26718 count, 1, OPTAB_DIRECT);
26719 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26720 count, QImode, 1, 4, false);
26721 return;
26724 /* When there are stringops, we can cheaply increase dest and src pointers.
26725 Otherwise we save code size by maintaining offset (zero is readily
26726 available from preceding rep operation) and using x86 addressing modes.
26728 if (TARGET_SINGLE_STRINGOP)
26730 if (max_size > 4)
26732 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26733 src = change_address (srcmem, SImode, srcptr);
26734 dest = change_address (destmem, SImode, destptr);
26735 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26736 emit_label (label);
26737 LABEL_NUSES (label) = 1;
26739 if (max_size > 2)
26741 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26742 src = change_address (srcmem, HImode, srcptr);
26743 dest = change_address (destmem, HImode, destptr);
26744 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26745 emit_label (label);
26746 LABEL_NUSES (label) = 1;
26748 if (max_size > 1)
26750 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26751 src = change_address (srcmem, QImode, srcptr);
26752 dest = change_address (destmem, QImode, destptr);
26753 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26754 emit_label (label);
26755 LABEL_NUSES (label) = 1;
26758 else
26760 rtx offset = force_reg (Pmode, const0_rtx);
26761 rtx tmp;
26763 if (max_size > 4)
26765 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26766 src = change_address (srcmem, SImode, srcptr);
26767 dest = change_address (destmem, SImode, destptr);
26768 emit_move_insn (dest, src);
26769 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26770 true, OPTAB_LIB_WIDEN);
26771 if (tmp != offset)
26772 emit_move_insn (offset, tmp);
26773 emit_label (label);
26774 LABEL_NUSES (label) = 1;
26776 if (max_size > 2)
26778 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26779 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26780 src = change_address (srcmem, HImode, tmp);
26781 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26782 dest = change_address (destmem, HImode, tmp);
26783 emit_move_insn (dest, src);
26784 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26785 true, OPTAB_LIB_WIDEN);
26786 if (tmp != offset)
26787 emit_move_insn (offset, tmp);
26788 emit_label (label);
26789 LABEL_NUSES (label) = 1;
26791 if (max_size > 1)
26793 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26794 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26795 src = change_address (srcmem, QImode, tmp);
26796 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26797 dest = change_address (destmem, QImode, tmp);
26798 emit_move_insn (dest, src);
26799 emit_label (label);
26800 LABEL_NUSES (label) = 1;
26805 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26806 with value PROMOTED_VAL.
26807 SRC is passed by pointer to be updated on return.
26808 Return value is updated DST. */
26809 static rtx
26810 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26811 HOST_WIDE_INT size_to_move)
26813 rtx dst = destmem, adjust;
26814 enum insn_code code;
26815 machine_mode move_mode;
26816 int piece_size, i;
26818 /* Find the widest mode in which we could perform moves.
26819 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26820 it until move of such size is supported. */
26821 move_mode = GET_MODE (promoted_val);
26822 if (move_mode == VOIDmode)
26823 move_mode = QImode;
26824 if (size_to_move < GET_MODE_SIZE (move_mode))
26826 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26827 move_mode = int_mode_for_size (move_bits, 0).require ();
26828 promoted_val = gen_lowpart (move_mode, promoted_val);
26830 piece_size = GET_MODE_SIZE (move_mode);
26831 code = optab_handler (mov_optab, move_mode);
26832 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26834 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26836 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26837 gcc_assert (size_to_move % piece_size == 0);
26838 adjust = GEN_INT (piece_size);
26839 for (i = 0; i < size_to_move; i += piece_size)
26841 if (piece_size <= GET_MODE_SIZE (word_mode))
26843 emit_insn (gen_strset (destptr, dst, promoted_val));
26844 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26845 piece_size);
26846 continue;
26849 emit_insn (GEN_FCN (code) (dst, promoted_val));
26851 emit_move_insn (destptr,
26852 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26854 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26855 piece_size);
26858 /* Update DST rtx. */
26859 return dst;
26861 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26862 static void
26863 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26864 rtx count, int max_size)
26866 count =
26867 expand_simple_binop (counter_mode (count), AND, count,
26868 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26869 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26870 gen_lowpart (QImode, value), count, QImode,
26871 1, max_size / 2, true);
26874 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26875 static void
26876 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26877 rtx count, int max_size)
26879 rtx dest;
26881 if (CONST_INT_P (count))
26883 HOST_WIDE_INT countval = INTVAL (count);
26884 HOST_WIDE_INT epilogue_size = countval % max_size;
26885 int i;
26887 /* For now MAX_SIZE should be a power of 2. This assert could be
26888 relaxed, but it'll require a bit more complicated epilogue
26889 expanding. */
26890 gcc_assert ((max_size & (max_size - 1)) == 0);
26891 for (i = max_size; i >= 1; i >>= 1)
26893 if (epilogue_size & i)
26895 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26896 destmem = emit_memset (destmem, destptr, vec_value, i);
26897 else
26898 destmem = emit_memset (destmem, destptr, value, i);
26901 return;
26903 if (max_size > 32)
26905 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26906 return;
26908 if (max_size > 16)
26910 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26911 if (TARGET_64BIT)
26913 dest = change_address (destmem, DImode, destptr);
26914 emit_insn (gen_strset (destptr, dest, value));
26915 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26916 emit_insn (gen_strset (destptr, dest, value));
26918 else
26920 dest = change_address (destmem, SImode, destptr);
26921 emit_insn (gen_strset (destptr, dest, value));
26922 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26923 emit_insn (gen_strset (destptr, dest, value));
26924 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26925 emit_insn (gen_strset (destptr, dest, value));
26926 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26927 emit_insn (gen_strset (destptr, dest, value));
26929 emit_label (label);
26930 LABEL_NUSES (label) = 1;
26932 if (max_size > 8)
26934 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26935 if (TARGET_64BIT)
26937 dest = change_address (destmem, DImode, destptr);
26938 emit_insn (gen_strset (destptr, dest, value));
26940 else
26942 dest = change_address (destmem, SImode, destptr);
26943 emit_insn (gen_strset (destptr, dest, value));
26944 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26945 emit_insn (gen_strset (destptr, dest, value));
26947 emit_label (label);
26948 LABEL_NUSES (label) = 1;
26950 if (max_size > 4)
26952 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26953 dest = change_address (destmem, SImode, destptr);
26954 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26955 emit_label (label);
26956 LABEL_NUSES (label) = 1;
26958 if (max_size > 2)
26960 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26961 dest = change_address (destmem, HImode, destptr);
26962 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26963 emit_label (label);
26964 LABEL_NUSES (label) = 1;
26966 if (max_size > 1)
26968 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26969 dest = change_address (destmem, QImode, destptr);
26970 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26971 emit_label (label);
26972 LABEL_NUSES (label) = 1;
26976 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26977 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26978 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26979 ignored.
26980 Return value is updated DESTMEM. */
26981 static rtx
26982 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26983 rtx destptr, rtx srcptr, rtx value,
26984 rtx vec_value, rtx count, int align,
26985 int desired_alignment, bool issetmem)
26987 int i;
26988 for (i = 1; i < desired_alignment; i <<= 1)
26990 if (align <= i)
26992 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26993 if (issetmem)
26995 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26996 destmem = emit_memset (destmem, destptr, vec_value, i);
26997 else
26998 destmem = emit_memset (destmem, destptr, value, i);
27000 else
27001 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27002 ix86_adjust_counter (count, i);
27003 emit_label (label);
27004 LABEL_NUSES (label) = 1;
27005 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27008 return destmem;
27011 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27012 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27013 and jump to DONE_LABEL. */
27014 static void
27015 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27016 rtx destptr, rtx srcptr,
27017 rtx value, rtx vec_value,
27018 rtx count, int size,
27019 rtx done_label, bool issetmem)
27021 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27022 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
27023 rtx modesize;
27024 int n;
27026 /* If we do not have vector value to copy, we must reduce size. */
27027 if (issetmem)
27029 if (!vec_value)
27031 if (GET_MODE (value) == VOIDmode && size > 8)
27032 mode = Pmode;
27033 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27034 mode = GET_MODE (value);
27036 else
27037 mode = GET_MODE (vec_value), value = vec_value;
27039 else
27041 /* Choose appropriate vector mode. */
27042 if (size >= 32)
27043 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27044 else if (size >= 16)
27045 mode = TARGET_SSE ? V16QImode : DImode;
27046 srcmem = change_address (srcmem, mode, srcptr);
27048 destmem = change_address (destmem, mode, destptr);
27049 modesize = GEN_INT (GET_MODE_SIZE (mode));
27050 gcc_assert (GET_MODE_SIZE (mode) <= size);
27051 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27053 if (issetmem)
27054 emit_move_insn (destmem, gen_lowpart (mode, value));
27055 else
27057 emit_move_insn (destmem, srcmem);
27058 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27060 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27063 destmem = offset_address (destmem, count, 1);
27064 destmem = offset_address (destmem, GEN_INT (-2 * size),
27065 GET_MODE_SIZE (mode));
27066 if (!issetmem)
27068 srcmem = offset_address (srcmem, count, 1);
27069 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27070 GET_MODE_SIZE (mode));
27072 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27074 if (issetmem)
27075 emit_move_insn (destmem, gen_lowpart (mode, value));
27076 else
27078 emit_move_insn (destmem, srcmem);
27079 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27081 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27083 emit_jump_insn (gen_jump (done_label));
27084 emit_barrier ();
27086 emit_label (label);
27087 LABEL_NUSES (label) = 1;
27090 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27091 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27092 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27093 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27094 DONE_LABEL is a label after the whole copying sequence. The label is created
27095 on demand if *DONE_LABEL is NULL.
27096 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27097 bounds after the initial copies.
27099 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27100 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27101 we will dispatch to a library call for large blocks.
27103 In pseudocode we do:
27105 if (COUNT < SIZE)
27107 Assume that SIZE is 4. Bigger sizes are handled analogously
27108 if (COUNT & 4)
27110 copy 4 bytes from SRCPTR to DESTPTR
27111 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27112 goto done_label
27114 if (!COUNT)
27115 goto done_label;
27116 copy 1 byte from SRCPTR to DESTPTR
27117 if (COUNT & 2)
27119 copy 2 bytes from SRCPTR to DESTPTR
27120 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27123 else
27125 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27126 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27128 OLD_DESPTR = DESTPTR;
27129 Align DESTPTR up to DESIRED_ALIGN
27130 SRCPTR += DESTPTR - OLD_DESTPTR
27131 COUNT -= DEST_PTR - OLD_DESTPTR
27132 if (DYNAMIC_CHECK)
27133 Round COUNT down to multiple of SIZE
27134 << optional caller supplied zero size guard is here >>
27135 << optional caller supplied dynamic check is here >>
27136 << caller supplied main copy loop is here >>
27138 done_label:
27140 static void
27141 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27142 rtx *destptr, rtx *srcptr,
27143 machine_mode mode,
27144 rtx value, rtx vec_value,
27145 rtx *count,
27146 rtx_code_label **done_label,
27147 int size,
27148 int desired_align,
27149 int align,
27150 unsigned HOST_WIDE_INT *min_size,
27151 bool dynamic_check,
27152 bool issetmem)
27154 rtx_code_label *loop_label = NULL, *label;
27155 int n;
27156 rtx modesize;
27157 int prolog_size = 0;
27158 rtx mode_value;
27160 /* Chose proper value to copy. */
27161 if (issetmem && VECTOR_MODE_P (mode))
27162 mode_value = vec_value;
27163 else
27164 mode_value = value;
27165 gcc_assert (GET_MODE_SIZE (mode) <= size);
27167 /* See if block is big or small, handle small blocks. */
27168 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27170 int size2 = size;
27171 loop_label = gen_label_rtx ();
27173 if (!*done_label)
27174 *done_label = gen_label_rtx ();
27176 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27177 1, loop_label);
27178 size2 >>= 1;
27180 /* Handle sizes > 3. */
27181 for (;size2 > 2; size2 >>= 1)
27182 expand_small_movmem_or_setmem (destmem, srcmem,
27183 *destptr, *srcptr,
27184 value, vec_value,
27185 *count,
27186 size2, *done_label, issetmem);
27187 /* Nothing to copy? Jump to DONE_LABEL if so */
27188 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27189 1, *done_label);
27191 /* Do a byte copy. */
27192 destmem = change_address (destmem, QImode, *destptr);
27193 if (issetmem)
27194 emit_move_insn (destmem, gen_lowpart (QImode, value));
27195 else
27197 srcmem = change_address (srcmem, QImode, *srcptr);
27198 emit_move_insn (destmem, srcmem);
27201 /* Handle sizes 2 and 3. */
27202 label = ix86_expand_aligntest (*count, 2, false);
27203 destmem = change_address (destmem, HImode, *destptr);
27204 destmem = offset_address (destmem, *count, 1);
27205 destmem = offset_address (destmem, GEN_INT (-2), 2);
27206 if (issetmem)
27207 emit_move_insn (destmem, gen_lowpart (HImode, value));
27208 else
27210 srcmem = change_address (srcmem, HImode, *srcptr);
27211 srcmem = offset_address (srcmem, *count, 1);
27212 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27213 emit_move_insn (destmem, srcmem);
27216 emit_label (label);
27217 LABEL_NUSES (label) = 1;
27218 emit_jump_insn (gen_jump (*done_label));
27219 emit_barrier ();
27221 else
27222 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27223 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27225 /* Start memcpy for COUNT >= SIZE. */
27226 if (loop_label)
27228 emit_label (loop_label);
27229 LABEL_NUSES (loop_label) = 1;
27232 /* Copy first desired_align bytes. */
27233 if (!issetmem)
27234 srcmem = change_address (srcmem, mode, *srcptr);
27235 destmem = change_address (destmem, mode, *destptr);
27236 modesize = GEN_INT (GET_MODE_SIZE (mode));
27237 for (n = 0; prolog_size < desired_align - align; n++)
27239 if (issetmem)
27240 emit_move_insn (destmem, mode_value);
27241 else
27243 emit_move_insn (destmem, srcmem);
27244 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27246 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27247 prolog_size += GET_MODE_SIZE (mode);
27251 /* Copy last SIZE bytes. */
27252 destmem = offset_address (destmem, *count, 1);
27253 destmem = offset_address (destmem,
27254 GEN_INT (-size - prolog_size),
27256 if (issetmem)
27257 emit_move_insn (destmem, mode_value);
27258 else
27260 srcmem = offset_address (srcmem, *count, 1);
27261 srcmem = offset_address (srcmem,
27262 GEN_INT (-size - prolog_size),
27264 emit_move_insn (destmem, srcmem);
27266 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27268 destmem = offset_address (destmem, modesize, 1);
27269 if (issetmem)
27270 emit_move_insn (destmem, mode_value);
27271 else
27273 srcmem = offset_address (srcmem, modesize, 1);
27274 emit_move_insn (destmem, srcmem);
27278 /* Align destination. */
27279 if (desired_align > 1 && desired_align > align)
27281 rtx saveddest = *destptr;
27283 gcc_assert (desired_align <= size);
27284 /* Align destptr up, place it to new register. */
27285 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27286 GEN_INT (prolog_size),
27287 NULL_RTX, 1, OPTAB_DIRECT);
27288 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27289 REG_POINTER (*destptr) = 1;
27290 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27291 GEN_INT (-desired_align),
27292 *destptr, 1, OPTAB_DIRECT);
27293 /* See how many bytes we skipped. */
27294 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27295 *destptr,
27296 saveddest, 1, OPTAB_DIRECT);
27297 /* Adjust srcptr and count. */
27298 if (!issetmem)
27299 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27300 saveddest, *srcptr, 1, OPTAB_DIRECT);
27301 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27302 saveddest, *count, 1, OPTAB_DIRECT);
27303 /* We copied at most size + prolog_size. */
27304 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27305 *min_size
27306 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27307 else
27308 *min_size = 0;
27310 /* Our loops always round down the block size, but for dispatch to
27311 library we need precise value. */
27312 if (dynamic_check)
27313 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27314 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27316 else
27318 gcc_assert (prolog_size == 0);
27319 /* Decrease count, so we won't end up copying last word twice. */
27320 if (!CONST_INT_P (*count))
27321 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27322 constm1_rtx, *count, 1, OPTAB_DIRECT);
27323 else
27324 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27325 (unsigned HOST_WIDE_INT)size));
27326 if (*min_size)
27327 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27332 /* This function is like the previous one, except here we know how many bytes
27333 need to be copied. That allows us to update alignment not only of DST, which
27334 is returned, but also of SRC, which is passed as a pointer for that
27335 reason. */
27336 static rtx
27337 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27338 rtx srcreg, rtx value, rtx vec_value,
27339 int desired_align, int align_bytes,
27340 bool issetmem)
27342 rtx src = NULL;
27343 rtx orig_dst = dst;
27344 rtx orig_src = NULL;
27345 int piece_size = 1;
27346 int copied_bytes = 0;
27348 if (!issetmem)
27350 gcc_assert (srcp != NULL);
27351 src = *srcp;
27352 orig_src = src;
27355 for (piece_size = 1;
27356 piece_size <= desired_align && copied_bytes < align_bytes;
27357 piece_size <<= 1)
27359 if (align_bytes & piece_size)
27361 if (issetmem)
27363 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27364 dst = emit_memset (dst, destreg, vec_value, piece_size);
27365 else
27366 dst = emit_memset (dst, destreg, value, piece_size);
27368 else
27369 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27370 copied_bytes += piece_size;
27373 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27374 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27375 if (MEM_SIZE_KNOWN_P (orig_dst))
27376 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27378 if (!issetmem)
27380 int src_align_bytes = get_mem_align_offset (src, desired_align
27381 * BITS_PER_UNIT);
27382 if (src_align_bytes >= 0)
27383 src_align_bytes = desired_align - src_align_bytes;
27384 if (src_align_bytes >= 0)
27386 unsigned int src_align;
27387 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27389 if ((src_align_bytes & (src_align - 1))
27390 == (align_bytes & (src_align - 1)))
27391 break;
27393 if (src_align > (unsigned int) desired_align)
27394 src_align = desired_align;
27395 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27396 set_mem_align (src, src_align * BITS_PER_UNIT);
27398 if (MEM_SIZE_KNOWN_P (orig_src))
27399 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27400 *srcp = src;
27403 return dst;
27406 /* Return true if ALG can be used in current context.
27407 Assume we expand memset if MEMSET is true. */
27408 static bool
27409 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27411 if (alg == no_stringop)
27412 return false;
27413 if (alg == vector_loop)
27414 return TARGET_SSE || TARGET_AVX;
27415 /* Algorithms using the rep prefix want at least edi and ecx;
27416 additionally, memset wants eax and memcpy wants esi. Don't
27417 consider such algorithms if the user has appropriated those
27418 registers for their own purposes, or if we have a non-default
27419 address space, since some string insns cannot override the segment. */
27420 if (alg == rep_prefix_1_byte
27421 || alg == rep_prefix_4_byte
27422 || alg == rep_prefix_8_byte)
27424 if (have_as)
27425 return false;
27426 if (fixed_regs[CX_REG]
27427 || fixed_regs[DI_REG]
27428 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27429 return false;
27431 return true;
27434 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27435 static enum stringop_alg
27436 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27437 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27438 bool memset, bool zero_memset, bool have_as,
27439 int *dynamic_check, bool *noalign, bool recur)
27441 const struct stringop_algs *algs;
27442 bool optimize_for_speed;
27443 int max = 0;
27444 const struct processor_costs *cost;
27445 int i;
27446 bool any_alg_usable_p = false;
27448 *noalign = false;
27449 *dynamic_check = -1;
27451 /* Even if the string operation call is cold, we still might spend a lot
27452 of time processing large blocks. */
27453 if (optimize_function_for_size_p (cfun)
27454 || (optimize_insn_for_size_p ()
27455 && (max_size < 256
27456 || (expected_size != -1 && expected_size < 256))))
27457 optimize_for_speed = false;
27458 else
27459 optimize_for_speed = true;
27461 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27462 if (memset)
27463 algs = &cost->memset[TARGET_64BIT != 0];
27464 else
27465 algs = &cost->memcpy[TARGET_64BIT != 0];
27467 /* See maximal size for user defined algorithm. */
27468 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27470 enum stringop_alg candidate = algs->size[i].alg;
27471 bool usable = alg_usable_p (candidate, memset, have_as);
27472 any_alg_usable_p |= usable;
27474 if (candidate != libcall && candidate && usable)
27475 max = algs->size[i].max;
27478 /* If expected size is not known but max size is small enough
27479 so inline version is a win, set expected size into
27480 the range. */
27481 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27482 && expected_size == -1)
27483 expected_size = min_size / 2 + max_size / 2;
27485 /* If user specified the algorithm, honor it if possible. */
27486 if (ix86_stringop_alg != no_stringop
27487 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27488 return ix86_stringop_alg;
27489 /* rep; movq or rep; movl is the smallest variant. */
27490 else if (!optimize_for_speed)
27492 *noalign = true;
27493 if (!count || (count & 3) || (memset && !zero_memset))
27494 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27495 ? rep_prefix_1_byte : loop_1_byte;
27496 else
27497 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27498 ? rep_prefix_4_byte : loop;
27500 /* Very tiny blocks are best handled via the loop, REP is expensive to
27501 setup. */
27502 else if (expected_size != -1 && expected_size < 4)
27503 return loop_1_byte;
27504 else if (expected_size != -1)
27506 enum stringop_alg alg = libcall;
27507 bool alg_noalign = false;
27508 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27510 /* We get here if the algorithms that were not libcall-based
27511 were rep-prefix based and we are unable to use rep prefixes
27512 based on global register usage. Break out of the loop and
27513 use the heuristic below. */
27514 if (algs->size[i].max == 0)
27515 break;
27516 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27518 enum stringop_alg candidate = algs->size[i].alg;
27520 if (candidate != libcall
27521 && alg_usable_p (candidate, memset, have_as))
27523 alg = candidate;
27524 alg_noalign = algs->size[i].noalign;
27526 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27527 last non-libcall inline algorithm. */
27528 if (TARGET_INLINE_ALL_STRINGOPS)
27530 /* When the current size is best to be copied by a libcall,
27531 but we are still forced to inline, run the heuristic below
27532 that will pick code for medium sized blocks. */
27533 if (alg != libcall)
27535 *noalign = alg_noalign;
27536 return alg;
27538 else if (!any_alg_usable_p)
27539 break;
27541 else if (alg_usable_p (candidate, memset, have_as))
27543 *noalign = algs->size[i].noalign;
27544 return candidate;
27549 /* When asked to inline the call anyway, try to pick meaningful choice.
27550 We look for maximal size of block that is faster to copy by hand and
27551 take blocks of at most of that size guessing that average size will
27552 be roughly half of the block.
27554 If this turns out to be bad, we might simply specify the preferred
27555 choice in ix86_costs. */
27556 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27557 && (algs->unknown_size == libcall
27558 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27560 enum stringop_alg alg;
27561 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27563 /* If there aren't any usable algorithms or if recursing already,
27564 then recursing on smaller sizes or same size isn't going to
27565 find anything. Just return the simple byte-at-a-time copy loop. */
27566 if (!any_alg_usable_p || recur)
27568 /* Pick something reasonable. */
27569 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27570 *dynamic_check = 128;
27571 return loop_1_byte;
27573 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27574 zero_memset, have_as, dynamic_check, noalign, true);
27575 gcc_assert (*dynamic_check == -1);
27576 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27577 *dynamic_check = max;
27578 else
27579 gcc_assert (alg != libcall);
27580 return alg;
27582 return (alg_usable_p (algs->unknown_size, memset, have_as)
27583 ? algs->unknown_size : libcall);
27586 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27587 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27588 static int
27589 decide_alignment (int align,
27590 enum stringop_alg alg,
27591 int expected_size,
27592 machine_mode move_mode)
27594 int desired_align = 0;
27596 gcc_assert (alg != no_stringop);
27598 if (alg == libcall)
27599 return 0;
27600 if (move_mode == VOIDmode)
27601 return 0;
27603 desired_align = GET_MODE_SIZE (move_mode);
27604 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27605 copying whole cacheline at once. */
27606 if (TARGET_PENTIUMPRO
27607 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27608 desired_align = 8;
27610 if (optimize_size)
27611 desired_align = 1;
27612 if (desired_align < align)
27613 desired_align = align;
27614 if (expected_size != -1 && expected_size < 4)
27615 desired_align = align;
27617 return desired_align;
27621 /* Helper function for memcpy. For QImode value 0xXY produce
27622 0xXYXYXYXY of wide specified by MODE. This is essentially
27623 a * 0x10101010, but we can do slightly better than
27624 synth_mult by unwinding the sequence by hand on CPUs with
27625 slow multiply. */
27626 static rtx
27627 promote_duplicated_reg (machine_mode mode, rtx val)
27629 machine_mode valmode = GET_MODE (val);
27630 rtx tmp;
27631 int nops = mode == DImode ? 3 : 2;
27633 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27634 if (val == const0_rtx)
27635 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27636 if (CONST_INT_P (val))
27638 HOST_WIDE_INT v = INTVAL (val) & 255;
27640 v |= v << 8;
27641 v |= v << 16;
27642 if (mode == DImode)
27643 v |= (v << 16) << 16;
27644 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27647 if (valmode == VOIDmode)
27648 valmode = QImode;
27649 if (valmode != QImode)
27650 val = gen_lowpart (QImode, val);
27651 if (mode == QImode)
27652 return val;
27653 if (!TARGET_PARTIAL_REG_STALL)
27654 nops--;
27655 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27656 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27657 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27658 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27660 rtx reg = convert_modes (mode, QImode, val, true);
27661 tmp = promote_duplicated_reg (mode, const1_rtx);
27662 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27663 OPTAB_DIRECT);
27665 else
27667 rtx reg = convert_modes (mode, QImode, val, true);
27669 if (!TARGET_PARTIAL_REG_STALL)
27670 if (mode == SImode)
27671 emit_insn (gen_insvsi_1 (reg, reg));
27672 else
27673 emit_insn (gen_insvdi_1 (reg, reg));
27674 else
27676 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27677 NULL, 1, OPTAB_DIRECT);
27678 reg =
27679 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27681 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27682 NULL, 1, OPTAB_DIRECT);
27683 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27684 if (mode == SImode)
27685 return reg;
27686 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27687 NULL, 1, OPTAB_DIRECT);
27688 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27689 return reg;
27693 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27694 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27695 alignment from ALIGN to DESIRED_ALIGN. */
27696 static rtx
27697 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27698 int align)
27700 rtx promoted_val;
27702 if (TARGET_64BIT
27703 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27704 promoted_val = promote_duplicated_reg (DImode, val);
27705 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27706 promoted_val = promote_duplicated_reg (SImode, val);
27707 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27708 promoted_val = promote_duplicated_reg (HImode, val);
27709 else
27710 promoted_val = val;
27712 return promoted_val;
27715 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27716 operations when profitable. The code depends upon architecture, block size
27717 and alignment, but always has one of the following overall structures:
27719 Aligned move sequence:
27721 1) Prologue guard: Conditional that jumps up to epilogues for small
27722 blocks that can be handled by epilogue alone. This is faster
27723 but also needed for correctness, since prologue assume the block
27724 is larger than the desired alignment.
27726 Optional dynamic check for size and libcall for large
27727 blocks is emitted here too, with -minline-stringops-dynamically.
27729 2) Prologue: copy first few bytes in order to get destination
27730 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27731 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27732 copied. We emit either a jump tree on power of two sized
27733 blocks, or a byte loop.
27735 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27736 with specified algorithm.
27738 4) Epilogue: code copying tail of the block that is too small to be
27739 handled by main body (or up to size guarded by prologue guard).
27741 Misaligned move sequence
27743 1) missaligned move prologue/epilogue containing:
27744 a) Prologue handling small memory blocks and jumping to done_label
27745 (skipped if blocks are known to be large enough)
27746 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27747 needed by single possibly misaligned move
27748 (skipped if alignment is not needed)
27749 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27751 2) Zero size guard dispatching to done_label, if needed
27753 3) dispatch to library call, if needed,
27755 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27756 with specified algorithm. */
27757 bool
27758 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27759 rtx align_exp, rtx expected_align_exp,
27760 rtx expected_size_exp, rtx min_size_exp,
27761 rtx max_size_exp, rtx probable_max_size_exp,
27762 bool issetmem)
27764 rtx destreg;
27765 rtx srcreg = NULL;
27766 rtx_code_label *label = NULL;
27767 rtx tmp;
27768 rtx_code_label *jump_around_label = NULL;
27769 HOST_WIDE_INT align = 1;
27770 unsigned HOST_WIDE_INT count = 0;
27771 HOST_WIDE_INT expected_size = -1;
27772 int size_needed = 0, epilogue_size_needed;
27773 int desired_align = 0, align_bytes = 0;
27774 enum stringop_alg alg;
27775 rtx promoted_val = NULL;
27776 rtx vec_promoted_val = NULL;
27777 bool force_loopy_epilogue = false;
27778 int dynamic_check;
27779 bool need_zero_guard = false;
27780 bool noalign;
27781 machine_mode move_mode = VOIDmode;
27782 machine_mode wider_mode;
27783 int unroll_factor = 1;
27784 /* TODO: Once value ranges are available, fill in proper data. */
27785 unsigned HOST_WIDE_INT min_size = 0;
27786 unsigned HOST_WIDE_INT max_size = -1;
27787 unsigned HOST_WIDE_INT probable_max_size = -1;
27788 bool misaligned_prologue_used = false;
27789 bool have_as;
27791 if (CONST_INT_P (align_exp))
27792 align = INTVAL (align_exp);
27793 /* i386 can do misaligned access on reasonably increased cost. */
27794 if (CONST_INT_P (expected_align_exp)
27795 && INTVAL (expected_align_exp) > align)
27796 align = INTVAL (expected_align_exp);
27797 /* ALIGN is the minimum of destination and source alignment, but we care here
27798 just about destination alignment. */
27799 else if (!issetmem
27800 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27801 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27803 if (CONST_INT_P (count_exp))
27805 min_size = max_size = probable_max_size = count = expected_size
27806 = INTVAL (count_exp);
27807 /* When COUNT is 0, there is nothing to do. */
27808 if (!count)
27809 return true;
27811 else
27813 if (min_size_exp)
27814 min_size = INTVAL (min_size_exp);
27815 if (max_size_exp)
27816 max_size = INTVAL (max_size_exp);
27817 if (probable_max_size_exp)
27818 probable_max_size = INTVAL (probable_max_size_exp);
27819 if (CONST_INT_P (expected_size_exp))
27820 expected_size = INTVAL (expected_size_exp);
27823 /* Make sure we don't need to care about overflow later on. */
27824 if (count > (HOST_WIDE_INT_1U << 30))
27825 return false;
27827 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27828 if (!issetmem)
27829 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27831 /* Step 0: Decide on preferred algorithm, desired alignment and
27832 size of chunks to be copied by main loop. */
27833 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27834 issetmem,
27835 issetmem && val_exp == const0_rtx, have_as,
27836 &dynamic_check, &noalign, false);
27837 if (alg == libcall)
27838 return false;
27839 gcc_assert (alg != no_stringop);
27841 /* For now vector-version of memset is generated only for memory zeroing, as
27842 creating of promoted vector value is very cheap in this case. */
27843 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27844 alg = unrolled_loop;
27846 if (!count)
27847 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27848 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27849 if (!issetmem)
27850 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27852 unroll_factor = 1;
27853 move_mode = word_mode;
27854 switch (alg)
27856 case libcall:
27857 case no_stringop:
27858 case last_alg:
27859 gcc_unreachable ();
27860 case loop_1_byte:
27861 need_zero_guard = true;
27862 move_mode = QImode;
27863 break;
27864 case loop:
27865 need_zero_guard = true;
27866 break;
27867 case unrolled_loop:
27868 need_zero_guard = true;
27869 unroll_factor = (TARGET_64BIT ? 4 : 2);
27870 break;
27871 case vector_loop:
27872 need_zero_guard = true;
27873 unroll_factor = 4;
27874 /* Find the widest supported mode. */
27875 move_mode = word_mode;
27876 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27877 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27878 move_mode = wider_mode;
27880 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27881 move_mode = TImode;
27883 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27884 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27885 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27887 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27888 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27889 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27890 move_mode = word_mode;
27892 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27893 break;
27894 case rep_prefix_8_byte:
27895 move_mode = DImode;
27896 break;
27897 case rep_prefix_4_byte:
27898 move_mode = SImode;
27899 break;
27900 case rep_prefix_1_byte:
27901 move_mode = QImode;
27902 break;
27904 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27905 epilogue_size_needed = size_needed;
27907 /* If we are going to call any library calls conditionally, make sure any
27908 pending stack adjustment happen before the first conditional branch,
27909 otherwise they will be emitted before the library call only and won't
27910 happen from the other branches. */
27911 if (dynamic_check != -1)
27912 do_pending_stack_adjust ();
27914 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27915 if (!TARGET_ALIGN_STRINGOPS || noalign)
27916 align = desired_align;
27918 /* Step 1: Prologue guard. */
27920 /* Alignment code needs count to be in register. */
27921 if (CONST_INT_P (count_exp) && desired_align > align)
27923 if (INTVAL (count_exp) > desired_align
27924 && INTVAL (count_exp) > size_needed)
27926 align_bytes
27927 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27928 if (align_bytes <= 0)
27929 align_bytes = 0;
27930 else
27931 align_bytes = desired_align - align_bytes;
27933 if (align_bytes == 0)
27934 count_exp = force_reg (counter_mode (count_exp), count_exp);
27936 gcc_assert (desired_align >= 1 && align >= 1);
27938 /* Misaligned move sequences handle both prologue and epilogue at once.
27939 Default code generation results in a smaller code for large alignments
27940 and also avoids redundant job when sizes are known precisely. */
27941 misaligned_prologue_used
27942 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27943 && MAX (desired_align, epilogue_size_needed) <= 32
27944 && desired_align <= epilogue_size_needed
27945 && ((desired_align > align && !align_bytes)
27946 || (!count && epilogue_size_needed > 1)));
27948 /* Do the cheap promotion to allow better CSE across the
27949 main loop and epilogue (ie one load of the big constant in the
27950 front of all code.
27951 For now the misaligned move sequences do not have fast path
27952 without broadcasting. */
27953 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27955 if (alg == vector_loop)
27957 gcc_assert (val_exp == const0_rtx);
27958 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27959 promoted_val = promote_duplicated_reg_to_size (val_exp,
27960 GET_MODE_SIZE (word_mode),
27961 desired_align, align);
27963 else
27965 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27966 desired_align, align);
27969 /* Misaligned move sequences handles both prologues and epilogues at once.
27970 Default code generation results in smaller code for large alignments and
27971 also avoids redundant job when sizes are known precisely. */
27972 if (misaligned_prologue_used)
27974 /* Misaligned move prologue handled small blocks by itself. */
27975 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27976 (dst, src, &destreg, &srcreg,
27977 move_mode, promoted_val, vec_promoted_val,
27978 &count_exp,
27979 &jump_around_label,
27980 desired_align < align
27981 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27982 desired_align, align, &min_size, dynamic_check, issetmem);
27983 if (!issetmem)
27984 src = change_address (src, BLKmode, srcreg);
27985 dst = change_address (dst, BLKmode, destreg);
27986 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27987 epilogue_size_needed = 0;
27988 if (need_zero_guard
27989 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27991 /* It is possible that we copied enough so the main loop will not
27992 execute. */
27993 gcc_assert (size_needed > 1);
27994 if (jump_around_label == NULL_RTX)
27995 jump_around_label = gen_label_rtx ();
27996 emit_cmp_and_jump_insns (count_exp,
27997 GEN_INT (size_needed),
27998 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27999 if (expected_size == -1
28000 || expected_size < (desired_align - align) / 2 + size_needed)
28001 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28002 else
28003 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28006 /* Ensure that alignment prologue won't copy past end of block. */
28007 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28009 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28010 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28011 Make sure it is power of 2. */
28012 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28014 /* To improve performance of small blocks, we jump around the VAL
28015 promoting mode. This mean that if the promoted VAL is not constant,
28016 we might not use it in the epilogue and have to use byte
28017 loop variant. */
28018 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28019 force_loopy_epilogue = true;
28020 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28021 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28023 /* If main algorithm works on QImode, no epilogue is needed.
28024 For small sizes just don't align anything. */
28025 if (size_needed == 1)
28026 desired_align = align;
28027 else
28028 goto epilogue;
28030 else if (!count
28031 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28033 label = gen_label_rtx ();
28034 emit_cmp_and_jump_insns (count_exp,
28035 GEN_INT (epilogue_size_needed),
28036 LTU, 0, counter_mode (count_exp), 1, label);
28037 if (expected_size == -1 || expected_size < epilogue_size_needed)
28038 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28039 else
28040 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28044 /* Emit code to decide on runtime whether library call or inline should be
28045 used. */
28046 if (dynamic_check != -1)
28048 if (!issetmem && CONST_INT_P (count_exp))
28050 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28052 emit_block_copy_via_libcall (dst, src, count_exp);
28053 count_exp = const0_rtx;
28054 goto epilogue;
28057 else
28059 rtx_code_label *hot_label = gen_label_rtx ();
28060 if (jump_around_label == NULL_RTX)
28061 jump_around_label = gen_label_rtx ();
28062 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28063 LEU, 0, counter_mode (count_exp),
28064 1, hot_label);
28065 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28066 if (issetmem)
28067 set_storage_via_libcall (dst, count_exp, val_exp);
28068 else
28069 emit_block_copy_via_libcall (dst, src, count_exp);
28070 emit_jump (jump_around_label);
28071 emit_label (hot_label);
28075 /* Step 2: Alignment prologue. */
28076 /* Do the expensive promotion once we branched off the small blocks. */
28077 if (issetmem && !promoted_val)
28078 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28079 desired_align, align);
28081 if (desired_align > align && !misaligned_prologue_used)
28083 if (align_bytes == 0)
28085 /* Except for the first move in prologue, we no longer know
28086 constant offset in aliasing info. It don't seems to worth
28087 the pain to maintain it for the first move, so throw away
28088 the info early. */
28089 dst = change_address (dst, BLKmode, destreg);
28090 if (!issetmem)
28091 src = change_address (src, BLKmode, srcreg);
28092 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28093 promoted_val, vec_promoted_val,
28094 count_exp, align, desired_align,
28095 issetmem);
28096 /* At most desired_align - align bytes are copied. */
28097 if (min_size < (unsigned)(desired_align - align))
28098 min_size = 0;
28099 else
28100 min_size -= desired_align - align;
28102 else
28104 /* If we know how many bytes need to be stored before dst is
28105 sufficiently aligned, maintain aliasing info accurately. */
28106 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28107 srcreg,
28108 promoted_val,
28109 vec_promoted_val,
28110 desired_align,
28111 align_bytes,
28112 issetmem);
28114 count_exp = plus_constant (counter_mode (count_exp),
28115 count_exp, -align_bytes);
28116 count -= align_bytes;
28117 min_size -= align_bytes;
28118 max_size -= align_bytes;
28120 if (need_zero_guard
28121 && min_size < (unsigned HOST_WIDE_INT) size_needed
28122 && (count < (unsigned HOST_WIDE_INT) size_needed
28123 || (align_bytes == 0
28124 && count < ((unsigned HOST_WIDE_INT) size_needed
28125 + desired_align - align))))
28127 /* It is possible that we copied enough so the main loop will not
28128 execute. */
28129 gcc_assert (size_needed > 1);
28130 if (label == NULL_RTX)
28131 label = gen_label_rtx ();
28132 emit_cmp_and_jump_insns (count_exp,
28133 GEN_INT (size_needed),
28134 LTU, 0, counter_mode (count_exp), 1, label);
28135 if (expected_size == -1
28136 || expected_size < (desired_align - align) / 2 + size_needed)
28137 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28138 else
28139 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28142 if (label && size_needed == 1)
28144 emit_label (label);
28145 LABEL_NUSES (label) = 1;
28146 label = NULL;
28147 epilogue_size_needed = 1;
28148 if (issetmem)
28149 promoted_val = val_exp;
28151 else if (label == NULL_RTX && !misaligned_prologue_used)
28152 epilogue_size_needed = size_needed;
28154 /* Step 3: Main loop. */
28156 switch (alg)
28158 case libcall:
28159 case no_stringop:
28160 case last_alg:
28161 gcc_unreachable ();
28162 case loop_1_byte:
28163 case loop:
28164 case unrolled_loop:
28165 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28166 count_exp, move_mode, unroll_factor,
28167 expected_size, issetmem);
28168 break;
28169 case vector_loop:
28170 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28171 vec_promoted_val, count_exp, move_mode,
28172 unroll_factor, expected_size, issetmem);
28173 break;
28174 case rep_prefix_8_byte:
28175 case rep_prefix_4_byte:
28176 case rep_prefix_1_byte:
28177 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28178 val_exp, count_exp, move_mode, issetmem);
28179 break;
28181 /* Adjust properly the offset of src and dest memory for aliasing. */
28182 if (CONST_INT_P (count_exp))
28184 if (!issetmem)
28185 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28186 (count / size_needed) * size_needed);
28187 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28188 (count / size_needed) * size_needed);
28190 else
28192 if (!issetmem)
28193 src = change_address (src, BLKmode, srcreg);
28194 dst = change_address (dst, BLKmode, destreg);
28197 /* Step 4: Epilogue to copy the remaining bytes. */
28198 epilogue:
28199 if (label)
28201 /* When the main loop is done, COUNT_EXP might hold original count,
28202 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28203 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28204 bytes. Compensate if needed. */
28206 if (size_needed < epilogue_size_needed)
28208 tmp =
28209 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28210 GEN_INT (size_needed - 1), count_exp, 1,
28211 OPTAB_DIRECT);
28212 if (tmp != count_exp)
28213 emit_move_insn (count_exp, tmp);
28215 emit_label (label);
28216 LABEL_NUSES (label) = 1;
28219 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28221 if (force_loopy_epilogue)
28222 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28223 epilogue_size_needed);
28224 else
28226 if (issetmem)
28227 expand_setmem_epilogue (dst, destreg, promoted_val,
28228 vec_promoted_val, count_exp,
28229 epilogue_size_needed);
28230 else
28231 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28232 epilogue_size_needed);
28235 if (jump_around_label)
28236 emit_label (jump_around_label);
28237 return true;
28241 /* Expand the appropriate insns for doing strlen if not just doing
28242 repnz; scasb
28244 out = result, initialized with the start address
28245 align_rtx = alignment of the address.
28246 scratch = scratch register, initialized with the startaddress when
28247 not aligned, otherwise undefined
28249 This is just the body. It needs the initializations mentioned above and
28250 some address computing at the end. These things are done in i386.md. */
28252 static void
28253 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28255 int align;
28256 rtx tmp;
28257 rtx_code_label *align_2_label = NULL;
28258 rtx_code_label *align_3_label = NULL;
28259 rtx_code_label *align_4_label = gen_label_rtx ();
28260 rtx_code_label *end_0_label = gen_label_rtx ();
28261 rtx mem;
28262 rtx tmpreg = gen_reg_rtx (SImode);
28263 rtx scratch = gen_reg_rtx (SImode);
28264 rtx cmp;
28266 align = 0;
28267 if (CONST_INT_P (align_rtx))
28268 align = INTVAL (align_rtx);
28270 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28272 /* Is there a known alignment and is it less than 4? */
28273 if (align < 4)
28275 rtx scratch1 = gen_reg_rtx (Pmode);
28276 emit_move_insn (scratch1, out);
28277 /* Is there a known alignment and is it not 2? */
28278 if (align != 2)
28280 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28281 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28283 /* Leave just the 3 lower bits. */
28284 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28285 NULL_RTX, 0, OPTAB_WIDEN);
28287 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28288 Pmode, 1, align_4_label);
28289 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28290 Pmode, 1, align_2_label);
28291 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28292 Pmode, 1, align_3_label);
28294 else
28296 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28297 check if is aligned to 4 - byte. */
28299 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28300 NULL_RTX, 0, OPTAB_WIDEN);
28302 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28303 Pmode, 1, align_4_label);
28306 mem = change_address (src, QImode, out);
28308 /* Now compare the bytes. */
28310 /* Compare the first n unaligned byte on a byte per byte basis. */
28311 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28312 QImode, 1, end_0_label);
28314 /* Increment the address. */
28315 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28317 /* Not needed with an alignment of 2 */
28318 if (align != 2)
28320 emit_label (align_2_label);
28322 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28323 end_0_label);
28325 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28327 emit_label (align_3_label);
28330 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28331 end_0_label);
28333 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28336 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28337 align this loop. It gives only huge programs, but does not help to
28338 speed up. */
28339 emit_label (align_4_label);
28341 mem = change_address (src, SImode, out);
28342 emit_move_insn (scratch, mem);
28343 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28345 /* This formula yields a nonzero result iff one of the bytes is zero.
28346 This saves three branches inside loop and many cycles. */
28348 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28349 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28350 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28351 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28352 gen_int_mode (0x80808080, SImode)));
28353 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28354 align_4_label);
28356 if (TARGET_CMOVE)
28358 rtx reg = gen_reg_rtx (SImode);
28359 rtx reg2 = gen_reg_rtx (Pmode);
28360 emit_move_insn (reg, tmpreg);
28361 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28363 /* If zero is not in the first two bytes, move two bytes forward. */
28364 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28365 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28366 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28367 emit_insn (gen_rtx_SET (tmpreg,
28368 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28369 reg,
28370 tmpreg)));
28371 /* Emit lea manually to avoid clobbering of flags. */
28372 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28374 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28375 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28376 emit_insn (gen_rtx_SET (out,
28377 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28378 reg2,
28379 out)));
28381 else
28383 rtx_code_label *end_2_label = gen_label_rtx ();
28384 /* Is zero in the first two bytes? */
28386 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28387 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28388 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28389 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28390 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28391 pc_rtx);
28392 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28393 JUMP_LABEL (tmp) = end_2_label;
28395 /* Not in the first two. Move two bytes forward. */
28396 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28397 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28399 emit_label (end_2_label);
28403 /* Avoid branch in fixing the byte. */
28404 tmpreg = gen_lowpart (QImode, tmpreg);
28405 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28406 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28407 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28408 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28410 emit_label (end_0_label);
28413 /* Expand strlen. */
28415 bool
28416 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28418 rtx addr, scratch1, scratch2, scratch3, scratch4;
28420 /* The generic case of strlen expander is long. Avoid it's
28421 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28423 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28424 && !TARGET_INLINE_ALL_STRINGOPS
28425 && !optimize_insn_for_size_p ()
28426 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28427 return false;
28429 addr = force_reg (Pmode, XEXP (src, 0));
28430 scratch1 = gen_reg_rtx (Pmode);
28432 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28433 && !optimize_insn_for_size_p ())
28435 /* Well it seems that some optimizer does not combine a call like
28436 foo(strlen(bar), strlen(bar));
28437 when the move and the subtraction is done here. It does calculate
28438 the length just once when these instructions are done inside of
28439 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28440 often used and I use one fewer register for the lifetime of
28441 output_strlen_unroll() this is better. */
28443 emit_move_insn (out, addr);
28445 ix86_expand_strlensi_unroll_1 (out, src, align);
28447 /* strlensi_unroll_1 returns the address of the zero at the end of
28448 the string, like memchr(), so compute the length by subtracting
28449 the start address. */
28450 emit_insn (ix86_gen_sub3 (out, out, addr));
28452 else
28454 rtx unspec;
28456 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28457 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28458 return false;
28459 /* Can't use this for non-default address spaces. */
28460 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28461 return false;
28463 scratch2 = gen_reg_rtx (Pmode);
28464 scratch3 = gen_reg_rtx (Pmode);
28465 scratch4 = force_reg (Pmode, constm1_rtx);
28467 emit_move_insn (scratch3, addr);
28468 eoschar = force_reg (QImode, eoschar);
28470 src = replace_equiv_address_nv (src, scratch3);
28472 /* If .md starts supporting :P, this can be done in .md. */
28473 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28474 scratch4), UNSPEC_SCAS);
28475 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28476 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28477 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28479 return true;
28482 /* For given symbol (function) construct code to compute address of it's PLT
28483 entry in large x86-64 PIC model. */
28484 static rtx
28485 construct_plt_address (rtx symbol)
28487 rtx tmp, unspec;
28489 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28490 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28491 gcc_assert (Pmode == DImode);
28493 tmp = gen_reg_rtx (Pmode);
28494 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28496 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28497 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28498 return tmp;
28502 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28503 rtx callarg2,
28504 rtx pop, bool sibcall)
28506 rtx vec[3];
28507 rtx use = NULL, call;
28508 unsigned int vec_len = 0;
28509 tree fndecl;
28511 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28513 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28514 if (fndecl
28515 && (lookup_attribute ("interrupt",
28516 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28517 error ("interrupt service routine can't be called directly");
28519 else
28520 fndecl = NULL_TREE;
28522 if (pop == const0_rtx)
28523 pop = NULL;
28524 gcc_assert (!TARGET_64BIT || !pop);
28526 if (TARGET_MACHO && !TARGET_64BIT)
28528 #if TARGET_MACHO
28529 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28530 fnaddr = machopic_indirect_call_target (fnaddr);
28531 #endif
28533 else
28535 /* Static functions and indirect calls don't need the pic register. Also,
28536 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28537 it an indirect call. */
28538 rtx addr = XEXP (fnaddr, 0);
28539 if (flag_pic
28540 && GET_CODE (addr) == SYMBOL_REF
28541 && !SYMBOL_REF_LOCAL_P (addr))
28543 if (flag_plt
28544 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28545 || !lookup_attribute ("noplt",
28546 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28548 if (!TARGET_64BIT
28549 || (ix86_cmodel == CM_LARGE_PIC
28550 && DEFAULT_ABI != MS_ABI))
28552 use_reg (&use, gen_rtx_REG (Pmode,
28553 REAL_PIC_OFFSET_TABLE_REGNUM));
28554 if (ix86_use_pseudo_pic_reg ())
28555 emit_move_insn (gen_rtx_REG (Pmode,
28556 REAL_PIC_OFFSET_TABLE_REGNUM),
28557 pic_offset_table_rtx);
28560 else if (!TARGET_PECOFF && !TARGET_MACHO)
28562 if (TARGET_64BIT)
28564 fnaddr = gen_rtx_UNSPEC (Pmode,
28565 gen_rtvec (1, addr),
28566 UNSPEC_GOTPCREL);
28567 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28569 else
28571 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28572 UNSPEC_GOT);
28573 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28574 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28575 fnaddr);
28577 fnaddr = gen_const_mem (Pmode, fnaddr);
28578 /* Pmode may not be the same as word_mode for x32, which
28579 doesn't support indirect branch via 32-bit memory slot.
28580 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28581 indirect branch via x32 GOT slot is OK. */
28582 if (GET_MODE (fnaddr) != word_mode)
28583 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28584 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28589 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28590 parameters passed in vector registers. */
28591 if (TARGET_64BIT
28592 && (INTVAL (callarg2) > 0
28593 || (INTVAL (callarg2) == 0
28594 && (TARGET_SSE || !flag_skip_rax_setup))))
28596 rtx al = gen_rtx_REG (QImode, AX_REG);
28597 emit_move_insn (al, callarg2);
28598 use_reg (&use, al);
28601 if (ix86_cmodel == CM_LARGE_PIC
28602 && !TARGET_PECOFF
28603 && MEM_P (fnaddr)
28604 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28605 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28606 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28607 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28608 branch via x32 GOT slot is OK. */
28609 else if (!(TARGET_X32
28610 && MEM_P (fnaddr)
28611 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28612 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28613 && (sibcall
28614 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28615 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28617 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28618 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28621 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28623 if (retval)
28625 /* We should add bounds as destination register in case
28626 pointer with bounds may be returned. */
28627 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28629 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28630 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28631 if (GET_CODE (retval) == PARALLEL)
28633 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28634 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28635 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28636 retval = chkp_join_splitted_slot (retval, par);
28638 else
28640 retval = gen_rtx_PARALLEL (VOIDmode,
28641 gen_rtvec (3, retval, b0, b1));
28642 chkp_put_regs_to_expr_list (retval);
28646 call = gen_rtx_SET (retval, call);
28648 vec[vec_len++] = call;
28650 if (pop)
28652 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28653 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28654 vec[vec_len++] = pop;
28657 if (cfun->machine->no_caller_saved_registers
28658 && (!fndecl
28659 || (!TREE_THIS_VOLATILE (fndecl)
28660 && !lookup_attribute ("no_caller_saved_registers",
28661 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28663 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28664 bool is_64bit_ms_abi = (TARGET_64BIT
28665 && ix86_function_abi (fndecl) == MS_ABI);
28666 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28668 /* If there are no caller-saved registers, add all registers
28669 that are clobbered by the call which returns. */
28670 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28671 if (!fixed_regs[i]
28672 && (ix86_call_used_regs[i] == 1
28673 || (ix86_call_used_regs[i] & c_mask))
28674 && !STACK_REGNO_P (i)
28675 && !MMX_REGNO_P (i))
28676 clobber_reg (&use,
28677 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28679 else if (TARGET_64BIT_MS_ABI
28680 && (!callarg2 || INTVAL (callarg2) != -2))
28682 unsigned i;
28684 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28686 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28687 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28689 clobber_reg (&use, gen_rtx_REG (mode, regno));
28692 /* Set here, but it may get cleared later. */
28693 if (TARGET_CALL_MS2SYSV_XLOGUES)
28695 if (!TARGET_SSE)
28698 /* Don't break hot-patched functions. */
28699 else if (ix86_function_ms_hook_prologue (current_function_decl))
28702 /* TODO: Cases not yet examined. */
28703 else if (flag_split_stack)
28704 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28706 else
28708 gcc_assert (!reload_completed);
28709 cfun->machine->call_ms2sysv = true;
28714 if (vec_len > 1)
28715 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28716 call = emit_call_insn (call);
28717 if (use)
28718 CALL_INSN_FUNCTION_USAGE (call) = use;
28720 return call;
28723 /* Return true if the function being called was marked with attribute
28724 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28725 to handle the non-PIC case in the backend because there is no easy
28726 interface for the front-end to force non-PLT calls to use the GOT.
28727 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28728 to call the function marked "noplt" indirectly. */
28730 static bool
28731 ix86_nopic_noplt_attribute_p (rtx call_op)
28733 if (flag_pic || ix86_cmodel == CM_LARGE
28734 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28735 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28736 || SYMBOL_REF_LOCAL_P (call_op))
28737 return false;
28739 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28741 if (!flag_plt
28742 || (symbol_decl != NULL_TREE
28743 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28744 return true;
28746 return false;
28749 /* Output indirect branch via a call and return thunk. CALL_OP is a
28750 register which contains the branch target. XASM is the assembly
28751 template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
28752 A normal call is converted to:
28754 call __x86_indirect_thunk_reg
28756 and a tail call is converted to:
28758 jmp __x86_indirect_thunk_reg
28761 static void
28762 ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
28764 char thunk_name_buf[32];
28765 char *thunk_name;
28766 enum indirect_thunk_prefix need_prefix
28767 = indirect_thunk_need_prefix (current_output_insn);
28768 int regno = REGNO (call_op);
28770 if (cfun->machine->indirect_branch_type
28771 != indirect_branch_thunk_inline)
28773 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28775 int i = regno;
28776 if (i >= FIRST_REX_INT_REG)
28777 i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
28778 if (need_prefix == indirect_thunk_prefix_bnd)
28779 indirect_thunks_bnd_used |= 1 << i;
28780 else
28781 indirect_thunks_used |= 1 << i;
28783 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28784 thunk_name = thunk_name_buf;
28786 else
28787 thunk_name = NULL;
28789 if (sibcall_p)
28791 if (thunk_name != NULL)
28793 if (need_prefix == indirect_thunk_prefix_bnd)
28794 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28795 else
28796 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28798 else
28799 output_indirect_thunk (need_prefix, regno);
28801 else
28803 if (thunk_name != NULL)
28805 if (need_prefix == indirect_thunk_prefix_bnd)
28806 fprintf (asm_out_file, "\tbnd call\t%s\n", thunk_name);
28807 else
28808 fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
28809 return;
28812 char indirectlabel1[32];
28813 char indirectlabel2[32];
28815 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28816 INDIRECT_LABEL,
28817 indirectlabelno++);
28818 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28819 INDIRECT_LABEL,
28820 indirectlabelno++);
28822 /* Jump. */
28823 if (need_prefix == indirect_thunk_prefix_bnd)
28824 fputs ("\tbnd jmp\t", asm_out_file);
28825 else
28826 fputs ("\tjmp\t", asm_out_file);
28827 assemble_name_raw (asm_out_file, indirectlabel2);
28828 fputc ('\n', asm_out_file);
28830 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28832 if (thunk_name != NULL)
28834 if (need_prefix == indirect_thunk_prefix_bnd)
28835 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28836 else
28837 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28839 else
28840 output_indirect_thunk (need_prefix, regno);
28842 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28844 /* Call. */
28845 if (need_prefix == indirect_thunk_prefix_bnd)
28846 fputs ("\tbnd call\t", asm_out_file);
28847 else
28848 fputs ("\tcall\t", asm_out_file);
28849 assemble_name_raw (asm_out_file, indirectlabel1);
28850 fputc ('\n', asm_out_file);
28854 /* Output indirect branch via a call and return thunk. CALL_OP is
28855 the branch target. XASM is the assembly template for CALL_OP.
28856 Branch is a tail call if SIBCALL_P is true. A normal call is
28857 converted to:
28859 jmp L2
28861 push CALL_OP
28862 jmp __x86_indirect_thunk
28864 call L1
28866 and a tail call is converted to:
28868 push CALL_OP
28869 jmp __x86_indirect_thunk
28872 static void
28873 ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
28874 bool sibcall_p)
28876 char thunk_name_buf[32];
28877 char *thunk_name;
28878 char push_buf[64];
28879 enum indirect_thunk_prefix need_prefix
28880 = indirect_thunk_need_prefix (current_output_insn);
28881 int regno = -1;
28883 if (cfun->machine->indirect_branch_type
28884 != indirect_branch_thunk_inline)
28886 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28888 if (need_prefix == indirect_thunk_prefix_bnd)
28889 indirect_thunk_bnd_needed = true;
28890 else
28891 indirect_thunk_needed = true;
28893 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28894 thunk_name = thunk_name_buf;
28896 else
28897 thunk_name = NULL;
28899 snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
28900 TARGET_64BIT ? 'q' : 'l', xasm);
28902 if (sibcall_p)
28904 output_asm_insn (push_buf, &call_op);
28905 if (thunk_name != NULL)
28907 if (need_prefix == indirect_thunk_prefix_bnd)
28908 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28909 else
28910 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28912 else
28913 output_indirect_thunk (need_prefix, regno);
28915 else
28917 char indirectlabel1[32];
28918 char indirectlabel2[32];
28920 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28921 INDIRECT_LABEL,
28922 indirectlabelno++);
28923 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28924 INDIRECT_LABEL,
28925 indirectlabelno++);
28927 /* Jump. */
28928 if (need_prefix == indirect_thunk_prefix_bnd)
28929 fputs ("\tbnd jmp\t", asm_out_file);
28930 else
28931 fputs ("\tjmp\t", asm_out_file);
28932 assemble_name_raw (asm_out_file, indirectlabel2);
28933 fputc ('\n', asm_out_file);
28935 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28937 /* An external function may be called via GOT, instead of PLT. */
28938 if (MEM_P (call_op))
28940 struct ix86_address parts;
28941 rtx addr = XEXP (call_op, 0);
28942 if (ix86_decompose_address (addr, &parts)
28943 && parts.base == stack_pointer_rtx)
28945 /* Since call will adjust stack by -UNITS_PER_WORD,
28946 we must convert "disp(stack, index, scale)" to
28947 "disp+UNITS_PER_WORD(stack, index, scale)". */
28948 if (parts.index)
28950 addr = gen_rtx_MULT (Pmode, parts.index,
28951 GEN_INT (parts.scale));
28952 addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28953 addr);
28955 else
28956 addr = stack_pointer_rtx;
28958 rtx disp;
28959 if (parts.disp != NULL_RTX)
28960 disp = plus_constant (Pmode, parts.disp,
28961 UNITS_PER_WORD);
28962 else
28963 disp = GEN_INT (UNITS_PER_WORD);
28965 addr = gen_rtx_PLUS (Pmode, addr, disp);
28966 call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
28970 output_asm_insn (push_buf, &call_op);
28972 if (thunk_name != NULL)
28974 if (need_prefix == indirect_thunk_prefix_bnd)
28975 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28976 else
28977 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28979 else
28980 output_indirect_thunk (need_prefix, regno);
28982 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28984 /* Call. */
28985 if (need_prefix == indirect_thunk_prefix_bnd)
28986 fputs ("\tbnd call\t", asm_out_file);
28987 else
28988 fputs ("\tcall\t", asm_out_file);
28989 assemble_name_raw (asm_out_file, indirectlabel1);
28990 fputc ('\n', asm_out_file);
28994 /* Output indirect branch via a call and return thunk. CALL_OP is
28995 the branch target. XASM is the assembly template for CALL_OP.
28996 Branch is a tail call if SIBCALL_P is true. */
28998 static void
28999 ix86_output_indirect_branch (rtx call_op, const char *xasm,
29000 bool sibcall_p)
29002 if (REG_P (call_op))
29003 ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
29004 else
29005 ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
29008 /* Output indirect jump. CALL_OP is the jump target. */
29010 const char *
29011 ix86_output_indirect_jmp (rtx call_op)
29013 if (cfun->machine->indirect_branch_type != indirect_branch_keep)
29015 /* We can't have red-zone since "call" in the indirect thunk
29016 pushes the return address onto stack, destroying red-zone. */
29017 if (ix86_red_zone_size != 0)
29018 gcc_unreachable ();
29020 ix86_output_indirect_branch (call_op, "%0", true);
29021 return "";
29023 else
29024 return "%!jmp\t%A0";
29027 /* Output function return. CALL_OP is the jump target. Add a REP
29028 prefix to RET if LONG_P is true and function return is kept. */
29030 const char *
29031 ix86_output_function_return (bool long_p)
29033 if (cfun->machine->function_return_type != indirect_branch_keep)
29035 char thunk_name[32];
29036 enum indirect_thunk_prefix need_prefix
29037 = indirect_thunk_need_prefix (current_output_insn);
29039 if (cfun->machine->function_return_type
29040 != indirect_branch_thunk_inline)
29042 bool need_thunk = (cfun->machine->function_return_type
29043 == indirect_branch_thunk);
29044 indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
29045 true);
29046 if (need_prefix == indirect_thunk_prefix_bnd)
29048 indirect_thunk_bnd_needed |= need_thunk;
29049 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
29051 else
29053 indirect_thunk_needed |= need_thunk;
29054 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
29057 else
29058 output_indirect_thunk (need_prefix, INVALID_REGNUM);
29060 return "";
29063 if (!long_p || ix86_bnd_prefixed_insn_p (current_output_insn))
29064 return "%!ret";
29066 return "rep%; ret";
29069 /* Output indirect function return. RET_OP is the function return
29070 target. */
29072 const char *
29073 ix86_output_indirect_function_return (rtx ret_op)
29075 if (cfun->machine->function_return_type != indirect_branch_keep)
29077 char thunk_name[32];
29078 enum indirect_thunk_prefix need_prefix
29079 = indirect_thunk_need_prefix (current_output_insn);
29080 unsigned int regno = REGNO (ret_op);
29081 gcc_assert (regno == CX_REG);
29083 if (cfun->machine->function_return_type
29084 != indirect_branch_thunk_inline)
29086 bool need_thunk = (cfun->machine->function_return_type
29087 == indirect_branch_thunk);
29088 indirect_thunk_name (thunk_name, regno, need_prefix, true);
29089 if (need_prefix == indirect_thunk_prefix_bnd)
29091 if (need_thunk)
29093 indirect_return_via_cx_bnd = true;
29094 indirect_thunks_bnd_used |= 1 << CX_REG;
29096 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
29098 else
29100 if (need_thunk)
29102 indirect_return_via_cx = true;
29103 indirect_thunks_used |= 1 << CX_REG;
29105 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
29108 else
29109 output_indirect_thunk (need_prefix, regno);
29111 return "";
29113 else
29114 return "%!jmp\t%A0";
29117 /* Split simple return with popping POPC bytes from stack to indirect
29118 branch with stack adjustment . */
29120 void
29121 ix86_split_simple_return_pop_internal (rtx popc)
29123 struct machine_function *m = cfun->machine;
29124 rtx ecx = gen_rtx_REG (SImode, CX_REG);
29125 rtx_insn *insn;
29127 /* There is no "pascal" calling convention in any 64bit ABI. */
29128 gcc_assert (!TARGET_64BIT);
29130 insn = emit_insn (gen_pop (ecx));
29131 m->fs.cfa_offset -= UNITS_PER_WORD;
29132 m->fs.sp_offset -= UNITS_PER_WORD;
29134 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
29135 x = gen_rtx_SET (stack_pointer_rtx, x);
29136 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29137 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
29138 RTX_FRAME_RELATED_P (insn) = 1;
29140 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
29141 x = gen_rtx_SET (stack_pointer_rtx, x);
29142 insn = emit_insn (x);
29143 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
29144 RTX_FRAME_RELATED_P (insn) = 1;
29146 /* Now return address is in ECX. */
29147 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
29150 /* Output the assembly for a call instruction. */
29152 const char *
29153 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29155 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29156 bool output_indirect_p
29157 = (!TARGET_SEH
29158 && cfun->machine->indirect_branch_type != indirect_branch_keep);
29159 bool seh_nop_p = false;
29160 const char *xasm;
29162 if (SIBLING_CALL_P (insn))
29164 if (direct_p)
29166 if (ix86_nopic_noplt_attribute_p (call_op))
29168 direct_p = false;
29169 if (TARGET_64BIT)
29171 if (output_indirect_p)
29172 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29173 else
29174 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29176 else
29178 if (output_indirect_p)
29179 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29180 else
29181 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29184 else
29185 xasm = "%!jmp\t%P0";
29187 /* SEH epilogue detection requires the indirect branch case
29188 to include REX.W. */
29189 else if (TARGET_SEH)
29190 xasm = "%!rex.W jmp\t%A0";
29191 else
29193 if (output_indirect_p)
29194 xasm = "%0";
29195 else
29196 xasm = "%!jmp\t%A0";
29199 if (output_indirect_p && !direct_p)
29200 ix86_output_indirect_branch (call_op, xasm, true);
29201 else
29202 output_asm_insn (xasm, &call_op);
29203 return "";
29206 /* SEH unwinding can require an extra nop to be emitted in several
29207 circumstances. Determine if we have one of those. */
29208 if (TARGET_SEH)
29210 rtx_insn *i;
29212 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29214 /* If we get to another real insn, we don't need the nop. */
29215 if (INSN_P (i))
29216 break;
29218 /* If we get to the epilogue note, prevent a catch region from
29219 being adjacent to the standard epilogue sequence. If non-
29220 call-exceptions, we'll have done this during epilogue emission. */
29221 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29222 && !flag_non_call_exceptions
29223 && !can_throw_internal (insn))
29225 seh_nop_p = true;
29226 break;
29230 /* If we didn't find a real insn following the call, prevent the
29231 unwinder from looking into the next function. */
29232 if (i == NULL)
29233 seh_nop_p = true;
29236 if (direct_p)
29238 if (ix86_nopic_noplt_attribute_p (call_op))
29240 direct_p = false;
29241 if (TARGET_64BIT)
29243 if (output_indirect_p)
29244 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29245 else
29246 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29248 else
29250 if (output_indirect_p)
29251 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29252 else
29253 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29256 else
29257 xasm = "%!call\t%P0";
29259 else
29261 if (output_indirect_p)
29262 xasm = "%0";
29263 else
29264 xasm = "%!call\t%A0";
29267 if (output_indirect_p && !direct_p)
29268 ix86_output_indirect_branch (call_op, xasm, false);
29269 else
29270 output_asm_insn (xasm, &call_op);
29272 if (seh_nop_p)
29273 return "nop";
29275 return "";
29278 /* Clear stack slot assignments remembered from previous functions.
29279 This is called from INIT_EXPANDERS once before RTL is emitted for each
29280 function. */
29282 static struct machine_function *
29283 ix86_init_machine_status (void)
29285 struct machine_function *f;
29287 f = ggc_cleared_alloc<machine_function> ();
29288 f->call_abi = ix86_abi;
29290 return f;
29293 /* Return a MEM corresponding to a stack slot with mode MODE.
29294 Allocate a new slot if necessary.
29296 The RTL for a function can have several slots available: N is
29297 which slot to use. */
29300 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29302 struct stack_local_entry *s;
29304 gcc_assert (n < MAX_386_STACK_LOCALS);
29306 for (s = ix86_stack_locals; s; s = s->next)
29307 if (s->mode == mode && s->n == n)
29308 return validize_mem (copy_rtx (s->rtl));
29310 s = ggc_alloc<stack_local_entry> ();
29311 s->n = n;
29312 s->mode = mode;
29313 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29315 s->next = ix86_stack_locals;
29316 ix86_stack_locals = s;
29317 return validize_mem (copy_rtx (s->rtl));
29320 static void
29321 ix86_instantiate_decls (void)
29323 struct stack_local_entry *s;
29325 for (s = ix86_stack_locals; s; s = s->next)
29326 if (s->rtl != NULL_RTX)
29327 instantiate_decl_rtl (s->rtl);
29330 /* Return the number used for encoding REG, in the range 0..7. */
29332 static int
29333 reg_encoded_number (rtx reg)
29335 unsigned regno = REGNO (reg);
29336 switch (regno)
29338 case AX_REG:
29339 return 0;
29340 case CX_REG:
29341 return 1;
29342 case DX_REG:
29343 return 2;
29344 case BX_REG:
29345 return 3;
29346 case SP_REG:
29347 return 4;
29348 case BP_REG:
29349 return 5;
29350 case SI_REG:
29351 return 6;
29352 case DI_REG:
29353 return 7;
29354 default:
29355 break;
29357 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29358 return regno - FIRST_STACK_REG;
29359 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29360 return regno - FIRST_SSE_REG;
29361 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29362 return regno - FIRST_MMX_REG;
29363 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29364 return regno - FIRST_REX_SSE_REG;
29365 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29366 return regno - FIRST_REX_INT_REG;
29367 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29368 return regno - FIRST_MASK_REG;
29369 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29370 return regno - FIRST_BND_REG;
29371 return -1;
29374 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29375 in its encoding if it could be relevant for ROP mitigation, otherwise
29376 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29377 used for calculating it into them. */
29379 static int
29380 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29381 int *popno0 = 0, int *popno1 = 0)
29383 if (asm_noperands (PATTERN (insn)) >= 0)
29384 return -1;
29385 int has_modrm = get_attr_modrm (insn);
29386 if (!has_modrm)
29387 return -1;
29388 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29389 rtx op0, op1;
29390 switch (cls)
29392 case MODRM_CLASS_OP02:
29393 gcc_assert (noperands >= 3);
29394 if (popno0)
29396 *popno0 = 0;
29397 *popno1 = 2;
29399 op0 = operands[0];
29400 op1 = operands[2];
29401 break;
29402 case MODRM_CLASS_OP01:
29403 gcc_assert (noperands >= 2);
29404 if (popno0)
29406 *popno0 = 0;
29407 *popno1 = 1;
29409 op0 = operands[0];
29410 op1 = operands[1];
29411 break;
29412 default:
29413 return -1;
29415 if (REG_P (op0) && REG_P (op1))
29417 int enc0 = reg_encoded_number (op0);
29418 int enc1 = reg_encoded_number (op1);
29419 return 0xc0 + (enc1 << 3) + enc0;
29421 return -1;
29424 /* Check whether x86 address PARTS is a pc-relative address. */
29426 bool
29427 ix86_rip_relative_addr_p (struct ix86_address *parts)
29429 rtx base, index, disp;
29431 base = parts->base;
29432 index = parts->index;
29433 disp = parts->disp;
29435 if (disp && !base && !index)
29437 if (TARGET_64BIT)
29439 rtx symbol = disp;
29441 if (GET_CODE (disp) == CONST)
29442 symbol = XEXP (disp, 0);
29443 if (GET_CODE (symbol) == PLUS
29444 && CONST_INT_P (XEXP (symbol, 1)))
29445 symbol = XEXP (symbol, 0);
29447 if (GET_CODE (symbol) == LABEL_REF
29448 || (GET_CODE (symbol) == SYMBOL_REF
29449 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29450 || (GET_CODE (symbol) == UNSPEC
29451 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29452 || XINT (symbol, 1) == UNSPEC_PCREL
29453 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29454 return true;
29457 return false;
29460 /* Calculate the length of the memory address in the instruction encoding.
29461 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29462 or other prefixes. We never generate addr32 prefix for LEA insn. */
29465 memory_address_length (rtx addr, bool lea)
29467 struct ix86_address parts;
29468 rtx base, index, disp;
29469 int len;
29470 int ok;
29472 if (GET_CODE (addr) == PRE_DEC
29473 || GET_CODE (addr) == POST_INC
29474 || GET_CODE (addr) == PRE_MODIFY
29475 || GET_CODE (addr) == POST_MODIFY)
29476 return 0;
29478 ok = ix86_decompose_address (addr, &parts);
29479 gcc_assert (ok);
29481 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29483 /* If this is not LEA instruction, add the length of addr32 prefix. */
29484 if (TARGET_64BIT && !lea
29485 && (SImode_address_operand (addr, VOIDmode)
29486 || (parts.base && GET_MODE (parts.base) == SImode)
29487 || (parts.index && GET_MODE (parts.index) == SImode)))
29488 len++;
29490 base = parts.base;
29491 index = parts.index;
29492 disp = parts.disp;
29494 if (base && SUBREG_P (base))
29495 base = SUBREG_REG (base);
29496 if (index && SUBREG_P (index))
29497 index = SUBREG_REG (index);
29499 gcc_assert (base == NULL_RTX || REG_P (base));
29500 gcc_assert (index == NULL_RTX || REG_P (index));
29502 /* Rule of thumb:
29503 - esp as the base always wants an index,
29504 - ebp as the base always wants a displacement,
29505 - r12 as the base always wants an index,
29506 - r13 as the base always wants a displacement. */
29508 /* Register Indirect. */
29509 if (base && !index && !disp)
29511 /* esp (for its index) and ebp (for its displacement) need
29512 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29513 code. */
29514 if (base == arg_pointer_rtx
29515 || base == frame_pointer_rtx
29516 || REGNO (base) == SP_REG
29517 || REGNO (base) == BP_REG
29518 || REGNO (base) == R12_REG
29519 || REGNO (base) == R13_REG)
29520 len++;
29523 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29524 is not disp32, but disp32(%rip), so for disp32
29525 SIB byte is needed, unless print_operand_address
29526 optimizes it into disp32(%rip) or (%rip) is implied
29527 by UNSPEC. */
29528 else if (disp && !base && !index)
29530 len += 4;
29531 if (!ix86_rip_relative_addr_p (&parts))
29532 len++;
29534 else
29536 /* Find the length of the displacement constant. */
29537 if (disp)
29539 if (base && satisfies_constraint_K (disp))
29540 len += 1;
29541 else
29542 len += 4;
29544 /* ebp always wants a displacement. Similarly r13. */
29545 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29546 len++;
29548 /* An index requires the two-byte modrm form.... */
29549 if (index
29550 /* ...like esp (or r12), which always wants an index. */
29551 || base == arg_pointer_rtx
29552 || base == frame_pointer_rtx
29553 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29554 len++;
29557 return len;
29560 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29561 is set, expect that insn have 8bit immediate alternative. */
29563 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29565 int len = 0;
29566 int i;
29567 extract_insn_cached (insn);
29568 for (i = recog_data.n_operands - 1; i >= 0; --i)
29569 if (CONSTANT_P (recog_data.operand[i]))
29571 enum attr_mode mode = get_attr_mode (insn);
29573 gcc_assert (!len);
29574 if (shortform && CONST_INT_P (recog_data.operand[i]))
29576 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29577 switch (mode)
29579 case MODE_QI:
29580 len = 1;
29581 continue;
29582 case MODE_HI:
29583 ival = trunc_int_for_mode (ival, HImode);
29584 break;
29585 case MODE_SI:
29586 ival = trunc_int_for_mode (ival, SImode);
29587 break;
29588 default:
29589 break;
29591 if (IN_RANGE (ival, -128, 127))
29593 len = 1;
29594 continue;
29597 switch (mode)
29599 case MODE_QI:
29600 len = 1;
29601 break;
29602 case MODE_HI:
29603 len = 2;
29604 break;
29605 case MODE_SI:
29606 len = 4;
29607 break;
29608 /* Immediates for DImode instructions are encoded
29609 as 32bit sign extended values. */
29610 case MODE_DI:
29611 len = 4;
29612 break;
29613 default:
29614 fatal_insn ("unknown insn mode", insn);
29617 return len;
29620 /* Compute default value for "length_address" attribute. */
29622 ix86_attr_length_address_default (rtx_insn *insn)
29624 int i;
29626 if (get_attr_type (insn) == TYPE_LEA)
29628 rtx set = PATTERN (insn), addr;
29630 if (GET_CODE (set) == PARALLEL)
29631 set = XVECEXP (set, 0, 0);
29633 gcc_assert (GET_CODE (set) == SET);
29635 addr = SET_SRC (set);
29637 return memory_address_length (addr, true);
29640 extract_insn_cached (insn);
29641 for (i = recog_data.n_operands - 1; i >= 0; --i)
29643 rtx op = recog_data.operand[i];
29644 if (MEM_P (op))
29646 constrain_operands_cached (insn, reload_completed);
29647 if (which_alternative != -1)
29649 const char *constraints = recog_data.constraints[i];
29650 int alt = which_alternative;
29652 while (*constraints == '=' || *constraints == '+')
29653 constraints++;
29654 while (alt-- > 0)
29655 while (*constraints++ != ',')
29657 /* Skip ignored operands. */
29658 if (*constraints == 'X')
29659 continue;
29662 int len = memory_address_length (XEXP (op, 0), false);
29664 /* Account for segment prefix for non-default addr spaces. */
29665 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29666 len++;
29668 return len;
29671 return 0;
29674 /* Compute default value for "length_vex" attribute. It includes
29675 2 or 3 byte VEX prefix and 1 opcode byte. */
29678 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29679 bool has_vex_w)
29681 int i;
29683 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29684 byte VEX prefix. */
29685 if (!has_0f_opcode || has_vex_w)
29686 return 3 + 1;
29688 /* We can always use 2 byte VEX prefix in 32bit. */
29689 if (!TARGET_64BIT)
29690 return 2 + 1;
29692 extract_insn_cached (insn);
29694 for (i = recog_data.n_operands - 1; i >= 0; --i)
29695 if (REG_P (recog_data.operand[i]))
29697 /* REX.W bit uses 3 byte VEX prefix. */
29698 if (GET_MODE (recog_data.operand[i]) == DImode
29699 && GENERAL_REG_P (recog_data.operand[i]))
29700 return 3 + 1;
29702 else
29704 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29705 if (MEM_P (recog_data.operand[i])
29706 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29707 return 3 + 1;
29710 return 2 + 1;
29714 static bool
29715 ix86_class_likely_spilled_p (reg_class_t);
29717 /* Returns true if lhs of insn is HW function argument register and set up
29718 is_spilled to true if it is likely spilled HW register. */
29719 static bool
29720 insn_is_function_arg (rtx insn, bool* is_spilled)
29722 rtx dst;
29724 if (!NONDEBUG_INSN_P (insn))
29725 return false;
29726 /* Call instructions are not movable, ignore it. */
29727 if (CALL_P (insn))
29728 return false;
29729 insn = PATTERN (insn);
29730 if (GET_CODE (insn) == PARALLEL)
29731 insn = XVECEXP (insn, 0, 0);
29732 if (GET_CODE (insn) != SET)
29733 return false;
29734 dst = SET_DEST (insn);
29735 if (REG_P (dst) && HARD_REGISTER_P (dst)
29736 && ix86_function_arg_regno_p (REGNO (dst)))
29738 /* Is it likely spilled HW register? */
29739 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29740 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29741 *is_spilled = true;
29742 return true;
29744 return false;
29747 /* Add output dependencies for chain of function adjacent arguments if only
29748 there is a move to likely spilled HW register. Return first argument
29749 if at least one dependence was added or NULL otherwise. */
29750 static rtx_insn *
29751 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29753 rtx_insn *insn;
29754 rtx_insn *last = call;
29755 rtx_insn *first_arg = NULL;
29756 bool is_spilled = false;
29758 head = PREV_INSN (head);
29760 /* Find nearest to call argument passing instruction. */
29761 while (true)
29763 last = PREV_INSN (last);
29764 if (last == head)
29765 return NULL;
29766 if (!NONDEBUG_INSN_P (last))
29767 continue;
29768 if (insn_is_function_arg (last, &is_spilled))
29769 break;
29770 return NULL;
29773 first_arg = last;
29774 while (true)
29776 insn = PREV_INSN (last);
29777 if (!INSN_P (insn))
29778 break;
29779 if (insn == head)
29780 break;
29781 if (!NONDEBUG_INSN_P (insn))
29783 last = insn;
29784 continue;
29786 if (insn_is_function_arg (insn, &is_spilled))
29788 /* Add output depdendence between two function arguments if chain
29789 of output arguments contains likely spilled HW registers. */
29790 if (is_spilled)
29791 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29792 first_arg = last = insn;
29794 else
29795 break;
29797 if (!is_spilled)
29798 return NULL;
29799 return first_arg;
29802 /* Add output or anti dependency from insn to first_arg to restrict its code
29803 motion. */
29804 static void
29805 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29807 rtx set;
29808 rtx tmp;
29810 /* Add anti dependencies for bounds stores. */
29811 if (INSN_P (insn)
29812 && GET_CODE (PATTERN (insn)) == PARALLEL
29813 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29814 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29816 add_dependence (first_arg, insn, REG_DEP_ANTI);
29817 return;
29820 set = single_set (insn);
29821 if (!set)
29822 return;
29823 tmp = SET_DEST (set);
29824 if (REG_P (tmp))
29826 /* Add output dependency to the first function argument. */
29827 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29828 return;
29830 /* Add anti dependency. */
29831 add_dependence (first_arg, insn, REG_DEP_ANTI);
29834 /* Avoid cross block motion of function argument through adding dependency
29835 from the first non-jump instruction in bb. */
29836 static void
29837 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29839 rtx_insn *insn = BB_END (bb);
29841 while (insn)
29843 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29845 rtx set = single_set (insn);
29846 if (set)
29848 avoid_func_arg_motion (arg, insn);
29849 return;
29852 if (insn == BB_HEAD (bb))
29853 return;
29854 insn = PREV_INSN (insn);
29858 /* Hook for pre-reload schedule - avoid motion of function arguments
29859 passed in likely spilled HW registers. */
29860 static void
29861 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29863 rtx_insn *insn;
29864 rtx_insn *first_arg = NULL;
29865 if (reload_completed)
29866 return;
29867 while (head != tail && DEBUG_INSN_P (head))
29868 head = NEXT_INSN (head);
29869 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29870 if (INSN_P (insn) && CALL_P (insn))
29872 first_arg = add_parameter_dependencies (insn, head);
29873 if (first_arg)
29875 /* Add dependee for first argument to predecessors if only
29876 region contains more than one block. */
29877 basic_block bb = BLOCK_FOR_INSN (insn);
29878 int rgn = CONTAINING_RGN (bb->index);
29879 int nr_blks = RGN_NR_BLOCKS (rgn);
29880 /* Skip trivial regions and region head blocks that can have
29881 predecessors outside of region. */
29882 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29884 edge e;
29885 edge_iterator ei;
29887 /* Regions are SCCs with the exception of selective
29888 scheduling with pipelining of outer blocks enabled.
29889 So also check that immediate predecessors of a non-head
29890 block are in the same region. */
29891 FOR_EACH_EDGE (e, ei, bb->preds)
29893 /* Avoid creating of loop-carried dependencies through
29894 using topological ordering in the region. */
29895 if (rgn == CONTAINING_RGN (e->src->index)
29896 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29897 add_dependee_for_func_arg (first_arg, e->src);
29900 insn = first_arg;
29901 if (insn == head)
29902 break;
29905 else if (first_arg)
29906 avoid_func_arg_motion (first_arg, insn);
29909 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29910 HW registers to maximum, to schedule them at soon as possible. These are
29911 moves from function argument registers at the top of the function entry
29912 and moves from function return value registers after call. */
29913 static int
29914 ix86_adjust_priority (rtx_insn *insn, int priority)
29916 rtx set;
29918 if (reload_completed)
29919 return priority;
29921 if (!NONDEBUG_INSN_P (insn))
29922 return priority;
29924 set = single_set (insn);
29925 if (set)
29927 rtx tmp = SET_SRC (set);
29928 if (REG_P (tmp)
29929 && HARD_REGISTER_P (tmp)
29930 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29931 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29932 return current_sched_info->sched_max_insns_priority;
29935 return priority;
29938 /* Prepare for scheduling pass. */
29939 static void
29940 ix86_sched_init_global (FILE *, int, int)
29942 /* Install scheduling hooks for current CPU. Some of these hooks are used
29943 in time-critical parts of the scheduler, so we only set them up when
29944 they are actually used. */
29945 switch (ix86_tune)
29947 case PROCESSOR_CORE2:
29948 case PROCESSOR_NEHALEM:
29949 case PROCESSOR_SANDYBRIDGE:
29950 case PROCESSOR_HASWELL:
29951 case PROCESSOR_GENERIC:
29952 /* Do not perform multipass scheduling for pre-reload schedule
29953 to save compile time. */
29954 if (reload_completed)
29956 ix86_core2i7_init_hooks ();
29957 break;
29959 /* Fall through. */
29960 default:
29961 targetm.sched.dfa_post_advance_cycle = NULL;
29962 targetm.sched.first_cycle_multipass_init = NULL;
29963 targetm.sched.first_cycle_multipass_begin = NULL;
29964 targetm.sched.first_cycle_multipass_issue = NULL;
29965 targetm.sched.first_cycle_multipass_backtrack = NULL;
29966 targetm.sched.first_cycle_multipass_end = NULL;
29967 targetm.sched.first_cycle_multipass_fini = NULL;
29968 break;
29973 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
29975 static HOST_WIDE_INT
29976 ix86_static_rtx_alignment (machine_mode mode)
29978 if (mode == DFmode)
29979 return 64;
29980 if (ALIGN_MODE_128 (mode))
29981 return MAX (128, GET_MODE_ALIGNMENT (mode));
29982 return GET_MODE_ALIGNMENT (mode);
29985 /* Implement TARGET_CONSTANT_ALIGNMENT. */
29987 static HOST_WIDE_INT
29988 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
29990 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
29991 || TREE_CODE (exp) == INTEGER_CST)
29993 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
29994 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
29995 return MAX (mode_align, align);
29997 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
29998 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
29999 return BITS_PER_WORD;
30001 return align;
30004 /* Implement TARGET_EMPTY_RECORD_P. */
30006 static bool
30007 ix86_is_empty_record (const_tree type)
30009 if (!TARGET_64BIT)
30010 return false;
30011 return default_is_empty_record (type);
30014 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
30016 static void
30017 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
30019 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
30021 if (!cum->warn_empty)
30022 return;
30024 if (!TYPE_EMPTY_P (type))
30025 return;
30027 const_tree ctx = get_ultimate_context (cum->decl);
30028 if (ctx != NULL_TREE
30029 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
30030 return;
30032 /* If the actual size of the type is zero, then there is no change
30033 in how objects of this size are passed. */
30034 if (int_size_in_bytes (type) == 0)
30035 return;
30037 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
30038 "changes in -fabi-version=12 (GCC 8)", type);
30040 /* Only warn once. */
30041 cum->warn_empty = false;
30044 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30045 the data type, and ALIGN is the alignment that the object would
30046 ordinarily have. */
30048 static int
30049 iamcu_alignment (tree type, int align)
30051 machine_mode mode;
30053 if (align < 32 || TYPE_USER_ALIGN (type))
30054 return align;
30056 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30057 bytes. */
30058 mode = TYPE_MODE (strip_array_types (type));
30059 switch (GET_MODE_CLASS (mode))
30061 case MODE_INT:
30062 case MODE_COMPLEX_INT:
30063 case MODE_COMPLEX_FLOAT:
30064 case MODE_FLOAT:
30065 case MODE_DECIMAL_FLOAT:
30066 return 32;
30067 default:
30068 return align;
30072 /* Compute the alignment for a static variable.
30073 TYPE is the data type, and ALIGN is the alignment that
30074 the object would ordinarily have. The value of this function is used
30075 instead of that alignment to align the object. */
30078 ix86_data_alignment (tree type, int align, bool opt)
30080 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30081 for symbols from other compilation units or symbols that don't need
30082 to bind locally. In order to preserve some ABI compatibility with
30083 those compilers, ensure we don't decrease alignment from what we
30084 used to assume. */
30086 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30088 /* A data structure, equal or greater than the size of a cache line
30089 (64 bytes in the Pentium 4 and other recent Intel processors, including
30090 processors based on Intel Core microarchitecture) should be aligned
30091 so that its base address is a multiple of a cache line size. */
30093 int max_align
30094 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30096 if (max_align < BITS_PER_WORD)
30097 max_align = BITS_PER_WORD;
30099 switch (ix86_align_data_type)
30101 case ix86_align_data_type_abi: opt = false; break;
30102 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30103 case ix86_align_data_type_cacheline: break;
30106 if (TARGET_IAMCU)
30107 align = iamcu_alignment (type, align);
30109 if (opt
30110 && AGGREGATE_TYPE_P (type)
30111 && TYPE_SIZE (type)
30112 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30114 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
30115 && align < max_align_compat)
30116 align = max_align_compat;
30117 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
30118 && align < max_align)
30119 align = max_align;
30122 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30123 to 16byte boundary. */
30124 if (TARGET_64BIT)
30126 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30127 && TYPE_SIZE (type)
30128 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30129 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30130 && align < 128)
30131 return 128;
30134 if (!opt)
30135 return align;
30137 if (TREE_CODE (type) == ARRAY_TYPE)
30139 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30140 return 64;
30141 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30142 return 128;
30144 else if (TREE_CODE (type) == COMPLEX_TYPE)
30147 if (TYPE_MODE (type) == DCmode && align < 64)
30148 return 64;
30149 if ((TYPE_MODE (type) == XCmode
30150 || TYPE_MODE (type) == TCmode) && align < 128)
30151 return 128;
30153 else if ((TREE_CODE (type) == RECORD_TYPE
30154 || TREE_CODE (type) == UNION_TYPE
30155 || TREE_CODE (type) == QUAL_UNION_TYPE)
30156 && TYPE_FIELDS (type))
30158 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30159 return 64;
30160 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30161 return 128;
30163 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30164 || TREE_CODE (type) == INTEGER_TYPE)
30166 if (TYPE_MODE (type) == DFmode && align < 64)
30167 return 64;
30168 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30169 return 128;
30172 return align;
30175 /* Compute the alignment for a local variable or a stack slot. EXP is
30176 the data type or decl itself, MODE is the widest mode available and
30177 ALIGN is the alignment that the object would ordinarily have. The
30178 value of this macro is used instead of that alignment to align the
30179 object. */
30181 unsigned int
30182 ix86_local_alignment (tree exp, machine_mode mode,
30183 unsigned int align)
30185 tree type, decl;
30187 if (exp && DECL_P (exp))
30189 type = TREE_TYPE (exp);
30190 decl = exp;
30192 else
30194 type = exp;
30195 decl = NULL;
30198 /* Don't do dynamic stack realignment for long long objects with
30199 -mpreferred-stack-boundary=2. */
30200 if (!TARGET_64BIT
30201 && align == 64
30202 && ix86_preferred_stack_boundary < 64
30203 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30204 && (!type || !TYPE_USER_ALIGN (type))
30205 && (!decl || !DECL_USER_ALIGN (decl)))
30206 align = 32;
30208 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30209 register in MODE. We will return the largest alignment of XF
30210 and DF. */
30211 if (!type)
30213 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30214 align = GET_MODE_ALIGNMENT (DFmode);
30215 return align;
30218 /* Don't increase alignment for Intel MCU psABI. */
30219 if (TARGET_IAMCU)
30220 return align;
30222 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30223 to 16byte boundary. Exact wording is:
30225 An array uses the same alignment as its elements, except that a local or
30226 global array variable of length at least 16 bytes or
30227 a C99 variable-length array variable always has alignment of at least 16 bytes.
30229 This was added to allow use of aligned SSE instructions at arrays. This
30230 rule is meant for static storage (where compiler can not do the analysis
30231 by itself). We follow it for automatic variables only when convenient.
30232 We fully control everything in the function compiled and functions from
30233 other unit can not rely on the alignment.
30235 Exclude va_list type. It is the common case of local array where
30236 we can not benefit from the alignment.
30238 TODO: Probably one should optimize for size only when var is not escaping. */
30239 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30240 && TARGET_SSE)
30242 if (AGGREGATE_TYPE_P (type)
30243 && (va_list_type_node == NULL_TREE
30244 || (TYPE_MAIN_VARIANT (type)
30245 != TYPE_MAIN_VARIANT (va_list_type_node)))
30246 && TYPE_SIZE (type)
30247 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30248 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30249 && align < 128)
30250 return 128;
30252 if (TREE_CODE (type) == ARRAY_TYPE)
30254 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30255 return 64;
30256 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30257 return 128;
30259 else if (TREE_CODE (type) == COMPLEX_TYPE)
30261 if (TYPE_MODE (type) == DCmode && align < 64)
30262 return 64;
30263 if ((TYPE_MODE (type) == XCmode
30264 || TYPE_MODE (type) == TCmode) && align < 128)
30265 return 128;
30267 else if ((TREE_CODE (type) == RECORD_TYPE
30268 || TREE_CODE (type) == UNION_TYPE
30269 || TREE_CODE (type) == QUAL_UNION_TYPE)
30270 && TYPE_FIELDS (type))
30272 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30273 return 64;
30274 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30275 return 128;
30277 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30278 || TREE_CODE (type) == INTEGER_TYPE)
30281 if (TYPE_MODE (type) == DFmode && align < 64)
30282 return 64;
30283 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30284 return 128;
30286 return align;
30289 /* Compute the minimum required alignment for dynamic stack realignment
30290 purposes for a local variable, parameter or a stack slot. EXP is
30291 the data type or decl itself, MODE is its mode and ALIGN is the
30292 alignment that the object would ordinarily have. */
30294 unsigned int
30295 ix86_minimum_alignment (tree exp, machine_mode mode,
30296 unsigned int align)
30298 tree type, decl;
30300 if (exp && DECL_P (exp))
30302 type = TREE_TYPE (exp);
30303 decl = exp;
30305 else
30307 type = exp;
30308 decl = NULL;
30311 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30312 return align;
30314 /* Don't do dynamic stack realignment for long long objects with
30315 -mpreferred-stack-boundary=2. */
30316 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30317 && (!type || !TYPE_USER_ALIGN (type))
30318 && (!decl || !DECL_USER_ALIGN (decl)))
30320 gcc_checking_assert (!TARGET_STV);
30321 return 32;
30324 return align;
30327 /* Find a location for the static chain incoming to a nested function.
30328 This is a register, unless all free registers are used by arguments. */
30330 static rtx
30331 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30333 unsigned regno;
30335 if (TARGET_64BIT)
30337 /* We always use R10 in 64-bit mode. */
30338 regno = R10_REG;
30340 else
30342 const_tree fntype, fndecl;
30343 unsigned int ccvt;
30345 /* By default in 32-bit mode we use ECX to pass the static chain. */
30346 regno = CX_REG;
30348 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30350 fntype = TREE_TYPE (fndecl_or_type);
30351 fndecl = fndecl_or_type;
30353 else
30355 fntype = fndecl_or_type;
30356 fndecl = NULL;
30359 ccvt = ix86_get_callcvt (fntype);
30360 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30362 /* Fastcall functions use ecx/edx for arguments, which leaves
30363 us with EAX for the static chain.
30364 Thiscall functions use ecx for arguments, which also
30365 leaves us with EAX for the static chain. */
30366 regno = AX_REG;
30368 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30370 /* Thiscall functions use ecx for arguments, which leaves
30371 us with EAX and EDX for the static chain.
30372 We are using for abi-compatibility EAX. */
30373 regno = AX_REG;
30375 else if (ix86_function_regparm (fntype, fndecl) == 3)
30377 /* For regparm 3, we have no free call-clobbered registers in
30378 which to store the static chain. In order to implement this,
30379 we have the trampoline push the static chain to the stack.
30380 However, we can't push a value below the return address when
30381 we call the nested function directly, so we have to use an
30382 alternate entry point. For this we use ESI, and have the
30383 alternate entry point push ESI, so that things appear the
30384 same once we're executing the nested function. */
30385 if (incoming_p)
30387 if (fndecl == current_function_decl
30388 && !ix86_static_chain_on_stack)
30390 gcc_assert (!reload_completed);
30391 ix86_static_chain_on_stack = true;
30393 return gen_frame_mem (SImode,
30394 plus_constant (Pmode,
30395 arg_pointer_rtx, -8));
30397 regno = SI_REG;
30401 return gen_rtx_REG (Pmode, regno);
30404 /* Emit RTL insns to initialize the variable parts of a trampoline.
30405 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30406 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30407 to be passed to the target function. */
30409 static void
30410 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30412 rtx mem, fnaddr;
30413 int opcode;
30414 int offset = 0;
30416 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30418 if (TARGET_64BIT)
30420 int size;
30422 /* Load the function address to r11. Try to load address using
30423 the shorter movl instead of movabs. We may want to support
30424 movq for kernel mode, but kernel does not use trampolines at
30425 the moment. FNADDR is a 32bit address and may not be in
30426 DImode when ptr_mode == SImode. Always use movl in this
30427 case. */
30428 if (ptr_mode == SImode
30429 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30431 fnaddr = copy_addr_to_reg (fnaddr);
30433 mem = adjust_address (m_tramp, HImode, offset);
30434 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30436 mem = adjust_address (m_tramp, SImode, offset + 2);
30437 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30438 offset += 6;
30440 else
30442 mem = adjust_address (m_tramp, HImode, offset);
30443 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30445 mem = adjust_address (m_tramp, DImode, offset + 2);
30446 emit_move_insn (mem, fnaddr);
30447 offset += 10;
30450 /* Load static chain using movabs to r10. Use the shorter movl
30451 instead of movabs when ptr_mode == SImode. */
30452 if (ptr_mode == SImode)
30454 opcode = 0xba41;
30455 size = 6;
30457 else
30459 opcode = 0xba49;
30460 size = 10;
30463 mem = adjust_address (m_tramp, HImode, offset);
30464 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30466 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30467 emit_move_insn (mem, chain_value);
30468 offset += size;
30470 /* Jump to r11; the last (unused) byte is a nop, only there to
30471 pad the write out to a single 32-bit store. */
30472 mem = adjust_address (m_tramp, SImode, offset);
30473 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30474 offset += 4;
30476 else
30478 rtx disp, chain;
30480 /* Depending on the static chain location, either load a register
30481 with a constant, or push the constant to the stack. All of the
30482 instructions are the same size. */
30483 chain = ix86_static_chain (fndecl, true);
30484 if (REG_P (chain))
30486 switch (REGNO (chain))
30488 case AX_REG:
30489 opcode = 0xb8; break;
30490 case CX_REG:
30491 opcode = 0xb9; break;
30492 default:
30493 gcc_unreachable ();
30496 else
30497 opcode = 0x68;
30499 mem = adjust_address (m_tramp, QImode, offset);
30500 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30502 mem = adjust_address (m_tramp, SImode, offset + 1);
30503 emit_move_insn (mem, chain_value);
30504 offset += 5;
30506 mem = adjust_address (m_tramp, QImode, offset);
30507 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30509 mem = adjust_address (m_tramp, SImode, offset + 1);
30511 /* Compute offset from the end of the jmp to the target function.
30512 In the case in which the trampoline stores the static chain on
30513 the stack, we need to skip the first insn which pushes the
30514 (call-saved) register static chain; this push is 1 byte. */
30515 offset += 5;
30516 disp = expand_binop (SImode, sub_optab, fnaddr,
30517 plus_constant (Pmode, XEXP (m_tramp, 0),
30518 offset - (MEM_P (chain) ? 1 : 0)),
30519 NULL_RTX, 1, OPTAB_DIRECT);
30520 emit_move_insn (mem, disp);
30523 gcc_assert (offset <= TRAMPOLINE_SIZE);
30525 #ifdef HAVE_ENABLE_EXECUTE_STACK
30526 #ifdef CHECK_EXECUTE_STACK_ENABLED
30527 if (CHECK_EXECUTE_STACK_ENABLED)
30528 #endif
30529 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30530 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
30531 #endif
30534 static bool
30535 ix86_allocate_stack_slots_for_args (void)
30537 /* Naked functions should not allocate stack slots for arguments. */
30538 return !ix86_function_naked (current_function_decl);
30541 static bool
30542 ix86_warn_func_return (tree decl)
30544 /* Naked functions are implemented entirely in assembly, including the
30545 return sequence, so suppress warnings about this. */
30546 return !ix86_function_naked (decl);
30549 /* The following file contains several enumerations and data structures
30550 built from the definitions in i386-builtin-types.def. */
30552 #include "i386-builtin-types.inc"
30554 /* Table for the ix86 builtin non-function types. */
30555 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30557 /* Retrieve an element from the above table, building some of
30558 the types lazily. */
30560 static tree
30561 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30563 unsigned int index;
30564 tree type, itype;
30566 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30568 type = ix86_builtin_type_tab[(int) tcode];
30569 if (type != NULL)
30570 return type;
30572 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30573 if (tcode <= IX86_BT_LAST_VECT)
30575 machine_mode mode;
30577 index = tcode - IX86_BT_LAST_PRIM - 1;
30578 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30579 mode = ix86_builtin_type_vect_mode[index];
30581 type = build_vector_type_for_mode (itype, mode);
30583 else
30585 int quals;
30587 index = tcode - IX86_BT_LAST_VECT - 1;
30588 if (tcode <= IX86_BT_LAST_PTR)
30589 quals = TYPE_UNQUALIFIED;
30590 else
30591 quals = TYPE_QUAL_CONST;
30593 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30594 if (quals != TYPE_UNQUALIFIED)
30595 itype = build_qualified_type (itype, quals);
30597 type = build_pointer_type (itype);
30600 ix86_builtin_type_tab[(int) tcode] = type;
30601 return type;
30604 /* Table for the ix86 builtin function types. */
30605 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30607 /* Retrieve an element from the above table, building some of
30608 the types lazily. */
30610 static tree
30611 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30613 tree type;
30615 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30617 type = ix86_builtin_func_type_tab[(int) tcode];
30618 if (type != NULL)
30619 return type;
30621 if (tcode <= IX86_BT_LAST_FUNC)
30623 unsigned start = ix86_builtin_func_start[(int) tcode];
30624 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30625 tree rtype, atype, args = void_list_node;
30626 unsigned i;
30628 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30629 for (i = after - 1; i > start; --i)
30631 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30632 args = tree_cons (NULL, atype, args);
30635 type = build_function_type (rtype, args);
30637 else
30639 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30640 enum ix86_builtin_func_type icode;
30642 icode = ix86_builtin_func_alias_base[index];
30643 type = ix86_get_builtin_func_type (icode);
30646 ix86_builtin_func_type_tab[(int) tcode] = type;
30647 return type;
30651 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30652 bdesc_* arrays below should come first, then builtins for each bdesc_*
30653 array in ascending order, so that we can use direct array accesses. */
30654 enum ix86_builtins
30656 IX86_BUILTIN_MASKMOVQ,
30657 IX86_BUILTIN_LDMXCSR,
30658 IX86_BUILTIN_STMXCSR,
30659 IX86_BUILTIN_MASKMOVDQU,
30660 IX86_BUILTIN_PSLLDQ128,
30661 IX86_BUILTIN_CLFLUSH,
30662 IX86_BUILTIN_MONITOR,
30663 IX86_BUILTIN_MWAIT,
30664 IX86_BUILTIN_CLZERO,
30665 IX86_BUILTIN_VEC_INIT_V2SI,
30666 IX86_BUILTIN_VEC_INIT_V4HI,
30667 IX86_BUILTIN_VEC_INIT_V8QI,
30668 IX86_BUILTIN_VEC_EXT_V2DF,
30669 IX86_BUILTIN_VEC_EXT_V2DI,
30670 IX86_BUILTIN_VEC_EXT_V4SF,
30671 IX86_BUILTIN_VEC_EXT_V4SI,
30672 IX86_BUILTIN_VEC_EXT_V8HI,
30673 IX86_BUILTIN_VEC_EXT_V2SI,
30674 IX86_BUILTIN_VEC_EXT_V4HI,
30675 IX86_BUILTIN_VEC_EXT_V16QI,
30676 IX86_BUILTIN_VEC_SET_V2DI,
30677 IX86_BUILTIN_VEC_SET_V4SF,
30678 IX86_BUILTIN_VEC_SET_V4SI,
30679 IX86_BUILTIN_VEC_SET_V8HI,
30680 IX86_BUILTIN_VEC_SET_V4HI,
30681 IX86_BUILTIN_VEC_SET_V16QI,
30682 IX86_BUILTIN_GATHERSIV2DF,
30683 IX86_BUILTIN_GATHERSIV4DF,
30684 IX86_BUILTIN_GATHERDIV2DF,
30685 IX86_BUILTIN_GATHERDIV4DF,
30686 IX86_BUILTIN_GATHERSIV4SF,
30687 IX86_BUILTIN_GATHERSIV8SF,
30688 IX86_BUILTIN_GATHERDIV4SF,
30689 IX86_BUILTIN_GATHERDIV8SF,
30690 IX86_BUILTIN_GATHERSIV2DI,
30691 IX86_BUILTIN_GATHERSIV4DI,
30692 IX86_BUILTIN_GATHERDIV2DI,
30693 IX86_BUILTIN_GATHERDIV4DI,
30694 IX86_BUILTIN_GATHERSIV4SI,
30695 IX86_BUILTIN_GATHERSIV8SI,
30696 IX86_BUILTIN_GATHERDIV4SI,
30697 IX86_BUILTIN_GATHERDIV8SI,
30698 IX86_BUILTIN_VFMSUBSD3_MASK3,
30699 IX86_BUILTIN_VFMSUBSS3_MASK3,
30700 IX86_BUILTIN_GATHER3SIV8SF,
30701 IX86_BUILTIN_GATHER3SIV4SF,
30702 IX86_BUILTIN_GATHER3SIV4DF,
30703 IX86_BUILTIN_GATHER3SIV2DF,
30704 IX86_BUILTIN_GATHER3DIV8SF,
30705 IX86_BUILTIN_GATHER3DIV4SF,
30706 IX86_BUILTIN_GATHER3DIV4DF,
30707 IX86_BUILTIN_GATHER3DIV2DF,
30708 IX86_BUILTIN_GATHER3SIV8SI,
30709 IX86_BUILTIN_GATHER3SIV4SI,
30710 IX86_BUILTIN_GATHER3SIV4DI,
30711 IX86_BUILTIN_GATHER3SIV2DI,
30712 IX86_BUILTIN_GATHER3DIV8SI,
30713 IX86_BUILTIN_GATHER3DIV4SI,
30714 IX86_BUILTIN_GATHER3DIV4DI,
30715 IX86_BUILTIN_GATHER3DIV2DI,
30716 IX86_BUILTIN_SCATTERSIV8SF,
30717 IX86_BUILTIN_SCATTERSIV4SF,
30718 IX86_BUILTIN_SCATTERSIV4DF,
30719 IX86_BUILTIN_SCATTERSIV2DF,
30720 IX86_BUILTIN_SCATTERDIV8SF,
30721 IX86_BUILTIN_SCATTERDIV4SF,
30722 IX86_BUILTIN_SCATTERDIV4DF,
30723 IX86_BUILTIN_SCATTERDIV2DF,
30724 IX86_BUILTIN_SCATTERSIV8SI,
30725 IX86_BUILTIN_SCATTERSIV4SI,
30726 IX86_BUILTIN_SCATTERSIV4DI,
30727 IX86_BUILTIN_SCATTERSIV2DI,
30728 IX86_BUILTIN_SCATTERDIV8SI,
30729 IX86_BUILTIN_SCATTERDIV4SI,
30730 IX86_BUILTIN_SCATTERDIV4DI,
30731 IX86_BUILTIN_SCATTERDIV2DI,
30732 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30733 where all operands are 32-byte or 64-byte wide respectively. */
30734 IX86_BUILTIN_GATHERALTSIV4DF,
30735 IX86_BUILTIN_GATHERALTDIV8SF,
30736 IX86_BUILTIN_GATHERALTSIV4DI,
30737 IX86_BUILTIN_GATHERALTDIV8SI,
30738 IX86_BUILTIN_GATHER3ALTDIV16SF,
30739 IX86_BUILTIN_GATHER3ALTDIV16SI,
30740 IX86_BUILTIN_GATHER3ALTSIV4DF,
30741 IX86_BUILTIN_GATHER3ALTDIV8SF,
30742 IX86_BUILTIN_GATHER3ALTSIV4DI,
30743 IX86_BUILTIN_GATHER3ALTDIV8SI,
30744 IX86_BUILTIN_GATHER3ALTSIV8DF,
30745 IX86_BUILTIN_GATHER3ALTSIV8DI,
30746 IX86_BUILTIN_GATHER3DIV16SF,
30747 IX86_BUILTIN_GATHER3DIV16SI,
30748 IX86_BUILTIN_GATHER3DIV8DF,
30749 IX86_BUILTIN_GATHER3DIV8DI,
30750 IX86_BUILTIN_GATHER3SIV16SF,
30751 IX86_BUILTIN_GATHER3SIV16SI,
30752 IX86_BUILTIN_GATHER3SIV8DF,
30753 IX86_BUILTIN_GATHER3SIV8DI,
30754 IX86_BUILTIN_SCATTERALTSIV8DF,
30755 IX86_BUILTIN_SCATTERALTDIV16SF,
30756 IX86_BUILTIN_SCATTERALTSIV8DI,
30757 IX86_BUILTIN_SCATTERALTDIV16SI,
30758 IX86_BUILTIN_SCATTERDIV16SF,
30759 IX86_BUILTIN_SCATTERDIV16SI,
30760 IX86_BUILTIN_SCATTERDIV8DF,
30761 IX86_BUILTIN_SCATTERDIV8DI,
30762 IX86_BUILTIN_SCATTERSIV16SF,
30763 IX86_BUILTIN_SCATTERSIV16SI,
30764 IX86_BUILTIN_SCATTERSIV8DF,
30765 IX86_BUILTIN_SCATTERSIV8DI,
30766 IX86_BUILTIN_GATHERPFQPD,
30767 IX86_BUILTIN_GATHERPFDPS,
30768 IX86_BUILTIN_GATHERPFDPD,
30769 IX86_BUILTIN_GATHERPFQPS,
30770 IX86_BUILTIN_SCATTERPFDPD,
30771 IX86_BUILTIN_SCATTERPFDPS,
30772 IX86_BUILTIN_SCATTERPFQPD,
30773 IX86_BUILTIN_SCATTERPFQPS,
30774 IX86_BUILTIN_CLWB,
30775 IX86_BUILTIN_CLFLUSHOPT,
30776 IX86_BUILTIN_INFQ,
30777 IX86_BUILTIN_HUGE_VALQ,
30778 IX86_BUILTIN_NANQ,
30779 IX86_BUILTIN_NANSQ,
30780 IX86_BUILTIN_XABORT,
30781 IX86_BUILTIN_ADDCARRYX32,
30782 IX86_BUILTIN_ADDCARRYX64,
30783 IX86_BUILTIN_SBB32,
30784 IX86_BUILTIN_SBB64,
30785 IX86_BUILTIN_RDRAND16_STEP,
30786 IX86_BUILTIN_RDRAND32_STEP,
30787 IX86_BUILTIN_RDRAND64_STEP,
30788 IX86_BUILTIN_RDSEED16_STEP,
30789 IX86_BUILTIN_RDSEED32_STEP,
30790 IX86_BUILTIN_RDSEED64_STEP,
30791 IX86_BUILTIN_MONITORX,
30792 IX86_BUILTIN_MWAITX,
30793 IX86_BUILTIN_CFSTRING,
30794 IX86_BUILTIN_CPU_INIT,
30795 IX86_BUILTIN_CPU_IS,
30796 IX86_BUILTIN_CPU_SUPPORTS,
30797 IX86_BUILTIN_READ_FLAGS,
30798 IX86_BUILTIN_WRITE_FLAGS,
30800 /* All the remaining builtins are tracked in bdesc_* arrays in
30801 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30802 this point. */
30803 #define BDESC(mask, icode, name, code, comparison, flag) \
30804 code,
30805 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30806 code, \
30807 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30808 #define BDESC_END(kind, next_kind)
30810 #include "i386-builtin.def"
30812 #undef BDESC
30813 #undef BDESC_FIRST
30814 #undef BDESC_END
30816 IX86_BUILTIN_MAX,
30818 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30820 /* Now just the aliases for bdesc_* start/end. */
30821 #define BDESC(mask, icode, name, code, comparison, flag)
30822 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30823 #define BDESC_END(kind, next_kind) \
30824 IX86_BUILTIN__BDESC_##kind##_LAST \
30825 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30827 #include "i386-builtin.def"
30829 #undef BDESC
30830 #undef BDESC_FIRST
30831 #undef BDESC_END
30833 /* Just to make sure there is no comma after the last enumerator. */
30834 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30837 /* Table for the ix86 builtin decls. */
30838 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30840 /* Table of all of the builtin functions that are possible with different ISA's
30841 but are waiting to be built until a function is declared to use that
30842 ISA. */
30843 struct builtin_isa {
30844 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30845 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
30846 const char *name; /* function name */
30847 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30848 unsigned char const_p:1; /* true if the declaration is constant */
30849 unsigned char pure_p:1; /* true if the declaration has pure attribute */
30850 bool leaf_p; /* true if the declaration has leaf attribute */
30851 bool nothrow_p; /* true if the declaration has nothrow attribute */
30852 bool set_and_not_built_p;
30855 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30857 /* Bits that can still enable any inclusion of a builtin. */
30858 static HOST_WIDE_INT deferred_isa_values = 0;
30859 static HOST_WIDE_INT deferred_isa_values2 = 0;
30861 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30862 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30863 function decl in the ix86_builtins array. Returns the function decl or
30864 NULL_TREE, if the builtin was not added.
30866 If the front end has a special hook for builtin functions, delay adding
30867 builtin functions that aren't in the current ISA until the ISA is changed
30868 with function specific optimization. Doing so, can save about 300K for the
30869 default compiler. When the builtin is expanded, check at that time whether
30870 it is valid.
30872 If the front end doesn't have a special hook, record all builtins, even if
30873 it isn't an instruction set in the current ISA in case the user uses
30874 function specific options for a different ISA, so that we don't get scope
30875 errors if a builtin is added in the middle of a function scope. */
30877 static inline tree
30878 def_builtin (HOST_WIDE_INT mask, const char *name,
30879 enum ix86_builtin_func_type tcode,
30880 enum ix86_builtins code)
30882 tree decl = NULL_TREE;
30884 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30886 ix86_builtins_isa[(int) code].isa = mask;
30888 mask &= ~OPTION_MASK_ISA_64BIT;
30890 /* Filter out the masks most often ored together with others. */
30891 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30892 && mask != OPTION_MASK_ISA_AVX512VL)
30893 mask &= ~OPTION_MASK_ISA_AVX512VL;
30894 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
30895 && mask != OPTION_MASK_ISA_AVX512BW)
30896 mask &= ~OPTION_MASK_ISA_AVX512BW;
30898 if (mask == 0
30899 || (mask & ix86_isa_flags) != 0
30900 || (lang_hooks.builtin_function
30901 == lang_hooks.builtin_function_ext_scope))
30903 tree type = ix86_get_builtin_func_type (tcode);
30904 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30905 NULL, NULL_TREE);
30906 ix86_builtins[(int) code] = decl;
30907 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30909 else
30911 /* Just a MASK where set_and_not_built_p == true can potentially
30912 include a builtin. */
30913 deferred_isa_values |= mask;
30914 ix86_builtins[(int) code] = NULL_TREE;
30915 ix86_builtins_isa[(int) code].tcode = tcode;
30916 ix86_builtins_isa[(int) code].name = name;
30917 ix86_builtins_isa[(int) code].leaf_p = false;
30918 ix86_builtins_isa[(int) code].nothrow_p = false;
30919 ix86_builtins_isa[(int) code].const_p = false;
30920 ix86_builtins_isa[(int) code].pure_p = false;
30921 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30925 return decl;
30928 /* Like def_builtin, but also marks the function decl "const". */
30930 static inline tree
30931 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30932 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30934 tree decl = def_builtin (mask, name, tcode, code);
30935 if (decl)
30936 TREE_READONLY (decl) = 1;
30937 else
30938 ix86_builtins_isa[(int) code].const_p = true;
30940 return decl;
30943 /* Like def_builtin, but also marks the function decl "pure". */
30945 static inline tree
30946 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
30947 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30949 tree decl = def_builtin (mask, name, tcode, code);
30950 if (decl)
30951 DECL_PURE_P (decl) = 1;
30952 else
30953 ix86_builtins_isa[(int) code].pure_p = true;
30955 return decl;
30958 /* Like def_builtin, but for additional isa2 flags. */
30960 static inline tree
30961 def_builtin2 (HOST_WIDE_INT mask, const char *name,
30962 enum ix86_builtin_func_type tcode,
30963 enum ix86_builtins code)
30965 tree decl = NULL_TREE;
30967 ix86_builtins_isa[(int) code].isa2 = mask;
30969 if (mask == 0
30970 || (mask & ix86_isa_flags2) != 0
30971 || (lang_hooks.builtin_function
30972 == lang_hooks.builtin_function_ext_scope))
30975 tree type = ix86_get_builtin_func_type (tcode);
30976 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30977 NULL, NULL_TREE);
30978 ix86_builtins[(int) code] = decl;
30979 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30981 else
30983 /* Just a MASK where set_and_not_built_p == true can potentially
30984 include a builtin. */
30985 deferred_isa_values2 |= mask;
30986 ix86_builtins[(int) code] = NULL_TREE;
30987 ix86_builtins_isa[(int) code].tcode = tcode;
30988 ix86_builtins_isa[(int) code].name = name;
30989 ix86_builtins_isa[(int) code].leaf_p = false;
30990 ix86_builtins_isa[(int) code].nothrow_p = false;
30991 ix86_builtins_isa[(int) code].const_p = false;
30992 ix86_builtins_isa[(int) code].pure_p = false;
30993 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30996 return decl;
30999 /* Like def_builtin, but also marks the function decl "const". */
31001 static inline tree
31002 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
31003 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31005 tree decl = def_builtin2 (mask, name, tcode, code);
31006 if (decl)
31007 TREE_READONLY (decl) = 1;
31008 else
31009 ix86_builtins_isa[(int) code].const_p = true;
31011 return decl;
31014 /* Like def_builtin, but also marks the function decl "pure". */
31016 static inline tree
31017 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
31018 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
31020 tree decl = def_builtin2 (mask, name, tcode, code);
31021 if (decl)
31022 DECL_PURE_P (decl) = 1;
31023 else
31024 ix86_builtins_isa[(int) code].pure_p = true;
31026 return decl;
31029 /* Add any new builtin functions for a given ISA that may not have been
31030 declared. This saves a bit of space compared to adding all of the
31031 declarations to the tree, even if we didn't use them. */
31033 static void
31034 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
31036 isa &= ~OPTION_MASK_ISA_64BIT;
31038 if ((isa & deferred_isa_values) == 0
31039 && (isa2 & deferred_isa_values2) == 0)
31040 return;
31042 /* Bits in ISA value can be removed from potential isa values. */
31043 deferred_isa_values &= ~isa;
31044 deferred_isa_values2 &= ~isa2;
31046 int i;
31047 tree saved_current_target_pragma = current_target_pragma;
31048 current_target_pragma = NULL_TREE;
31050 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
31052 if (((ix86_builtins_isa[i].isa & isa) != 0
31053 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
31054 && ix86_builtins_isa[i].set_and_not_built_p)
31056 tree decl, type;
31058 /* Don't define the builtin again. */
31059 ix86_builtins_isa[i].set_and_not_built_p = false;
31061 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
31062 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
31063 type, i, BUILT_IN_MD, NULL,
31064 NULL_TREE);
31066 ix86_builtins[i] = decl;
31067 if (ix86_builtins_isa[i].const_p)
31068 TREE_READONLY (decl) = 1;
31069 if (ix86_builtins_isa[i].pure_p)
31070 DECL_PURE_P (decl) = 1;
31071 if (ix86_builtins_isa[i].leaf_p)
31072 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31073 NULL_TREE);
31074 if (ix86_builtins_isa[i].nothrow_p)
31075 TREE_NOTHROW (decl) = 1;
31079 current_target_pragma = saved_current_target_pragma;
31082 /* Bits for builtin_description.flag. */
31084 /* Set when we don't support the comparison natively, and should
31085 swap_comparison in order to support it. */
31086 #define BUILTIN_DESC_SWAP_OPERANDS 1
31088 struct builtin_description
31090 const HOST_WIDE_INT mask;
31091 const enum insn_code icode;
31092 const char *const name;
31093 const enum ix86_builtins code;
31094 const enum rtx_code comparison;
31095 const int flag;
31098 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
31099 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
31100 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
31101 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
31102 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
31103 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
31104 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
31105 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
31106 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
31107 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
31108 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
31109 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
31110 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
31111 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
31112 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
31113 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
31114 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
31115 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
31116 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31117 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31118 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31119 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31120 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31121 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31122 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31123 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31124 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31125 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31126 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31127 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31128 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31129 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31130 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31131 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31132 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31133 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31134 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31135 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31136 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31137 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31138 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31139 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31140 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31141 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31142 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31143 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31144 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31145 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31146 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31147 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31148 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31149 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31151 #define BDESC(mask, icode, name, code, comparison, flag) \
31152 { mask, icode, name, code, comparison, flag },
31153 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31154 static const struct builtin_description bdesc_##kind[] = \
31156 BDESC (mask, icode, name, code, comparison, flag)
31157 #define BDESC_END(kind, next_kind) \
31160 #include "i386-builtin.def"
31162 #undef BDESC
31163 #undef BDESC_FIRST
31164 #undef BDESC_END
31166 /* TM vector builtins. */
31168 /* Reuse the existing x86-specific `struct builtin_description' cause
31169 we're lazy. Add casts to make them fit. */
31170 static const struct builtin_description bdesc_tm[] =
31172 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31173 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31174 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31175 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31176 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31177 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31178 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31180 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31181 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31182 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31183 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31184 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31185 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31186 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31188 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31189 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31190 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31191 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31192 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31193 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31194 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31196 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31197 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31198 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31201 /* Initialize the transactional memory vector load/store builtins. */
31203 static void
31204 ix86_init_tm_builtins (void)
31206 enum ix86_builtin_func_type ftype;
31207 const struct builtin_description *d;
31208 size_t i;
31209 tree decl;
31210 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31211 tree attrs_log, attrs_type_log;
31213 if (!flag_tm)
31214 return;
31216 /* If there are no builtins defined, we must be compiling in a
31217 language without trans-mem support. */
31218 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31219 return;
31221 /* Use whatever attributes a normal TM load has. */
31222 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31223 attrs_load = DECL_ATTRIBUTES (decl);
31224 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31225 /* Use whatever attributes a normal TM store has. */
31226 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31227 attrs_store = DECL_ATTRIBUTES (decl);
31228 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31229 /* Use whatever attributes a normal TM log has. */
31230 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31231 attrs_log = DECL_ATTRIBUTES (decl);
31232 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31234 for (i = 0, d = bdesc_tm;
31235 i < ARRAY_SIZE (bdesc_tm);
31236 i++, d++)
31238 if ((d->mask & ix86_isa_flags) != 0
31239 || (lang_hooks.builtin_function
31240 == lang_hooks.builtin_function_ext_scope))
31242 tree type, attrs, attrs_type;
31243 enum built_in_function code = (enum built_in_function) d->code;
31245 ftype = (enum ix86_builtin_func_type) d->flag;
31246 type = ix86_get_builtin_func_type (ftype);
31248 if (BUILTIN_TM_LOAD_P (code))
31250 attrs = attrs_load;
31251 attrs_type = attrs_type_load;
31253 else if (BUILTIN_TM_STORE_P (code))
31255 attrs = attrs_store;
31256 attrs_type = attrs_type_store;
31258 else
31260 attrs = attrs_log;
31261 attrs_type = attrs_type_log;
31263 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31264 /* The builtin without the prefix for
31265 calling it directly. */
31266 d->name + strlen ("__builtin_"),
31267 attrs);
31268 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31269 set the TYPE_ATTRIBUTES. */
31270 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31272 set_builtin_decl (code, decl, false);
31277 /* Macros for verification of enum ix86_builtins order. */
31278 #define BDESC_VERIFY(x, y, z) \
31279 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31280 #define BDESC_VERIFYS(x, y, z) \
31281 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31283 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31284 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31285 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31286 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31287 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31288 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31289 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31290 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31291 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31292 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31293 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31294 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31295 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31296 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31297 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31298 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
31299 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31300 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31301 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31302 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31303 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
31304 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31305 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31306 IX86_BUILTIN__BDESC_CET_LAST, 1);
31307 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31308 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
31310 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31311 in the current target ISA to allow the user to compile particular modules
31312 with different target specific options that differ from the command line
31313 options. */
31314 static void
31315 ix86_init_mmx_sse_builtins (void)
31317 const struct builtin_description * d;
31318 enum ix86_builtin_func_type ftype;
31319 size_t i;
31321 /* Add all special builtins with variable number of operands. */
31322 for (i = 0, d = bdesc_special_args;
31323 i < ARRAY_SIZE (bdesc_special_args);
31324 i++, d++)
31326 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31327 if (d->name == 0)
31328 continue;
31330 ftype = (enum ix86_builtin_func_type) d->flag;
31331 def_builtin (d->mask, d->name, ftype, d->code);
31333 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31334 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31335 ARRAY_SIZE (bdesc_special_args) - 1);
31337 /* Add all special builtins with variable number of operands. */
31338 for (i = 0, d = bdesc_special_args2;
31339 i < ARRAY_SIZE (bdesc_special_args2);
31340 i++, d++)
31342 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
31343 if (d->name == 0)
31344 continue;
31346 ftype = (enum ix86_builtin_func_type) d->flag;
31347 def_builtin2 (d->mask, d->name, ftype, d->code);
31349 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
31350 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31351 ARRAY_SIZE (bdesc_special_args2) - 1);
31353 /* Add all builtins with variable number of operands. */
31354 for (i = 0, d = bdesc_args;
31355 i < ARRAY_SIZE (bdesc_args);
31356 i++, d++)
31358 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31359 if (d->name == 0)
31360 continue;
31362 ftype = (enum ix86_builtin_func_type) d->flag;
31363 def_builtin_const (d->mask, d->name, ftype, d->code);
31365 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31366 IX86_BUILTIN__BDESC_ARGS_FIRST,
31367 ARRAY_SIZE (bdesc_args) - 1);
31369 /* Add all builtins with variable number of operands. */
31370 for (i = 0, d = bdesc_args2;
31371 i < ARRAY_SIZE (bdesc_args2);
31372 i++, d++)
31374 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
31375 if (d->name == 0)
31376 continue;
31378 ftype = (enum ix86_builtin_func_type) d->flag;
31379 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31381 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
31382 IX86_BUILTIN__BDESC_ARGS2_FIRST,
31383 ARRAY_SIZE (bdesc_args2) - 1);
31385 /* Add all builtins with rounding. */
31386 for (i = 0, d = bdesc_round_args;
31387 i < ARRAY_SIZE (bdesc_round_args);
31388 i++, d++)
31390 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31391 if (d->name == 0)
31392 continue;
31394 ftype = (enum ix86_builtin_func_type) d->flag;
31395 def_builtin_const (d->mask, d->name, ftype, d->code);
31397 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31398 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31399 ARRAY_SIZE (bdesc_round_args) - 1);
31401 /* pcmpestr[im] insns. */
31402 for (i = 0, d = bdesc_pcmpestr;
31403 i < ARRAY_SIZE (bdesc_pcmpestr);
31404 i++, d++)
31406 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31407 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31408 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31409 else
31410 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31411 def_builtin_const (d->mask, d->name, ftype, d->code);
31413 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31414 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31415 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31417 /* pcmpistr[im] insns. */
31418 for (i = 0, d = bdesc_pcmpistr;
31419 i < ARRAY_SIZE (bdesc_pcmpistr);
31420 i++, d++)
31422 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31423 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31424 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31425 else
31426 ftype = INT_FTYPE_V16QI_V16QI_INT;
31427 def_builtin_const (d->mask, d->name, ftype, d->code);
31429 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31430 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31431 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31433 /* comi/ucomi insns. */
31434 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31436 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31437 if (d->mask == OPTION_MASK_ISA_SSE2)
31438 ftype = INT_FTYPE_V2DF_V2DF;
31439 else
31440 ftype = INT_FTYPE_V4SF_V4SF;
31441 def_builtin_const (d->mask, d->name, ftype, d->code);
31443 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31444 IX86_BUILTIN__BDESC_COMI_FIRST,
31445 ARRAY_SIZE (bdesc_comi) - 1);
31447 /* SSE */
31448 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31449 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31450 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31451 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31453 /* SSE or 3DNow!A */
31454 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31455 /* As it uses V4HImode, we have to require -mmmx too. */
31456 | OPTION_MASK_ISA_MMX,
31457 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31458 IX86_BUILTIN_MASKMOVQ);
31460 /* SSE2 */
31461 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31462 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31464 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31465 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31466 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31467 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31469 /* SSE3. */
31470 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31471 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31472 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31473 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31475 /* AES */
31476 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31477 "__builtin_ia32_aesenc128",
31478 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31479 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31480 "__builtin_ia32_aesenclast128",
31481 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31482 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31483 "__builtin_ia32_aesdec128",
31484 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31485 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31486 "__builtin_ia32_aesdeclast128",
31487 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31488 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31489 "__builtin_ia32_aesimc128",
31490 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31491 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31492 "__builtin_ia32_aeskeygenassist128",
31493 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31495 /* PCLMUL */
31496 def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2,
31497 "__builtin_ia32_pclmulqdq128",
31498 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31500 /* RDRND */
31501 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31502 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31503 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31504 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31505 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31506 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31507 IX86_BUILTIN_RDRAND64_STEP);
31509 /* AVX2 */
31510 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31511 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31512 IX86_BUILTIN_GATHERSIV2DF);
31514 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31515 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31516 IX86_BUILTIN_GATHERSIV4DF);
31518 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31519 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31520 IX86_BUILTIN_GATHERDIV2DF);
31522 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31523 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31524 IX86_BUILTIN_GATHERDIV4DF);
31526 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31527 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31528 IX86_BUILTIN_GATHERSIV4SF);
31530 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31531 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31532 IX86_BUILTIN_GATHERSIV8SF);
31534 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31535 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31536 IX86_BUILTIN_GATHERDIV4SF);
31538 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31539 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31540 IX86_BUILTIN_GATHERDIV8SF);
31542 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31543 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31544 IX86_BUILTIN_GATHERSIV2DI);
31546 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31547 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31548 IX86_BUILTIN_GATHERSIV4DI);
31550 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31551 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31552 IX86_BUILTIN_GATHERDIV2DI);
31554 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31555 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31556 IX86_BUILTIN_GATHERDIV4DI);
31558 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31559 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31560 IX86_BUILTIN_GATHERSIV4SI);
31562 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31563 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31564 IX86_BUILTIN_GATHERSIV8SI);
31566 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31567 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31568 IX86_BUILTIN_GATHERDIV4SI);
31570 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31571 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31572 IX86_BUILTIN_GATHERDIV8SI);
31574 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31575 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31576 IX86_BUILTIN_GATHERALTSIV4DF);
31578 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31579 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31580 IX86_BUILTIN_GATHERALTDIV8SF);
31582 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31583 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31584 IX86_BUILTIN_GATHERALTSIV4DI);
31586 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31587 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31588 IX86_BUILTIN_GATHERALTDIV8SI);
31590 /* AVX512F */
31591 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31592 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31593 IX86_BUILTIN_GATHER3SIV16SF);
31595 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31596 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31597 IX86_BUILTIN_GATHER3SIV8DF);
31599 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31600 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31601 IX86_BUILTIN_GATHER3DIV16SF);
31603 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31604 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31605 IX86_BUILTIN_GATHER3DIV8DF);
31607 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31608 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31609 IX86_BUILTIN_GATHER3SIV16SI);
31611 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31612 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31613 IX86_BUILTIN_GATHER3SIV8DI);
31615 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31616 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31617 IX86_BUILTIN_GATHER3DIV16SI);
31619 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31620 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31621 IX86_BUILTIN_GATHER3DIV8DI);
31623 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31624 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31625 IX86_BUILTIN_GATHER3ALTSIV8DF);
31627 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31628 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31629 IX86_BUILTIN_GATHER3ALTDIV16SF);
31631 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31632 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31633 IX86_BUILTIN_GATHER3ALTSIV8DI);
31635 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31636 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31637 IX86_BUILTIN_GATHER3ALTDIV16SI);
31639 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31640 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31641 IX86_BUILTIN_SCATTERSIV16SF);
31643 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31644 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31645 IX86_BUILTIN_SCATTERSIV8DF);
31647 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31648 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31649 IX86_BUILTIN_SCATTERDIV16SF);
31651 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31652 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31653 IX86_BUILTIN_SCATTERDIV8DF);
31655 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31656 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31657 IX86_BUILTIN_SCATTERSIV16SI);
31659 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31660 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31661 IX86_BUILTIN_SCATTERSIV8DI);
31663 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31664 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31665 IX86_BUILTIN_SCATTERDIV16SI);
31667 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31668 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31669 IX86_BUILTIN_SCATTERDIV8DI);
31671 /* AVX512VL */
31672 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31673 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31674 IX86_BUILTIN_GATHER3SIV2DF);
31676 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31677 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31678 IX86_BUILTIN_GATHER3SIV4DF);
31680 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31681 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31682 IX86_BUILTIN_GATHER3DIV2DF);
31684 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31685 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31686 IX86_BUILTIN_GATHER3DIV4DF);
31688 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31689 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31690 IX86_BUILTIN_GATHER3SIV4SF);
31692 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31693 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31694 IX86_BUILTIN_GATHER3SIV8SF);
31696 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31697 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31698 IX86_BUILTIN_GATHER3DIV4SF);
31700 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31701 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31702 IX86_BUILTIN_GATHER3DIV8SF);
31704 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31705 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31706 IX86_BUILTIN_GATHER3SIV2DI);
31708 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31709 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31710 IX86_BUILTIN_GATHER3SIV4DI);
31712 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31713 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31714 IX86_BUILTIN_GATHER3DIV2DI);
31716 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31717 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31718 IX86_BUILTIN_GATHER3DIV4DI);
31720 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31721 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31722 IX86_BUILTIN_GATHER3SIV4SI);
31724 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31725 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31726 IX86_BUILTIN_GATHER3SIV8SI);
31728 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31729 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31730 IX86_BUILTIN_GATHER3DIV4SI);
31732 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31733 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31734 IX86_BUILTIN_GATHER3DIV8SI);
31736 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31737 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31738 IX86_BUILTIN_GATHER3ALTSIV4DF);
31740 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31741 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31742 IX86_BUILTIN_GATHER3ALTDIV8SF);
31744 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31745 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31746 IX86_BUILTIN_GATHER3ALTSIV4DI);
31748 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31749 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31750 IX86_BUILTIN_GATHER3ALTDIV8SI);
31752 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31753 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31754 IX86_BUILTIN_SCATTERSIV8SF);
31756 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31757 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31758 IX86_BUILTIN_SCATTERSIV4SF);
31760 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31761 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31762 IX86_BUILTIN_SCATTERSIV4DF);
31764 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31765 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31766 IX86_BUILTIN_SCATTERSIV2DF);
31768 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31769 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31770 IX86_BUILTIN_SCATTERDIV8SF);
31772 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31773 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31774 IX86_BUILTIN_SCATTERDIV4SF);
31776 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31777 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31778 IX86_BUILTIN_SCATTERDIV4DF);
31780 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31781 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31782 IX86_BUILTIN_SCATTERDIV2DF);
31784 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31785 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31786 IX86_BUILTIN_SCATTERSIV8SI);
31788 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31789 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31790 IX86_BUILTIN_SCATTERSIV4SI);
31792 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31793 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31794 IX86_BUILTIN_SCATTERSIV4DI);
31796 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31797 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31798 IX86_BUILTIN_SCATTERSIV2DI);
31800 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31801 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31802 IX86_BUILTIN_SCATTERDIV8SI);
31804 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31805 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31806 IX86_BUILTIN_SCATTERDIV4SI);
31808 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31809 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31810 IX86_BUILTIN_SCATTERDIV4DI);
31812 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31813 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31814 IX86_BUILTIN_SCATTERDIV2DI);
31815 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31816 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31817 IX86_BUILTIN_SCATTERALTSIV8DF);
31819 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31820 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31821 IX86_BUILTIN_SCATTERALTDIV16SF);
31823 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31824 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31825 IX86_BUILTIN_SCATTERALTSIV8DI);
31827 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31828 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31829 IX86_BUILTIN_SCATTERALTDIV16SI);
31831 /* AVX512PF */
31832 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31833 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31834 IX86_BUILTIN_GATHERPFDPD);
31835 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31836 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31837 IX86_BUILTIN_GATHERPFDPS);
31838 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31839 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31840 IX86_BUILTIN_GATHERPFQPD);
31841 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31842 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31843 IX86_BUILTIN_GATHERPFQPS);
31844 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31845 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31846 IX86_BUILTIN_SCATTERPFDPD);
31847 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31848 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31849 IX86_BUILTIN_SCATTERPFDPS);
31850 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31851 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31852 IX86_BUILTIN_SCATTERPFQPD);
31853 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31854 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31855 IX86_BUILTIN_SCATTERPFQPS);
31857 /* SHA */
31858 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31859 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31860 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31861 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31862 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31863 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31864 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31865 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31866 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31867 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31868 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31869 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31870 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31871 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31873 /* RTM. */
31874 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31875 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31877 /* MMX access to the vec_init patterns. */
31878 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31879 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31881 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31882 V4HI_FTYPE_HI_HI_HI_HI,
31883 IX86_BUILTIN_VEC_INIT_V4HI);
31885 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31886 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31887 IX86_BUILTIN_VEC_INIT_V8QI);
31889 /* Access to the vec_extract patterns. */
31890 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31891 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31892 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31893 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31894 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31895 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31896 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31897 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31898 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31899 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31901 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31902 /* As it uses V4HImode, we have to require -mmmx too. */
31903 | OPTION_MASK_ISA_MMX,
31904 "__builtin_ia32_vec_ext_v4hi",
31905 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31907 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31908 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31910 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31911 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31913 /* Access to the vec_set patterns. */
31914 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31915 "__builtin_ia32_vec_set_v2di",
31916 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31918 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31919 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31921 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31922 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31924 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31925 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31927 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31928 /* As it uses V4HImode, we have to require -mmmx too. */
31929 | OPTION_MASK_ISA_MMX,
31930 "__builtin_ia32_vec_set_v4hi",
31931 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31933 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31934 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31936 /* RDSEED */
31937 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31938 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31939 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31940 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31941 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31942 "__builtin_ia32_rdseed_di_step",
31943 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31945 /* ADCX */
31946 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31947 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31948 def_builtin (OPTION_MASK_ISA_64BIT,
31949 "__builtin_ia32_addcarryx_u64",
31950 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31951 IX86_BUILTIN_ADDCARRYX64);
31953 /* SBB */
31954 def_builtin (0, "__builtin_ia32_sbb_u32",
31955 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31956 def_builtin (OPTION_MASK_ISA_64BIT,
31957 "__builtin_ia32_sbb_u64",
31958 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31959 IX86_BUILTIN_SBB64);
31961 /* Read/write FLAGS. */
31962 def_builtin (0, "__builtin_ia32_readeflags_u32",
31963 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31964 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31965 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31966 def_builtin (0, "__builtin_ia32_writeeflags_u32",
31967 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31968 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31969 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31971 /* CLFLUSHOPT. */
31972 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31973 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31975 /* CLWB. */
31976 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31977 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31979 /* MONITORX and MWAITX. */
31980 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31981 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31982 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31983 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31985 /* CLZERO. */
31986 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31987 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31989 /* Add FMA4 multi-arg argument instructions */
31990 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31992 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31993 if (d->name == 0)
31994 continue;
31996 ftype = (enum ix86_builtin_func_type) d->flag;
31997 def_builtin_const (d->mask, d->name, ftype, d->code);
31999 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
32000 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32001 ARRAY_SIZE (bdesc_multi_arg) - 1);
32003 /* Add CET inrinsics. */
32004 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
32006 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
32007 if (d->name == 0)
32008 continue;
32010 ftype = (enum ix86_builtin_func_type) d->flag;
32011 def_builtin (d->mask, d->name, ftype, d->code);
32013 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
32014 IX86_BUILTIN__BDESC_CET_FIRST,
32015 ARRAY_SIZE (bdesc_cet) - 1);
32017 for (i = 0, d = bdesc_cet_rdssp;
32018 i < ARRAY_SIZE (bdesc_cet_rdssp);
32019 i++, d++)
32021 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
32022 if (d->name == 0)
32023 continue;
32025 ftype = (enum ix86_builtin_func_type) d->flag;
32026 def_builtin (d->mask, d->name, ftype, d->code);
32028 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
32029 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
32030 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
32033 static void
32034 ix86_init_mpx_builtins ()
32036 const struct builtin_description * d;
32037 enum ix86_builtin_func_type ftype;
32038 tree decl;
32039 size_t i;
32041 for (i = 0, d = bdesc_mpx;
32042 i < ARRAY_SIZE (bdesc_mpx);
32043 i++, d++)
32045 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
32046 if (d->name == 0)
32047 continue;
32049 ftype = (enum ix86_builtin_func_type) d->flag;
32050 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
32052 /* With no leaf and nothrow flags for MPX builtins
32053 abnormal edges may follow its call when setjmp
32054 presents in the function. Since we may have a lot
32055 of MPX builtins calls it causes lots of useless
32056 edges and enormous PHI nodes. To avoid this we mark
32057 MPX builtins as leaf and nothrow. */
32058 if (decl)
32060 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32061 NULL_TREE);
32062 TREE_NOTHROW (decl) = 1;
32064 else
32066 ix86_builtins_isa[(int)d->code].leaf_p = true;
32067 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32070 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
32071 IX86_BUILTIN__BDESC_MPX_FIRST,
32072 ARRAY_SIZE (bdesc_mpx) - 1);
32074 for (i = 0, d = bdesc_mpx_const;
32075 i < ARRAY_SIZE (bdesc_mpx_const);
32076 i++, d++)
32078 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
32079 if (d->name == 0)
32080 continue;
32082 ftype = (enum ix86_builtin_func_type) d->flag;
32083 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
32085 if (decl)
32087 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32088 NULL_TREE);
32089 TREE_NOTHROW (decl) = 1;
32091 else
32093 ix86_builtins_isa[(int)d->code].leaf_p = true;
32094 ix86_builtins_isa[(int)d->code].nothrow_p = true;
32097 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
32098 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32099 ARRAY_SIZE (bdesc_mpx_const) - 1);
32101 #undef BDESC_VERIFY
32102 #undef BDESC_VERIFYS
32104 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
32105 to return a pointer to VERSION_DECL if the outcome of the expression
32106 formed by PREDICATE_CHAIN is true. This function will be called during
32107 version dispatch to decide which function version to execute. It returns
32108 the basic block at the end, to which more conditions can be added. */
32110 static basic_block
32111 add_condition_to_bb (tree function_decl, tree version_decl,
32112 tree predicate_chain, basic_block new_bb)
32114 gimple *return_stmt;
32115 tree convert_expr, result_var;
32116 gimple *convert_stmt;
32117 gimple *call_cond_stmt;
32118 gimple *if_else_stmt;
32120 basic_block bb1, bb2, bb3;
32121 edge e12, e23;
32123 tree cond_var, and_expr_var = NULL_TREE;
32124 gimple_seq gseq;
32126 tree predicate_decl, predicate_arg;
32128 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
32130 gcc_assert (new_bb != NULL);
32131 gseq = bb_seq (new_bb);
32134 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
32135 build_fold_addr_expr (version_decl));
32136 result_var = create_tmp_var (ptr_type_node);
32137 convert_stmt = gimple_build_assign (result_var, convert_expr);
32138 return_stmt = gimple_build_return (result_var);
32140 if (predicate_chain == NULL_TREE)
32142 gimple_seq_add_stmt (&gseq, convert_stmt);
32143 gimple_seq_add_stmt (&gseq, return_stmt);
32144 set_bb_seq (new_bb, gseq);
32145 gimple_set_bb (convert_stmt, new_bb);
32146 gimple_set_bb (return_stmt, new_bb);
32147 pop_cfun ();
32148 return new_bb;
32151 while (predicate_chain != NULL)
32153 cond_var = create_tmp_var (integer_type_node);
32154 predicate_decl = TREE_PURPOSE (predicate_chain);
32155 predicate_arg = TREE_VALUE (predicate_chain);
32156 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
32157 gimple_call_set_lhs (call_cond_stmt, cond_var);
32159 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
32160 gimple_set_bb (call_cond_stmt, new_bb);
32161 gimple_seq_add_stmt (&gseq, call_cond_stmt);
32163 predicate_chain = TREE_CHAIN (predicate_chain);
32165 if (and_expr_var == NULL)
32166 and_expr_var = cond_var;
32167 else
32169 gimple *assign_stmt;
32170 /* Use MIN_EXPR to check if any integer is zero?.
32171 and_expr_var = min_expr <cond_var, and_expr_var> */
32172 assign_stmt = gimple_build_assign (and_expr_var,
32173 build2 (MIN_EXPR, integer_type_node,
32174 cond_var, and_expr_var));
32176 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
32177 gimple_set_bb (assign_stmt, new_bb);
32178 gimple_seq_add_stmt (&gseq, assign_stmt);
32182 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
32183 integer_zero_node,
32184 NULL_TREE, NULL_TREE);
32185 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
32186 gimple_set_bb (if_else_stmt, new_bb);
32187 gimple_seq_add_stmt (&gseq, if_else_stmt);
32189 gimple_seq_add_stmt (&gseq, convert_stmt);
32190 gimple_seq_add_stmt (&gseq, return_stmt);
32191 set_bb_seq (new_bb, gseq);
32193 bb1 = new_bb;
32194 e12 = split_block (bb1, if_else_stmt);
32195 bb2 = e12->dest;
32196 e12->flags &= ~EDGE_FALLTHRU;
32197 e12->flags |= EDGE_TRUE_VALUE;
32199 e23 = split_block (bb2, return_stmt);
32201 gimple_set_bb (convert_stmt, bb2);
32202 gimple_set_bb (return_stmt, bb2);
32204 bb3 = e23->dest;
32205 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32207 remove_edge (e23);
32208 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32210 pop_cfun ();
32212 return bb3;
32215 /* This parses the attribute arguments to target in DECL and determines
32216 the right builtin to use to match the platform specification.
32217 It returns the priority value for this version decl. If PREDICATE_LIST
32218 is not NULL, it stores the list of cpu features that need to be checked
32219 before dispatching this function. */
32221 static unsigned int
32222 get_builtin_code_for_version (tree decl, tree *predicate_list)
32224 tree attrs;
32225 struct cl_target_option cur_target;
32226 tree target_node;
32227 struct cl_target_option *new_target;
32228 const char *arg_str = NULL;
32229 const char *attrs_str = NULL;
32230 char *tok_str = NULL;
32231 char *token;
32233 /* Priority of i386 features, greater value is higher priority. This is
32234 used to decide the order in which function dispatch must happen. For
32235 instance, a version specialized for SSE4.2 should be checked for dispatch
32236 before a version for SSE3, as SSE4.2 implies SSE3. */
32237 enum feature_priority
32239 P_ZERO = 0,
32240 P_MMX,
32241 P_SSE,
32242 P_SSE2,
32243 P_SSE3,
32244 P_SSSE3,
32245 P_PROC_SSSE3,
32246 P_SSE4_A,
32247 P_PROC_SSE4_A,
32248 P_SSE4_1,
32249 P_SSE4_2,
32250 P_PROC_SSE4_2,
32251 P_POPCNT,
32252 P_AES,
32253 P_PCLMUL,
32254 P_AVX,
32255 P_PROC_AVX,
32256 P_BMI,
32257 P_PROC_BMI,
32258 P_FMA4,
32259 P_XOP,
32260 P_PROC_XOP,
32261 P_FMA,
32262 P_PROC_FMA,
32263 P_BMI2,
32264 P_AVX2,
32265 P_PROC_AVX2,
32266 P_AVX512F,
32267 P_PROC_AVX512F
32270 enum feature_priority priority = P_ZERO;
32272 /* These are the target attribute strings for which a dispatcher is
32273 available, from fold_builtin_cpu. */
32275 static struct _feature_list
32277 const char *const name;
32278 const enum feature_priority priority;
32280 const feature_list[] =
32282 {"mmx", P_MMX},
32283 {"sse", P_SSE},
32284 {"sse2", P_SSE2},
32285 {"sse3", P_SSE3},
32286 {"sse4a", P_SSE4_A},
32287 {"ssse3", P_SSSE3},
32288 {"sse4.1", P_SSE4_1},
32289 {"sse4.2", P_SSE4_2},
32290 {"popcnt", P_POPCNT},
32291 {"aes", P_AES},
32292 {"pclmul", P_PCLMUL},
32293 {"avx", P_AVX},
32294 {"bmi", P_BMI},
32295 {"fma4", P_FMA4},
32296 {"xop", P_XOP},
32297 {"fma", P_FMA},
32298 {"bmi2", P_BMI2},
32299 {"avx2", P_AVX2},
32300 {"avx512f", P_AVX512F}
32304 static unsigned int NUM_FEATURES
32305 = sizeof (feature_list) / sizeof (struct _feature_list);
32307 unsigned int i;
32309 tree predicate_chain = NULL_TREE;
32310 tree predicate_decl, predicate_arg;
32312 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32313 gcc_assert (attrs != NULL);
32315 attrs = TREE_VALUE (TREE_VALUE (attrs));
32317 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32318 attrs_str = TREE_STRING_POINTER (attrs);
32320 /* Return priority zero for default function. */
32321 if (strcmp (attrs_str, "default") == 0)
32322 return 0;
32324 /* Handle arch= if specified. For priority, set it to be 1 more than
32325 the best instruction set the processor can handle. For instance, if
32326 there is a version for atom and a version for ssse3 (the highest ISA
32327 priority for atom), the atom version must be checked for dispatch
32328 before the ssse3 version. */
32329 if (strstr (attrs_str, "arch=") != NULL)
32331 cl_target_option_save (&cur_target, &global_options);
32332 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32333 &global_options_set);
32335 gcc_assert (target_node);
32336 new_target = TREE_TARGET_OPTION (target_node);
32337 gcc_assert (new_target);
32339 if (new_target->arch_specified && new_target->arch > 0)
32341 switch (new_target->arch)
32343 case PROCESSOR_CORE2:
32344 arg_str = "core2";
32345 priority = P_PROC_SSSE3;
32346 break;
32347 case PROCESSOR_NEHALEM:
32348 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32350 arg_str = "westmere";
32351 priority = P_AES;
32353 else
32355 /* We translate "arch=corei7" and "arch=nehalem" to
32356 "corei7" so that it will be mapped to M_INTEL_COREI7
32357 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32358 arg_str = "corei7";
32359 priority = P_PROC_SSE4_2;
32361 break;
32362 case PROCESSOR_SANDYBRIDGE:
32363 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32364 arg_str = "ivybridge";
32365 else
32366 arg_str = "sandybridge";
32367 priority = P_PROC_AVX;
32368 break;
32369 case PROCESSOR_HASWELL:
32370 case PROCESSOR_SKYLAKE_AVX512:
32371 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_GFNI)
32372 arg_str = "icelake";
32373 else if (new_target->x_ix86_isa_flags
32374 & OPTION_MASK_ISA_AVX512VBMI)
32375 arg_str = "cannonlake";
32376 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32377 arg_str = "skylake-avx512";
32378 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32379 arg_str = "skylake";
32380 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32381 arg_str = "broadwell";
32382 else
32383 arg_str = "haswell";
32384 priority = P_PROC_AVX2;
32385 break;
32386 case PROCESSOR_BONNELL:
32387 arg_str = "bonnell";
32388 priority = P_PROC_SSSE3;
32389 break;
32390 case PROCESSOR_KNL:
32391 arg_str = "knl";
32392 priority = P_PROC_AVX512F;
32393 break;
32394 case PROCESSOR_KNM:
32395 arg_str = "knm";
32396 priority = P_PROC_AVX512F;
32397 break;
32398 case PROCESSOR_SILVERMONT:
32399 arg_str = "silvermont";
32400 priority = P_PROC_SSE4_2;
32401 break;
32402 case PROCESSOR_AMDFAM10:
32403 arg_str = "amdfam10h";
32404 priority = P_PROC_SSE4_A;
32405 break;
32406 case PROCESSOR_BTVER1:
32407 arg_str = "btver1";
32408 priority = P_PROC_SSE4_A;
32409 break;
32410 case PROCESSOR_BTVER2:
32411 arg_str = "btver2";
32412 priority = P_PROC_BMI;
32413 break;
32414 case PROCESSOR_BDVER1:
32415 arg_str = "bdver1";
32416 priority = P_PROC_XOP;
32417 break;
32418 case PROCESSOR_BDVER2:
32419 arg_str = "bdver2";
32420 priority = P_PROC_FMA;
32421 break;
32422 case PROCESSOR_BDVER3:
32423 arg_str = "bdver3";
32424 priority = P_PROC_FMA;
32425 break;
32426 case PROCESSOR_BDVER4:
32427 arg_str = "bdver4";
32428 priority = P_PROC_AVX2;
32429 break;
32430 case PROCESSOR_ZNVER1:
32431 arg_str = "znver1";
32432 priority = P_PROC_AVX2;
32433 break;
32437 cl_target_option_restore (&global_options, &cur_target);
32439 if (predicate_list && arg_str == NULL)
32441 error_at (DECL_SOURCE_LOCATION (decl),
32442 "No dispatcher found for the versioning attributes");
32443 return 0;
32446 if (predicate_list)
32448 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32449 /* For a C string literal the length includes the trailing NULL. */
32450 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32451 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32452 predicate_chain);
32456 /* Process feature name. */
32457 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32458 strcpy (tok_str, attrs_str);
32459 token = strtok (tok_str, ",");
32460 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32462 while (token != NULL)
32464 /* Do not process "arch=" */
32465 if (strncmp (token, "arch=", 5) == 0)
32467 token = strtok (NULL, ",");
32468 continue;
32470 for (i = 0; i < NUM_FEATURES; ++i)
32472 if (strcmp (token, feature_list[i].name) == 0)
32474 if (predicate_list)
32476 predicate_arg = build_string_literal (
32477 strlen (feature_list[i].name) + 1,
32478 feature_list[i].name);
32479 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32480 predicate_chain);
32482 /* Find the maximum priority feature. */
32483 if (feature_list[i].priority > priority)
32484 priority = feature_list[i].priority;
32486 break;
32489 if (predicate_list && i == NUM_FEATURES)
32491 error_at (DECL_SOURCE_LOCATION (decl),
32492 "No dispatcher found for %s", token);
32493 return 0;
32495 token = strtok (NULL, ",");
32497 free (tok_str);
32499 if (predicate_list && predicate_chain == NULL_TREE)
32501 error_at (DECL_SOURCE_LOCATION (decl),
32502 "No dispatcher found for the versioning attributes : %s",
32503 attrs_str);
32504 return 0;
32506 else if (predicate_list)
32508 predicate_chain = nreverse (predicate_chain);
32509 *predicate_list = predicate_chain;
32512 return priority;
32515 /* This compares the priority of target features in function DECL1
32516 and DECL2. It returns positive value if DECL1 is higher priority,
32517 negative value if DECL2 is higher priority and 0 if they are the
32518 same. */
32520 static int
32521 ix86_compare_version_priority (tree decl1, tree decl2)
32523 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32524 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32526 return (int)priority1 - (int)priority2;
32529 /* V1 and V2 point to function versions with different priorities
32530 based on the target ISA. This function compares their priorities. */
32532 static int
32533 feature_compare (const void *v1, const void *v2)
32535 typedef struct _function_version_info
32537 tree version_decl;
32538 tree predicate_chain;
32539 unsigned int dispatch_priority;
32540 } function_version_info;
32542 const function_version_info c1 = *(const function_version_info *)v1;
32543 const function_version_info c2 = *(const function_version_info *)v2;
32544 return (c2.dispatch_priority - c1.dispatch_priority);
32547 /* This function generates the dispatch function for
32548 multi-versioned functions. DISPATCH_DECL is the function which will
32549 contain the dispatch logic. FNDECLS are the function choices for
32550 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32551 in DISPATCH_DECL in which the dispatch code is generated. */
32553 static int
32554 dispatch_function_versions (tree dispatch_decl,
32555 void *fndecls_p,
32556 basic_block *empty_bb)
32558 tree default_decl;
32559 gimple *ifunc_cpu_init_stmt;
32560 gimple_seq gseq;
32561 int ix;
32562 tree ele;
32563 vec<tree> *fndecls;
32564 unsigned int num_versions = 0;
32565 unsigned int actual_versions = 0;
32566 unsigned int i;
32568 struct _function_version_info
32570 tree version_decl;
32571 tree predicate_chain;
32572 unsigned int dispatch_priority;
32573 }*function_version_info;
32575 gcc_assert (dispatch_decl != NULL
32576 && fndecls_p != NULL
32577 && empty_bb != NULL);
32579 /*fndecls_p is actually a vector. */
32580 fndecls = static_cast<vec<tree> *> (fndecls_p);
32582 /* At least one more version other than the default. */
32583 num_versions = fndecls->length ();
32584 gcc_assert (num_versions >= 2);
32586 function_version_info = (struct _function_version_info *)
32587 XNEWVEC (struct _function_version_info, (num_versions - 1));
32589 /* The first version in the vector is the default decl. */
32590 default_decl = (*fndecls)[0];
32592 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32594 gseq = bb_seq (*empty_bb);
32595 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32596 constructors, so explicity call __builtin_cpu_init here. */
32597 ifunc_cpu_init_stmt = gimple_build_call_vec (
32598 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32599 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32600 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32601 set_bb_seq (*empty_bb, gseq);
32603 pop_cfun ();
32606 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32608 tree version_decl = ele;
32609 tree predicate_chain = NULL_TREE;
32610 unsigned int priority;
32611 /* Get attribute string, parse it and find the right predicate decl.
32612 The predicate function could be a lengthy combination of many
32613 features, like arch-type and various isa-variants. */
32614 priority = get_builtin_code_for_version (version_decl,
32615 &predicate_chain);
32617 if (predicate_chain == NULL_TREE)
32618 continue;
32620 function_version_info [actual_versions].version_decl = version_decl;
32621 function_version_info [actual_versions].predicate_chain
32622 = predicate_chain;
32623 function_version_info [actual_versions].dispatch_priority = priority;
32624 actual_versions++;
32627 /* Sort the versions according to descending order of dispatch priority. The
32628 priority is based on the ISA. This is not a perfect solution. There
32629 could still be ambiguity. If more than one function version is suitable
32630 to execute, which one should be dispatched? In future, allow the user
32631 to specify a dispatch priority next to the version. */
32632 qsort (function_version_info, actual_versions,
32633 sizeof (struct _function_version_info), feature_compare);
32635 for (i = 0; i < actual_versions; ++i)
32636 *empty_bb = add_condition_to_bb (dispatch_decl,
32637 function_version_info[i].version_decl,
32638 function_version_info[i].predicate_chain,
32639 *empty_bb);
32641 /* dispatch default version at the end. */
32642 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32643 NULL, *empty_bb);
32645 free (function_version_info);
32646 return 0;
32649 /* This function changes the assembler name for functions that are
32650 versions. If DECL is a function version and has a "target"
32651 attribute, it appends the attribute string to its assembler name. */
32653 static tree
32654 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32656 tree version_attr;
32657 const char *orig_name, *version_string;
32658 char *attr_str, *assembler_name;
32660 if (DECL_DECLARED_INLINE_P (decl)
32661 && lookup_attribute ("gnu_inline",
32662 DECL_ATTRIBUTES (decl)))
32663 error_at (DECL_SOURCE_LOCATION (decl),
32664 "Function versions cannot be marked as gnu_inline,"
32665 " bodies have to be generated");
32667 if (DECL_VIRTUAL_P (decl)
32668 || DECL_VINDEX (decl))
32669 sorry ("Virtual function multiversioning not supported");
32671 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32673 /* target attribute string cannot be NULL. */
32674 gcc_assert (version_attr != NULL_TREE);
32676 orig_name = IDENTIFIER_POINTER (id);
32677 version_string
32678 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32680 if (strcmp (version_string, "default") == 0)
32681 return id;
32683 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32684 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32686 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32688 /* Allow assembler name to be modified if already set. */
32689 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32690 SET_DECL_RTL (decl, NULL);
32692 tree ret = get_identifier (assembler_name);
32693 XDELETEVEC (attr_str);
32694 XDELETEVEC (assembler_name);
32695 return ret;
32699 static tree
32700 ix86_mangle_decl_assembler_name (tree decl, tree id)
32702 /* For function version, add the target suffix to the assembler name. */
32703 if (TREE_CODE (decl) == FUNCTION_DECL
32704 && DECL_FUNCTION_VERSIONED (decl))
32705 id = ix86_mangle_function_version_assembler_name (decl, id);
32706 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32707 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32708 #endif
32710 return id;
32713 /* Make a dispatcher declaration for the multi-versioned function DECL.
32714 Calls to DECL function will be replaced with calls to the dispatcher
32715 by the front-end. Returns the decl of the dispatcher function. */
32717 static tree
32718 ix86_get_function_versions_dispatcher (void *decl)
32720 tree fn = (tree) decl;
32721 struct cgraph_node *node = NULL;
32722 struct cgraph_node *default_node = NULL;
32723 struct cgraph_function_version_info *node_v = NULL;
32724 struct cgraph_function_version_info *first_v = NULL;
32726 tree dispatch_decl = NULL;
32728 struct cgraph_function_version_info *default_version_info = NULL;
32730 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32732 node = cgraph_node::get (fn);
32733 gcc_assert (node != NULL);
32735 node_v = node->function_version ();
32736 gcc_assert (node_v != NULL);
32738 if (node_v->dispatcher_resolver != NULL)
32739 return node_v->dispatcher_resolver;
32741 /* Find the default version and make it the first node. */
32742 first_v = node_v;
32743 /* Go to the beginning of the chain. */
32744 while (first_v->prev != NULL)
32745 first_v = first_v->prev;
32746 default_version_info = first_v;
32747 while (default_version_info != NULL)
32749 if (is_function_default_version
32750 (default_version_info->this_node->decl))
32751 break;
32752 default_version_info = default_version_info->next;
32755 /* If there is no default node, just return NULL. */
32756 if (default_version_info == NULL)
32757 return NULL;
32759 /* Make default info the first node. */
32760 if (first_v != default_version_info)
32762 default_version_info->prev->next = default_version_info->next;
32763 if (default_version_info->next)
32764 default_version_info->next->prev = default_version_info->prev;
32765 first_v->prev = default_version_info;
32766 default_version_info->next = first_v;
32767 default_version_info->prev = NULL;
32770 default_node = default_version_info->this_node;
32772 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32773 if (targetm.has_ifunc_p ())
32775 struct cgraph_function_version_info *it_v = NULL;
32776 struct cgraph_node *dispatcher_node = NULL;
32777 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32779 /* Right now, the dispatching is done via ifunc. */
32780 dispatch_decl = make_dispatcher_decl (default_node->decl);
32782 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32783 gcc_assert (dispatcher_node != NULL);
32784 dispatcher_node->dispatcher_function = 1;
32785 dispatcher_version_info
32786 = dispatcher_node->insert_new_function_version ();
32787 dispatcher_version_info->next = default_version_info;
32788 dispatcher_node->definition = 1;
32790 /* Set the dispatcher for all the versions. */
32791 it_v = default_version_info;
32792 while (it_v != NULL)
32794 it_v->dispatcher_resolver = dispatch_decl;
32795 it_v = it_v->next;
32798 else
32799 #endif
32801 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32802 "multiversioning needs ifunc which is not supported "
32803 "on this target");
32806 return dispatch_decl;
32809 /* Make the resolver function decl to dispatch the versions of
32810 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
32811 ifunc alias that will point to the created resolver. Create an
32812 empty basic block in the resolver and store the pointer in
32813 EMPTY_BB. Return the decl of the resolver function. */
32815 static tree
32816 make_resolver_func (const tree default_decl,
32817 const tree ifunc_alias_decl,
32818 basic_block *empty_bb)
32820 char *resolver_name;
32821 tree decl, type, decl_name, t;
32823 /* IFUNC's have to be globally visible. So, if the default_decl is
32824 not, then the name of the IFUNC should be made unique. */
32825 if (TREE_PUBLIC (default_decl) == 0)
32827 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
32828 symtab->change_decl_assembler_name (ifunc_alias_decl,
32829 get_identifier (ifunc_name));
32830 XDELETEVEC (ifunc_name);
32833 resolver_name = make_unique_name (default_decl, "resolver", false);
32835 /* The resolver function should return a (void *). */
32836 type = build_function_type_list (ptr_type_node, NULL_TREE);
32838 decl = build_fn_decl (resolver_name, type);
32839 decl_name = get_identifier (resolver_name);
32840 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32842 DECL_NAME (decl) = decl_name;
32843 TREE_USED (decl) = 1;
32844 DECL_ARTIFICIAL (decl) = 1;
32845 DECL_IGNORED_P (decl) = 1;
32846 TREE_PUBLIC (decl) = 0;
32847 DECL_UNINLINABLE (decl) = 1;
32849 /* Resolver is not external, body is generated. */
32850 DECL_EXTERNAL (decl) = 0;
32851 DECL_EXTERNAL (ifunc_alias_decl) = 0;
32853 DECL_CONTEXT (decl) = NULL_TREE;
32854 DECL_INITIAL (decl) = make_node (BLOCK);
32855 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32857 if (DECL_COMDAT_GROUP (default_decl)
32858 || TREE_PUBLIC (default_decl))
32860 /* In this case, each translation unit with a call to this
32861 versioned function will put out a resolver. Ensure it
32862 is comdat to keep just one copy. */
32863 DECL_COMDAT (decl) = 1;
32864 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32866 /* Build result decl and add to function_decl. */
32867 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32868 DECL_ARTIFICIAL (t) = 1;
32869 DECL_IGNORED_P (t) = 1;
32870 DECL_RESULT (decl) = t;
32872 gimplify_function_tree (decl);
32873 push_cfun (DECL_STRUCT_FUNCTION (decl));
32874 *empty_bb = init_lowered_empty_function (decl, false,
32875 profile_count::uninitialized ());
32877 cgraph_node::add_new_function (decl, true);
32878 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32880 pop_cfun ();
32882 gcc_assert (ifunc_alias_decl != NULL);
32883 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
32884 DECL_ATTRIBUTES (ifunc_alias_decl)
32885 = make_attribute ("ifunc", resolver_name,
32886 DECL_ATTRIBUTES (ifunc_alias_decl));
32888 /* Create the alias for dispatch to resolver here. */
32889 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
32890 XDELETEVEC (resolver_name);
32891 return decl;
32894 /* Generate the dispatching code body to dispatch multi-versioned function
32895 DECL. The target hook is called to process the "target" attributes and
32896 provide the code to dispatch the right function at run-time. NODE points
32897 to the dispatcher decl whose body will be created. */
32899 static tree
32900 ix86_generate_version_dispatcher_body (void *node_p)
32902 tree resolver_decl;
32903 basic_block empty_bb;
32904 tree default_ver_decl;
32905 struct cgraph_node *versn;
32906 struct cgraph_node *node;
32908 struct cgraph_function_version_info *node_version_info = NULL;
32909 struct cgraph_function_version_info *versn_info = NULL;
32911 node = (cgraph_node *)node_p;
32913 node_version_info = node->function_version ();
32914 gcc_assert (node->dispatcher_function
32915 && node_version_info != NULL);
32917 if (node_version_info->dispatcher_resolver)
32918 return node_version_info->dispatcher_resolver;
32920 /* The first version in the chain corresponds to the default version. */
32921 default_ver_decl = node_version_info->next->this_node->decl;
32923 /* node is going to be an alias, so remove the finalized bit. */
32924 node->definition = false;
32926 resolver_decl = make_resolver_func (default_ver_decl,
32927 node->decl, &empty_bb);
32929 node_version_info->dispatcher_resolver = resolver_decl;
32931 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32933 auto_vec<tree, 2> fn_ver_vec;
32935 for (versn_info = node_version_info->next; versn_info;
32936 versn_info = versn_info->next)
32938 versn = versn_info->this_node;
32939 /* Check for virtual functions here again, as by this time it should
32940 have been determined if this function needs a vtable index or
32941 not. This happens for methods in derived classes that override
32942 virtual methods in base classes but are not explicitly marked as
32943 virtual. */
32944 if (DECL_VINDEX (versn->decl))
32945 sorry ("Virtual function multiversioning not supported");
32947 fn_ver_vec.safe_push (versn->decl);
32950 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32951 cgraph_edge::rebuild_edges ();
32952 pop_cfun ();
32953 return resolver_decl;
32955 /* This builds the processor_model struct type defined in
32956 libgcc/config/i386/cpuinfo.c */
32958 static tree
32959 build_processor_model_struct (void)
32961 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32962 "__cpu_features"};
32963 tree field = NULL_TREE, field_chain = NULL_TREE;
32964 int i;
32965 tree type = make_node (RECORD_TYPE);
32967 /* The first 3 fields are unsigned int. */
32968 for (i = 0; i < 3; ++i)
32970 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32971 get_identifier (field_name[i]), unsigned_type_node);
32972 if (field_chain != NULL_TREE)
32973 DECL_CHAIN (field) = field_chain;
32974 field_chain = field;
32977 /* The last field is an array of unsigned integers of size one. */
32978 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32979 get_identifier (field_name[3]),
32980 build_array_type (unsigned_type_node,
32981 build_index_type (size_one_node)));
32982 if (field_chain != NULL_TREE)
32983 DECL_CHAIN (field) = field_chain;
32984 field_chain = field;
32986 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
32987 return type;
32990 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
32992 static tree
32993 make_var_decl (tree type, const char *name)
32995 tree new_decl;
32997 new_decl = build_decl (UNKNOWN_LOCATION,
32998 VAR_DECL,
32999 get_identifier(name),
33000 type);
33002 DECL_EXTERNAL (new_decl) = 1;
33003 TREE_STATIC (new_decl) = 1;
33004 TREE_PUBLIC (new_decl) = 1;
33005 DECL_INITIAL (new_decl) = 0;
33006 DECL_ARTIFICIAL (new_decl) = 0;
33007 DECL_PRESERVE_P (new_decl) = 1;
33009 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33010 assemble_variable (new_decl, 0, 0, 0);
33012 return new_decl;
33015 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33016 into an integer defined in libgcc/config/i386/cpuinfo.c */
33018 static tree
33019 fold_builtin_cpu (tree fndecl, tree *args)
33021 unsigned int i;
33022 enum ix86_builtins fn_code = (enum ix86_builtins)
33023 DECL_FUNCTION_CODE (fndecl);
33024 tree param_string_cst = NULL;
33026 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33027 enum processor_features
33029 F_CMOV = 0,
33030 F_MMX,
33031 F_POPCNT,
33032 F_SSE,
33033 F_SSE2,
33034 F_SSE3,
33035 F_SSSE3,
33036 F_SSE4_1,
33037 F_SSE4_2,
33038 F_AVX,
33039 F_AVX2,
33040 F_SSE4_A,
33041 F_FMA4,
33042 F_XOP,
33043 F_FMA,
33044 F_AVX512F,
33045 F_BMI,
33046 F_BMI2,
33047 F_AES,
33048 F_PCLMUL,
33049 F_AVX512VL,
33050 F_AVX512BW,
33051 F_AVX512DQ,
33052 F_AVX512CD,
33053 F_AVX512ER,
33054 F_AVX512PF,
33055 F_AVX512VBMI,
33056 F_AVX512IFMA,
33057 F_AVX5124VNNIW,
33058 F_AVX5124FMAPS,
33059 F_AVX512VPOPCNTDQ,
33060 F_MAX
33063 /* These are the values for vendor types and cpu types and subtypes
33064 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33065 the corresponding start value. */
33066 enum processor_model
33068 M_INTEL = 1,
33069 M_AMD,
33070 M_CPU_TYPE_START,
33071 M_INTEL_BONNELL,
33072 M_INTEL_CORE2,
33073 M_INTEL_COREI7,
33074 M_AMDFAM10H,
33075 M_AMDFAM15H,
33076 M_INTEL_SILVERMONT,
33077 M_INTEL_KNL,
33078 M_AMD_BTVER1,
33079 M_AMD_BTVER2,
33080 M_AMDFAM17H,
33081 M_INTEL_KNM,
33082 M_CPU_SUBTYPE_START,
33083 M_INTEL_COREI7_NEHALEM,
33084 M_INTEL_COREI7_WESTMERE,
33085 M_INTEL_COREI7_SANDYBRIDGE,
33086 M_AMDFAM10H_BARCELONA,
33087 M_AMDFAM10H_SHANGHAI,
33088 M_AMDFAM10H_ISTANBUL,
33089 M_AMDFAM15H_BDVER1,
33090 M_AMDFAM15H_BDVER2,
33091 M_AMDFAM15H_BDVER3,
33092 M_AMDFAM15H_BDVER4,
33093 M_AMDFAM17H_ZNVER1,
33094 M_INTEL_COREI7_IVYBRIDGE,
33095 M_INTEL_COREI7_HASWELL,
33096 M_INTEL_COREI7_BROADWELL,
33097 M_INTEL_COREI7_SKYLAKE,
33098 M_INTEL_COREI7_SKYLAKE_AVX512,
33099 M_INTEL_COREI7_CANNONLAKE,
33100 M_INTEL_COREI7_ICELAKE
33103 static struct _arch_names_table
33105 const char *const name;
33106 const enum processor_model model;
33108 const arch_names_table[] =
33110 {"amd", M_AMD},
33111 {"intel", M_INTEL},
33112 {"atom", M_INTEL_BONNELL},
33113 {"slm", M_INTEL_SILVERMONT},
33114 {"core2", M_INTEL_CORE2},
33115 {"corei7", M_INTEL_COREI7},
33116 {"nehalem", M_INTEL_COREI7_NEHALEM},
33117 {"westmere", M_INTEL_COREI7_WESTMERE},
33118 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33119 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33120 {"haswell", M_INTEL_COREI7_HASWELL},
33121 {"broadwell", M_INTEL_COREI7_BROADWELL},
33122 {"skylake", M_INTEL_COREI7_SKYLAKE},
33123 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33124 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
33125 {"icelake", M_INTEL_COREI7_ICELAKE},
33126 {"bonnell", M_INTEL_BONNELL},
33127 {"silvermont", M_INTEL_SILVERMONT},
33128 {"knl", M_INTEL_KNL},
33129 {"knm", M_INTEL_KNM},
33130 {"amdfam10h", M_AMDFAM10H},
33131 {"barcelona", M_AMDFAM10H_BARCELONA},
33132 {"shanghai", M_AMDFAM10H_SHANGHAI},
33133 {"istanbul", M_AMDFAM10H_ISTANBUL},
33134 {"btver1", M_AMD_BTVER1},
33135 {"amdfam15h", M_AMDFAM15H},
33136 {"bdver1", M_AMDFAM15H_BDVER1},
33137 {"bdver2", M_AMDFAM15H_BDVER2},
33138 {"bdver3", M_AMDFAM15H_BDVER3},
33139 {"bdver4", M_AMDFAM15H_BDVER4},
33140 {"btver2", M_AMD_BTVER2},
33141 {"amdfam17h", M_AMDFAM17H},
33142 {"znver1", M_AMDFAM17H_ZNVER1},
33145 static struct _isa_names_table
33147 const char *const name;
33148 const enum processor_features feature;
33150 const isa_names_table[] =
33152 {"cmov", F_CMOV},
33153 {"mmx", F_MMX},
33154 {"popcnt", F_POPCNT},
33155 {"sse", F_SSE},
33156 {"sse2", F_SSE2},
33157 {"sse3", F_SSE3},
33158 {"ssse3", F_SSSE3},
33159 {"sse4a", F_SSE4_A},
33160 {"sse4.1", F_SSE4_1},
33161 {"sse4.2", F_SSE4_2},
33162 {"avx", F_AVX},
33163 {"fma4", F_FMA4},
33164 {"xop", F_XOP},
33165 {"fma", F_FMA},
33166 {"avx2", F_AVX2},
33167 {"avx512f", F_AVX512F},
33168 {"bmi", F_BMI},
33169 {"bmi2", F_BMI2},
33170 {"aes", F_AES},
33171 {"pclmul", F_PCLMUL},
33172 {"avx512vl",F_AVX512VL},
33173 {"avx512bw",F_AVX512BW},
33174 {"avx512dq",F_AVX512DQ},
33175 {"avx512cd",F_AVX512CD},
33176 {"avx512er",F_AVX512ER},
33177 {"avx512pf",F_AVX512PF},
33178 {"avx512vbmi",F_AVX512VBMI},
33179 {"avx512ifma",F_AVX512IFMA},
33180 {"avx5124vnniw",F_AVX5124VNNIW},
33181 {"avx5124fmaps",F_AVX5124FMAPS},
33182 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
33185 tree __processor_model_type = build_processor_model_struct ();
33186 tree __cpu_model_var = make_var_decl (__processor_model_type,
33187 "__cpu_model");
33190 varpool_node::add (__cpu_model_var);
33192 gcc_assert ((args != NULL) && (*args != NULL));
33194 param_string_cst = *args;
33195 while (param_string_cst
33196 && TREE_CODE (param_string_cst) != STRING_CST)
33198 /* *args must be a expr that can contain other EXPRS leading to a
33199 STRING_CST. */
33200 if (!EXPR_P (param_string_cst))
33202 error ("Parameter to builtin must be a string constant or literal");
33203 return integer_zero_node;
33205 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33208 gcc_assert (param_string_cst);
33210 if (fn_code == IX86_BUILTIN_CPU_IS)
33212 tree ref;
33213 tree field;
33214 tree final;
33216 unsigned int field_val = 0;
33217 unsigned int NUM_ARCH_NAMES
33218 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33220 for (i = 0; i < NUM_ARCH_NAMES; i++)
33221 if (strcmp (arch_names_table[i].name,
33222 TREE_STRING_POINTER (param_string_cst)) == 0)
33223 break;
33225 if (i == NUM_ARCH_NAMES)
33227 error ("Parameter to builtin not valid: %s",
33228 TREE_STRING_POINTER (param_string_cst));
33229 return integer_zero_node;
33232 field = TYPE_FIELDS (__processor_model_type);
33233 field_val = arch_names_table[i].model;
33235 /* CPU types are stored in the next field. */
33236 if (field_val > M_CPU_TYPE_START
33237 && field_val < M_CPU_SUBTYPE_START)
33239 field = DECL_CHAIN (field);
33240 field_val -= M_CPU_TYPE_START;
33243 /* CPU subtypes are stored in the next field. */
33244 if (field_val > M_CPU_SUBTYPE_START)
33246 field = DECL_CHAIN ( DECL_CHAIN (field));
33247 field_val -= M_CPU_SUBTYPE_START;
33250 /* Get the appropriate field in __cpu_model. */
33251 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33252 field, NULL_TREE);
33254 /* Check the value. */
33255 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33256 build_int_cstu (unsigned_type_node, field_val));
33257 return build1 (CONVERT_EXPR, integer_type_node, final);
33259 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33261 tree ref;
33262 tree array_elt;
33263 tree field;
33264 tree final;
33266 unsigned int field_val = 0;
33267 unsigned int NUM_ISA_NAMES
33268 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33270 for (i = 0; i < NUM_ISA_NAMES; i++)
33271 if (strcmp (isa_names_table[i].name,
33272 TREE_STRING_POINTER (param_string_cst)) == 0)
33273 break;
33275 if (i == NUM_ISA_NAMES)
33277 error ("Parameter to builtin not valid: %s",
33278 TREE_STRING_POINTER (param_string_cst));
33279 return integer_zero_node;
33282 field = TYPE_FIELDS (__processor_model_type);
33283 /* Get the last field, which is __cpu_features. */
33284 while (DECL_CHAIN (field))
33285 field = DECL_CHAIN (field);
33287 /* Get the appropriate field: __cpu_model.__cpu_features */
33288 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33289 field, NULL_TREE);
33291 /* Access the 0th element of __cpu_features array. */
33292 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33293 integer_zero_node, NULL_TREE, NULL_TREE);
33295 field_val = (1 << isa_names_table[i].feature);
33296 /* Return __cpu_model.__cpu_features[0] & field_val */
33297 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33298 build_int_cstu (unsigned_type_node, field_val));
33299 return build1 (CONVERT_EXPR, integer_type_node, final);
33301 gcc_unreachable ();
33304 static tree
33305 ix86_fold_builtin (tree fndecl, int n_args,
33306 tree *args, bool ignore ATTRIBUTE_UNUSED)
33308 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33310 enum ix86_builtins fn_code = (enum ix86_builtins)
33311 DECL_FUNCTION_CODE (fndecl);
33312 switch (fn_code)
33314 case IX86_BUILTIN_CPU_IS:
33315 case IX86_BUILTIN_CPU_SUPPORTS:
33316 gcc_assert (n_args == 1);
33317 return fold_builtin_cpu (fndecl, args);
33319 case IX86_BUILTIN_NANQ:
33320 case IX86_BUILTIN_NANSQ:
33322 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33323 const char *str = c_getstr (*args);
33324 int quiet = fn_code == IX86_BUILTIN_NANQ;
33325 REAL_VALUE_TYPE real;
33327 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33328 return build_real (type, real);
33329 return NULL_TREE;
33332 case IX86_BUILTIN_INFQ:
33333 case IX86_BUILTIN_HUGE_VALQ:
33335 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33336 REAL_VALUE_TYPE inf;
33337 real_inf (&inf);
33338 return build_real (type, inf);
33341 case IX86_BUILTIN_TZCNT16:
33342 case IX86_BUILTIN_CTZS:
33343 case IX86_BUILTIN_TZCNT32:
33344 case IX86_BUILTIN_TZCNT64:
33345 gcc_assert (n_args == 1);
33346 if (TREE_CODE (args[0]) == INTEGER_CST)
33348 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33349 tree arg = args[0];
33350 if (fn_code == IX86_BUILTIN_TZCNT16
33351 || fn_code == IX86_BUILTIN_CTZS)
33352 arg = fold_convert (short_unsigned_type_node, arg);
33353 if (integer_zerop (arg))
33354 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33355 else
33356 return fold_const_call (CFN_CTZ, type, arg);
33358 break;
33360 case IX86_BUILTIN_LZCNT16:
33361 case IX86_BUILTIN_CLZS:
33362 case IX86_BUILTIN_LZCNT32:
33363 case IX86_BUILTIN_LZCNT64:
33364 gcc_assert (n_args == 1);
33365 if (TREE_CODE (args[0]) == INTEGER_CST)
33367 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33368 tree arg = args[0];
33369 if (fn_code == IX86_BUILTIN_LZCNT16
33370 || fn_code == IX86_BUILTIN_CLZS)
33371 arg = fold_convert (short_unsigned_type_node, arg);
33372 if (integer_zerop (arg))
33373 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33374 else
33375 return fold_const_call (CFN_CLZ, type, arg);
33377 break;
33379 case IX86_BUILTIN_BEXTR32:
33380 case IX86_BUILTIN_BEXTR64:
33381 case IX86_BUILTIN_BEXTRI32:
33382 case IX86_BUILTIN_BEXTRI64:
33383 gcc_assert (n_args == 2);
33384 if (tree_fits_uhwi_p (args[1]))
33386 unsigned HOST_WIDE_INT res = 0;
33387 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33388 unsigned int start = tree_to_uhwi (args[1]);
33389 unsigned int len = (start & 0xff00) >> 8;
33390 start &= 0xff;
33391 if (start >= prec || len == 0)
33392 res = 0;
33393 else if (!tree_fits_uhwi_p (args[0]))
33394 break;
33395 else
33396 res = tree_to_uhwi (args[0]) >> start;
33397 if (len > prec)
33398 len = prec;
33399 if (len < HOST_BITS_PER_WIDE_INT)
33400 res &= (HOST_WIDE_INT_1U << len) - 1;
33401 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33403 break;
33405 case IX86_BUILTIN_BZHI32:
33406 case IX86_BUILTIN_BZHI64:
33407 gcc_assert (n_args == 2);
33408 if (tree_fits_uhwi_p (args[1]))
33410 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33411 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33412 return args[0];
33413 if (!tree_fits_uhwi_p (args[0]))
33414 break;
33415 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33416 res &= ~(HOST_WIDE_INT_M1U << idx);
33417 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33419 break;
33421 case IX86_BUILTIN_PDEP32:
33422 case IX86_BUILTIN_PDEP64:
33423 gcc_assert (n_args == 2);
33424 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33426 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33427 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33428 unsigned HOST_WIDE_INT res = 0;
33429 unsigned HOST_WIDE_INT m, k = 1;
33430 for (m = 1; m; m <<= 1)
33431 if ((mask & m) != 0)
33433 if ((src & k) != 0)
33434 res |= m;
33435 k <<= 1;
33437 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33439 break;
33441 case IX86_BUILTIN_PEXT32:
33442 case IX86_BUILTIN_PEXT64:
33443 gcc_assert (n_args == 2);
33444 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33446 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33447 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33448 unsigned HOST_WIDE_INT res = 0;
33449 unsigned HOST_WIDE_INT m, k = 1;
33450 for (m = 1; m; m <<= 1)
33451 if ((mask & m) != 0)
33453 if ((src & m) != 0)
33454 res |= k;
33455 k <<= 1;
33457 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33459 break;
33461 default:
33462 break;
33466 #ifdef SUBTARGET_FOLD_BUILTIN
33467 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33468 #endif
33470 return NULL_TREE;
33473 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33474 constant) in GIMPLE. */
33476 bool
33477 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33479 gimple *stmt = gsi_stmt (*gsi);
33480 tree fndecl = gimple_call_fndecl (stmt);
33481 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33482 int n_args = gimple_call_num_args (stmt);
33483 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33484 tree decl = NULL_TREE;
33485 tree arg0, arg1;
33487 switch (fn_code)
33489 case IX86_BUILTIN_TZCNT32:
33490 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33491 goto fold_tzcnt_lzcnt;
33493 case IX86_BUILTIN_TZCNT64:
33494 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33495 goto fold_tzcnt_lzcnt;
33497 case IX86_BUILTIN_LZCNT32:
33498 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33499 goto fold_tzcnt_lzcnt;
33501 case IX86_BUILTIN_LZCNT64:
33502 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33503 goto fold_tzcnt_lzcnt;
33505 fold_tzcnt_lzcnt:
33506 gcc_assert (n_args == 1);
33507 arg0 = gimple_call_arg (stmt, 0);
33508 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33510 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33511 /* If arg0 is provably non-zero, optimize into generic
33512 __builtin_c[tl]z{,ll} function the middle-end handles
33513 better. */
33514 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33515 return false;
33517 location_t loc = gimple_location (stmt);
33518 gimple *g = gimple_build_call (decl, 1, arg0);
33519 gimple_set_location (g, loc);
33520 tree lhs = make_ssa_name (integer_type_node);
33521 gimple_call_set_lhs (g, lhs);
33522 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33523 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33524 gimple_set_location (g, loc);
33525 gsi_replace (gsi, g, false);
33526 return true;
33528 break;
33530 case IX86_BUILTIN_BZHI32:
33531 case IX86_BUILTIN_BZHI64:
33532 gcc_assert (n_args == 2);
33533 arg1 = gimple_call_arg (stmt, 1);
33534 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33536 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33537 arg0 = gimple_call_arg (stmt, 0);
33538 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33539 break;
33540 location_t loc = gimple_location (stmt);
33541 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33542 gimple_set_location (g, loc);
33543 gsi_replace (gsi, g, false);
33544 return true;
33546 break;
33548 case IX86_BUILTIN_PDEP32:
33549 case IX86_BUILTIN_PDEP64:
33550 case IX86_BUILTIN_PEXT32:
33551 case IX86_BUILTIN_PEXT64:
33552 gcc_assert (n_args == 2);
33553 arg1 = gimple_call_arg (stmt, 1);
33554 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33556 location_t loc = gimple_location (stmt);
33557 arg0 = gimple_call_arg (stmt, 0);
33558 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33559 gimple_set_location (g, loc);
33560 gsi_replace (gsi, g, false);
33561 return true;
33563 break;
33565 default:
33566 break;
33569 return false;
33572 /* Make builtins to detect cpu type and features supported. NAME is
33573 the builtin name, CODE is the builtin code, and FTYPE is the function
33574 type of the builtin. */
33576 static void
33577 make_cpu_type_builtin (const char* name, int code,
33578 enum ix86_builtin_func_type ftype, bool is_const)
33580 tree decl;
33581 tree type;
33583 type = ix86_get_builtin_func_type (ftype);
33584 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33585 NULL, NULL_TREE);
33586 gcc_assert (decl != NULL_TREE);
33587 ix86_builtins[(int) code] = decl;
33588 TREE_READONLY (decl) = is_const;
33591 /* Make builtins to get CPU type and features supported. The created
33592 builtins are :
33594 __builtin_cpu_init (), to detect cpu type and features,
33595 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33596 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33599 static void
33600 ix86_init_platform_type_builtins (void)
33602 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33603 INT_FTYPE_VOID, false);
33604 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33605 INT_FTYPE_PCCHAR, true);
33606 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33607 INT_FTYPE_PCCHAR, true);
33610 /* Internal method for ix86_init_builtins. */
33612 static void
33613 ix86_init_builtins_va_builtins_abi (void)
33615 tree ms_va_ref, sysv_va_ref;
33616 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33617 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33618 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33619 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33621 if (!TARGET_64BIT)
33622 return;
33623 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33624 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33625 ms_va_ref = build_reference_type (ms_va_list_type_node);
33626 sysv_va_ref =
33627 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33629 fnvoid_va_end_ms =
33630 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33631 fnvoid_va_start_ms =
33632 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33633 fnvoid_va_end_sysv =
33634 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33635 fnvoid_va_start_sysv =
33636 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33637 NULL_TREE);
33638 fnvoid_va_copy_ms =
33639 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33640 NULL_TREE);
33641 fnvoid_va_copy_sysv =
33642 build_function_type_list (void_type_node, sysv_va_ref,
33643 sysv_va_ref, NULL_TREE);
33645 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33646 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33647 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33648 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33649 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33650 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33651 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33652 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33653 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33654 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33655 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33656 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33659 static void
33660 ix86_init_builtin_types (void)
33662 tree float80_type_node, const_string_type_node;
33664 /* The __float80 type. */
33665 float80_type_node = long_double_type_node;
33666 if (TYPE_MODE (float80_type_node) != XFmode)
33668 if (float64x_type_node != NULL_TREE
33669 && TYPE_MODE (float64x_type_node) == XFmode)
33670 float80_type_node = float64x_type_node;
33671 else
33673 /* The __float80 type. */
33674 float80_type_node = make_node (REAL_TYPE);
33676 TYPE_PRECISION (float80_type_node) = 80;
33677 layout_type (float80_type_node);
33680 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33682 /* The __float128 type. The node has already been created as
33683 _Float128, so we only need to register the __float128 name for
33684 it. */
33685 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33687 const_string_type_node
33688 = build_pointer_type (build_qualified_type
33689 (char_type_node, TYPE_QUAL_CONST));
33691 /* This macro is built by i386-builtin-types.awk. */
33692 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33695 static void
33696 ix86_init_builtins (void)
33698 tree ftype, decl;
33700 ix86_init_builtin_types ();
33702 /* Builtins to get CPU type and features. */
33703 ix86_init_platform_type_builtins ();
33705 /* TFmode support builtins. */
33706 def_builtin_const (0, "__builtin_infq",
33707 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33708 def_builtin_const (0, "__builtin_huge_valq",
33709 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33711 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33712 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33713 BUILT_IN_MD, "nanq", NULL_TREE);
33714 TREE_READONLY (decl) = 1;
33715 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33717 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33718 BUILT_IN_MD, "nansq", NULL_TREE);
33719 TREE_READONLY (decl) = 1;
33720 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33722 /* We will expand them to normal call if SSE isn't available since
33723 they are used by libgcc. */
33724 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33725 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33726 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33727 TREE_READONLY (decl) = 1;
33728 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33730 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33731 decl = add_builtin_function ("__builtin_copysignq", ftype,
33732 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33733 "__copysigntf3", NULL_TREE);
33734 TREE_READONLY (decl) = 1;
33735 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33737 ix86_init_tm_builtins ();
33738 ix86_init_mmx_sse_builtins ();
33739 ix86_init_mpx_builtins ();
33741 if (TARGET_LP64)
33742 ix86_init_builtins_va_builtins_abi ();
33744 #ifdef SUBTARGET_INIT_BUILTINS
33745 SUBTARGET_INIT_BUILTINS;
33746 #endif
33749 /* Return the ix86 builtin for CODE. */
33751 static tree
33752 ix86_builtin_decl (unsigned code, bool)
33754 if (code >= IX86_BUILTIN_MAX)
33755 return error_mark_node;
33757 return ix86_builtins[code];
33760 /* Errors in the source file can cause expand_expr to return const0_rtx
33761 where we expect a vector. To avoid crashing, use one of the vector
33762 clear instructions. */
33763 static rtx
33764 safe_vector_operand (rtx x, machine_mode mode)
33766 if (x == const0_rtx)
33767 x = CONST0_RTX (mode);
33768 return x;
33771 /* Fixup modeless constants to fit required mode. */
33772 static rtx
33773 fixup_modeless_constant (rtx x, machine_mode mode)
33775 if (GET_MODE (x) == VOIDmode)
33776 x = convert_to_mode (mode, x, 1);
33777 return x;
33780 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33782 static rtx
33783 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33785 rtx pat;
33786 tree arg0 = CALL_EXPR_ARG (exp, 0);
33787 tree arg1 = CALL_EXPR_ARG (exp, 1);
33788 rtx op0 = expand_normal (arg0);
33789 rtx op1 = expand_normal (arg1);
33790 machine_mode tmode = insn_data[icode].operand[0].mode;
33791 machine_mode mode0 = insn_data[icode].operand[1].mode;
33792 machine_mode mode1 = insn_data[icode].operand[2].mode;
33794 if (VECTOR_MODE_P (mode0))
33795 op0 = safe_vector_operand (op0, mode0);
33796 if (VECTOR_MODE_P (mode1))
33797 op1 = safe_vector_operand (op1, mode1);
33799 if (optimize || !target
33800 || GET_MODE (target) != tmode
33801 || !insn_data[icode].operand[0].predicate (target, tmode))
33802 target = gen_reg_rtx (tmode);
33804 if (GET_MODE (op1) == SImode && mode1 == TImode)
33806 rtx x = gen_reg_rtx (V4SImode);
33807 emit_insn (gen_sse2_loadd (x, op1));
33808 op1 = gen_lowpart (TImode, x);
33811 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33812 op0 = copy_to_mode_reg (mode0, op0);
33813 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33814 op1 = copy_to_mode_reg (mode1, op1);
33816 pat = GEN_FCN (icode) (target, op0, op1);
33817 if (! pat)
33818 return 0;
33820 emit_insn (pat);
33822 return target;
33825 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33827 static rtx
33828 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33829 enum ix86_builtin_func_type m_type,
33830 enum rtx_code sub_code)
33832 rtx pat;
33833 int i;
33834 int nargs;
33835 bool comparison_p = false;
33836 bool tf_p = false;
33837 bool last_arg_constant = false;
33838 int num_memory = 0;
33839 struct {
33840 rtx op;
33841 machine_mode mode;
33842 } args[4];
33844 machine_mode tmode = insn_data[icode].operand[0].mode;
33846 switch (m_type)
33848 case MULTI_ARG_4_DF2_DI_I:
33849 case MULTI_ARG_4_DF2_DI_I1:
33850 case MULTI_ARG_4_SF2_SI_I:
33851 case MULTI_ARG_4_SF2_SI_I1:
33852 nargs = 4;
33853 last_arg_constant = true;
33854 break;
33856 case MULTI_ARG_3_SF:
33857 case MULTI_ARG_3_DF:
33858 case MULTI_ARG_3_SF2:
33859 case MULTI_ARG_3_DF2:
33860 case MULTI_ARG_3_DI:
33861 case MULTI_ARG_3_SI:
33862 case MULTI_ARG_3_SI_DI:
33863 case MULTI_ARG_3_HI:
33864 case MULTI_ARG_3_HI_SI:
33865 case MULTI_ARG_3_QI:
33866 case MULTI_ARG_3_DI2:
33867 case MULTI_ARG_3_SI2:
33868 case MULTI_ARG_3_HI2:
33869 case MULTI_ARG_3_QI2:
33870 nargs = 3;
33871 break;
33873 case MULTI_ARG_2_SF:
33874 case MULTI_ARG_2_DF:
33875 case MULTI_ARG_2_DI:
33876 case MULTI_ARG_2_SI:
33877 case MULTI_ARG_2_HI:
33878 case MULTI_ARG_2_QI:
33879 nargs = 2;
33880 break;
33882 case MULTI_ARG_2_DI_IMM:
33883 case MULTI_ARG_2_SI_IMM:
33884 case MULTI_ARG_2_HI_IMM:
33885 case MULTI_ARG_2_QI_IMM:
33886 nargs = 2;
33887 last_arg_constant = true;
33888 break;
33890 case MULTI_ARG_1_SF:
33891 case MULTI_ARG_1_DF:
33892 case MULTI_ARG_1_SF2:
33893 case MULTI_ARG_1_DF2:
33894 case MULTI_ARG_1_DI:
33895 case MULTI_ARG_1_SI:
33896 case MULTI_ARG_1_HI:
33897 case MULTI_ARG_1_QI:
33898 case MULTI_ARG_1_SI_DI:
33899 case MULTI_ARG_1_HI_DI:
33900 case MULTI_ARG_1_HI_SI:
33901 case MULTI_ARG_1_QI_DI:
33902 case MULTI_ARG_1_QI_SI:
33903 case MULTI_ARG_1_QI_HI:
33904 nargs = 1;
33905 break;
33907 case MULTI_ARG_2_DI_CMP:
33908 case MULTI_ARG_2_SI_CMP:
33909 case MULTI_ARG_2_HI_CMP:
33910 case MULTI_ARG_2_QI_CMP:
33911 nargs = 2;
33912 comparison_p = true;
33913 break;
33915 case MULTI_ARG_2_SF_TF:
33916 case MULTI_ARG_2_DF_TF:
33917 case MULTI_ARG_2_DI_TF:
33918 case MULTI_ARG_2_SI_TF:
33919 case MULTI_ARG_2_HI_TF:
33920 case MULTI_ARG_2_QI_TF:
33921 nargs = 2;
33922 tf_p = true;
33923 break;
33925 default:
33926 gcc_unreachable ();
33929 if (optimize || !target
33930 || GET_MODE (target) != tmode
33931 || !insn_data[icode].operand[0].predicate (target, tmode))
33932 target = gen_reg_rtx (tmode);
33933 else if (memory_operand (target, tmode))
33934 num_memory++;
33936 gcc_assert (nargs <= 4);
33938 for (i = 0; i < nargs; i++)
33940 tree arg = CALL_EXPR_ARG (exp, i);
33941 rtx op = expand_normal (arg);
33942 int adjust = (comparison_p) ? 1 : 0;
33943 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33945 if (last_arg_constant && i == nargs - 1)
33947 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33949 enum insn_code new_icode = icode;
33950 switch (icode)
33952 case CODE_FOR_xop_vpermil2v2df3:
33953 case CODE_FOR_xop_vpermil2v4sf3:
33954 case CODE_FOR_xop_vpermil2v4df3:
33955 case CODE_FOR_xop_vpermil2v8sf3:
33956 error ("the last argument must be a 2-bit immediate");
33957 return gen_reg_rtx (tmode);
33958 case CODE_FOR_xop_rotlv2di3:
33959 new_icode = CODE_FOR_rotlv2di3;
33960 goto xop_rotl;
33961 case CODE_FOR_xop_rotlv4si3:
33962 new_icode = CODE_FOR_rotlv4si3;
33963 goto xop_rotl;
33964 case CODE_FOR_xop_rotlv8hi3:
33965 new_icode = CODE_FOR_rotlv8hi3;
33966 goto xop_rotl;
33967 case CODE_FOR_xop_rotlv16qi3:
33968 new_icode = CODE_FOR_rotlv16qi3;
33969 xop_rotl:
33970 if (CONST_INT_P (op))
33972 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33973 op = GEN_INT (INTVAL (op) & mask);
33974 gcc_checking_assert
33975 (insn_data[icode].operand[i + 1].predicate (op, mode));
33977 else
33979 gcc_checking_assert
33980 (nargs == 2
33981 && insn_data[new_icode].operand[0].mode == tmode
33982 && insn_data[new_icode].operand[1].mode == tmode
33983 && insn_data[new_icode].operand[2].mode == mode
33984 && insn_data[new_icode].operand[0].predicate
33985 == insn_data[icode].operand[0].predicate
33986 && insn_data[new_icode].operand[1].predicate
33987 == insn_data[icode].operand[1].predicate);
33988 icode = new_icode;
33989 goto non_constant;
33991 break;
33992 default:
33993 gcc_unreachable ();
33997 else
33999 non_constant:
34000 if (VECTOR_MODE_P (mode))
34001 op = safe_vector_operand (op, mode);
34003 /* If we aren't optimizing, only allow one memory operand to be
34004 generated. */
34005 if (memory_operand (op, mode))
34006 num_memory++;
34008 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34010 if (optimize
34011 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34012 || num_memory > 1)
34013 op = force_reg (mode, op);
34016 args[i].op = op;
34017 args[i].mode = mode;
34020 switch (nargs)
34022 case 1:
34023 pat = GEN_FCN (icode) (target, args[0].op);
34024 break;
34026 case 2:
34027 if (tf_p)
34028 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34029 GEN_INT ((int)sub_code));
34030 else if (! comparison_p)
34031 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34032 else
34034 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34035 args[0].op,
34036 args[1].op);
34038 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34040 break;
34042 case 3:
34043 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34044 break;
34046 case 4:
34047 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34048 break;
34050 default:
34051 gcc_unreachable ();
34054 if (! pat)
34055 return 0;
34057 emit_insn (pat);
34058 return target;
34061 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34062 insns with vec_merge. */
34064 static rtx
34065 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34066 rtx target)
34068 rtx pat;
34069 tree arg0 = CALL_EXPR_ARG (exp, 0);
34070 rtx op1, op0 = expand_normal (arg0);
34071 machine_mode tmode = insn_data[icode].operand[0].mode;
34072 machine_mode mode0 = insn_data[icode].operand[1].mode;
34074 if (optimize || !target
34075 || GET_MODE (target) != tmode
34076 || !insn_data[icode].operand[0].predicate (target, tmode))
34077 target = gen_reg_rtx (tmode);
34079 if (VECTOR_MODE_P (mode0))
34080 op0 = safe_vector_operand (op0, mode0);
34082 if ((optimize && !register_operand (op0, mode0))
34083 || !insn_data[icode].operand[1].predicate (op0, mode0))
34084 op0 = copy_to_mode_reg (mode0, op0);
34086 op1 = op0;
34087 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34088 op1 = copy_to_mode_reg (mode0, op1);
34090 pat = GEN_FCN (icode) (target, op0, op1);
34091 if (! pat)
34092 return 0;
34093 emit_insn (pat);
34094 return target;
34097 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34099 static rtx
34100 ix86_expand_sse_compare (const struct builtin_description *d,
34101 tree exp, rtx target, bool swap)
34103 rtx pat;
34104 tree arg0 = CALL_EXPR_ARG (exp, 0);
34105 tree arg1 = CALL_EXPR_ARG (exp, 1);
34106 rtx op0 = expand_normal (arg0);
34107 rtx op1 = expand_normal (arg1);
34108 rtx op2;
34109 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34110 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34111 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34112 enum rtx_code comparison = d->comparison;
34114 if (VECTOR_MODE_P (mode0))
34115 op0 = safe_vector_operand (op0, mode0);
34116 if (VECTOR_MODE_P (mode1))
34117 op1 = safe_vector_operand (op1, mode1);
34119 /* Swap operands if we have a comparison that isn't available in
34120 hardware. */
34121 if (swap)
34122 std::swap (op0, op1);
34124 if (optimize || !target
34125 || GET_MODE (target) != tmode
34126 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34127 target = gen_reg_rtx (tmode);
34129 if ((optimize && !register_operand (op0, mode0))
34130 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34131 op0 = copy_to_mode_reg (mode0, op0);
34132 if ((optimize && !register_operand (op1, mode1))
34133 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34134 op1 = copy_to_mode_reg (mode1, op1);
34136 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34137 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34138 if (! pat)
34139 return 0;
34140 emit_insn (pat);
34141 return target;
34144 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34146 static rtx
34147 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34148 rtx target)
34150 rtx pat;
34151 tree arg0 = CALL_EXPR_ARG (exp, 0);
34152 tree arg1 = CALL_EXPR_ARG (exp, 1);
34153 rtx op0 = expand_normal (arg0);
34154 rtx op1 = expand_normal (arg1);
34155 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34156 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34157 enum rtx_code comparison = d->comparison;
34159 if (VECTOR_MODE_P (mode0))
34160 op0 = safe_vector_operand (op0, mode0);
34161 if (VECTOR_MODE_P (mode1))
34162 op1 = safe_vector_operand (op1, mode1);
34164 /* Swap operands if we have a comparison that isn't available in
34165 hardware. */
34166 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34167 std::swap (op0, op1);
34169 target = gen_reg_rtx (SImode);
34170 emit_move_insn (target, const0_rtx);
34171 target = gen_rtx_SUBREG (QImode, target, 0);
34173 if ((optimize && !register_operand (op0, mode0))
34174 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34175 op0 = copy_to_mode_reg (mode0, op0);
34176 if ((optimize && !register_operand (op1, mode1))
34177 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34178 op1 = copy_to_mode_reg (mode1, op1);
34180 pat = GEN_FCN (d->icode) (op0, op1);
34181 if (! pat)
34182 return 0;
34183 emit_insn (pat);
34184 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34185 gen_rtx_fmt_ee (comparison, QImode,
34186 SET_DEST (pat),
34187 const0_rtx)));
34189 return SUBREG_REG (target);
34192 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34194 static rtx
34195 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34196 rtx target)
34198 rtx pat;
34199 tree arg0 = CALL_EXPR_ARG (exp, 0);
34200 rtx op1, op0 = expand_normal (arg0);
34201 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34202 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34204 if (optimize || target == 0
34205 || GET_MODE (target) != tmode
34206 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34207 target = gen_reg_rtx (tmode);
34209 if (VECTOR_MODE_P (mode0))
34210 op0 = safe_vector_operand (op0, mode0);
34212 if ((optimize && !register_operand (op0, mode0))
34213 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34214 op0 = copy_to_mode_reg (mode0, op0);
34216 op1 = GEN_INT (d->comparison);
34218 pat = GEN_FCN (d->icode) (target, op0, op1);
34219 if (! pat)
34220 return 0;
34221 emit_insn (pat);
34222 return target;
34225 static rtx
34226 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34227 tree exp, rtx target)
34229 rtx pat;
34230 tree arg0 = CALL_EXPR_ARG (exp, 0);
34231 tree arg1 = CALL_EXPR_ARG (exp, 1);
34232 rtx op0 = expand_normal (arg0);
34233 rtx op1 = expand_normal (arg1);
34234 rtx op2;
34235 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34236 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34237 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34239 if (optimize || target == 0
34240 || GET_MODE (target) != tmode
34241 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34242 target = gen_reg_rtx (tmode);
34244 op0 = safe_vector_operand (op0, mode0);
34245 op1 = safe_vector_operand (op1, mode1);
34247 if ((optimize && !register_operand (op0, mode0))
34248 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34249 op0 = copy_to_mode_reg (mode0, op0);
34250 if ((optimize && !register_operand (op1, mode1))
34251 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34252 op1 = copy_to_mode_reg (mode1, op1);
34254 op2 = GEN_INT (d->comparison);
34256 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34257 if (! pat)
34258 return 0;
34259 emit_insn (pat);
34260 return target;
34263 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34265 static rtx
34266 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34267 rtx target)
34269 rtx pat;
34270 tree arg0 = CALL_EXPR_ARG (exp, 0);
34271 tree arg1 = CALL_EXPR_ARG (exp, 1);
34272 rtx op0 = expand_normal (arg0);
34273 rtx op1 = expand_normal (arg1);
34274 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34275 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34276 enum rtx_code comparison = d->comparison;
34278 if (VECTOR_MODE_P (mode0))
34279 op0 = safe_vector_operand (op0, mode0);
34280 if (VECTOR_MODE_P (mode1))
34281 op1 = safe_vector_operand (op1, mode1);
34283 target = gen_reg_rtx (SImode);
34284 emit_move_insn (target, const0_rtx);
34285 target = gen_rtx_SUBREG (QImode, target, 0);
34287 if ((optimize && !register_operand (op0, mode0))
34288 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34289 op0 = copy_to_mode_reg (mode0, op0);
34290 if ((optimize && !register_operand (op1, mode1))
34291 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34292 op1 = copy_to_mode_reg (mode1, op1);
34294 pat = GEN_FCN (d->icode) (op0, op1);
34295 if (! pat)
34296 return 0;
34297 emit_insn (pat);
34298 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34299 gen_rtx_fmt_ee (comparison, QImode,
34300 SET_DEST (pat),
34301 const0_rtx)));
34303 return SUBREG_REG (target);
34306 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34308 static rtx
34309 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34310 tree exp, rtx target)
34312 rtx pat;
34313 tree arg0 = CALL_EXPR_ARG (exp, 0);
34314 tree arg1 = CALL_EXPR_ARG (exp, 1);
34315 tree arg2 = CALL_EXPR_ARG (exp, 2);
34316 tree arg3 = CALL_EXPR_ARG (exp, 3);
34317 tree arg4 = CALL_EXPR_ARG (exp, 4);
34318 rtx scratch0, scratch1;
34319 rtx op0 = expand_normal (arg0);
34320 rtx op1 = expand_normal (arg1);
34321 rtx op2 = expand_normal (arg2);
34322 rtx op3 = expand_normal (arg3);
34323 rtx op4 = expand_normal (arg4);
34324 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34326 tmode0 = insn_data[d->icode].operand[0].mode;
34327 tmode1 = insn_data[d->icode].operand[1].mode;
34328 modev2 = insn_data[d->icode].operand[2].mode;
34329 modei3 = insn_data[d->icode].operand[3].mode;
34330 modev4 = insn_data[d->icode].operand[4].mode;
34331 modei5 = insn_data[d->icode].operand[5].mode;
34332 modeimm = insn_data[d->icode].operand[6].mode;
34334 if (VECTOR_MODE_P (modev2))
34335 op0 = safe_vector_operand (op0, modev2);
34336 if (VECTOR_MODE_P (modev4))
34337 op2 = safe_vector_operand (op2, modev4);
34339 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34340 op0 = copy_to_mode_reg (modev2, op0);
34341 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34342 op1 = copy_to_mode_reg (modei3, op1);
34343 if ((optimize && !register_operand (op2, modev4))
34344 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34345 op2 = copy_to_mode_reg (modev4, op2);
34346 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34347 op3 = copy_to_mode_reg (modei5, op3);
34349 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34351 error ("the fifth argument must be an 8-bit immediate");
34352 return const0_rtx;
34355 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34357 if (optimize || !target
34358 || GET_MODE (target) != tmode0
34359 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34360 target = gen_reg_rtx (tmode0);
34362 scratch1 = gen_reg_rtx (tmode1);
34364 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34366 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34368 if (optimize || !target
34369 || GET_MODE (target) != tmode1
34370 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34371 target = gen_reg_rtx (tmode1);
34373 scratch0 = gen_reg_rtx (tmode0);
34375 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34377 else
34379 gcc_assert (d->flag);
34381 scratch0 = gen_reg_rtx (tmode0);
34382 scratch1 = gen_reg_rtx (tmode1);
34384 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34387 if (! pat)
34388 return 0;
34390 emit_insn (pat);
34392 if (d->flag)
34394 target = gen_reg_rtx (SImode);
34395 emit_move_insn (target, const0_rtx);
34396 target = gen_rtx_SUBREG (QImode, target, 0);
34398 emit_insn
34399 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34400 gen_rtx_fmt_ee (EQ, QImode,
34401 gen_rtx_REG ((machine_mode) d->flag,
34402 FLAGS_REG),
34403 const0_rtx)));
34404 return SUBREG_REG (target);
34406 else
34407 return target;
34411 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34413 static rtx
34414 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34415 tree exp, rtx target)
34417 rtx pat;
34418 tree arg0 = CALL_EXPR_ARG (exp, 0);
34419 tree arg1 = CALL_EXPR_ARG (exp, 1);
34420 tree arg2 = CALL_EXPR_ARG (exp, 2);
34421 rtx scratch0, scratch1;
34422 rtx op0 = expand_normal (arg0);
34423 rtx op1 = expand_normal (arg1);
34424 rtx op2 = expand_normal (arg2);
34425 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34427 tmode0 = insn_data[d->icode].operand[0].mode;
34428 tmode1 = insn_data[d->icode].operand[1].mode;
34429 modev2 = insn_data[d->icode].operand[2].mode;
34430 modev3 = insn_data[d->icode].operand[3].mode;
34431 modeimm = insn_data[d->icode].operand[4].mode;
34433 if (VECTOR_MODE_P (modev2))
34434 op0 = safe_vector_operand (op0, modev2);
34435 if (VECTOR_MODE_P (modev3))
34436 op1 = safe_vector_operand (op1, modev3);
34438 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34439 op0 = copy_to_mode_reg (modev2, op0);
34440 if ((optimize && !register_operand (op1, modev3))
34441 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34442 op1 = copy_to_mode_reg (modev3, op1);
34444 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34446 error ("the third argument must be an 8-bit immediate");
34447 return const0_rtx;
34450 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34452 if (optimize || !target
34453 || GET_MODE (target) != tmode0
34454 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34455 target = gen_reg_rtx (tmode0);
34457 scratch1 = gen_reg_rtx (tmode1);
34459 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34461 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34463 if (optimize || !target
34464 || GET_MODE (target) != tmode1
34465 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34466 target = gen_reg_rtx (tmode1);
34468 scratch0 = gen_reg_rtx (tmode0);
34470 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34472 else
34474 gcc_assert (d->flag);
34476 scratch0 = gen_reg_rtx (tmode0);
34477 scratch1 = gen_reg_rtx (tmode1);
34479 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34482 if (! pat)
34483 return 0;
34485 emit_insn (pat);
34487 if (d->flag)
34489 target = gen_reg_rtx (SImode);
34490 emit_move_insn (target, const0_rtx);
34491 target = gen_rtx_SUBREG (QImode, target, 0);
34493 emit_insn
34494 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34495 gen_rtx_fmt_ee (EQ, QImode,
34496 gen_rtx_REG ((machine_mode) d->flag,
34497 FLAGS_REG),
34498 const0_rtx)));
34499 return SUBREG_REG (target);
34501 else
34502 return target;
34505 /* Subroutine of ix86_expand_builtin to take care of insns with
34506 variable number of operands. */
34508 static rtx
34509 ix86_expand_args_builtin (const struct builtin_description *d,
34510 tree exp, rtx target)
34512 rtx pat, real_target;
34513 unsigned int i, nargs;
34514 unsigned int nargs_constant = 0;
34515 unsigned int mask_pos = 0;
34516 int num_memory = 0;
34517 struct
34519 rtx op;
34520 machine_mode mode;
34521 } args[6];
34522 bool second_arg_count = false;
34523 enum insn_code icode = d->icode;
34524 const struct insn_data_d *insn_p = &insn_data[icode];
34525 machine_mode tmode = insn_p->operand[0].mode;
34526 machine_mode rmode = VOIDmode;
34527 bool swap = false;
34528 enum rtx_code comparison = d->comparison;
34530 switch ((enum ix86_builtin_func_type) d->flag)
34532 case V2DF_FTYPE_V2DF_ROUND:
34533 case V4DF_FTYPE_V4DF_ROUND:
34534 case V8DF_FTYPE_V8DF_ROUND:
34535 case V4SF_FTYPE_V4SF_ROUND:
34536 case V8SF_FTYPE_V8SF_ROUND:
34537 case V16SF_FTYPE_V16SF_ROUND:
34538 case V4SI_FTYPE_V4SF_ROUND:
34539 case V8SI_FTYPE_V8SF_ROUND:
34540 case V16SI_FTYPE_V16SF_ROUND:
34541 return ix86_expand_sse_round (d, exp, target);
34542 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34543 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34544 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34545 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34546 case INT_FTYPE_V8SF_V8SF_PTEST:
34547 case INT_FTYPE_V4DI_V4DI_PTEST:
34548 case INT_FTYPE_V4DF_V4DF_PTEST:
34549 case INT_FTYPE_V4SF_V4SF_PTEST:
34550 case INT_FTYPE_V2DI_V2DI_PTEST:
34551 case INT_FTYPE_V2DF_V2DF_PTEST:
34552 return ix86_expand_sse_ptest (d, exp, target);
34553 case FLOAT128_FTYPE_FLOAT128:
34554 case FLOAT_FTYPE_FLOAT:
34555 case INT_FTYPE_INT:
34556 case UINT_FTYPE_UINT:
34557 case UINT16_FTYPE_UINT16:
34558 case UINT64_FTYPE_INT:
34559 case UINT64_FTYPE_UINT64:
34560 case INT64_FTYPE_INT64:
34561 case INT64_FTYPE_V4SF:
34562 case INT64_FTYPE_V2DF:
34563 case INT_FTYPE_V16QI:
34564 case INT_FTYPE_V8QI:
34565 case INT_FTYPE_V8SF:
34566 case INT_FTYPE_V4DF:
34567 case INT_FTYPE_V4SF:
34568 case INT_FTYPE_V2DF:
34569 case INT_FTYPE_V32QI:
34570 case V16QI_FTYPE_V16QI:
34571 case V8SI_FTYPE_V8SF:
34572 case V8SI_FTYPE_V4SI:
34573 case V8HI_FTYPE_V8HI:
34574 case V8HI_FTYPE_V16QI:
34575 case V8QI_FTYPE_V8QI:
34576 case V8SF_FTYPE_V8SF:
34577 case V8SF_FTYPE_V8SI:
34578 case V8SF_FTYPE_V4SF:
34579 case V8SF_FTYPE_V8HI:
34580 case V4SI_FTYPE_V4SI:
34581 case V4SI_FTYPE_V16QI:
34582 case V4SI_FTYPE_V4SF:
34583 case V4SI_FTYPE_V8SI:
34584 case V4SI_FTYPE_V8HI:
34585 case V4SI_FTYPE_V4DF:
34586 case V4SI_FTYPE_V2DF:
34587 case V4HI_FTYPE_V4HI:
34588 case V4DF_FTYPE_V4DF:
34589 case V4DF_FTYPE_V4SI:
34590 case V4DF_FTYPE_V4SF:
34591 case V4DF_FTYPE_V2DF:
34592 case V4SF_FTYPE_V4SF:
34593 case V4SF_FTYPE_V4SI:
34594 case V4SF_FTYPE_V8SF:
34595 case V4SF_FTYPE_V4DF:
34596 case V4SF_FTYPE_V8HI:
34597 case V4SF_FTYPE_V2DF:
34598 case V2DI_FTYPE_V2DI:
34599 case V2DI_FTYPE_V16QI:
34600 case V2DI_FTYPE_V8HI:
34601 case V2DI_FTYPE_V4SI:
34602 case V2DF_FTYPE_V2DF:
34603 case V2DF_FTYPE_V4SI:
34604 case V2DF_FTYPE_V4DF:
34605 case V2DF_FTYPE_V4SF:
34606 case V2DF_FTYPE_V2SI:
34607 case V2SI_FTYPE_V2SI:
34608 case V2SI_FTYPE_V4SF:
34609 case V2SI_FTYPE_V2SF:
34610 case V2SI_FTYPE_V2DF:
34611 case V2SF_FTYPE_V2SF:
34612 case V2SF_FTYPE_V2SI:
34613 case V32QI_FTYPE_V32QI:
34614 case V32QI_FTYPE_V16QI:
34615 case V16HI_FTYPE_V16HI:
34616 case V16HI_FTYPE_V8HI:
34617 case V8SI_FTYPE_V8SI:
34618 case V16HI_FTYPE_V16QI:
34619 case V8SI_FTYPE_V16QI:
34620 case V4DI_FTYPE_V16QI:
34621 case V8SI_FTYPE_V8HI:
34622 case V4DI_FTYPE_V8HI:
34623 case V4DI_FTYPE_V4SI:
34624 case V4DI_FTYPE_V2DI:
34625 case UQI_FTYPE_UQI:
34626 case UHI_FTYPE_UHI:
34627 case USI_FTYPE_USI:
34628 case USI_FTYPE_UQI:
34629 case USI_FTYPE_UHI:
34630 case UDI_FTYPE_UDI:
34631 case UHI_FTYPE_V16QI:
34632 case USI_FTYPE_V32QI:
34633 case UDI_FTYPE_V64QI:
34634 case V16QI_FTYPE_UHI:
34635 case V32QI_FTYPE_USI:
34636 case V64QI_FTYPE_UDI:
34637 case V8HI_FTYPE_UQI:
34638 case V16HI_FTYPE_UHI:
34639 case V32HI_FTYPE_USI:
34640 case V4SI_FTYPE_UQI:
34641 case V8SI_FTYPE_UQI:
34642 case V4SI_FTYPE_UHI:
34643 case V8SI_FTYPE_UHI:
34644 case UQI_FTYPE_V8HI:
34645 case UHI_FTYPE_V16HI:
34646 case USI_FTYPE_V32HI:
34647 case UQI_FTYPE_V4SI:
34648 case UQI_FTYPE_V8SI:
34649 case UHI_FTYPE_V16SI:
34650 case UQI_FTYPE_V2DI:
34651 case UQI_FTYPE_V4DI:
34652 case UQI_FTYPE_V8DI:
34653 case V16SI_FTYPE_UHI:
34654 case V2DI_FTYPE_UQI:
34655 case V4DI_FTYPE_UQI:
34656 case V16SI_FTYPE_INT:
34657 case V16SF_FTYPE_V8SF:
34658 case V16SI_FTYPE_V8SI:
34659 case V16SF_FTYPE_V4SF:
34660 case V16SI_FTYPE_V4SI:
34661 case V16SI_FTYPE_V16SF:
34662 case V16SI_FTYPE_V16SI:
34663 case V64QI_FTYPE_V64QI:
34664 case V32HI_FTYPE_V32HI:
34665 case V16SF_FTYPE_V16SF:
34666 case V8DI_FTYPE_UQI:
34667 case V8DI_FTYPE_V8DI:
34668 case V8DF_FTYPE_V4DF:
34669 case V8DF_FTYPE_V2DF:
34670 case V8DF_FTYPE_V8DF:
34671 case V4DI_FTYPE_V4DI:
34672 nargs = 1;
34673 break;
34674 case V4SF_FTYPE_V4SF_VEC_MERGE:
34675 case V2DF_FTYPE_V2DF_VEC_MERGE:
34676 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34677 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34678 case V16QI_FTYPE_V16QI_V16QI:
34679 case V16QI_FTYPE_V8HI_V8HI:
34680 case V16SF_FTYPE_V16SF_V16SF:
34681 case V8QI_FTYPE_V8QI_V8QI:
34682 case V8QI_FTYPE_V4HI_V4HI:
34683 case V8HI_FTYPE_V8HI_V8HI:
34684 case V8HI_FTYPE_V16QI_V16QI:
34685 case V8HI_FTYPE_V4SI_V4SI:
34686 case V8SF_FTYPE_V8SF_V8SF:
34687 case V8SF_FTYPE_V8SF_V8SI:
34688 case V8DF_FTYPE_V8DF_V8DF:
34689 case V4SI_FTYPE_V4SI_V4SI:
34690 case V4SI_FTYPE_V8HI_V8HI:
34691 case V4SI_FTYPE_V2DF_V2DF:
34692 case V4HI_FTYPE_V4HI_V4HI:
34693 case V4HI_FTYPE_V8QI_V8QI:
34694 case V4HI_FTYPE_V2SI_V2SI:
34695 case V4DF_FTYPE_V4DF_V4DF:
34696 case V4DF_FTYPE_V4DF_V4DI:
34697 case V4SF_FTYPE_V4SF_V4SF:
34698 case V4SF_FTYPE_V4SF_V4SI:
34699 case V4SF_FTYPE_V4SF_V2SI:
34700 case V4SF_FTYPE_V4SF_V2DF:
34701 case V4SF_FTYPE_V4SF_UINT:
34702 case V4SF_FTYPE_V4SF_DI:
34703 case V4SF_FTYPE_V4SF_SI:
34704 case V2DI_FTYPE_V2DI_V2DI:
34705 case V2DI_FTYPE_V16QI_V16QI:
34706 case V2DI_FTYPE_V4SI_V4SI:
34707 case V2DI_FTYPE_V2DI_V16QI:
34708 case V2SI_FTYPE_V2SI_V2SI:
34709 case V2SI_FTYPE_V4HI_V4HI:
34710 case V2SI_FTYPE_V2SF_V2SF:
34711 case V2DF_FTYPE_V2DF_V2DF:
34712 case V2DF_FTYPE_V2DF_V4SF:
34713 case V2DF_FTYPE_V2DF_V2DI:
34714 case V2DF_FTYPE_V2DF_DI:
34715 case V2DF_FTYPE_V2DF_SI:
34716 case V2DF_FTYPE_V2DF_UINT:
34717 case V2SF_FTYPE_V2SF_V2SF:
34718 case V1DI_FTYPE_V1DI_V1DI:
34719 case V1DI_FTYPE_V8QI_V8QI:
34720 case V1DI_FTYPE_V2SI_V2SI:
34721 case V32QI_FTYPE_V16HI_V16HI:
34722 case V16HI_FTYPE_V8SI_V8SI:
34723 case V64QI_FTYPE_V64QI_V64QI:
34724 case V32QI_FTYPE_V32QI_V32QI:
34725 case V16HI_FTYPE_V32QI_V32QI:
34726 case V16HI_FTYPE_V16HI_V16HI:
34727 case V8SI_FTYPE_V4DF_V4DF:
34728 case V8SI_FTYPE_V8SI_V8SI:
34729 case V8SI_FTYPE_V16HI_V16HI:
34730 case V4DI_FTYPE_V4DI_V4DI:
34731 case V4DI_FTYPE_V8SI_V8SI:
34732 case V8DI_FTYPE_V64QI_V64QI:
34733 if (comparison == UNKNOWN)
34734 return ix86_expand_binop_builtin (icode, exp, target);
34735 nargs = 2;
34736 break;
34737 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34738 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34739 gcc_assert (comparison != UNKNOWN);
34740 nargs = 2;
34741 swap = true;
34742 break;
34743 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34744 case V16HI_FTYPE_V16HI_SI_COUNT:
34745 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34746 case V8SI_FTYPE_V8SI_SI_COUNT:
34747 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34748 case V4DI_FTYPE_V4DI_INT_COUNT:
34749 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34750 case V8HI_FTYPE_V8HI_SI_COUNT:
34751 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34752 case V4SI_FTYPE_V4SI_SI_COUNT:
34753 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34754 case V4HI_FTYPE_V4HI_SI_COUNT:
34755 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34756 case V2DI_FTYPE_V2DI_SI_COUNT:
34757 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34758 case V2SI_FTYPE_V2SI_SI_COUNT:
34759 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34760 case V1DI_FTYPE_V1DI_SI_COUNT:
34761 nargs = 2;
34762 second_arg_count = true;
34763 break;
34764 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
34765 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
34766 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
34767 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
34768 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
34769 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
34770 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
34771 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
34772 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
34773 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
34774 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
34775 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
34776 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
34777 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
34778 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
34779 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
34780 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
34781 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
34782 nargs = 4;
34783 second_arg_count = true;
34784 break;
34785 case UINT64_FTYPE_UINT64_UINT64:
34786 case UINT_FTYPE_UINT_UINT:
34787 case UINT_FTYPE_UINT_USHORT:
34788 case UINT_FTYPE_UINT_UCHAR:
34789 case UINT16_FTYPE_UINT16_INT:
34790 case UINT8_FTYPE_UINT8_INT:
34791 case UQI_FTYPE_UQI_UQI:
34792 case UHI_FTYPE_UHI_UHI:
34793 case USI_FTYPE_USI_USI:
34794 case UDI_FTYPE_UDI_UDI:
34795 case V16SI_FTYPE_V8DF_V8DF:
34796 nargs = 2;
34797 break;
34798 case V2DI_FTYPE_V2DI_INT_CONVERT:
34799 nargs = 2;
34800 rmode = V1TImode;
34801 nargs_constant = 1;
34802 break;
34803 case V4DI_FTYPE_V4DI_INT_CONVERT:
34804 nargs = 2;
34805 rmode = V2TImode;
34806 nargs_constant = 1;
34807 break;
34808 case V8DI_FTYPE_V8DI_INT_CONVERT:
34809 nargs = 2;
34810 rmode = V4TImode;
34811 nargs_constant = 1;
34812 break;
34813 case V8HI_FTYPE_V8HI_INT:
34814 case V8HI_FTYPE_V8SF_INT:
34815 case V16HI_FTYPE_V16SF_INT:
34816 case V8HI_FTYPE_V4SF_INT:
34817 case V8SF_FTYPE_V8SF_INT:
34818 case V4SF_FTYPE_V16SF_INT:
34819 case V16SF_FTYPE_V16SF_INT:
34820 case V4SI_FTYPE_V4SI_INT:
34821 case V4SI_FTYPE_V8SI_INT:
34822 case V4HI_FTYPE_V4HI_INT:
34823 case V4DF_FTYPE_V4DF_INT:
34824 case V4DF_FTYPE_V8DF_INT:
34825 case V4SF_FTYPE_V4SF_INT:
34826 case V4SF_FTYPE_V8SF_INT:
34827 case V2DI_FTYPE_V2DI_INT:
34828 case V2DF_FTYPE_V2DF_INT:
34829 case V2DF_FTYPE_V4DF_INT:
34830 case V16HI_FTYPE_V16HI_INT:
34831 case V8SI_FTYPE_V8SI_INT:
34832 case V16SI_FTYPE_V16SI_INT:
34833 case V4SI_FTYPE_V16SI_INT:
34834 case V4DI_FTYPE_V4DI_INT:
34835 case V2DI_FTYPE_V4DI_INT:
34836 case V4DI_FTYPE_V8DI_INT:
34837 case QI_FTYPE_V4SF_INT:
34838 case QI_FTYPE_V2DF_INT:
34839 case UQI_FTYPE_UQI_UQI_CONST:
34840 case UHI_FTYPE_UHI_UQI:
34841 case USI_FTYPE_USI_UQI:
34842 case UDI_FTYPE_UDI_UQI:
34843 nargs = 2;
34844 nargs_constant = 1;
34845 break;
34846 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34847 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34848 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34849 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34850 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34851 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34852 case UHI_FTYPE_V16SI_V16SI_UHI:
34853 case UQI_FTYPE_V8DI_V8DI_UQI:
34854 case V16HI_FTYPE_V16SI_V16HI_UHI:
34855 case V16QI_FTYPE_V16SI_V16QI_UHI:
34856 case V16QI_FTYPE_V8DI_V16QI_UQI:
34857 case V16SF_FTYPE_V16SF_V16SF_UHI:
34858 case V16SF_FTYPE_V4SF_V16SF_UHI:
34859 case V16SI_FTYPE_SI_V16SI_UHI:
34860 case V16SI_FTYPE_V16HI_V16SI_UHI:
34861 case V16SI_FTYPE_V16QI_V16SI_UHI:
34862 case V8SF_FTYPE_V4SF_V8SF_UQI:
34863 case V4DF_FTYPE_V2DF_V4DF_UQI:
34864 case V8SI_FTYPE_V4SI_V8SI_UQI:
34865 case V8SI_FTYPE_SI_V8SI_UQI:
34866 case V4SI_FTYPE_V4SI_V4SI_UQI:
34867 case V4SI_FTYPE_SI_V4SI_UQI:
34868 case V4DI_FTYPE_V2DI_V4DI_UQI:
34869 case V4DI_FTYPE_DI_V4DI_UQI:
34870 case V2DI_FTYPE_V2DI_V2DI_UQI:
34871 case V2DI_FTYPE_DI_V2DI_UQI:
34872 case V64QI_FTYPE_V64QI_V64QI_UDI:
34873 case V64QI_FTYPE_V16QI_V64QI_UDI:
34874 case V64QI_FTYPE_QI_V64QI_UDI:
34875 case V32QI_FTYPE_V32QI_V32QI_USI:
34876 case V32QI_FTYPE_V16QI_V32QI_USI:
34877 case V32QI_FTYPE_QI_V32QI_USI:
34878 case V16QI_FTYPE_V16QI_V16QI_UHI:
34879 case V16QI_FTYPE_QI_V16QI_UHI:
34880 case V32HI_FTYPE_V8HI_V32HI_USI:
34881 case V32HI_FTYPE_HI_V32HI_USI:
34882 case V16HI_FTYPE_V8HI_V16HI_UHI:
34883 case V16HI_FTYPE_HI_V16HI_UHI:
34884 case V8HI_FTYPE_V8HI_V8HI_UQI:
34885 case V8HI_FTYPE_HI_V8HI_UQI:
34886 case V8SF_FTYPE_V8HI_V8SF_UQI:
34887 case V4SF_FTYPE_V8HI_V4SF_UQI:
34888 case V8SI_FTYPE_V8SF_V8SI_UQI:
34889 case V4SI_FTYPE_V4SF_V4SI_UQI:
34890 case V4DI_FTYPE_V4SF_V4DI_UQI:
34891 case V2DI_FTYPE_V4SF_V2DI_UQI:
34892 case V4SF_FTYPE_V4DI_V4SF_UQI:
34893 case V4SF_FTYPE_V2DI_V4SF_UQI:
34894 case V4DF_FTYPE_V4DI_V4DF_UQI:
34895 case V2DF_FTYPE_V2DI_V2DF_UQI:
34896 case V16QI_FTYPE_V8HI_V16QI_UQI:
34897 case V16QI_FTYPE_V16HI_V16QI_UHI:
34898 case V16QI_FTYPE_V4SI_V16QI_UQI:
34899 case V16QI_FTYPE_V8SI_V16QI_UQI:
34900 case V8HI_FTYPE_V4SI_V8HI_UQI:
34901 case V8HI_FTYPE_V8SI_V8HI_UQI:
34902 case V16QI_FTYPE_V2DI_V16QI_UQI:
34903 case V16QI_FTYPE_V4DI_V16QI_UQI:
34904 case V8HI_FTYPE_V2DI_V8HI_UQI:
34905 case V8HI_FTYPE_V4DI_V8HI_UQI:
34906 case V4SI_FTYPE_V2DI_V4SI_UQI:
34907 case V4SI_FTYPE_V4DI_V4SI_UQI:
34908 case V32QI_FTYPE_V32HI_V32QI_USI:
34909 case UHI_FTYPE_V16QI_V16QI_UHI:
34910 case USI_FTYPE_V32QI_V32QI_USI:
34911 case UDI_FTYPE_V64QI_V64QI_UDI:
34912 case UQI_FTYPE_V8HI_V8HI_UQI:
34913 case UHI_FTYPE_V16HI_V16HI_UHI:
34914 case USI_FTYPE_V32HI_V32HI_USI:
34915 case UQI_FTYPE_V4SI_V4SI_UQI:
34916 case UQI_FTYPE_V8SI_V8SI_UQI:
34917 case UQI_FTYPE_V2DI_V2DI_UQI:
34918 case UQI_FTYPE_V4DI_V4DI_UQI:
34919 case V4SF_FTYPE_V2DF_V4SF_UQI:
34920 case V4SF_FTYPE_V4DF_V4SF_UQI:
34921 case V16SI_FTYPE_V16SI_V16SI_UHI:
34922 case V16SI_FTYPE_V4SI_V16SI_UHI:
34923 case V2DI_FTYPE_V4SI_V2DI_UQI:
34924 case V2DI_FTYPE_V8HI_V2DI_UQI:
34925 case V2DI_FTYPE_V16QI_V2DI_UQI:
34926 case V4DI_FTYPE_V4DI_V4DI_UQI:
34927 case V4DI_FTYPE_V4SI_V4DI_UQI:
34928 case V4DI_FTYPE_V8HI_V4DI_UQI:
34929 case V4DI_FTYPE_V16QI_V4DI_UQI:
34930 case V4DI_FTYPE_V4DF_V4DI_UQI:
34931 case V2DI_FTYPE_V2DF_V2DI_UQI:
34932 case V4SI_FTYPE_V4DF_V4SI_UQI:
34933 case V4SI_FTYPE_V2DF_V4SI_UQI:
34934 case V4SI_FTYPE_V8HI_V4SI_UQI:
34935 case V4SI_FTYPE_V16QI_V4SI_UQI:
34936 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34937 case V8DF_FTYPE_V2DF_V8DF_UQI:
34938 case V8DF_FTYPE_V4DF_V8DF_UQI:
34939 case V8DF_FTYPE_V8DF_V8DF_UQI:
34940 case V8SF_FTYPE_V8SF_V8SF_UQI:
34941 case V8SF_FTYPE_V8SI_V8SF_UQI:
34942 case V4DF_FTYPE_V4DF_V4DF_UQI:
34943 case V4SF_FTYPE_V4SF_V4SF_UQI:
34944 case V2DF_FTYPE_V2DF_V2DF_UQI:
34945 case V2DF_FTYPE_V4SF_V2DF_UQI:
34946 case V2DF_FTYPE_V4SI_V2DF_UQI:
34947 case V4SF_FTYPE_V4SI_V4SF_UQI:
34948 case V4DF_FTYPE_V4SF_V4DF_UQI:
34949 case V4DF_FTYPE_V4SI_V4DF_UQI:
34950 case V8SI_FTYPE_V8SI_V8SI_UQI:
34951 case V8SI_FTYPE_V8HI_V8SI_UQI:
34952 case V8SI_FTYPE_V16QI_V8SI_UQI:
34953 case V8DF_FTYPE_V8SI_V8DF_UQI:
34954 case V8DI_FTYPE_DI_V8DI_UQI:
34955 case V16SF_FTYPE_V8SF_V16SF_UHI:
34956 case V16SI_FTYPE_V8SI_V16SI_UHI:
34957 case V16HI_FTYPE_V16HI_V16HI_UHI:
34958 case V8HI_FTYPE_V16QI_V8HI_UQI:
34959 case V16HI_FTYPE_V16QI_V16HI_UHI:
34960 case V32HI_FTYPE_V32HI_V32HI_USI:
34961 case V32HI_FTYPE_V32QI_V32HI_USI:
34962 case V8DI_FTYPE_V16QI_V8DI_UQI:
34963 case V8DI_FTYPE_V2DI_V8DI_UQI:
34964 case V8DI_FTYPE_V4DI_V8DI_UQI:
34965 case V8DI_FTYPE_V8DI_V8DI_UQI:
34966 case V8DI_FTYPE_V8HI_V8DI_UQI:
34967 case V8DI_FTYPE_V8SI_V8DI_UQI:
34968 case V8HI_FTYPE_V8DI_V8HI_UQI:
34969 case V8SI_FTYPE_V8DI_V8SI_UQI:
34970 case V4SI_FTYPE_V4SI_V4SI_V4SI:
34971 case V16SI_FTYPE_V16SI_V16SI_V16SI:
34972 case V8DI_FTYPE_V8DI_V8DI_V8DI:
34973 case V32HI_FTYPE_V32HI_V32HI_V32HI:
34974 case V2DI_FTYPE_V2DI_V2DI_V2DI:
34975 case V16HI_FTYPE_V16HI_V16HI_V16HI:
34976 case V8SI_FTYPE_V8SI_V8SI_V8SI:
34977 case V8HI_FTYPE_V8HI_V8HI_V8HI:
34978 nargs = 3;
34979 break;
34980 case V32QI_FTYPE_V32QI_V32QI_INT:
34981 case V16HI_FTYPE_V16HI_V16HI_INT:
34982 case V16QI_FTYPE_V16QI_V16QI_INT:
34983 case V4DI_FTYPE_V4DI_V4DI_INT:
34984 case V8HI_FTYPE_V8HI_V8HI_INT:
34985 case V8SI_FTYPE_V8SI_V8SI_INT:
34986 case V8SI_FTYPE_V8SI_V4SI_INT:
34987 case V8SF_FTYPE_V8SF_V8SF_INT:
34988 case V8SF_FTYPE_V8SF_V4SF_INT:
34989 case V4SI_FTYPE_V4SI_V4SI_INT:
34990 case V4DF_FTYPE_V4DF_V4DF_INT:
34991 case V16SF_FTYPE_V16SF_V16SF_INT:
34992 case V16SF_FTYPE_V16SF_V4SF_INT:
34993 case V16SI_FTYPE_V16SI_V4SI_INT:
34994 case V4DF_FTYPE_V4DF_V2DF_INT:
34995 case V4SF_FTYPE_V4SF_V4SF_INT:
34996 case V2DI_FTYPE_V2DI_V2DI_INT:
34997 case V4DI_FTYPE_V4DI_V2DI_INT:
34998 case V2DF_FTYPE_V2DF_V2DF_INT:
34999 case UQI_FTYPE_V8DI_V8UDI_INT:
35000 case UQI_FTYPE_V8DF_V8DF_INT:
35001 case UQI_FTYPE_V2DF_V2DF_INT:
35002 case UQI_FTYPE_V4SF_V4SF_INT:
35003 case UHI_FTYPE_V16SI_V16SI_INT:
35004 case UHI_FTYPE_V16SF_V16SF_INT:
35005 case V64QI_FTYPE_V64QI_V64QI_INT:
35006 case V32HI_FTYPE_V32HI_V32HI_INT:
35007 case V16SI_FTYPE_V16SI_V16SI_INT:
35008 case V8DI_FTYPE_V8DI_V8DI_INT:
35009 nargs = 3;
35010 nargs_constant = 1;
35011 break;
35012 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35013 nargs = 3;
35014 rmode = V4DImode;
35015 nargs_constant = 1;
35016 break;
35017 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35018 nargs = 3;
35019 rmode = V2DImode;
35020 nargs_constant = 1;
35021 break;
35022 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35023 nargs = 3;
35024 rmode = DImode;
35025 nargs_constant = 1;
35026 break;
35027 case V2DI_FTYPE_V2DI_UINT_UINT:
35028 nargs = 3;
35029 nargs_constant = 2;
35030 break;
35031 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35032 nargs = 3;
35033 rmode = V8DImode;
35034 nargs_constant = 1;
35035 break;
35036 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35037 nargs = 5;
35038 rmode = V8DImode;
35039 mask_pos = 2;
35040 nargs_constant = 1;
35041 break;
35042 case QI_FTYPE_V8DF_INT_UQI:
35043 case QI_FTYPE_V4DF_INT_UQI:
35044 case QI_FTYPE_V2DF_INT_UQI:
35045 case HI_FTYPE_V16SF_INT_UHI:
35046 case QI_FTYPE_V8SF_INT_UQI:
35047 case QI_FTYPE_V4SF_INT_UQI:
35048 case V4SI_FTYPE_V4SI_V4SI_UHI:
35049 case V8SI_FTYPE_V8SI_V8SI_UHI:
35050 nargs = 3;
35051 mask_pos = 1;
35052 nargs_constant = 1;
35053 break;
35054 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35055 nargs = 5;
35056 rmode = V4DImode;
35057 mask_pos = 2;
35058 nargs_constant = 1;
35059 break;
35060 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35061 nargs = 5;
35062 rmode = V2DImode;
35063 mask_pos = 2;
35064 nargs_constant = 1;
35065 break;
35066 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35067 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35068 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35069 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35070 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35071 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35072 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35073 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35074 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35075 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35076 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35077 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35078 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35079 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35080 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35081 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35082 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35083 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35084 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35085 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35086 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35087 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35088 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35089 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35090 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35091 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35092 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35093 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35094 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35095 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35096 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35097 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35098 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35099 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35100 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35101 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35102 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35103 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35104 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35105 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35106 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35107 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35108 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35109 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35110 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35111 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35112 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35113 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35114 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35115 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35116 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35117 nargs = 4;
35118 break;
35119 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35120 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35121 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35122 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35123 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35124 nargs = 4;
35125 nargs_constant = 1;
35126 break;
35127 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35128 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35129 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35130 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35131 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35132 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35133 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35134 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35135 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35136 case USI_FTYPE_V32QI_V32QI_INT_USI:
35137 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35138 case USI_FTYPE_V32HI_V32HI_INT_USI:
35139 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35140 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35141 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
35142 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
35143 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
35144 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
35145 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
35146 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
35147 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
35148 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
35149 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
35150 nargs = 4;
35151 mask_pos = 1;
35152 nargs_constant = 1;
35153 break;
35154 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35155 nargs = 4;
35156 nargs_constant = 2;
35157 break;
35158 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35159 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35160 nargs = 4;
35161 break;
35162 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35163 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35164 mask_pos = 1;
35165 nargs = 4;
35166 nargs_constant = 1;
35167 break;
35168 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35169 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35170 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35171 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35172 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35173 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35174 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35175 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35176 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35177 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35178 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35179 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35180 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35181 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35182 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35183 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35184 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35185 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35186 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35187 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35188 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35189 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35190 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35191 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35192 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35193 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35194 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35195 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35196 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35197 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35198 nargs = 4;
35199 mask_pos = 2;
35200 nargs_constant = 1;
35201 break;
35202 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35203 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35204 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35205 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35206 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35207 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35208 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35209 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35210 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35211 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35212 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35213 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35214 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35215 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35216 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35217 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35218 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35219 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35220 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35221 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35222 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35223 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35224 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35225 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35226 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35227 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35228 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35229 nargs = 5;
35230 mask_pos = 2;
35231 nargs_constant = 1;
35232 break;
35233 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35234 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35235 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35236 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35237 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35238 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35239 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35240 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35241 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35242 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35243 nargs = 5;
35244 mask_pos = 1;
35245 nargs_constant = 1;
35246 break;
35247 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
35248 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
35249 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
35250 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
35251 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
35252 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
35253 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
35254 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
35255 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
35256 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
35257 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
35258 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
35259 nargs = 5;
35260 mask_pos = 1;
35261 nargs_constant = 2;
35262 break;
35264 default:
35265 gcc_unreachable ();
35268 gcc_assert (nargs <= ARRAY_SIZE (args));
35270 if (comparison != UNKNOWN)
35272 gcc_assert (nargs == 2);
35273 return ix86_expand_sse_compare (d, exp, target, swap);
35276 if (rmode == VOIDmode || rmode == tmode)
35278 if (optimize
35279 || target == 0
35280 || GET_MODE (target) != tmode
35281 || !insn_p->operand[0].predicate (target, tmode))
35282 target = gen_reg_rtx (tmode);
35283 else if (memory_operand (target, tmode))
35284 num_memory++;
35285 real_target = target;
35287 else
35289 real_target = gen_reg_rtx (tmode);
35290 target = lowpart_subreg (rmode, real_target, tmode);
35293 for (i = 0; i < nargs; i++)
35295 tree arg = CALL_EXPR_ARG (exp, i);
35296 rtx op = expand_normal (arg);
35297 machine_mode mode = insn_p->operand[i + 1].mode;
35298 bool match = insn_p->operand[i + 1].predicate (op, mode);
35300 if (second_arg_count && i == 1)
35302 /* SIMD shift insns take either an 8-bit immediate or
35303 register as count. But builtin functions take int as
35304 count. If count doesn't match, we put it in register.
35305 The instructions are using 64-bit count, if op is just
35306 32-bit, zero-extend it, as negative shift counts
35307 are undefined behavior and zero-extension is more
35308 efficient. */
35309 if (!match)
35311 if (SCALAR_INT_MODE_P (GET_MODE (op)))
35312 op = convert_modes (mode, GET_MODE (op), op, 1);
35313 else
35314 op = lowpart_subreg (mode, op, GET_MODE (op));
35315 if (!insn_p->operand[i + 1].predicate (op, mode))
35316 op = copy_to_reg (op);
35319 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35320 (!mask_pos && (nargs - i) <= nargs_constant))
35322 if (!match)
35323 switch (icode)
35325 case CODE_FOR_avx_vinsertf128v4di:
35326 case CODE_FOR_avx_vextractf128v4di:
35327 error ("the last argument must be an 1-bit immediate");
35328 return const0_rtx;
35330 case CODE_FOR_avx512f_cmpv8di3_mask:
35331 case CODE_FOR_avx512f_cmpv16si3_mask:
35332 case CODE_FOR_avx512f_ucmpv8di3_mask:
35333 case CODE_FOR_avx512f_ucmpv16si3_mask:
35334 case CODE_FOR_avx512vl_cmpv4di3_mask:
35335 case CODE_FOR_avx512vl_cmpv8si3_mask:
35336 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35337 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35338 case CODE_FOR_avx512vl_cmpv2di3_mask:
35339 case CODE_FOR_avx512vl_cmpv4si3_mask:
35340 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35341 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35342 error ("the last argument must be a 3-bit immediate");
35343 return const0_rtx;
35345 case CODE_FOR_sse4_1_roundsd:
35346 case CODE_FOR_sse4_1_roundss:
35348 case CODE_FOR_sse4_1_roundpd:
35349 case CODE_FOR_sse4_1_roundps:
35350 case CODE_FOR_avx_roundpd256:
35351 case CODE_FOR_avx_roundps256:
35353 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35354 case CODE_FOR_sse4_1_roundps_sfix:
35355 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35356 case CODE_FOR_avx_roundps_sfix256:
35358 case CODE_FOR_sse4_1_blendps:
35359 case CODE_FOR_avx_blendpd256:
35360 case CODE_FOR_avx_vpermilv4df:
35361 case CODE_FOR_avx_vpermilv4df_mask:
35362 case CODE_FOR_avx512f_getmantv8df_mask:
35363 case CODE_FOR_avx512f_getmantv16sf_mask:
35364 case CODE_FOR_avx512vl_getmantv8sf_mask:
35365 case CODE_FOR_avx512vl_getmantv4df_mask:
35366 case CODE_FOR_avx512vl_getmantv4sf_mask:
35367 case CODE_FOR_avx512vl_getmantv2df_mask:
35368 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35369 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35370 case CODE_FOR_avx512dq_rangepv4df_mask:
35371 case CODE_FOR_avx512dq_rangepv8sf_mask:
35372 case CODE_FOR_avx512dq_rangepv2df_mask:
35373 case CODE_FOR_avx512dq_rangepv4sf_mask:
35374 case CODE_FOR_avx_shufpd256_mask:
35375 error ("the last argument must be a 4-bit immediate");
35376 return const0_rtx;
35378 case CODE_FOR_sha1rnds4:
35379 case CODE_FOR_sse4_1_blendpd:
35380 case CODE_FOR_avx_vpermilv2df:
35381 case CODE_FOR_avx_vpermilv2df_mask:
35382 case CODE_FOR_xop_vpermil2v2df3:
35383 case CODE_FOR_xop_vpermil2v4sf3:
35384 case CODE_FOR_xop_vpermil2v4df3:
35385 case CODE_FOR_xop_vpermil2v8sf3:
35386 case CODE_FOR_avx512f_vinsertf32x4_mask:
35387 case CODE_FOR_avx512f_vinserti32x4_mask:
35388 case CODE_FOR_avx512f_vextractf32x4_mask:
35389 case CODE_FOR_avx512f_vextracti32x4_mask:
35390 case CODE_FOR_sse2_shufpd:
35391 case CODE_FOR_sse2_shufpd_mask:
35392 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35393 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35394 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35395 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35396 error ("the last argument must be a 2-bit immediate");
35397 return const0_rtx;
35399 case CODE_FOR_avx_vextractf128v4df:
35400 case CODE_FOR_avx_vextractf128v8sf:
35401 case CODE_FOR_avx_vextractf128v8si:
35402 case CODE_FOR_avx_vinsertf128v4df:
35403 case CODE_FOR_avx_vinsertf128v8sf:
35404 case CODE_FOR_avx_vinsertf128v8si:
35405 case CODE_FOR_avx512f_vinsertf64x4_mask:
35406 case CODE_FOR_avx512f_vinserti64x4_mask:
35407 case CODE_FOR_avx512f_vextractf64x4_mask:
35408 case CODE_FOR_avx512f_vextracti64x4_mask:
35409 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35410 case CODE_FOR_avx512dq_vinserti32x8_mask:
35411 case CODE_FOR_avx512vl_vinsertv4df:
35412 case CODE_FOR_avx512vl_vinsertv4di:
35413 case CODE_FOR_avx512vl_vinsertv8sf:
35414 case CODE_FOR_avx512vl_vinsertv8si:
35415 error ("the last argument must be a 1-bit immediate");
35416 return const0_rtx;
35418 case CODE_FOR_avx_vmcmpv2df3:
35419 case CODE_FOR_avx_vmcmpv4sf3:
35420 case CODE_FOR_avx_cmpv2df3:
35421 case CODE_FOR_avx_cmpv4sf3:
35422 case CODE_FOR_avx_cmpv4df3:
35423 case CODE_FOR_avx_cmpv8sf3:
35424 case CODE_FOR_avx512f_cmpv8df3_mask:
35425 case CODE_FOR_avx512f_cmpv16sf3_mask:
35426 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35427 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35428 error ("the last argument must be a 5-bit immediate");
35429 return const0_rtx;
35431 default:
35432 switch (nargs_constant)
35434 case 2:
35435 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35436 (!mask_pos && (nargs - i) == nargs_constant))
35438 error ("the next to last argument must be an 8-bit immediate");
35439 break;
35441 /* FALLTHRU */
35442 case 1:
35443 error ("the last argument must be an 8-bit immediate");
35444 break;
35445 default:
35446 gcc_unreachable ();
35448 return const0_rtx;
35451 else
35453 if (VECTOR_MODE_P (mode))
35454 op = safe_vector_operand (op, mode);
35456 /* If we aren't optimizing, only allow one memory operand to
35457 be generated. */
35458 if (memory_operand (op, mode))
35459 num_memory++;
35461 op = fixup_modeless_constant (op, mode);
35463 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35465 if (optimize || !match || num_memory > 1)
35466 op = copy_to_mode_reg (mode, op);
35468 else
35470 op = copy_to_reg (op);
35471 op = lowpart_subreg (mode, op, GET_MODE (op));
35475 args[i].op = op;
35476 args[i].mode = mode;
35479 switch (nargs)
35481 case 1:
35482 pat = GEN_FCN (icode) (real_target, args[0].op);
35483 break;
35484 case 2:
35485 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35486 break;
35487 case 3:
35488 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35489 args[2].op);
35490 break;
35491 case 4:
35492 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35493 args[2].op, args[3].op);
35494 break;
35495 case 5:
35496 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35497 args[2].op, args[3].op, args[4].op);
35498 break;
35499 case 6:
35500 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35501 args[2].op, args[3].op, args[4].op,
35502 args[5].op);
35503 break;
35504 default:
35505 gcc_unreachable ();
35508 if (! pat)
35509 return 0;
35511 emit_insn (pat);
35512 return target;
35515 /* Transform pattern of following layout:
35516 (set A
35517 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
35519 into:
35520 (set (A B)) */
35522 static rtx
35523 ix86_erase_embedded_rounding (rtx pat)
35525 if (GET_CODE (pat) == INSN)
35526 pat = PATTERN (pat);
35528 gcc_assert (GET_CODE (pat) == SET);
35529 rtx src = SET_SRC (pat);
35530 gcc_assert (XVECLEN (src, 0) == 2);
35531 rtx p0 = XVECEXP (src, 0, 0);
35532 gcc_assert (GET_CODE (src) == UNSPEC
35533 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
35534 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
35535 return res;
35538 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35539 with rounding. */
35540 static rtx
35541 ix86_expand_sse_comi_round (const struct builtin_description *d,
35542 tree exp, rtx target)
35544 rtx pat, set_dst;
35545 tree arg0 = CALL_EXPR_ARG (exp, 0);
35546 tree arg1 = CALL_EXPR_ARG (exp, 1);
35547 tree arg2 = CALL_EXPR_ARG (exp, 2);
35548 tree arg3 = CALL_EXPR_ARG (exp, 3);
35549 rtx op0 = expand_normal (arg0);
35550 rtx op1 = expand_normal (arg1);
35551 rtx op2 = expand_normal (arg2);
35552 rtx op3 = expand_normal (arg3);
35553 enum insn_code icode = d->icode;
35554 const struct insn_data_d *insn_p = &insn_data[icode];
35555 machine_mode mode0 = insn_p->operand[0].mode;
35556 machine_mode mode1 = insn_p->operand[1].mode;
35557 enum rtx_code comparison = UNEQ;
35558 bool need_ucomi = false;
35560 /* See avxintrin.h for values. */
35561 enum rtx_code comi_comparisons[32] =
35563 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35564 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35565 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35567 bool need_ucomi_values[32] =
35569 true, false, false, true, true, false, false, true,
35570 true, false, false, true, true, false, false, true,
35571 false, true, true, false, false, true, true, false,
35572 false, true, true, false, false, true, true, false
35575 if (!CONST_INT_P (op2))
35577 error ("the third argument must be comparison constant");
35578 return const0_rtx;
35580 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35582 error ("incorrect comparison mode");
35583 return const0_rtx;
35586 if (!insn_p->operand[2].predicate (op3, SImode))
35588 error ("incorrect rounding operand");
35589 return const0_rtx;
35592 comparison = comi_comparisons[INTVAL (op2)];
35593 need_ucomi = need_ucomi_values[INTVAL (op2)];
35595 if (VECTOR_MODE_P (mode0))
35596 op0 = safe_vector_operand (op0, mode0);
35597 if (VECTOR_MODE_P (mode1))
35598 op1 = safe_vector_operand (op1, mode1);
35600 target = gen_reg_rtx (SImode);
35601 emit_move_insn (target, const0_rtx);
35602 target = gen_rtx_SUBREG (QImode, target, 0);
35604 if ((optimize && !register_operand (op0, mode0))
35605 || !insn_p->operand[0].predicate (op0, mode0))
35606 op0 = copy_to_mode_reg (mode0, op0);
35607 if ((optimize && !register_operand (op1, mode1))
35608 || !insn_p->operand[1].predicate (op1, mode1))
35609 op1 = copy_to_mode_reg (mode1, op1);
35611 if (need_ucomi)
35612 icode = icode == CODE_FOR_sse_comi_round
35613 ? CODE_FOR_sse_ucomi_round
35614 : CODE_FOR_sse2_ucomi_round;
35616 pat = GEN_FCN (icode) (op0, op1, op3);
35617 if (! pat)
35618 return 0;
35620 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35621 if (INTVAL (op3) == NO_ROUND)
35623 pat = ix86_erase_embedded_rounding (pat);
35624 if (! pat)
35625 return 0;
35627 set_dst = SET_DEST (pat);
35629 else
35631 gcc_assert (GET_CODE (pat) == SET);
35632 set_dst = SET_DEST (pat);
35635 emit_insn (pat);
35636 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35637 gen_rtx_fmt_ee (comparison, QImode,
35638 set_dst,
35639 const0_rtx)));
35641 return SUBREG_REG (target);
35644 static rtx
35645 ix86_expand_round_builtin (const struct builtin_description *d,
35646 tree exp, rtx target)
35648 rtx pat;
35649 unsigned int i, nargs;
35650 struct
35652 rtx op;
35653 machine_mode mode;
35654 } args[6];
35655 enum insn_code icode = d->icode;
35656 const struct insn_data_d *insn_p = &insn_data[icode];
35657 machine_mode tmode = insn_p->operand[0].mode;
35658 unsigned int nargs_constant = 0;
35659 unsigned int redundant_embed_rnd = 0;
35661 switch ((enum ix86_builtin_func_type) d->flag)
35663 case UINT64_FTYPE_V2DF_INT:
35664 case UINT64_FTYPE_V4SF_INT:
35665 case UINT_FTYPE_V2DF_INT:
35666 case UINT_FTYPE_V4SF_INT:
35667 case INT64_FTYPE_V2DF_INT:
35668 case INT64_FTYPE_V4SF_INT:
35669 case INT_FTYPE_V2DF_INT:
35670 case INT_FTYPE_V4SF_INT:
35671 nargs = 2;
35672 break;
35673 case V4SF_FTYPE_V4SF_UINT_INT:
35674 case V4SF_FTYPE_V4SF_UINT64_INT:
35675 case V2DF_FTYPE_V2DF_UINT64_INT:
35676 case V4SF_FTYPE_V4SF_INT_INT:
35677 case V4SF_FTYPE_V4SF_INT64_INT:
35678 case V2DF_FTYPE_V2DF_INT64_INT:
35679 case V4SF_FTYPE_V4SF_V4SF_INT:
35680 case V2DF_FTYPE_V2DF_V2DF_INT:
35681 case V4SF_FTYPE_V4SF_V2DF_INT:
35682 case V2DF_FTYPE_V2DF_V4SF_INT:
35683 nargs = 3;
35684 break;
35685 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35686 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35687 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35688 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35689 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35690 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35691 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35692 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35693 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35694 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35695 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35696 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35697 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35698 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35699 nargs = 4;
35700 break;
35701 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35702 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35703 nargs_constant = 2;
35704 nargs = 4;
35705 break;
35706 case INT_FTYPE_V4SF_V4SF_INT_INT:
35707 case INT_FTYPE_V2DF_V2DF_INT_INT:
35708 return ix86_expand_sse_comi_round (d, exp, target);
35709 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35710 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
35711 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
35712 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35713 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35714 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35715 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35716 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35717 nargs = 5;
35718 break;
35719 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35720 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35721 nargs_constant = 4;
35722 nargs = 5;
35723 break;
35724 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35725 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35726 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35727 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35728 nargs_constant = 3;
35729 nargs = 5;
35730 break;
35731 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35732 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35733 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35734 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35735 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
35736 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
35737 nargs = 6;
35738 nargs_constant = 4;
35739 break;
35740 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35741 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35742 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35743 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35744 nargs = 6;
35745 nargs_constant = 3;
35746 break;
35747 default:
35748 gcc_unreachable ();
35750 gcc_assert (nargs <= ARRAY_SIZE (args));
35752 if (optimize
35753 || target == 0
35754 || GET_MODE (target) != tmode
35755 || !insn_p->operand[0].predicate (target, tmode))
35756 target = gen_reg_rtx (tmode);
35758 for (i = 0; i < nargs; i++)
35760 tree arg = CALL_EXPR_ARG (exp, i);
35761 rtx op = expand_normal (arg);
35762 machine_mode mode = insn_p->operand[i + 1].mode;
35763 bool match = insn_p->operand[i + 1].predicate (op, mode);
35765 if (i == nargs - nargs_constant)
35767 if (!match)
35769 switch (icode)
35771 case CODE_FOR_avx512f_getmantv8df_mask_round:
35772 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35773 case CODE_FOR_avx512f_vgetmantv2df_round:
35774 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
35775 case CODE_FOR_avx512f_vgetmantv4sf_round:
35776 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
35777 error ("the immediate argument must be a 4-bit immediate");
35778 return const0_rtx;
35779 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35780 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35781 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35782 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35783 error ("the immediate argument must be a 5-bit immediate");
35784 return const0_rtx;
35785 default:
35786 error ("the immediate argument must be an 8-bit immediate");
35787 return const0_rtx;
35791 else if (i == nargs-1)
35793 if (!insn_p->operand[nargs].predicate (op, SImode))
35795 error ("incorrect rounding operand");
35796 return const0_rtx;
35799 /* If there is no rounding use normal version of the pattern. */
35800 if (INTVAL (op) == NO_ROUND)
35801 redundant_embed_rnd = 1;
35803 else
35805 if (VECTOR_MODE_P (mode))
35806 op = safe_vector_operand (op, mode);
35808 op = fixup_modeless_constant (op, mode);
35810 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35812 if (optimize || !match)
35813 op = copy_to_mode_reg (mode, op);
35815 else
35817 op = copy_to_reg (op);
35818 op = lowpart_subreg (mode, op, GET_MODE (op));
35822 args[i].op = op;
35823 args[i].mode = mode;
35826 switch (nargs)
35828 case 1:
35829 pat = GEN_FCN (icode) (target, args[0].op);
35830 break;
35831 case 2:
35832 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35833 break;
35834 case 3:
35835 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35836 args[2].op);
35837 break;
35838 case 4:
35839 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35840 args[2].op, args[3].op);
35841 break;
35842 case 5:
35843 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35844 args[2].op, args[3].op, args[4].op);
35845 break;
35846 case 6:
35847 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35848 args[2].op, args[3].op, args[4].op,
35849 args[5].op);
35850 break;
35851 default:
35852 gcc_unreachable ();
35855 if (!pat)
35856 return 0;
35858 if (redundant_embed_rnd)
35859 pat = ix86_erase_embedded_rounding (pat);
35861 emit_insn (pat);
35862 return target;
35865 /* Subroutine of ix86_expand_builtin to take care of special insns
35866 with variable number of operands. */
35868 static rtx
35869 ix86_expand_special_args_builtin (const struct builtin_description *d,
35870 tree exp, rtx target)
35872 tree arg;
35873 rtx pat, op;
35874 unsigned int i, nargs, arg_adjust, memory;
35875 bool aligned_mem = false;
35876 struct
35878 rtx op;
35879 machine_mode mode;
35880 } args[3];
35881 enum insn_code icode = d->icode;
35882 bool last_arg_constant = false;
35883 const struct insn_data_d *insn_p = &insn_data[icode];
35884 machine_mode tmode = insn_p->operand[0].mode;
35885 enum { load, store } klass;
35887 switch ((enum ix86_builtin_func_type) d->flag)
35889 case VOID_FTYPE_VOID:
35890 emit_insn (GEN_FCN (icode) (target));
35891 return 0;
35892 case VOID_FTYPE_UINT64:
35893 case VOID_FTYPE_UNSIGNED:
35894 nargs = 0;
35895 klass = store;
35896 memory = 0;
35897 break;
35899 case INT_FTYPE_VOID:
35900 case USHORT_FTYPE_VOID:
35901 case UINT64_FTYPE_VOID:
35902 case UINT_FTYPE_VOID:
35903 case UNSIGNED_FTYPE_VOID:
35904 nargs = 0;
35905 klass = load;
35906 memory = 0;
35907 break;
35908 case UINT64_FTYPE_PUNSIGNED:
35909 case V2DI_FTYPE_PV2DI:
35910 case V4DI_FTYPE_PV4DI:
35911 case V32QI_FTYPE_PCCHAR:
35912 case V16QI_FTYPE_PCCHAR:
35913 case V8SF_FTYPE_PCV4SF:
35914 case V8SF_FTYPE_PCFLOAT:
35915 case V4SF_FTYPE_PCFLOAT:
35916 case V4DF_FTYPE_PCV2DF:
35917 case V4DF_FTYPE_PCDOUBLE:
35918 case V2DF_FTYPE_PCDOUBLE:
35919 case VOID_FTYPE_PVOID:
35920 case V8DI_FTYPE_PV8DI:
35921 nargs = 1;
35922 klass = load;
35923 memory = 0;
35924 switch (icode)
35926 case CODE_FOR_sse4_1_movntdqa:
35927 case CODE_FOR_avx2_movntdqa:
35928 case CODE_FOR_avx512f_movntdqa:
35929 aligned_mem = true;
35930 break;
35931 default:
35932 break;
35934 break;
35935 case VOID_FTYPE_PV2SF_V4SF:
35936 case VOID_FTYPE_PV8DI_V8DI:
35937 case VOID_FTYPE_PV4DI_V4DI:
35938 case VOID_FTYPE_PV2DI_V2DI:
35939 case VOID_FTYPE_PCHAR_V32QI:
35940 case VOID_FTYPE_PCHAR_V16QI:
35941 case VOID_FTYPE_PFLOAT_V16SF:
35942 case VOID_FTYPE_PFLOAT_V8SF:
35943 case VOID_FTYPE_PFLOAT_V4SF:
35944 case VOID_FTYPE_PDOUBLE_V8DF:
35945 case VOID_FTYPE_PDOUBLE_V4DF:
35946 case VOID_FTYPE_PDOUBLE_V2DF:
35947 case VOID_FTYPE_PLONGLONG_LONGLONG:
35948 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35949 case VOID_FTYPE_PINT_INT:
35950 nargs = 1;
35951 klass = store;
35952 /* Reserve memory operand for target. */
35953 memory = ARRAY_SIZE (args);
35954 switch (icode)
35956 /* These builtins and instructions require the memory
35957 to be properly aligned. */
35958 case CODE_FOR_avx_movntv4di:
35959 case CODE_FOR_sse2_movntv2di:
35960 case CODE_FOR_avx_movntv8sf:
35961 case CODE_FOR_sse_movntv4sf:
35962 case CODE_FOR_sse4a_vmmovntv4sf:
35963 case CODE_FOR_avx_movntv4df:
35964 case CODE_FOR_sse2_movntv2df:
35965 case CODE_FOR_sse4a_vmmovntv2df:
35966 case CODE_FOR_sse2_movntidi:
35967 case CODE_FOR_sse_movntq:
35968 case CODE_FOR_sse2_movntisi:
35969 case CODE_FOR_avx512f_movntv16sf:
35970 case CODE_FOR_avx512f_movntv8df:
35971 case CODE_FOR_avx512f_movntv8di:
35972 aligned_mem = true;
35973 break;
35974 default:
35975 break;
35977 break;
35978 case V4SF_FTYPE_V4SF_PCV2SF:
35979 case V2DF_FTYPE_V2DF_PCDOUBLE:
35980 nargs = 2;
35981 klass = load;
35982 memory = 1;
35983 break;
35984 case V8SF_FTYPE_PCV8SF_V8SI:
35985 case V4DF_FTYPE_PCV4DF_V4DI:
35986 case V4SF_FTYPE_PCV4SF_V4SI:
35987 case V2DF_FTYPE_PCV2DF_V2DI:
35988 case V8SI_FTYPE_PCV8SI_V8SI:
35989 case V4DI_FTYPE_PCV4DI_V4DI:
35990 case V4SI_FTYPE_PCV4SI_V4SI:
35991 case V2DI_FTYPE_PCV2DI_V2DI:
35992 case VOID_FTYPE_INT_INT64:
35993 nargs = 2;
35994 klass = load;
35995 memory = 0;
35996 break;
35997 case VOID_FTYPE_PV8DF_V8DF_UQI:
35998 case VOID_FTYPE_PV4DF_V4DF_UQI:
35999 case VOID_FTYPE_PV2DF_V2DF_UQI:
36000 case VOID_FTYPE_PV16SF_V16SF_UHI:
36001 case VOID_FTYPE_PV8SF_V8SF_UQI:
36002 case VOID_FTYPE_PV4SF_V4SF_UQI:
36003 case VOID_FTYPE_PV8DI_V8DI_UQI:
36004 case VOID_FTYPE_PV4DI_V4DI_UQI:
36005 case VOID_FTYPE_PV2DI_V2DI_UQI:
36006 case VOID_FTYPE_PV16SI_V16SI_UHI:
36007 case VOID_FTYPE_PV8SI_V8SI_UQI:
36008 case VOID_FTYPE_PV4SI_V4SI_UQI:
36009 case VOID_FTYPE_PV64QI_V64QI_UDI:
36010 case VOID_FTYPE_PV32HI_V32HI_USI:
36011 case VOID_FTYPE_PV32QI_V32QI_USI:
36012 case VOID_FTYPE_PV16QI_V16QI_UHI:
36013 case VOID_FTYPE_PV16HI_V16HI_UHI:
36014 case VOID_FTYPE_PV8HI_V8HI_UQI:
36015 switch (icode)
36017 /* These builtins and instructions require the memory
36018 to be properly aligned. */
36019 case CODE_FOR_avx512f_storev16sf_mask:
36020 case CODE_FOR_avx512f_storev16si_mask:
36021 case CODE_FOR_avx512f_storev8df_mask:
36022 case CODE_FOR_avx512f_storev8di_mask:
36023 case CODE_FOR_avx512vl_storev8sf_mask:
36024 case CODE_FOR_avx512vl_storev8si_mask:
36025 case CODE_FOR_avx512vl_storev4df_mask:
36026 case CODE_FOR_avx512vl_storev4di_mask:
36027 case CODE_FOR_avx512vl_storev4sf_mask:
36028 case CODE_FOR_avx512vl_storev4si_mask:
36029 case CODE_FOR_avx512vl_storev2df_mask:
36030 case CODE_FOR_avx512vl_storev2di_mask:
36031 aligned_mem = true;
36032 break;
36033 default:
36034 break;
36036 /* FALLTHRU */
36037 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36038 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36039 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36040 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36041 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36042 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36043 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36044 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36045 case VOID_FTYPE_PV8SI_V8DI_UQI:
36046 case VOID_FTYPE_PV8HI_V8DI_UQI:
36047 case VOID_FTYPE_PV16HI_V16SI_UHI:
36048 case VOID_FTYPE_PV16QI_V8DI_UQI:
36049 case VOID_FTYPE_PV16QI_V16SI_UHI:
36050 case VOID_FTYPE_PV4SI_V4DI_UQI:
36051 case VOID_FTYPE_PV4SI_V2DI_UQI:
36052 case VOID_FTYPE_PV8HI_V4DI_UQI:
36053 case VOID_FTYPE_PV8HI_V2DI_UQI:
36054 case VOID_FTYPE_PV8HI_V8SI_UQI:
36055 case VOID_FTYPE_PV8HI_V4SI_UQI:
36056 case VOID_FTYPE_PV16QI_V4DI_UQI:
36057 case VOID_FTYPE_PV16QI_V2DI_UQI:
36058 case VOID_FTYPE_PV16QI_V8SI_UQI:
36059 case VOID_FTYPE_PV16QI_V4SI_UQI:
36060 case VOID_FTYPE_PCHAR_V64QI_UDI:
36061 case VOID_FTYPE_PCHAR_V32QI_USI:
36062 case VOID_FTYPE_PCHAR_V16QI_UHI:
36063 case VOID_FTYPE_PSHORT_V32HI_USI:
36064 case VOID_FTYPE_PSHORT_V16HI_UHI:
36065 case VOID_FTYPE_PSHORT_V8HI_UQI:
36066 case VOID_FTYPE_PINT_V16SI_UHI:
36067 case VOID_FTYPE_PINT_V8SI_UQI:
36068 case VOID_FTYPE_PINT_V4SI_UQI:
36069 case VOID_FTYPE_PINT64_V8DI_UQI:
36070 case VOID_FTYPE_PINT64_V4DI_UQI:
36071 case VOID_FTYPE_PINT64_V2DI_UQI:
36072 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36073 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36074 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36075 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36076 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36077 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36078 case VOID_FTYPE_PV32QI_V32HI_USI:
36079 case VOID_FTYPE_PV16QI_V16HI_UHI:
36080 case VOID_FTYPE_PV8QI_V8HI_UQI:
36081 nargs = 2;
36082 klass = store;
36083 /* Reserve memory operand for target. */
36084 memory = ARRAY_SIZE (args);
36085 break;
36086 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36087 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36088 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36089 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36090 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36091 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36092 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36093 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36094 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36095 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36096 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36097 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36098 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
36099 case V32HI_FTYPE_PCV32HI_V32HI_USI:
36100 case V32QI_FTYPE_PCV32QI_V32QI_USI:
36101 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
36102 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
36103 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
36104 switch (icode)
36106 /* These builtins and instructions require the memory
36107 to be properly aligned. */
36108 case CODE_FOR_avx512f_loadv16sf_mask:
36109 case CODE_FOR_avx512f_loadv16si_mask:
36110 case CODE_FOR_avx512f_loadv8df_mask:
36111 case CODE_FOR_avx512f_loadv8di_mask:
36112 case CODE_FOR_avx512vl_loadv8sf_mask:
36113 case CODE_FOR_avx512vl_loadv8si_mask:
36114 case CODE_FOR_avx512vl_loadv4df_mask:
36115 case CODE_FOR_avx512vl_loadv4di_mask:
36116 case CODE_FOR_avx512vl_loadv4sf_mask:
36117 case CODE_FOR_avx512vl_loadv4si_mask:
36118 case CODE_FOR_avx512vl_loadv2df_mask:
36119 case CODE_FOR_avx512vl_loadv2di_mask:
36120 case CODE_FOR_avx512bw_loadv64qi_mask:
36121 case CODE_FOR_avx512vl_loadv32qi_mask:
36122 case CODE_FOR_avx512vl_loadv16qi_mask:
36123 case CODE_FOR_avx512bw_loadv32hi_mask:
36124 case CODE_FOR_avx512vl_loadv16hi_mask:
36125 case CODE_FOR_avx512vl_loadv8hi_mask:
36126 aligned_mem = true;
36127 break;
36128 default:
36129 break;
36131 /* FALLTHRU */
36132 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36133 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36134 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36135 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36136 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36137 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36138 case V16SI_FTYPE_PCINT_V16SI_UHI:
36139 case V8SI_FTYPE_PCINT_V8SI_UQI:
36140 case V4SI_FTYPE_PCINT_V4SI_UQI:
36141 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36142 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36143 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36144 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36145 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36146 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36147 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36148 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36149 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36150 nargs = 3;
36151 klass = load;
36152 memory = 0;
36153 break;
36154 case VOID_FTYPE_UINT_UINT_UINT:
36155 case VOID_FTYPE_UINT64_UINT_UINT:
36156 case UCHAR_FTYPE_UINT_UINT_UINT:
36157 case UCHAR_FTYPE_UINT64_UINT_UINT:
36158 nargs = 3;
36159 klass = load;
36160 memory = ARRAY_SIZE (args);
36161 last_arg_constant = true;
36162 break;
36163 default:
36164 gcc_unreachable ();
36167 gcc_assert (nargs <= ARRAY_SIZE (args));
36169 if (klass == store)
36171 arg = CALL_EXPR_ARG (exp, 0);
36172 op = expand_normal (arg);
36173 gcc_assert (target == 0);
36174 if (memory)
36176 op = ix86_zero_extend_to_Pmode (op);
36177 target = gen_rtx_MEM (tmode, op);
36178 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36179 on it. Try to improve it using get_pointer_alignment,
36180 and if the special builtin is one that requires strict
36181 mode alignment, also from it's GET_MODE_ALIGNMENT.
36182 Failure to do so could lead to ix86_legitimate_combined_insn
36183 rejecting all changes to such insns. */
36184 unsigned int align = get_pointer_alignment (arg);
36185 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36186 align = GET_MODE_ALIGNMENT (tmode);
36187 if (MEM_ALIGN (target) < align)
36188 set_mem_align (target, align);
36190 else
36191 target = force_reg (tmode, op);
36192 arg_adjust = 1;
36194 else
36196 arg_adjust = 0;
36197 if (optimize
36198 || target == 0
36199 || !register_operand (target, tmode)
36200 || GET_MODE (target) != tmode)
36201 target = gen_reg_rtx (tmode);
36204 for (i = 0; i < nargs; i++)
36206 machine_mode mode = insn_p->operand[i + 1].mode;
36207 bool match;
36209 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36210 op = expand_normal (arg);
36211 match = insn_p->operand[i + 1].predicate (op, mode);
36213 if (last_arg_constant && (i + 1) == nargs)
36215 if (!match)
36217 if (icode == CODE_FOR_lwp_lwpvalsi3
36218 || icode == CODE_FOR_lwp_lwpinssi3
36219 || icode == CODE_FOR_lwp_lwpvaldi3
36220 || icode == CODE_FOR_lwp_lwpinsdi3)
36221 error ("the last argument must be a 32-bit immediate");
36222 else
36223 error ("the last argument must be an 8-bit immediate");
36224 return const0_rtx;
36227 else
36229 if (i == memory)
36231 /* This must be the memory operand. */
36232 op = ix86_zero_extend_to_Pmode (op);
36233 op = gen_rtx_MEM (mode, op);
36234 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36235 on it. Try to improve it using get_pointer_alignment,
36236 and if the special builtin is one that requires strict
36237 mode alignment, also from it's GET_MODE_ALIGNMENT.
36238 Failure to do so could lead to ix86_legitimate_combined_insn
36239 rejecting all changes to such insns. */
36240 unsigned int align = get_pointer_alignment (arg);
36241 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36242 align = GET_MODE_ALIGNMENT (mode);
36243 if (MEM_ALIGN (op) < align)
36244 set_mem_align (op, align);
36246 else
36248 /* This must be register. */
36249 if (VECTOR_MODE_P (mode))
36250 op = safe_vector_operand (op, mode);
36252 op = fixup_modeless_constant (op, mode);
36254 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36255 op = copy_to_mode_reg (mode, op);
36256 else
36258 op = copy_to_reg (op);
36259 op = lowpart_subreg (mode, op, GET_MODE (op));
36264 args[i].op = op;
36265 args[i].mode = mode;
36268 switch (nargs)
36270 case 0:
36271 pat = GEN_FCN (icode) (target);
36272 break;
36273 case 1:
36274 pat = GEN_FCN (icode) (target, args[0].op);
36275 break;
36276 case 2:
36277 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36278 break;
36279 case 3:
36280 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36281 break;
36282 default:
36283 gcc_unreachable ();
36286 if (! pat)
36287 return 0;
36288 emit_insn (pat);
36289 return klass == store ? 0 : target;
36292 /* Return the integer constant in ARG. Constrain it to be in the range
36293 of the subparts of VEC_TYPE; issue an error if not. */
36295 static int
36296 get_element_number (tree vec_type, tree arg)
36298 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36300 if (!tree_fits_uhwi_p (arg)
36301 || (elt = tree_to_uhwi (arg), elt > max))
36303 error ("selector must be an integer constant in the range 0..%wi", max);
36304 return 0;
36307 return elt;
36310 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36311 ix86_expand_vector_init. We DO have language-level syntax for this, in
36312 the form of (type){ init-list }. Except that since we can't place emms
36313 instructions from inside the compiler, we can't allow the use of MMX
36314 registers unless the user explicitly asks for it. So we do *not* define
36315 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36316 we have builtins invoked by mmintrin.h that gives us license to emit
36317 these sorts of instructions. */
36319 static rtx
36320 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36322 machine_mode tmode = TYPE_MODE (type);
36323 machine_mode inner_mode = GET_MODE_INNER (tmode);
36324 int i, n_elt = GET_MODE_NUNITS (tmode);
36325 rtvec v = rtvec_alloc (n_elt);
36327 gcc_assert (VECTOR_MODE_P (tmode));
36328 gcc_assert (call_expr_nargs (exp) == n_elt);
36330 for (i = 0; i < n_elt; ++i)
36332 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36333 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36336 if (!target || !register_operand (target, tmode))
36337 target = gen_reg_rtx (tmode);
36339 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36340 return target;
36343 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36344 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36345 had a language-level syntax for referencing vector elements. */
36347 static rtx
36348 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36350 machine_mode tmode, mode0;
36351 tree arg0, arg1;
36352 int elt;
36353 rtx op0;
36355 arg0 = CALL_EXPR_ARG (exp, 0);
36356 arg1 = CALL_EXPR_ARG (exp, 1);
36358 op0 = expand_normal (arg0);
36359 elt = get_element_number (TREE_TYPE (arg0), arg1);
36361 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36362 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36363 gcc_assert (VECTOR_MODE_P (mode0));
36365 op0 = force_reg (mode0, op0);
36367 if (optimize || !target || !register_operand (target, tmode))
36368 target = gen_reg_rtx (tmode);
36370 ix86_expand_vector_extract (true, target, op0, elt);
36372 return target;
36375 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36376 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36377 a language-level syntax for referencing vector elements. */
36379 static rtx
36380 ix86_expand_vec_set_builtin (tree exp)
36382 machine_mode tmode, mode1;
36383 tree arg0, arg1, arg2;
36384 int elt;
36385 rtx op0, op1, target;
36387 arg0 = CALL_EXPR_ARG (exp, 0);
36388 arg1 = CALL_EXPR_ARG (exp, 1);
36389 arg2 = CALL_EXPR_ARG (exp, 2);
36391 tmode = TYPE_MODE (TREE_TYPE (arg0));
36392 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36393 gcc_assert (VECTOR_MODE_P (tmode));
36395 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36396 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36397 elt = get_element_number (TREE_TYPE (arg0), arg2);
36399 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36400 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36402 op0 = force_reg (tmode, op0);
36403 op1 = force_reg (mode1, op1);
36405 /* OP0 is the source of these builtin functions and shouldn't be
36406 modified. Create a copy, use it and return it as target. */
36407 target = gen_reg_rtx (tmode);
36408 emit_move_insn (target, op0);
36409 ix86_expand_vector_set (true, target, op1, elt);
36411 return target;
36414 /* Emit conditional move of SRC to DST with condition
36415 OP1 CODE OP2. */
36416 static void
36417 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36419 rtx t;
36421 if (TARGET_CMOVE)
36423 t = ix86_expand_compare (code, op1, op2);
36424 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36425 src, dst)));
36427 else
36429 rtx_code_label *nomove = gen_label_rtx ();
36430 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36431 const0_rtx, GET_MODE (op1), 1, nomove);
36432 emit_move_insn (dst, src);
36433 emit_label (nomove);
36437 /* Choose max of DST and SRC and put it to DST. */
36438 static void
36439 ix86_emit_move_max (rtx dst, rtx src)
36441 ix86_emit_cmove (dst, src, LTU, dst, src);
36444 /* Expand an expression EXP that calls a built-in function,
36445 with result going to TARGET if that's convenient
36446 (and in mode MODE if that's convenient).
36447 SUBTARGET may be used as the target for computing one of EXP's operands.
36448 IGNORE is nonzero if the value is to be ignored. */
36450 static rtx
36451 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36452 machine_mode mode, int ignore)
36454 size_t i;
36455 enum insn_code icode, icode2;
36456 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36457 tree arg0, arg1, arg2, arg3, arg4;
36458 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
36459 machine_mode mode0, mode1, mode2, mode3, mode4;
36460 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36462 /* For CPU builtins that can be folded, fold first and expand the fold. */
36463 switch (fcode)
36465 case IX86_BUILTIN_CPU_INIT:
36467 /* Make it call __cpu_indicator_init in libgcc. */
36468 tree call_expr, fndecl, type;
36469 type = build_function_type_list (integer_type_node, NULL_TREE);
36470 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36471 call_expr = build_call_expr (fndecl, 0);
36472 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36474 case IX86_BUILTIN_CPU_IS:
36475 case IX86_BUILTIN_CPU_SUPPORTS:
36477 tree arg0 = CALL_EXPR_ARG (exp, 0);
36478 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36479 gcc_assert (fold_expr != NULL_TREE);
36480 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36484 HOST_WIDE_INT isa = ix86_isa_flags;
36485 HOST_WIDE_INT isa2 = ix86_isa_flags2;
36486 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
36487 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
36488 /* The general case is we require all the ISAs specified in bisa{,2}
36489 to be enabled.
36490 The exceptions are:
36491 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
36492 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
36493 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
36494 where for each this pair it is sufficient if either of the ISAs is
36495 enabled, plus if it is ored with other options also those others. */
36496 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36497 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36498 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
36499 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
36500 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36501 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36502 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
36503 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
36504 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36505 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36506 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
36507 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
36508 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
36510 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
36511 (enum fpmath_unit) 0, false);
36512 if (!opts)
36513 error ("%qE needs unknown isa option", fndecl);
36514 else
36516 gcc_assert (opts != NULL);
36517 error ("%qE needs isa option %s", fndecl, opts);
36518 free (opts);
36520 return expand_call (exp, target, ignore);
36523 switch (fcode)
36525 case IX86_BUILTIN_BNDMK:
36526 if (!target
36527 || GET_MODE (target) != BNDmode
36528 || !register_operand (target, BNDmode))
36529 target = gen_reg_rtx (BNDmode);
36531 arg0 = CALL_EXPR_ARG (exp, 0);
36532 arg1 = CALL_EXPR_ARG (exp, 1);
36534 op0 = expand_normal (arg0);
36535 op1 = expand_normal (arg1);
36537 if (!register_operand (op0, Pmode))
36538 op0 = ix86_zero_extend_to_Pmode (op0);
36539 if (!register_operand (op1, Pmode))
36540 op1 = ix86_zero_extend_to_Pmode (op1);
36542 /* Builtin arg1 is size of block but instruction op1 should
36543 be (size - 1). */
36544 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36545 NULL_RTX, 1, OPTAB_DIRECT);
36547 emit_insn (BNDmode == BND64mode
36548 ? gen_bnd64_mk (target, op0, op1)
36549 : gen_bnd32_mk (target, op0, op1));
36550 return target;
36552 case IX86_BUILTIN_BNDSTX:
36553 arg0 = CALL_EXPR_ARG (exp, 0);
36554 arg1 = CALL_EXPR_ARG (exp, 1);
36555 arg2 = CALL_EXPR_ARG (exp, 2);
36557 op0 = expand_normal (arg0);
36558 op1 = expand_normal (arg1);
36559 op2 = expand_normal (arg2);
36561 if (!register_operand (op0, Pmode))
36562 op0 = ix86_zero_extend_to_Pmode (op0);
36563 if (!register_operand (op1, BNDmode))
36564 op1 = copy_to_mode_reg (BNDmode, op1);
36565 if (!register_operand (op2, Pmode))
36566 op2 = ix86_zero_extend_to_Pmode (op2);
36568 emit_insn (BNDmode == BND64mode
36569 ? gen_bnd64_stx (op2, op0, op1)
36570 : gen_bnd32_stx (op2, op0, op1));
36571 return 0;
36573 case IX86_BUILTIN_BNDLDX:
36574 if (!target
36575 || GET_MODE (target) != BNDmode
36576 || !register_operand (target, BNDmode))
36577 target = gen_reg_rtx (BNDmode);
36579 arg0 = CALL_EXPR_ARG (exp, 0);
36580 arg1 = CALL_EXPR_ARG (exp, 1);
36582 op0 = expand_normal (arg0);
36583 op1 = expand_normal (arg1);
36585 if (!register_operand (op0, Pmode))
36586 op0 = ix86_zero_extend_to_Pmode (op0);
36587 if (!register_operand (op1, Pmode))
36588 op1 = ix86_zero_extend_to_Pmode (op1);
36590 emit_insn (BNDmode == BND64mode
36591 ? gen_bnd64_ldx (target, op0, op1)
36592 : gen_bnd32_ldx (target, op0, op1));
36593 return target;
36595 case IX86_BUILTIN_BNDCL:
36596 arg0 = CALL_EXPR_ARG (exp, 0);
36597 arg1 = CALL_EXPR_ARG (exp, 1);
36599 op0 = expand_normal (arg0);
36600 op1 = expand_normal (arg1);
36602 if (!register_operand (op0, Pmode))
36603 op0 = ix86_zero_extend_to_Pmode (op0);
36604 if (!register_operand (op1, BNDmode))
36605 op1 = copy_to_mode_reg (BNDmode, op1);
36607 emit_insn (BNDmode == BND64mode
36608 ? gen_bnd64_cl (op1, op0)
36609 : gen_bnd32_cl (op1, op0));
36610 return 0;
36612 case IX86_BUILTIN_BNDCU:
36613 arg0 = CALL_EXPR_ARG (exp, 0);
36614 arg1 = CALL_EXPR_ARG (exp, 1);
36616 op0 = expand_normal (arg0);
36617 op1 = expand_normal (arg1);
36619 if (!register_operand (op0, Pmode))
36620 op0 = ix86_zero_extend_to_Pmode (op0);
36621 if (!register_operand (op1, BNDmode))
36622 op1 = copy_to_mode_reg (BNDmode, op1);
36624 emit_insn (BNDmode == BND64mode
36625 ? gen_bnd64_cu (op1, op0)
36626 : gen_bnd32_cu (op1, op0));
36627 return 0;
36629 case IX86_BUILTIN_BNDRET:
36630 arg0 = CALL_EXPR_ARG (exp, 0);
36631 target = chkp_get_rtl_bounds (arg0);
36633 /* If no bounds were specified for returned value,
36634 then use INIT bounds. It usually happens when
36635 some built-in function is expanded. */
36636 if (!target)
36638 rtx t1 = gen_reg_rtx (Pmode);
36639 rtx t2 = gen_reg_rtx (Pmode);
36640 target = gen_reg_rtx (BNDmode);
36641 emit_move_insn (t1, const0_rtx);
36642 emit_move_insn (t2, constm1_rtx);
36643 emit_insn (BNDmode == BND64mode
36644 ? gen_bnd64_mk (target, t1, t2)
36645 : gen_bnd32_mk (target, t1, t2));
36648 gcc_assert (target && REG_P (target));
36649 return target;
36651 case IX86_BUILTIN_BNDNARROW:
36653 rtx m1, m1h1, m1h2, lb, ub, t1;
36655 /* Return value and lb. */
36656 arg0 = CALL_EXPR_ARG (exp, 0);
36657 /* Bounds. */
36658 arg1 = CALL_EXPR_ARG (exp, 1);
36659 /* Size. */
36660 arg2 = CALL_EXPR_ARG (exp, 2);
36662 lb = expand_normal (arg0);
36663 op1 = expand_normal (arg1);
36664 op2 = expand_normal (arg2);
36666 /* Size was passed but we need to use (size - 1) as for bndmk. */
36667 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36668 NULL_RTX, 1, OPTAB_DIRECT);
36670 /* Add LB to size and inverse to get UB. */
36671 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36672 op2, 1, OPTAB_DIRECT);
36673 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36675 if (!register_operand (lb, Pmode))
36676 lb = ix86_zero_extend_to_Pmode (lb);
36677 if (!register_operand (ub, Pmode))
36678 ub = ix86_zero_extend_to_Pmode (ub);
36680 /* We need to move bounds to memory before any computations. */
36681 if (MEM_P (op1))
36682 m1 = op1;
36683 else
36685 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36686 emit_move_insn (m1, op1);
36689 /* Generate mem expression to be used for access to LB and UB. */
36690 m1h1 = adjust_address (m1, Pmode, 0);
36691 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36693 t1 = gen_reg_rtx (Pmode);
36695 /* Compute LB. */
36696 emit_move_insn (t1, m1h1);
36697 ix86_emit_move_max (t1, lb);
36698 emit_move_insn (m1h1, t1);
36700 /* Compute UB. UB is stored in 1's complement form. Therefore
36701 we also use max here. */
36702 emit_move_insn (t1, m1h2);
36703 ix86_emit_move_max (t1, ub);
36704 emit_move_insn (m1h2, t1);
36706 op2 = gen_reg_rtx (BNDmode);
36707 emit_move_insn (op2, m1);
36709 return chkp_join_splitted_slot (lb, op2);
36712 case IX86_BUILTIN_BNDINT:
36714 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36716 if (!target
36717 || GET_MODE (target) != BNDmode
36718 || !register_operand (target, BNDmode))
36719 target = gen_reg_rtx (BNDmode);
36721 arg0 = CALL_EXPR_ARG (exp, 0);
36722 arg1 = CALL_EXPR_ARG (exp, 1);
36724 op0 = expand_normal (arg0);
36725 op1 = expand_normal (arg1);
36727 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36728 rh1 = adjust_address (res, Pmode, 0);
36729 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36731 /* Put first bounds to temporaries. */
36732 lb1 = gen_reg_rtx (Pmode);
36733 ub1 = gen_reg_rtx (Pmode);
36734 if (MEM_P (op0))
36736 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36737 emit_move_insn (ub1, adjust_address (op0, Pmode,
36738 GET_MODE_SIZE (Pmode)));
36740 else
36742 emit_move_insn (res, op0);
36743 emit_move_insn (lb1, rh1);
36744 emit_move_insn (ub1, rh2);
36747 /* Put second bounds to temporaries. */
36748 lb2 = gen_reg_rtx (Pmode);
36749 ub2 = gen_reg_rtx (Pmode);
36750 if (MEM_P (op1))
36752 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36753 emit_move_insn (ub2, adjust_address (op1, Pmode,
36754 GET_MODE_SIZE (Pmode)));
36756 else
36758 emit_move_insn (res, op1);
36759 emit_move_insn (lb2, rh1);
36760 emit_move_insn (ub2, rh2);
36763 /* Compute LB. */
36764 ix86_emit_move_max (lb1, lb2);
36765 emit_move_insn (rh1, lb1);
36767 /* Compute UB. UB is stored in 1's complement form. Therefore
36768 we also use max here. */
36769 ix86_emit_move_max (ub1, ub2);
36770 emit_move_insn (rh2, ub1);
36772 emit_move_insn (target, res);
36774 return target;
36777 case IX86_BUILTIN_SIZEOF:
36779 tree name;
36780 rtx symbol;
36782 if (!target
36783 || GET_MODE (target) != Pmode
36784 || !register_operand (target, Pmode))
36785 target = gen_reg_rtx (Pmode);
36787 arg0 = CALL_EXPR_ARG (exp, 0);
36788 gcc_assert (VAR_P (arg0));
36790 name = DECL_ASSEMBLER_NAME (arg0);
36791 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36793 emit_insn (Pmode == SImode
36794 ? gen_move_size_reloc_si (target, symbol)
36795 : gen_move_size_reloc_di (target, symbol));
36797 return target;
36800 case IX86_BUILTIN_BNDLOWER:
36802 rtx mem, hmem;
36804 if (!target
36805 || GET_MODE (target) != Pmode
36806 || !register_operand (target, Pmode))
36807 target = gen_reg_rtx (Pmode);
36809 arg0 = CALL_EXPR_ARG (exp, 0);
36810 op0 = expand_normal (arg0);
36812 /* We need to move bounds to memory first. */
36813 if (MEM_P (op0))
36814 mem = op0;
36815 else
36817 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36818 emit_move_insn (mem, op0);
36821 /* Generate mem expression to access LB and load it. */
36822 hmem = adjust_address (mem, Pmode, 0);
36823 emit_move_insn (target, hmem);
36825 return target;
36828 case IX86_BUILTIN_BNDUPPER:
36830 rtx mem, hmem, res;
36832 if (!target
36833 || GET_MODE (target) != Pmode
36834 || !register_operand (target, Pmode))
36835 target = gen_reg_rtx (Pmode);
36837 arg0 = CALL_EXPR_ARG (exp, 0);
36838 op0 = expand_normal (arg0);
36840 /* We need to move bounds to memory first. */
36841 if (MEM_P (op0))
36842 mem = op0;
36843 else
36845 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36846 emit_move_insn (mem, op0);
36849 /* Generate mem expression to access UB. */
36850 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36852 /* We need to inverse all bits of UB. */
36853 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36855 if (res != target)
36856 emit_move_insn (target, res);
36858 return target;
36861 case IX86_BUILTIN_MASKMOVQ:
36862 case IX86_BUILTIN_MASKMOVDQU:
36863 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36864 ? CODE_FOR_mmx_maskmovq
36865 : CODE_FOR_sse2_maskmovdqu);
36866 /* Note the arg order is different from the operand order. */
36867 arg1 = CALL_EXPR_ARG (exp, 0);
36868 arg2 = CALL_EXPR_ARG (exp, 1);
36869 arg0 = CALL_EXPR_ARG (exp, 2);
36870 op0 = expand_normal (arg0);
36871 op1 = expand_normal (arg1);
36872 op2 = expand_normal (arg2);
36873 mode0 = insn_data[icode].operand[0].mode;
36874 mode1 = insn_data[icode].operand[1].mode;
36875 mode2 = insn_data[icode].operand[2].mode;
36877 op0 = ix86_zero_extend_to_Pmode (op0);
36878 op0 = gen_rtx_MEM (mode1, op0);
36880 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36881 op0 = copy_to_mode_reg (mode0, op0);
36882 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36883 op1 = copy_to_mode_reg (mode1, op1);
36884 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36885 op2 = copy_to_mode_reg (mode2, op2);
36886 pat = GEN_FCN (icode) (op0, op1, op2);
36887 if (! pat)
36888 return 0;
36889 emit_insn (pat);
36890 return 0;
36892 case IX86_BUILTIN_LDMXCSR:
36893 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36894 target = assign_386_stack_local (SImode, SLOT_TEMP);
36895 emit_move_insn (target, op0);
36896 emit_insn (gen_sse_ldmxcsr (target));
36897 return 0;
36899 case IX86_BUILTIN_STMXCSR:
36900 target = assign_386_stack_local (SImode, SLOT_TEMP);
36901 emit_insn (gen_sse_stmxcsr (target));
36902 return copy_to_mode_reg (SImode, target);
36904 case IX86_BUILTIN_CLFLUSH:
36905 arg0 = CALL_EXPR_ARG (exp, 0);
36906 op0 = expand_normal (arg0);
36907 icode = CODE_FOR_sse2_clflush;
36908 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36909 op0 = ix86_zero_extend_to_Pmode (op0);
36911 emit_insn (gen_sse2_clflush (op0));
36912 return 0;
36914 case IX86_BUILTIN_CLWB:
36915 arg0 = CALL_EXPR_ARG (exp, 0);
36916 op0 = expand_normal (arg0);
36917 icode = CODE_FOR_clwb;
36918 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36919 op0 = ix86_zero_extend_to_Pmode (op0);
36921 emit_insn (gen_clwb (op0));
36922 return 0;
36924 case IX86_BUILTIN_CLFLUSHOPT:
36925 arg0 = CALL_EXPR_ARG (exp, 0);
36926 op0 = expand_normal (arg0);
36927 icode = CODE_FOR_clflushopt;
36928 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36929 op0 = ix86_zero_extend_to_Pmode (op0);
36931 emit_insn (gen_clflushopt (op0));
36932 return 0;
36934 case IX86_BUILTIN_MONITOR:
36935 case IX86_BUILTIN_MONITORX:
36936 arg0 = CALL_EXPR_ARG (exp, 0);
36937 arg1 = CALL_EXPR_ARG (exp, 1);
36938 arg2 = CALL_EXPR_ARG (exp, 2);
36939 op0 = expand_normal (arg0);
36940 op1 = expand_normal (arg1);
36941 op2 = expand_normal (arg2);
36942 if (!REG_P (op0))
36943 op0 = ix86_zero_extend_to_Pmode (op0);
36944 if (!REG_P (op1))
36945 op1 = copy_to_mode_reg (SImode, op1);
36946 if (!REG_P (op2))
36947 op2 = copy_to_mode_reg (SImode, op2);
36949 emit_insn (fcode == IX86_BUILTIN_MONITOR
36950 ? ix86_gen_monitor (op0, op1, op2)
36951 : ix86_gen_monitorx (op0, op1, op2));
36952 return 0;
36954 case IX86_BUILTIN_MWAIT:
36955 arg0 = CALL_EXPR_ARG (exp, 0);
36956 arg1 = CALL_EXPR_ARG (exp, 1);
36957 op0 = expand_normal (arg0);
36958 op1 = expand_normal (arg1);
36959 if (!REG_P (op0))
36960 op0 = copy_to_mode_reg (SImode, op0);
36961 if (!REG_P (op1))
36962 op1 = copy_to_mode_reg (SImode, op1);
36963 emit_insn (gen_sse3_mwait (op0, op1));
36964 return 0;
36966 case IX86_BUILTIN_MWAITX:
36967 arg0 = CALL_EXPR_ARG (exp, 0);
36968 arg1 = CALL_EXPR_ARG (exp, 1);
36969 arg2 = CALL_EXPR_ARG (exp, 2);
36970 op0 = expand_normal (arg0);
36971 op1 = expand_normal (arg1);
36972 op2 = expand_normal (arg2);
36973 if (!REG_P (op0))
36974 op0 = copy_to_mode_reg (SImode, op0);
36975 if (!REG_P (op1))
36976 op1 = copy_to_mode_reg (SImode, op1);
36977 if (!REG_P (op2))
36978 op2 = copy_to_mode_reg (SImode, op2);
36979 emit_insn (gen_mwaitx (op0, op1, op2));
36980 return 0;
36982 case IX86_BUILTIN_CLZERO:
36983 arg0 = CALL_EXPR_ARG (exp, 0);
36984 op0 = expand_normal (arg0);
36985 if (!REG_P (op0))
36986 op0 = ix86_zero_extend_to_Pmode (op0);
36987 emit_insn (ix86_gen_clzero (op0));
36988 return 0;
36990 case IX86_BUILTIN_VEC_INIT_V2SI:
36991 case IX86_BUILTIN_VEC_INIT_V4HI:
36992 case IX86_BUILTIN_VEC_INIT_V8QI:
36993 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
36995 case IX86_BUILTIN_VEC_EXT_V2DF:
36996 case IX86_BUILTIN_VEC_EXT_V2DI:
36997 case IX86_BUILTIN_VEC_EXT_V4SF:
36998 case IX86_BUILTIN_VEC_EXT_V4SI:
36999 case IX86_BUILTIN_VEC_EXT_V8HI:
37000 case IX86_BUILTIN_VEC_EXT_V2SI:
37001 case IX86_BUILTIN_VEC_EXT_V4HI:
37002 case IX86_BUILTIN_VEC_EXT_V16QI:
37003 return ix86_expand_vec_ext_builtin (exp, target);
37005 case IX86_BUILTIN_VEC_SET_V2DI:
37006 case IX86_BUILTIN_VEC_SET_V4SF:
37007 case IX86_BUILTIN_VEC_SET_V4SI:
37008 case IX86_BUILTIN_VEC_SET_V8HI:
37009 case IX86_BUILTIN_VEC_SET_V4HI:
37010 case IX86_BUILTIN_VEC_SET_V16QI:
37011 return ix86_expand_vec_set_builtin (exp);
37013 case IX86_BUILTIN_NANQ:
37014 case IX86_BUILTIN_NANSQ:
37015 return expand_call (exp, target, ignore);
37017 case IX86_BUILTIN_RDPID:
37019 op0 = gen_reg_rtx (TARGET_64BIT ? DImode : SImode);
37021 if (TARGET_64BIT)
37023 insn = gen_rdpid_rex64 (op0);
37024 op0 = convert_to_mode (SImode, op0, 1);
37026 else
37027 insn = gen_rdpid (op0);
37028 emit_insn (insn);
37030 if (target == 0)
37032 /* mode is VOIDmode if __builtin_rdpid has been called
37033 without lhs. */
37034 if (mode == VOIDmode)
37035 return target;
37036 target = gen_reg_rtx (mode);
37038 emit_move_insn (target, op0);
37039 return target;
37040 case IX86_BUILTIN_RDPMC:
37041 case IX86_BUILTIN_RDTSC:
37042 case IX86_BUILTIN_RDTSCP:
37043 case IX86_BUILTIN_XGETBV:
37045 op0 = gen_reg_rtx (DImode);
37046 op1 = gen_reg_rtx (DImode);
37048 if (fcode == IX86_BUILTIN_RDPMC)
37050 arg0 = CALL_EXPR_ARG (exp, 0);
37051 op2 = expand_normal (arg0);
37052 if (!register_operand (op2, SImode))
37053 op2 = copy_to_mode_reg (SImode, op2);
37055 insn = (TARGET_64BIT
37056 ? gen_rdpmc_rex64 (op0, op1, op2)
37057 : gen_rdpmc (op0, op2));
37058 emit_insn (insn);
37060 else if (fcode == IX86_BUILTIN_XGETBV)
37062 arg0 = CALL_EXPR_ARG (exp, 0);
37063 op2 = expand_normal (arg0);
37064 if (!register_operand (op2, SImode))
37065 op2 = copy_to_mode_reg (SImode, op2);
37067 insn = (TARGET_64BIT
37068 ? gen_xgetbv_rex64 (op0, op1, op2)
37069 : gen_xgetbv (op0, op2));
37070 emit_insn (insn);
37072 else if (fcode == IX86_BUILTIN_RDTSC)
37074 insn = (TARGET_64BIT
37075 ? gen_rdtsc_rex64 (op0, op1)
37076 : gen_rdtsc (op0));
37077 emit_insn (insn);
37079 else
37081 op2 = gen_reg_rtx (SImode);
37083 insn = (TARGET_64BIT
37084 ? gen_rdtscp_rex64 (op0, op1, op2)
37085 : gen_rdtscp (op0, op2));
37086 emit_insn (insn);
37088 arg0 = CALL_EXPR_ARG (exp, 0);
37089 op4 = expand_normal (arg0);
37090 if (!address_operand (op4, VOIDmode))
37092 op4 = convert_memory_address (Pmode, op4);
37093 op4 = copy_addr_to_reg (op4);
37095 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37098 if (target == 0)
37100 /* mode is VOIDmode if __builtin_rd* has been called
37101 without lhs. */
37102 if (mode == VOIDmode)
37103 return target;
37104 target = gen_reg_rtx (mode);
37107 if (TARGET_64BIT)
37109 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37110 op1, 1, OPTAB_DIRECT);
37111 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37112 op0, 1, OPTAB_DIRECT);
37115 emit_move_insn (target, op0);
37116 return target;
37118 case IX86_BUILTIN_FXSAVE:
37119 case IX86_BUILTIN_FXRSTOR:
37120 case IX86_BUILTIN_FXSAVE64:
37121 case IX86_BUILTIN_FXRSTOR64:
37122 case IX86_BUILTIN_FNSTENV:
37123 case IX86_BUILTIN_FLDENV:
37124 mode0 = BLKmode;
37125 switch (fcode)
37127 case IX86_BUILTIN_FXSAVE:
37128 icode = CODE_FOR_fxsave;
37129 break;
37130 case IX86_BUILTIN_FXRSTOR:
37131 icode = CODE_FOR_fxrstor;
37132 break;
37133 case IX86_BUILTIN_FXSAVE64:
37134 icode = CODE_FOR_fxsave64;
37135 break;
37136 case IX86_BUILTIN_FXRSTOR64:
37137 icode = CODE_FOR_fxrstor64;
37138 break;
37139 case IX86_BUILTIN_FNSTENV:
37140 icode = CODE_FOR_fnstenv;
37141 break;
37142 case IX86_BUILTIN_FLDENV:
37143 icode = CODE_FOR_fldenv;
37144 break;
37145 default:
37146 gcc_unreachable ();
37149 arg0 = CALL_EXPR_ARG (exp, 0);
37150 op0 = expand_normal (arg0);
37152 if (!address_operand (op0, VOIDmode))
37154 op0 = convert_memory_address (Pmode, op0);
37155 op0 = copy_addr_to_reg (op0);
37157 op0 = gen_rtx_MEM (mode0, op0);
37159 pat = GEN_FCN (icode) (op0);
37160 if (pat)
37161 emit_insn (pat);
37162 return 0;
37164 case IX86_BUILTIN_XSETBV:
37165 arg0 = CALL_EXPR_ARG (exp, 0);
37166 arg1 = CALL_EXPR_ARG (exp, 1);
37167 op0 = expand_normal (arg0);
37168 op1 = expand_normal (arg1);
37170 if (!REG_P (op0))
37171 op0 = copy_to_mode_reg (SImode, op0);
37173 if (TARGET_64BIT)
37175 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37176 NULL, 1, OPTAB_DIRECT);
37178 op2 = gen_lowpart (SImode, op2);
37179 op1 = gen_lowpart (SImode, op1);
37180 if (!REG_P (op1))
37181 op1 = copy_to_mode_reg (SImode, op1);
37182 if (!REG_P (op2))
37183 op2 = copy_to_mode_reg (SImode, op2);
37184 icode = CODE_FOR_xsetbv_rex64;
37185 pat = GEN_FCN (icode) (op0, op1, op2);
37187 else
37189 if (!REG_P (op1))
37190 op1 = copy_to_mode_reg (DImode, op1);
37191 icode = CODE_FOR_xsetbv;
37192 pat = GEN_FCN (icode) (op0, op1);
37194 if (pat)
37195 emit_insn (pat);
37196 return 0;
37198 case IX86_BUILTIN_XSAVE:
37199 case IX86_BUILTIN_XRSTOR:
37200 case IX86_BUILTIN_XSAVE64:
37201 case IX86_BUILTIN_XRSTOR64:
37202 case IX86_BUILTIN_XSAVEOPT:
37203 case IX86_BUILTIN_XSAVEOPT64:
37204 case IX86_BUILTIN_XSAVES:
37205 case IX86_BUILTIN_XRSTORS:
37206 case IX86_BUILTIN_XSAVES64:
37207 case IX86_BUILTIN_XRSTORS64:
37208 case IX86_BUILTIN_XSAVEC:
37209 case IX86_BUILTIN_XSAVEC64:
37210 arg0 = CALL_EXPR_ARG (exp, 0);
37211 arg1 = CALL_EXPR_ARG (exp, 1);
37212 op0 = expand_normal (arg0);
37213 op1 = expand_normal (arg1);
37215 if (!address_operand (op0, VOIDmode))
37217 op0 = convert_memory_address (Pmode, op0);
37218 op0 = copy_addr_to_reg (op0);
37220 op0 = gen_rtx_MEM (BLKmode, op0);
37222 op1 = force_reg (DImode, op1);
37224 if (TARGET_64BIT)
37226 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37227 NULL, 1, OPTAB_DIRECT);
37228 switch (fcode)
37230 case IX86_BUILTIN_XSAVE:
37231 icode = CODE_FOR_xsave_rex64;
37232 break;
37233 case IX86_BUILTIN_XRSTOR:
37234 icode = CODE_FOR_xrstor_rex64;
37235 break;
37236 case IX86_BUILTIN_XSAVE64:
37237 icode = CODE_FOR_xsave64;
37238 break;
37239 case IX86_BUILTIN_XRSTOR64:
37240 icode = CODE_FOR_xrstor64;
37241 break;
37242 case IX86_BUILTIN_XSAVEOPT:
37243 icode = CODE_FOR_xsaveopt_rex64;
37244 break;
37245 case IX86_BUILTIN_XSAVEOPT64:
37246 icode = CODE_FOR_xsaveopt64;
37247 break;
37248 case IX86_BUILTIN_XSAVES:
37249 icode = CODE_FOR_xsaves_rex64;
37250 break;
37251 case IX86_BUILTIN_XRSTORS:
37252 icode = CODE_FOR_xrstors_rex64;
37253 break;
37254 case IX86_BUILTIN_XSAVES64:
37255 icode = CODE_FOR_xsaves64;
37256 break;
37257 case IX86_BUILTIN_XRSTORS64:
37258 icode = CODE_FOR_xrstors64;
37259 break;
37260 case IX86_BUILTIN_XSAVEC:
37261 icode = CODE_FOR_xsavec_rex64;
37262 break;
37263 case IX86_BUILTIN_XSAVEC64:
37264 icode = CODE_FOR_xsavec64;
37265 break;
37266 default:
37267 gcc_unreachable ();
37270 op2 = gen_lowpart (SImode, op2);
37271 op1 = gen_lowpart (SImode, op1);
37272 pat = GEN_FCN (icode) (op0, op1, op2);
37274 else
37276 switch (fcode)
37278 case IX86_BUILTIN_XSAVE:
37279 icode = CODE_FOR_xsave;
37280 break;
37281 case IX86_BUILTIN_XRSTOR:
37282 icode = CODE_FOR_xrstor;
37283 break;
37284 case IX86_BUILTIN_XSAVEOPT:
37285 icode = CODE_FOR_xsaveopt;
37286 break;
37287 case IX86_BUILTIN_XSAVES:
37288 icode = CODE_FOR_xsaves;
37289 break;
37290 case IX86_BUILTIN_XRSTORS:
37291 icode = CODE_FOR_xrstors;
37292 break;
37293 case IX86_BUILTIN_XSAVEC:
37294 icode = CODE_FOR_xsavec;
37295 break;
37296 default:
37297 gcc_unreachable ();
37299 pat = GEN_FCN (icode) (op0, op1);
37302 if (pat)
37303 emit_insn (pat);
37304 return 0;
37306 case IX86_BUILTIN_LLWPCB:
37307 arg0 = CALL_EXPR_ARG (exp, 0);
37308 op0 = expand_normal (arg0);
37309 icode = CODE_FOR_lwp_llwpcb;
37310 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37311 op0 = ix86_zero_extend_to_Pmode (op0);
37312 emit_insn (gen_lwp_llwpcb (op0));
37313 return 0;
37315 case IX86_BUILTIN_SLWPCB:
37316 icode = CODE_FOR_lwp_slwpcb;
37317 if (!target
37318 || !insn_data[icode].operand[0].predicate (target, Pmode))
37319 target = gen_reg_rtx (Pmode);
37320 emit_insn (gen_lwp_slwpcb (target));
37321 return target;
37323 case IX86_BUILTIN_BEXTRI32:
37324 case IX86_BUILTIN_BEXTRI64:
37325 arg0 = CALL_EXPR_ARG (exp, 0);
37326 arg1 = CALL_EXPR_ARG (exp, 1);
37327 op0 = expand_normal (arg0);
37328 op1 = expand_normal (arg1);
37329 icode = (fcode == IX86_BUILTIN_BEXTRI32
37330 ? CODE_FOR_tbm_bextri_si
37331 : CODE_FOR_tbm_bextri_di);
37332 if (!CONST_INT_P (op1))
37334 error ("last argument must be an immediate");
37335 return const0_rtx;
37337 else
37339 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37340 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37341 op1 = GEN_INT (length);
37342 op2 = GEN_INT (lsb_index);
37343 pat = GEN_FCN (icode) (target, op0, op1, op2);
37344 if (pat)
37345 emit_insn (pat);
37346 return target;
37349 case IX86_BUILTIN_RDRAND16_STEP:
37350 icode = CODE_FOR_rdrandhi_1;
37351 mode0 = HImode;
37352 goto rdrand_step;
37354 case IX86_BUILTIN_RDRAND32_STEP:
37355 icode = CODE_FOR_rdrandsi_1;
37356 mode0 = SImode;
37357 goto rdrand_step;
37359 case IX86_BUILTIN_RDRAND64_STEP:
37360 icode = CODE_FOR_rdranddi_1;
37361 mode0 = DImode;
37363 rdrand_step:
37364 arg0 = CALL_EXPR_ARG (exp, 0);
37365 op1 = expand_normal (arg0);
37366 if (!address_operand (op1, VOIDmode))
37368 op1 = convert_memory_address (Pmode, op1);
37369 op1 = copy_addr_to_reg (op1);
37372 op0 = gen_reg_rtx (mode0);
37373 emit_insn (GEN_FCN (icode) (op0));
37375 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37377 op1 = gen_reg_rtx (SImode);
37378 emit_move_insn (op1, CONST1_RTX (SImode));
37380 /* Emit SImode conditional move. */
37381 if (mode0 == HImode)
37383 if (TARGET_ZERO_EXTEND_WITH_AND
37384 && optimize_function_for_speed_p (cfun))
37386 op2 = force_reg (SImode, const0_rtx);
37388 emit_insn (gen_movstricthi
37389 (gen_lowpart (HImode, op2), op0));
37391 else
37393 op2 = gen_reg_rtx (SImode);
37395 emit_insn (gen_zero_extendhisi2 (op2, op0));
37398 else if (mode0 == SImode)
37399 op2 = op0;
37400 else
37401 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37403 if (target == 0
37404 || !register_operand (target, SImode))
37405 target = gen_reg_rtx (SImode);
37407 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37408 const0_rtx);
37409 emit_insn (gen_rtx_SET (target,
37410 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37411 return target;
37413 case IX86_BUILTIN_RDSEED16_STEP:
37414 icode = CODE_FOR_rdseedhi_1;
37415 mode0 = HImode;
37416 goto rdseed_step;
37418 case IX86_BUILTIN_RDSEED32_STEP:
37419 icode = CODE_FOR_rdseedsi_1;
37420 mode0 = SImode;
37421 goto rdseed_step;
37423 case IX86_BUILTIN_RDSEED64_STEP:
37424 icode = CODE_FOR_rdseeddi_1;
37425 mode0 = DImode;
37427 rdseed_step:
37428 arg0 = CALL_EXPR_ARG (exp, 0);
37429 op1 = expand_normal (arg0);
37430 if (!address_operand (op1, VOIDmode))
37432 op1 = convert_memory_address (Pmode, op1);
37433 op1 = copy_addr_to_reg (op1);
37436 op0 = gen_reg_rtx (mode0);
37437 emit_insn (GEN_FCN (icode) (op0));
37439 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37441 op2 = gen_reg_rtx (QImode);
37443 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37444 const0_rtx);
37445 emit_insn (gen_rtx_SET (op2, pat));
37447 if (target == 0
37448 || !register_operand (target, SImode))
37449 target = gen_reg_rtx (SImode);
37451 emit_insn (gen_zero_extendqisi2 (target, op2));
37452 return target;
37454 case IX86_BUILTIN_SBB32:
37455 icode = CODE_FOR_subborrowsi;
37456 icode2 = CODE_FOR_subborrowsi_0;
37457 mode0 = SImode;
37458 mode1 = DImode;
37459 mode2 = CCmode;
37460 goto handlecarry;
37462 case IX86_BUILTIN_SBB64:
37463 icode = CODE_FOR_subborrowdi;
37464 icode2 = CODE_FOR_subborrowdi_0;
37465 mode0 = DImode;
37466 mode1 = TImode;
37467 mode2 = CCmode;
37468 goto handlecarry;
37470 case IX86_BUILTIN_ADDCARRYX32:
37471 icode = CODE_FOR_addcarrysi;
37472 icode2 = CODE_FOR_addcarrysi_0;
37473 mode0 = SImode;
37474 mode1 = DImode;
37475 mode2 = CCCmode;
37476 goto handlecarry;
37478 case IX86_BUILTIN_ADDCARRYX64:
37479 icode = CODE_FOR_addcarrydi;
37480 icode2 = CODE_FOR_addcarrydi_0;
37481 mode0 = DImode;
37482 mode1 = TImode;
37483 mode2 = CCCmode;
37485 handlecarry:
37486 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37487 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37488 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37489 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37491 op1 = expand_normal (arg0);
37492 if (!integer_zerop (arg0))
37493 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37495 op2 = expand_normal (arg1);
37496 if (!register_operand (op2, mode0))
37497 op2 = copy_to_mode_reg (mode0, op2);
37499 op3 = expand_normal (arg2);
37500 if (!register_operand (op3, mode0))
37501 op3 = copy_to_mode_reg (mode0, op3);
37503 op4 = expand_normal (arg3);
37504 if (!address_operand (op4, VOIDmode))
37506 op4 = convert_memory_address (Pmode, op4);
37507 op4 = copy_addr_to_reg (op4);
37510 op0 = gen_reg_rtx (mode0);
37511 if (integer_zerop (arg0))
37513 /* If arg0 is 0, optimize right away into add or sub
37514 instruction that sets CCCmode flags. */
37515 op1 = gen_rtx_REG (mode2, FLAGS_REG);
37516 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
37518 else
37520 /* Generate CF from input operand. */
37521 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37523 /* Generate instruction that consumes CF. */
37524 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37525 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
37526 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
37527 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
37530 /* Return current CF value. */
37531 if (target == 0)
37532 target = gen_reg_rtx (QImode);
37534 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
37535 emit_insn (gen_rtx_SET (target, pat));
37537 /* Store the result. */
37538 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37540 return target;
37542 case IX86_BUILTIN_READ_FLAGS:
37543 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37545 if (optimize
37546 || target == NULL_RTX
37547 || !nonimmediate_operand (target, word_mode)
37548 || GET_MODE (target) != word_mode)
37549 target = gen_reg_rtx (word_mode);
37551 emit_insn (gen_pop (target));
37552 return target;
37554 case IX86_BUILTIN_WRITE_FLAGS:
37556 arg0 = CALL_EXPR_ARG (exp, 0);
37557 op0 = expand_normal (arg0);
37558 if (!general_no_elim_operand (op0, word_mode))
37559 op0 = copy_to_mode_reg (word_mode, op0);
37561 emit_insn (gen_push (op0));
37562 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37563 return 0;
37565 case IX86_BUILTIN_KTESTC8:
37566 icode = CODE_FOR_ktestqi;
37567 mode3 = CCCmode;
37568 goto kortest;
37570 case IX86_BUILTIN_KTESTZ8:
37571 icode = CODE_FOR_ktestqi;
37572 mode3 = CCZmode;
37573 goto kortest;
37575 case IX86_BUILTIN_KTESTC16:
37576 icode = CODE_FOR_ktesthi;
37577 mode3 = CCCmode;
37578 goto kortest;
37580 case IX86_BUILTIN_KTESTZ16:
37581 icode = CODE_FOR_ktesthi;
37582 mode3 = CCZmode;
37583 goto kortest;
37585 case IX86_BUILTIN_KTESTC32:
37586 icode = CODE_FOR_ktestsi;
37587 mode3 = CCCmode;
37588 goto kortest;
37590 case IX86_BUILTIN_KTESTZ32:
37591 icode = CODE_FOR_ktestsi;
37592 mode3 = CCZmode;
37593 goto kortest;
37595 case IX86_BUILTIN_KTESTC64:
37596 icode = CODE_FOR_ktestdi;
37597 mode3 = CCCmode;
37598 goto kortest;
37600 case IX86_BUILTIN_KTESTZ64:
37601 icode = CODE_FOR_ktestdi;
37602 mode3 = CCZmode;
37603 goto kortest;
37605 case IX86_BUILTIN_KORTESTC8:
37606 icode = CODE_FOR_kortestqi;
37607 mode3 = CCCmode;
37608 goto kortest;
37610 case IX86_BUILTIN_KORTESTZ8:
37611 icode = CODE_FOR_kortestqi;
37612 mode3 = CCZmode;
37613 goto kortest;
37615 case IX86_BUILTIN_KORTESTC16:
37616 icode = CODE_FOR_kortesthi;
37617 mode3 = CCCmode;
37618 goto kortest;
37620 case IX86_BUILTIN_KORTESTZ16:
37621 icode = CODE_FOR_kortesthi;
37622 mode3 = CCZmode;
37623 goto kortest;
37625 case IX86_BUILTIN_KORTESTC32:
37626 icode = CODE_FOR_kortestsi;
37627 mode3 = CCCmode;
37628 goto kortest;
37630 case IX86_BUILTIN_KORTESTZ32:
37631 icode = CODE_FOR_kortestsi;
37632 mode3 = CCZmode;
37633 goto kortest;
37635 case IX86_BUILTIN_KORTESTC64:
37636 icode = CODE_FOR_kortestdi;
37637 mode3 = CCCmode;
37638 goto kortest;
37640 case IX86_BUILTIN_KORTESTZ64:
37641 icode = CODE_FOR_kortestdi;
37642 mode3 = CCZmode;
37644 kortest:
37645 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37646 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37647 op0 = expand_normal (arg0);
37648 op1 = expand_normal (arg1);
37650 mode0 = insn_data[icode].operand[0].mode;
37651 mode1 = insn_data[icode].operand[1].mode;
37653 if (GET_MODE (op0) != VOIDmode)
37654 op0 = force_reg (GET_MODE (op0), op0);
37656 op0 = gen_lowpart (mode0, op0);
37658 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37659 op0 = copy_to_mode_reg (mode0, op0);
37661 if (GET_MODE (op1) != VOIDmode)
37662 op1 = force_reg (GET_MODE (op1), op1);
37664 op1 = gen_lowpart (mode1, op1);
37666 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37667 op1 = copy_to_mode_reg (mode1, op1);
37669 target = gen_reg_rtx (QImode);
37671 /* Emit kortest. */
37672 emit_insn (GEN_FCN (icode) (op0, op1));
37673 /* And use setcc to return result from flags. */
37674 ix86_expand_setcc (target, EQ,
37675 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
37676 return target;
37678 case IX86_BUILTIN_GATHERSIV2DF:
37679 icode = CODE_FOR_avx2_gathersiv2df;
37680 goto gather_gen;
37681 case IX86_BUILTIN_GATHERSIV4DF:
37682 icode = CODE_FOR_avx2_gathersiv4df;
37683 goto gather_gen;
37684 case IX86_BUILTIN_GATHERDIV2DF:
37685 icode = CODE_FOR_avx2_gatherdiv2df;
37686 goto gather_gen;
37687 case IX86_BUILTIN_GATHERDIV4DF:
37688 icode = CODE_FOR_avx2_gatherdiv4df;
37689 goto gather_gen;
37690 case IX86_BUILTIN_GATHERSIV4SF:
37691 icode = CODE_FOR_avx2_gathersiv4sf;
37692 goto gather_gen;
37693 case IX86_BUILTIN_GATHERSIV8SF:
37694 icode = CODE_FOR_avx2_gathersiv8sf;
37695 goto gather_gen;
37696 case IX86_BUILTIN_GATHERDIV4SF:
37697 icode = CODE_FOR_avx2_gatherdiv4sf;
37698 goto gather_gen;
37699 case IX86_BUILTIN_GATHERDIV8SF:
37700 icode = CODE_FOR_avx2_gatherdiv8sf;
37701 goto gather_gen;
37702 case IX86_BUILTIN_GATHERSIV2DI:
37703 icode = CODE_FOR_avx2_gathersiv2di;
37704 goto gather_gen;
37705 case IX86_BUILTIN_GATHERSIV4DI:
37706 icode = CODE_FOR_avx2_gathersiv4di;
37707 goto gather_gen;
37708 case IX86_BUILTIN_GATHERDIV2DI:
37709 icode = CODE_FOR_avx2_gatherdiv2di;
37710 goto gather_gen;
37711 case IX86_BUILTIN_GATHERDIV4DI:
37712 icode = CODE_FOR_avx2_gatherdiv4di;
37713 goto gather_gen;
37714 case IX86_BUILTIN_GATHERSIV4SI:
37715 icode = CODE_FOR_avx2_gathersiv4si;
37716 goto gather_gen;
37717 case IX86_BUILTIN_GATHERSIV8SI:
37718 icode = CODE_FOR_avx2_gathersiv8si;
37719 goto gather_gen;
37720 case IX86_BUILTIN_GATHERDIV4SI:
37721 icode = CODE_FOR_avx2_gatherdiv4si;
37722 goto gather_gen;
37723 case IX86_BUILTIN_GATHERDIV8SI:
37724 icode = CODE_FOR_avx2_gatherdiv8si;
37725 goto gather_gen;
37726 case IX86_BUILTIN_GATHERALTSIV4DF:
37727 icode = CODE_FOR_avx2_gathersiv4df;
37728 goto gather_gen;
37729 case IX86_BUILTIN_GATHERALTDIV8SF:
37730 icode = CODE_FOR_avx2_gatherdiv8sf;
37731 goto gather_gen;
37732 case IX86_BUILTIN_GATHERALTSIV4DI:
37733 icode = CODE_FOR_avx2_gathersiv4di;
37734 goto gather_gen;
37735 case IX86_BUILTIN_GATHERALTDIV8SI:
37736 icode = CODE_FOR_avx2_gatherdiv8si;
37737 goto gather_gen;
37738 case IX86_BUILTIN_GATHER3SIV16SF:
37739 icode = CODE_FOR_avx512f_gathersiv16sf;
37740 goto gather_gen;
37741 case IX86_BUILTIN_GATHER3SIV8DF:
37742 icode = CODE_FOR_avx512f_gathersiv8df;
37743 goto gather_gen;
37744 case IX86_BUILTIN_GATHER3DIV16SF:
37745 icode = CODE_FOR_avx512f_gatherdiv16sf;
37746 goto gather_gen;
37747 case IX86_BUILTIN_GATHER3DIV8DF:
37748 icode = CODE_FOR_avx512f_gatherdiv8df;
37749 goto gather_gen;
37750 case IX86_BUILTIN_GATHER3SIV16SI:
37751 icode = CODE_FOR_avx512f_gathersiv16si;
37752 goto gather_gen;
37753 case IX86_BUILTIN_GATHER3SIV8DI:
37754 icode = CODE_FOR_avx512f_gathersiv8di;
37755 goto gather_gen;
37756 case IX86_BUILTIN_GATHER3DIV16SI:
37757 icode = CODE_FOR_avx512f_gatherdiv16si;
37758 goto gather_gen;
37759 case IX86_BUILTIN_GATHER3DIV8DI:
37760 icode = CODE_FOR_avx512f_gatherdiv8di;
37761 goto gather_gen;
37762 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37763 icode = CODE_FOR_avx512f_gathersiv8df;
37764 goto gather_gen;
37765 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37766 icode = CODE_FOR_avx512f_gatherdiv16sf;
37767 goto gather_gen;
37768 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37769 icode = CODE_FOR_avx512f_gathersiv8di;
37770 goto gather_gen;
37771 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37772 icode = CODE_FOR_avx512f_gatherdiv16si;
37773 goto gather_gen;
37774 case IX86_BUILTIN_GATHER3SIV2DF:
37775 icode = CODE_FOR_avx512vl_gathersiv2df;
37776 goto gather_gen;
37777 case IX86_BUILTIN_GATHER3SIV4DF:
37778 icode = CODE_FOR_avx512vl_gathersiv4df;
37779 goto gather_gen;
37780 case IX86_BUILTIN_GATHER3DIV2DF:
37781 icode = CODE_FOR_avx512vl_gatherdiv2df;
37782 goto gather_gen;
37783 case IX86_BUILTIN_GATHER3DIV4DF:
37784 icode = CODE_FOR_avx512vl_gatherdiv4df;
37785 goto gather_gen;
37786 case IX86_BUILTIN_GATHER3SIV4SF:
37787 icode = CODE_FOR_avx512vl_gathersiv4sf;
37788 goto gather_gen;
37789 case IX86_BUILTIN_GATHER3SIV8SF:
37790 icode = CODE_FOR_avx512vl_gathersiv8sf;
37791 goto gather_gen;
37792 case IX86_BUILTIN_GATHER3DIV4SF:
37793 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37794 goto gather_gen;
37795 case IX86_BUILTIN_GATHER3DIV8SF:
37796 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37797 goto gather_gen;
37798 case IX86_BUILTIN_GATHER3SIV2DI:
37799 icode = CODE_FOR_avx512vl_gathersiv2di;
37800 goto gather_gen;
37801 case IX86_BUILTIN_GATHER3SIV4DI:
37802 icode = CODE_FOR_avx512vl_gathersiv4di;
37803 goto gather_gen;
37804 case IX86_BUILTIN_GATHER3DIV2DI:
37805 icode = CODE_FOR_avx512vl_gatherdiv2di;
37806 goto gather_gen;
37807 case IX86_BUILTIN_GATHER3DIV4DI:
37808 icode = CODE_FOR_avx512vl_gatherdiv4di;
37809 goto gather_gen;
37810 case IX86_BUILTIN_GATHER3SIV4SI:
37811 icode = CODE_FOR_avx512vl_gathersiv4si;
37812 goto gather_gen;
37813 case IX86_BUILTIN_GATHER3SIV8SI:
37814 icode = CODE_FOR_avx512vl_gathersiv8si;
37815 goto gather_gen;
37816 case IX86_BUILTIN_GATHER3DIV4SI:
37817 icode = CODE_FOR_avx512vl_gatherdiv4si;
37818 goto gather_gen;
37819 case IX86_BUILTIN_GATHER3DIV8SI:
37820 icode = CODE_FOR_avx512vl_gatherdiv8si;
37821 goto gather_gen;
37822 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37823 icode = CODE_FOR_avx512vl_gathersiv4df;
37824 goto gather_gen;
37825 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37826 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37827 goto gather_gen;
37828 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37829 icode = CODE_FOR_avx512vl_gathersiv4di;
37830 goto gather_gen;
37831 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37832 icode = CODE_FOR_avx512vl_gatherdiv8si;
37833 goto gather_gen;
37834 case IX86_BUILTIN_SCATTERSIV16SF:
37835 icode = CODE_FOR_avx512f_scattersiv16sf;
37836 goto scatter_gen;
37837 case IX86_BUILTIN_SCATTERSIV8DF:
37838 icode = CODE_FOR_avx512f_scattersiv8df;
37839 goto scatter_gen;
37840 case IX86_BUILTIN_SCATTERDIV16SF:
37841 icode = CODE_FOR_avx512f_scatterdiv16sf;
37842 goto scatter_gen;
37843 case IX86_BUILTIN_SCATTERDIV8DF:
37844 icode = CODE_FOR_avx512f_scatterdiv8df;
37845 goto scatter_gen;
37846 case IX86_BUILTIN_SCATTERSIV16SI:
37847 icode = CODE_FOR_avx512f_scattersiv16si;
37848 goto scatter_gen;
37849 case IX86_BUILTIN_SCATTERSIV8DI:
37850 icode = CODE_FOR_avx512f_scattersiv8di;
37851 goto scatter_gen;
37852 case IX86_BUILTIN_SCATTERDIV16SI:
37853 icode = CODE_FOR_avx512f_scatterdiv16si;
37854 goto scatter_gen;
37855 case IX86_BUILTIN_SCATTERDIV8DI:
37856 icode = CODE_FOR_avx512f_scatterdiv8di;
37857 goto scatter_gen;
37858 case IX86_BUILTIN_SCATTERSIV8SF:
37859 icode = CODE_FOR_avx512vl_scattersiv8sf;
37860 goto scatter_gen;
37861 case IX86_BUILTIN_SCATTERSIV4SF:
37862 icode = CODE_FOR_avx512vl_scattersiv4sf;
37863 goto scatter_gen;
37864 case IX86_BUILTIN_SCATTERSIV4DF:
37865 icode = CODE_FOR_avx512vl_scattersiv4df;
37866 goto scatter_gen;
37867 case IX86_BUILTIN_SCATTERSIV2DF:
37868 icode = CODE_FOR_avx512vl_scattersiv2df;
37869 goto scatter_gen;
37870 case IX86_BUILTIN_SCATTERDIV8SF:
37871 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37872 goto scatter_gen;
37873 case IX86_BUILTIN_SCATTERDIV4SF:
37874 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37875 goto scatter_gen;
37876 case IX86_BUILTIN_SCATTERDIV4DF:
37877 icode = CODE_FOR_avx512vl_scatterdiv4df;
37878 goto scatter_gen;
37879 case IX86_BUILTIN_SCATTERDIV2DF:
37880 icode = CODE_FOR_avx512vl_scatterdiv2df;
37881 goto scatter_gen;
37882 case IX86_BUILTIN_SCATTERSIV8SI:
37883 icode = CODE_FOR_avx512vl_scattersiv8si;
37884 goto scatter_gen;
37885 case IX86_BUILTIN_SCATTERSIV4SI:
37886 icode = CODE_FOR_avx512vl_scattersiv4si;
37887 goto scatter_gen;
37888 case IX86_BUILTIN_SCATTERSIV4DI:
37889 icode = CODE_FOR_avx512vl_scattersiv4di;
37890 goto scatter_gen;
37891 case IX86_BUILTIN_SCATTERSIV2DI:
37892 icode = CODE_FOR_avx512vl_scattersiv2di;
37893 goto scatter_gen;
37894 case IX86_BUILTIN_SCATTERDIV8SI:
37895 icode = CODE_FOR_avx512vl_scatterdiv8si;
37896 goto scatter_gen;
37897 case IX86_BUILTIN_SCATTERDIV4SI:
37898 icode = CODE_FOR_avx512vl_scatterdiv4si;
37899 goto scatter_gen;
37900 case IX86_BUILTIN_SCATTERDIV4DI:
37901 icode = CODE_FOR_avx512vl_scatterdiv4di;
37902 goto scatter_gen;
37903 case IX86_BUILTIN_SCATTERDIV2DI:
37904 icode = CODE_FOR_avx512vl_scatterdiv2di;
37905 goto scatter_gen;
37906 case IX86_BUILTIN_GATHERPFDPD:
37907 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37908 goto vec_prefetch_gen;
37909 case IX86_BUILTIN_SCATTERALTSIV8DF:
37910 icode = CODE_FOR_avx512f_scattersiv8df;
37911 goto scatter_gen;
37912 case IX86_BUILTIN_SCATTERALTDIV16SF:
37913 icode = CODE_FOR_avx512f_scatterdiv16sf;
37914 goto scatter_gen;
37915 case IX86_BUILTIN_SCATTERALTSIV8DI:
37916 icode = CODE_FOR_avx512f_scattersiv8di;
37917 goto scatter_gen;
37918 case IX86_BUILTIN_SCATTERALTDIV16SI:
37919 icode = CODE_FOR_avx512f_scatterdiv16si;
37920 goto scatter_gen;
37921 case IX86_BUILTIN_GATHERPFDPS:
37922 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37923 goto vec_prefetch_gen;
37924 case IX86_BUILTIN_GATHERPFQPD:
37925 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37926 goto vec_prefetch_gen;
37927 case IX86_BUILTIN_GATHERPFQPS:
37928 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37929 goto vec_prefetch_gen;
37930 case IX86_BUILTIN_SCATTERPFDPD:
37931 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37932 goto vec_prefetch_gen;
37933 case IX86_BUILTIN_SCATTERPFDPS:
37934 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37935 goto vec_prefetch_gen;
37936 case IX86_BUILTIN_SCATTERPFQPD:
37937 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37938 goto vec_prefetch_gen;
37939 case IX86_BUILTIN_SCATTERPFQPS:
37940 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37941 goto vec_prefetch_gen;
37943 gather_gen:
37944 rtx half;
37945 rtx (*gen) (rtx, rtx);
37947 arg0 = CALL_EXPR_ARG (exp, 0);
37948 arg1 = CALL_EXPR_ARG (exp, 1);
37949 arg2 = CALL_EXPR_ARG (exp, 2);
37950 arg3 = CALL_EXPR_ARG (exp, 3);
37951 arg4 = CALL_EXPR_ARG (exp, 4);
37952 op0 = expand_normal (arg0);
37953 op1 = expand_normal (arg1);
37954 op2 = expand_normal (arg2);
37955 op3 = expand_normal (arg3);
37956 op4 = expand_normal (arg4);
37957 /* Note the arg order is different from the operand order. */
37958 mode0 = insn_data[icode].operand[1].mode;
37959 mode2 = insn_data[icode].operand[3].mode;
37960 mode3 = insn_data[icode].operand[4].mode;
37961 mode4 = insn_data[icode].operand[5].mode;
37963 if (target == NULL_RTX
37964 || GET_MODE (target) != insn_data[icode].operand[0].mode
37965 || !insn_data[icode].operand[0].predicate (target,
37966 GET_MODE (target)))
37967 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37968 else
37969 subtarget = target;
37971 switch (fcode)
37973 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37974 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37975 half = gen_reg_rtx (V8SImode);
37976 if (!nonimmediate_operand (op2, V16SImode))
37977 op2 = copy_to_mode_reg (V16SImode, op2);
37978 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37979 op2 = half;
37980 break;
37981 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37982 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37983 case IX86_BUILTIN_GATHERALTSIV4DF:
37984 case IX86_BUILTIN_GATHERALTSIV4DI:
37985 half = gen_reg_rtx (V4SImode);
37986 if (!nonimmediate_operand (op2, V8SImode))
37987 op2 = copy_to_mode_reg (V8SImode, op2);
37988 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37989 op2 = half;
37990 break;
37991 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37992 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37993 half = gen_reg_rtx (mode0);
37994 if (mode0 == V8SFmode)
37995 gen = gen_vec_extract_lo_v16sf;
37996 else
37997 gen = gen_vec_extract_lo_v16si;
37998 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37999 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38000 emit_insn (gen (half, op0));
38001 op0 = half;
38002 if (GET_MODE (op3) != VOIDmode)
38004 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38005 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38006 emit_insn (gen (half, op3));
38007 op3 = half;
38009 break;
38010 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38011 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38012 case IX86_BUILTIN_GATHERALTDIV8SF:
38013 case IX86_BUILTIN_GATHERALTDIV8SI:
38014 half = gen_reg_rtx (mode0);
38015 if (mode0 == V4SFmode)
38016 gen = gen_vec_extract_lo_v8sf;
38017 else
38018 gen = gen_vec_extract_lo_v8si;
38019 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38020 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38021 emit_insn (gen (half, op0));
38022 op0 = half;
38023 if (GET_MODE (op3) != VOIDmode)
38025 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38026 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38027 emit_insn (gen (half, op3));
38028 op3 = half;
38030 break;
38031 default:
38032 break;
38035 /* Force memory operand only with base register here. But we
38036 don't want to do it on memory operand for other builtin
38037 functions. */
38038 op1 = ix86_zero_extend_to_Pmode (op1);
38040 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38041 op0 = copy_to_mode_reg (mode0, op0);
38042 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38043 op1 = copy_to_mode_reg (Pmode, op1);
38044 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38045 op2 = copy_to_mode_reg (mode2, op2);
38047 op3 = fixup_modeless_constant (op3, mode3);
38049 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38051 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38052 op3 = copy_to_mode_reg (mode3, op3);
38054 else
38056 op3 = copy_to_reg (op3);
38057 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38059 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38061 error ("the last argument must be scale 1, 2, 4, 8");
38062 return const0_rtx;
38065 /* Optimize. If mask is known to have all high bits set,
38066 replace op0 with pc_rtx to signal that the instruction
38067 overwrites the whole destination and doesn't use its
38068 previous contents. */
38069 if (optimize)
38071 if (TREE_CODE (arg3) == INTEGER_CST)
38073 if (integer_all_onesp (arg3))
38074 op0 = pc_rtx;
38076 else if (TREE_CODE (arg3) == VECTOR_CST)
38078 unsigned int negative = 0;
38079 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38081 tree cst = VECTOR_CST_ELT (arg3, i);
38082 if (TREE_CODE (cst) == INTEGER_CST
38083 && tree_int_cst_sign_bit (cst))
38084 negative++;
38085 else if (TREE_CODE (cst) == REAL_CST
38086 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38087 negative++;
38089 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38090 op0 = pc_rtx;
38092 else if (TREE_CODE (arg3) == SSA_NAME
38093 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38095 /* Recognize also when mask is like:
38096 __v2df src = _mm_setzero_pd ();
38097 __v2df mask = _mm_cmpeq_pd (src, src);
38099 __v8sf src = _mm256_setzero_ps ();
38100 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38101 as that is a cheaper way to load all ones into
38102 a register than having to load a constant from
38103 memory. */
38104 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38105 if (is_gimple_call (def_stmt))
38107 tree fndecl = gimple_call_fndecl (def_stmt);
38108 if (fndecl
38109 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38110 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38112 case IX86_BUILTIN_CMPPD:
38113 case IX86_BUILTIN_CMPPS:
38114 case IX86_BUILTIN_CMPPD256:
38115 case IX86_BUILTIN_CMPPS256:
38116 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38117 break;
38118 /* FALLTHRU */
38119 case IX86_BUILTIN_CMPEQPD:
38120 case IX86_BUILTIN_CMPEQPS:
38121 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38122 && initializer_zerop (gimple_call_arg (def_stmt,
38123 1)))
38124 op0 = pc_rtx;
38125 break;
38126 default:
38127 break;
38133 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38134 if (! pat)
38135 return const0_rtx;
38136 emit_insn (pat);
38138 switch (fcode)
38140 case IX86_BUILTIN_GATHER3DIV16SF:
38141 if (target == NULL_RTX)
38142 target = gen_reg_rtx (V8SFmode);
38143 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38144 break;
38145 case IX86_BUILTIN_GATHER3DIV16SI:
38146 if (target == NULL_RTX)
38147 target = gen_reg_rtx (V8SImode);
38148 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38149 break;
38150 case IX86_BUILTIN_GATHER3DIV8SF:
38151 case IX86_BUILTIN_GATHERDIV8SF:
38152 if (target == NULL_RTX)
38153 target = gen_reg_rtx (V4SFmode);
38154 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38155 break;
38156 case IX86_BUILTIN_GATHER3DIV8SI:
38157 case IX86_BUILTIN_GATHERDIV8SI:
38158 if (target == NULL_RTX)
38159 target = gen_reg_rtx (V4SImode);
38160 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38161 break;
38162 default:
38163 target = subtarget;
38164 break;
38166 return target;
38168 scatter_gen:
38169 arg0 = CALL_EXPR_ARG (exp, 0);
38170 arg1 = CALL_EXPR_ARG (exp, 1);
38171 arg2 = CALL_EXPR_ARG (exp, 2);
38172 arg3 = CALL_EXPR_ARG (exp, 3);
38173 arg4 = CALL_EXPR_ARG (exp, 4);
38174 op0 = expand_normal (arg0);
38175 op1 = expand_normal (arg1);
38176 op2 = expand_normal (arg2);
38177 op3 = expand_normal (arg3);
38178 op4 = expand_normal (arg4);
38179 mode1 = insn_data[icode].operand[1].mode;
38180 mode2 = insn_data[icode].operand[2].mode;
38181 mode3 = insn_data[icode].operand[3].mode;
38182 mode4 = insn_data[icode].operand[4].mode;
38184 /* Scatter instruction stores operand op3 to memory with
38185 indices from op2 and scale from op4 under writemask op1.
38186 If index operand op2 has more elements then source operand
38187 op3 one need to use only its low half. And vice versa. */
38188 switch (fcode)
38190 case IX86_BUILTIN_SCATTERALTSIV8DF:
38191 case IX86_BUILTIN_SCATTERALTSIV8DI:
38192 half = gen_reg_rtx (V8SImode);
38193 if (!nonimmediate_operand (op2, V16SImode))
38194 op2 = copy_to_mode_reg (V16SImode, op2);
38195 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38196 op2 = half;
38197 break;
38198 case IX86_BUILTIN_SCATTERALTDIV16SF:
38199 case IX86_BUILTIN_SCATTERALTDIV16SI:
38200 half = gen_reg_rtx (mode3);
38201 if (mode3 == V8SFmode)
38202 gen = gen_vec_extract_lo_v16sf;
38203 else
38204 gen = gen_vec_extract_lo_v16si;
38205 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38206 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38207 emit_insn (gen (half, op3));
38208 op3 = half;
38209 break;
38210 default:
38211 break;
38214 /* Force memory operand only with base register here. But we
38215 don't want to do it on memory operand for other builtin
38216 functions. */
38217 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38219 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38220 op0 = copy_to_mode_reg (Pmode, op0);
38222 op1 = fixup_modeless_constant (op1, mode1);
38224 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38226 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38227 op1 = copy_to_mode_reg (mode1, op1);
38229 else
38231 op1 = copy_to_reg (op1);
38232 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38235 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38236 op2 = copy_to_mode_reg (mode2, op2);
38238 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38239 op3 = copy_to_mode_reg (mode3, op3);
38241 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38243 error ("the last argument must be scale 1, 2, 4, 8");
38244 return const0_rtx;
38247 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38248 if (! pat)
38249 return const0_rtx;
38251 emit_insn (pat);
38252 return 0;
38254 vec_prefetch_gen:
38255 arg0 = CALL_EXPR_ARG (exp, 0);
38256 arg1 = CALL_EXPR_ARG (exp, 1);
38257 arg2 = CALL_EXPR_ARG (exp, 2);
38258 arg3 = CALL_EXPR_ARG (exp, 3);
38259 arg4 = CALL_EXPR_ARG (exp, 4);
38260 op0 = expand_normal (arg0);
38261 op1 = expand_normal (arg1);
38262 op2 = expand_normal (arg2);
38263 op3 = expand_normal (arg3);
38264 op4 = expand_normal (arg4);
38265 mode0 = insn_data[icode].operand[0].mode;
38266 mode1 = insn_data[icode].operand[1].mode;
38267 mode3 = insn_data[icode].operand[3].mode;
38268 mode4 = insn_data[icode].operand[4].mode;
38270 op0 = fixup_modeless_constant (op0, mode0);
38272 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38274 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38275 op0 = copy_to_mode_reg (mode0, op0);
38277 else
38279 op0 = copy_to_reg (op0);
38280 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38283 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38284 op1 = copy_to_mode_reg (mode1, op1);
38286 /* Force memory operand only with base register here. But we
38287 don't want to do it on memory operand for other builtin
38288 functions. */
38289 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38291 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38292 op2 = copy_to_mode_reg (Pmode, op2);
38294 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38296 error ("the forth argument must be scale 1, 2, 4, 8");
38297 return const0_rtx;
38300 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38302 error ("incorrect hint operand");
38303 return const0_rtx;
38306 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38307 if (! pat)
38308 return const0_rtx;
38310 emit_insn (pat);
38312 return 0;
38314 case IX86_BUILTIN_XABORT:
38315 icode = CODE_FOR_xabort;
38316 arg0 = CALL_EXPR_ARG (exp, 0);
38317 op0 = expand_normal (arg0);
38318 mode0 = insn_data[icode].operand[0].mode;
38319 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38321 error ("the xabort's argument must be an 8-bit immediate");
38322 return const0_rtx;
38324 emit_insn (gen_xabort (op0));
38325 return 0;
38327 case IX86_BUILTIN_RSTORSSP:
38328 case IX86_BUILTIN_CLRSSBSY:
38329 arg0 = CALL_EXPR_ARG (exp, 0);
38330 op0 = expand_normal (arg0);
38331 icode = (fcode == IX86_BUILTIN_RSTORSSP
38332 ? CODE_FOR_rstorssp
38333 : CODE_FOR_clrssbsy);
38334 if (!address_operand (op0, VOIDmode))
38336 op1 = convert_memory_address (Pmode, op0);
38337 op0 = copy_addr_to_reg (op1);
38339 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
38340 return 0;
38342 case IX86_BUILTIN_WRSSD:
38343 case IX86_BUILTIN_WRSSQ:
38344 case IX86_BUILTIN_WRUSSD:
38345 case IX86_BUILTIN_WRUSSQ:
38346 arg0 = CALL_EXPR_ARG (exp, 0);
38347 op0 = expand_normal (arg0);
38348 arg1 = CALL_EXPR_ARG (exp, 1);
38349 op1 = expand_normal (arg1);
38350 switch (fcode)
38352 case IX86_BUILTIN_WRSSD:
38353 icode = CODE_FOR_wrsssi;
38354 mode = SImode;
38355 break;
38356 case IX86_BUILTIN_WRSSQ:
38357 icode = CODE_FOR_wrssdi;
38358 mode = DImode;
38359 break;
38360 case IX86_BUILTIN_WRUSSD:
38361 icode = CODE_FOR_wrusssi;
38362 mode = SImode;
38363 break;
38364 case IX86_BUILTIN_WRUSSQ:
38365 icode = CODE_FOR_wrussdi;
38366 mode = DImode;
38367 break;
38369 op0 = force_reg (mode, op0);
38370 if (!address_operand (op1, VOIDmode))
38372 op2 = convert_memory_address (Pmode, op1);
38373 op1 = copy_addr_to_reg (op2);
38375 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
38376 return 0;
38378 default:
38379 break;
38382 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38383 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38385 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38386 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38387 target);
38390 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
38391 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
38393 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
38394 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
38395 target);
38398 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38399 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38401 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38402 switch (fcode)
38404 case IX86_BUILTIN_FABSQ:
38405 case IX86_BUILTIN_COPYSIGNQ:
38406 if (!TARGET_SSE)
38407 /* Emit a normal call if SSE isn't available. */
38408 return expand_call (exp, target, ignore);
38409 /* FALLTHRU */
38410 default:
38411 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38415 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38416 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38418 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38419 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38420 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38421 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38422 int masked = 1;
38423 machine_mode mode, wide_mode, nar_mode;
38425 nar_mode = V4SFmode;
38426 mode = V16SFmode;
38427 wide_mode = V64SFmode;
38428 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38429 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38431 switch (fcode)
38433 case IX86_BUILTIN_4FMAPS:
38434 fcn = gen_avx5124fmaddps_4fmaddps;
38435 masked = 0;
38436 goto v4fma_expand;
38438 case IX86_BUILTIN_4DPWSSD:
38439 nar_mode = V4SImode;
38440 mode = V16SImode;
38441 wide_mode = V64SImode;
38442 fcn = gen_avx5124vnniw_vp4dpwssd;
38443 masked = 0;
38444 goto v4fma_expand;
38446 case IX86_BUILTIN_4DPWSSDS:
38447 nar_mode = V4SImode;
38448 mode = V16SImode;
38449 wide_mode = V64SImode;
38450 fcn = gen_avx5124vnniw_vp4dpwssds;
38451 masked = 0;
38452 goto v4fma_expand;
38454 case IX86_BUILTIN_4FNMAPS:
38455 fcn = gen_avx5124fmaddps_4fnmaddps;
38456 masked = 0;
38457 goto v4fma_expand;
38459 case IX86_BUILTIN_4FNMAPS_MASK:
38460 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38461 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38462 goto v4fma_expand;
38464 case IX86_BUILTIN_4DPWSSD_MASK:
38465 nar_mode = V4SImode;
38466 mode = V16SImode;
38467 wide_mode = V64SImode;
38468 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38469 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38470 goto v4fma_expand;
38472 case IX86_BUILTIN_4DPWSSDS_MASK:
38473 nar_mode = V4SImode;
38474 mode = V16SImode;
38475 wide_mode = V64SImode;
38476 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38477 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38478 goto v4fma_expand;
38480 case IX86_BUILTIN_4FMAPS_MASK:
38482 tree args[4];
38483 rtx ops[4];
38484 rtx wide_reg;
38485 rtx accum;
38486 rtx addr;
38487 rtx mem;
38489 v4fma_expand:
38490 wide_reg = gen_reg_rtx (wide_mode);
38491 for (i = 0; i < 4; i++)
38493 args[i] = CALL_EXPR_ARG (exp, i);
38494 ops[i] = expand_normal (args[i]);
38496 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38497 ops[i]);
38500 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38501 accum = force_reg (mode, accum);
38503 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38504 addr = force_reg (Pmode, addr);
38506 mem = gen_rtx_MEM (nar_mode, addr);
38508 target = gen_reg_rtx (mode);
38510 emit_move_insn (target, accum);
38512 if (! masked)
38513 emit_insn (fcn (target, accum, wide_reg, mem));
38514 else
38516 rtx merge, mask;
38517 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38519 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38521 if (CONST_INT_P (mask))
38522 mask = fixup_modeless_constant (mask, HImode);
38524 mask = force_reg (HImode, mask);
38526 if (GET_MODE (mask) != HImode)
38527 mask = gen_rtx_SUBREG (HImode, mask, 0);
38529 /* If merge is 0 then we're about to emit z-masked variant. */
38530 if (const0_operand (merge, mode))
38531 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38532 /* If merge is the same as accum then emit merge-masked variant. */
38533 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38535 merge = force_reg (mode, merge);
38536 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38538 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38539 else
38541 target = gen_reg_rtx (mode);
38542 emit_move_insn (target, merge);
38543 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38546 return target;
38549 case IX86_BUILTIN_4FNMASS:
38550 fcn = gen_avx5124fmaddps_4fnmaddss;
38551 masked = 0;
38552 goto s4fma_expand;
38554 case IX86_BUILTIN_4FMASS:
38555 fcn = gen_avx5124fmaddps_4fmaddss;
38556 masked = 0;
38557 goto s4fma_expand;
38559 case IX86_BUILTIN_4FNMASS_MASK:
38560 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38561 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38562 goto s4fma_expand;
38564 case IX86_BUILTIN_4FMASS_MASK:
38566 tree args[4];
38567 rtx ops[4];
38568 rtx wide_reg;
38569 rtx accum;
38570 rtx addr;
38571 rtx mem;
38573 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38574 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38576 s4fma_expand:
38577 mode = V4SFmode;
38578 wide_reg = gen_reg_rtx (V64SFmode);
38579 for (i = 0; i < 4; i++)
38581 rtx tmp;
38582 args[i] = CALL_EXPR_ARG (exp, i);
38583 ops[i] = expand_normal (args[i]);
38585 tmp = gen_reg_rtx (SFmode);
38586 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38588 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38589 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38592 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38593 accum = force_reg (V4SFmode, accum);
38595 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38596 addr = force_reg (Pmode, addr);
38598 mem = gen_rtx_MEM (V4SFmode, addr);
38600 target = gen_reg_rtx (V4SFmode);
38602 emit_move_insn (target, accum);
38604 if (! masked)
38605 emit_insn (fcn (target, accum, wide_reg, mem));
38606 else
38608 rtx merge, mask;
38609 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38611 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38613 if (CONST_INT_P (mask))
38614 mask = fixup_modeless_constant (mask, QImode);
38616 mask = force_reg (QImode, mask);
38618 if (GET_MODE (mask) != QImode)
38619 mask = gen_rtx_SUBREG (QImode, mask, 0);
38621 /* If merge is 0 then we're about to emit z-masked variant. */
38622 if (const0_operand (merge, mode))
38623 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38624 /* If merge is the same as accum then emit merge-masked
38625 variant. */
38626 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38628 merge = force_reg (mode, merge);
38629 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38631 /* Merge with something unknown might happen if we z-mask
38632 w/ -O0. */
38633 else
38635 target = gen_reg_rtx (mode);
38636 emit_move_insn (target, merge);
38637 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38640 return target;
38642 case IX86_BUILTIN_RDPID:
38643 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38644 target);
38645 default:
38646 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38650 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38651 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38653 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38654 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38657 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38658 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38660 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38661 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38664 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38665 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38667 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38668 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38671 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38672 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38674 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38675 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38678 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38679 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38681 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38682 const struct builtin_description *d = bdesc_multi_arg + i;
38683 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38684 (enum ix86_builtin_func_type)
38685 d->flag, d->comparison);
38688 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
38689 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
38691 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
38692 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
38693 target);
38696 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
38697 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
38699 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
38700 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
38701 target);
38704 gcc_unreachable ();
38707 /* This returns the target-specific builtin with code CODE if
38708 current_function_decl has visibility on this builtin, which is checked
38709 using isa flags. Returns NULL_TREE otherwise. */
38711 static tree ix86_get_builtin (enum ix86_builtins code)
38713 struct cl_target_option *opts;
38714 tree target_tree = NULL_TREE;
38716 /* Determine the isa flags of current_function_decl. */
38718 if (current_function_decl)
38719 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38721 if (target_tree == NULL)
38722 target_tree = target_option_default_node;
38724 opts = TREE_TARGET_OPTION (target_tree);
38726 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38727 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38728 return ix86_builtin_decl (code, true);
38729 else
38730 return NULL_TREE;
38733 /* Return function decl for target specific builtin
38734 for given MPX builtin passed i FCODE. */
38735 static tree
38736 ix86_builtin_mpx_function (unsigned fcode)
38738 switch (fcode)
38740 case BUILT_IN_CHKP_BNDMK:
38741 return ix86_builtins[IX86_BUILTIN_BNDMK];
38743 case BUILT_IN_CHKP_BNDSTX:
38744 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38746 case BUILT_IN_CHKP_BNDLDX:
38747 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38749 case BUILT_IN_CHKP_BNDCL:
38750 return ix86_builtins[IX86_BUILTIN_BNDCL];
38752 case BUILT_IN_CHKP_BNDCU:
38753 return ix86_builtins[IX86_BUILTIN_BNDCU];
38755 case BUILT_IN_CHKP_BNDRET:
38756 return ix86_builtins[IX86_BUILTIN_BNDRET];
38758 case BUILT_IN_CHKP_INTERSECT:
38759 return ix86_builtins[IX86_BUILTIN_BNDINT];
38761 case BUILT_IN_CHKP_NARROW:
38762 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38764 case BUILT_IN_CHKP_SIZEOF:
38765 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38767 case BUILT_IN_CHKP_EXTRACT_LOWER:
38768 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38770 case BUILT_IN_CHKP_EXTRACT_UPPER:
38771 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38773 default:
38774 return NULL_TREE;
38777 gcc_unreachable ();
38780 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38782 Return an address to be used to load/store bounds for pointer
38783 passed in SLOT.
38785 SLOT_NO is an integer constant holding number of a target
38786 dependent special slot to be used in case SLOT is not a memory.
38788 SPECIAL_BASE is a pointer to be used as a base of fake address
38789 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38790 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38792 static rtx
38793 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38795 rtx addr = NULL;
38797 /* NULL slot means we pass bounds for pointer not passed to the
38798 function at all. Register slot means we pass pointer in a
38799 register. In both these cases bounds are passed via Bounds
38800 Table. Since we do not have actual pointer stored in memory,
38801 we have to use fake addresses to access Bounds Table. We
38802 start with (special_base - sizeof (void*)) and decrease this
38803 address by pointer size to get addresses for other slots. */
38804 if (!slot || REG_P (slot))
38806 gcc_assert (CONST_INT_P (slot_no));
38807 addr = plus_constant (Pmode, special_base,
38808 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38810 /* If pointer is passed in a memory then its address is used to
38811 access Bounds Table. */
38812 else if (MEM_P (slot))
38814 addr = XEXP (slot, 0);
38815 if (!register_operand (addr, Pmode))
38816 addr = copy_addr_to_reg (addr);
38818 else
38819 gcc_unreachable ();
38821 return addr;
38824 /* Expand pass uses this hook to load bounds for function parameter
38825 PTR passed in SLOT in case its bounds are not passed in a register.
38827 If SLOT is a memory, then bounds are loaded as for regular pointer
38828 loaded from memory. PTR may be NULL in case SLOT is a memory.
38829 In such case value of PTR (if required) may be loaded from SLOT.
38831 If SLOT is NULL or a register then SLOT_NO is an integer constant
38832 holding number of the target dependent special slot which should be
38833 used to obtain bounds.
38835 Return loaded bounds. */
38837 static rtx
38838 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38840 rtx reg = gen_reg_rtx (BNDmode);
38841 rtx addr;
38843 /* Get address to be used to access Bounds Table. Special slots start
38844 at the location of return address of the current function. */
38845 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38847 /* Load pointer value from a memory if we don't have it. */
38848 if (!ptr)
38850 gcc_assert (MEM_P (slot));
38851 ptr = copy_addr_to_reg (slot);
38854 if (!register_operand (ptr, Pmode))
38855 ptr = ix86_zero_extend_to_Pmode (ptr);
38857 emit_insn (BNDmode == BND64mode
38858 ? gen_bnd64_ldx (reg, addr, ptr)
38859 : gen_bnd32_ldx (reg, addr, ptr));
38861 return reg;
38864 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38865 passed in SLOT in case BOUNDS are not passed in a register.
38867 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38868 stored in memory. PTR may be NULL in case SLOT is a memory.
38869 In such case value of PTR (if required) may be loaded from SLOT.
38871 If SLOT is NULL or a register then SLOT_NO is an integer constant
38872 holding number of the target dependent special slot which should be
38873 used to store BOUNDS. */
38875 static void
38876 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38878 rtx addr;
38880 /* Get address to be used to access Bounds Table. Special slots start
38881 at the location of return address of a called function. */
38882 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38884 /* Load pointer value from a memory if we don't have it. */
38885 if (!ptr)
38887 gcc_assert (MEM_P (slot));
38888 ptr = copy_addr_to_reg (slot);
38891 if (!register_operand (ptr, Pmode))
38892 ptr = ix86_zero_extend_to_Pmode (ptr);
38894 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38895 if (!register_operand (bounds, BNDmode))
38896 bounds = copy_to_mode_reg (BNDmode, bounds);
38898 emit_insn (BNDmode == BND64mode
38899 ? gen_bnd64_stx (addr, ptr, bounds)
38900 : gen_bnd32_stx (addr, ptr, bounds));
38903 /* Load and return bounds returned by function in SLOT. */
38905 static rtx
38906 ix86_load_returned_bounds (rtx slot)
38908 rtx res;
38910 gcc_assert (REG_P (slot));
38911 res = gen_reg_rtx (BNDmode);
38912 emit_move_insn (res, slot);
38914 return res;
38917 /* Store BOUNDS returned by function into SLOT. */
38919 static void
38920 ix86_store_returned_bounds (rtx slot, rtx bounds)
38922 gcc_assert (REG_P (slot));
38923 emit_move_insn (slot, bounds);
38926 /* Returns a function decl for a vectorized version of the combined function
38927 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38928 if it is not available. */
38930 static tree
38931 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38932 tree type_in)
38934 machine_mode in_mode, out_mode;
38935 int in_n, out_n;
38937 if (TREE_CODE (type_out) != VECTOR_TYPE
38938 || TREE_CODE (type_in) != VECTOR_TYPE)
38939 return NULL_TREE;
38941 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38942 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38943 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38944 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38946 switch (fn)
38948 CASE_CFN_EXP2:
38949 if (out_mode == SFmode && in_mode == SFmode)
38951 if (out_n == 16 && in_n == 16)
38952 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38954 break;
38956 CASE_CFN_IFLOOR:
38957 CASE_CFN_LFLOOR:
38958 CASE_CFN_LLFLOOR:
38959 /* The round insn does not trap on denormals. */
38960 if (flag_trapping_math || !TARGET_SSE4_1)
38961 break;
38963 if (out_mode == SImode && in_mode == DFmode)
38965 if (out_n == 4 && in_n == 2)
38966 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38967 else if (out_n == 8 && in_n == 4)
38968 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38969 else if (out_n == 16 && in_n == 8)
38970 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38972 if (out_mode == SImode && in_mode == SFmode)
38974 if (out_n == 4 && in_n == 4)
38975 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38976 else if (out_n == 8 && in_n == 8)
38977 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38978 else if (out_n == 16 && in_n == 16)
38979 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38981 break;
38983 CASE_CFN_ICEIL:
38984 CASE_CFN_LCEIL:
38985 CASE_CFN_LLCEIL:
38986 /* The round insn does not trap on denormals. */
38987 if (flag_trapping_math || !TARGET_SSE4_1)
38988 break;
38990 if (out_mode == SImode && in_mode == DFmode)
38992 if (out_n == 4 && in_n == 2)
38993 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38994 else if (out_n == 8 && in_n == 4)
38995 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38996 else if (out_n == 16 && in_n == 8)
38997 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38999 if (out_mode == SImode && in_mode == SFmode)
39001 if (out_n == 4 && in_n == 4)
39002 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39003 else if (out_n == 8 && in_n == 8)
39004 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39005 else if (out_n == 16 && in_n == 16)
39006 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39008 break;
39010 CASE_CFN_IRINT:
39011 CASE_CFN_LRINT:
39012 CASE_CFN_LLRINT:
39013 if (out_mode == SImode && in_mode == DFmode)
39015 if (out_n == 4 && in_n == 2)
39016 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39017 else if (out_n == 8 && in_n == 4)
39018 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39019 else if (out_n == 16 && in_n == 8)
39020 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39022 if (out_mode == SImode && in_mode == SFmode)
39024 if (out_n == 4 && in_n == 4)
39025 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39026 else if (out_n == 8 && in_n == 8)
39027 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39028 else if (out_n == 16 && in_n == 16)
39029 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39031 break;
39033 CASE_CFN_IROUND:
39034 CASE_CFN_LROUND:
39035 CASE_CFN_LLROUND:
39036 /* The round insn does not trap on denormals. */
39037 if (flag_trapping_math || !TARGET_SSE4_1)
39038 break;
39040 if (out_mode == SImode && in_mode == DFmode)
39042 if (out_n == 4 && in_n == 2)
39043 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39044 else if (out_n == 8 && in_n == 4)
39045 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39046 else if (out_n == 16 && in_n == 8)
39047 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39049 if (out_mode == SImode && in_mode == SFmode)
39051 if (out_n == 4 && in_n == 4)
39052 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39053 else if (out_n == 8 && in_n == 8)
39054 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39055 else if (out_n == 16 && in_n == 16)
39056 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39058 break;
39060 CASE_CFN_FLOOR:
39061 /* The round insn does not trap on denormals. */
39062 if (flag_trapping_math || !TARGET_SSE4_1)
39063 break;
39065 if (out_mode == DFmode && in_mode == DFmode)
39067 if (out_n == 2 && in_n == 2)
39068 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39069 else if (out_n == 4 && in_n == 4)
39070 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39071 else if (out_n == 8 && in_n == 8)
39072 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39074 if (out_mode == SFmode && in_mode == SFmode)
39076 if (out_n == 4 && in_n == 4)
39077 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39078 else if (out_n == 8 && in_n == 8)
39079 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39080 else if (out_n == 16 && in_n == 16)
39081 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39083 break;
39085 CASE_CFN_CEIL:
39086 /* The round insn does not trap on denormals. */
39087 if (flag_trapping_math || !TARGET_SSE4_1)
39088 break;
39090 if (out_mode == DFmode && in_mode == DFmode)
39092 if (out_n == 2 && in_n == 2)
39093 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39094 else if (out_n == 4 && in_n == 4)
39095 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39096 else if (out_n == 8 && in_n == 8)
39097 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39099 if (out_mode == SFmode && in_mode == SFmode)
39101 if (out_n == 4 && in_n == 4)
39102 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39103 else if (out_n == 8 && in_n == 8)
39104 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39105 else if (out_n == 16 && in_n == 16)
39106 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39108 break;
39110 CASE_CFN_TRUNC:
39111 /* The round insn does not trap on denormals. */
39112 if (flag_trapping_math || !TARGET_SSE4_1)
39113 break;
39115 if (out_mode == DFmode && in_mode == DFmode)
39117 if (out_n == 2 && in_n == 2)
39118 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39119 else if (out_n == 4 && in_n == 4)
39120 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39121 else if (out_n == 8 && in_n == 8)
39122 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39124 if (out_mode == SFmode && in_mode == SFmode)
39126 if (out_n == 4 && in_n == 4)
39127 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39128 else if (out_n == 8 && in_n == 8)
39129 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39130 else if (out_n == 16 && in_n == 16)
39131 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39133 break;
39135 CASE_CFN_RINT:
39136 /* The round insn does not trap on denormals. */
39137 if (flag_trapping_math || !TARGET_SSE4_1)
39138 break;
39140 if (out_mode == DFmode && in_mode == DFmode)
39142 if (out_n == 2 && in_n == 2)
39143 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39144 else if (out_n == 4 && in_n == 4)
39145 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39147 if (out_mode == SFmode && in_mode == SFmode)
39149 if (out_n == 4 && in_n == 4)
39150 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39151 else if (out_n == 8 && in_n == 8)
39152 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39154 break;
39156 CASE_CFN_FMA:
39157 if (out_mode == DFmode && in_mode == DFmode)
39159 if (out_n == 2 && in_n == 2)
39160 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39161 if (out_n == 4 && in_n == 4)
39162 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39164 if (out_mode == SFmode && in_mode == SFmode)
39166 if (out_n == 4 && in_n == 4)
39167 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39168 if (out_n == 8 && in_n == 8)
39169 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39171 break;
39173 default:
39174 break;
39177 /* Dispatch to a handler for a vectorization library. */
39178 if (ix86_veclib_handler)
39179 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39181 return NULL_TREE;
39184 /* Handler for an SVML-style interface to
39185 a library with vectorized intrinsics. */
39187 static tree
39188 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39190 char name[20];
39191 tree fntype, new_fndecl, args;
39192 unsigned arity;
39193 const char *bname;
39194 machine_mode el_mode, in_mode;
39195 int n, in_n;
39197 /* The SVML is suitable for unsafe math only. */
39198 if (!flag_unsafe_math_optimizations)
39199 return NULL_TREE;
39201 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39202 n = TYPE_VECTOR_SUBPARTS (type_out);
39203 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39204 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39205 if (el_mode != in_mode
39206 || n != in_n)
39207 return NULL_TREE;
39209 switch (fn)
39211 CASE_CFN_EXP:
39212 CASE_CFN_LOG:
39213 CASE_CFN_LOG10:
39214 CASE_CFN_POW:
39215 CASE_CFN_TANH:
39216 CASE_CFN_TAN:
39217 CASE_CFN_ATAN:
39218 CASE_CFN_ATAN2:
39219 CASE_CFN_ATANH:
39220 CASE_CFN_CBRT:
39221 CASE_CFN_SINH:
39222 CASE_CFN_SIN:
39223 CASE_CFN_ASINH:
39224 CASE_CFN_ASIN:
39225 CASE_CFN_COSH:
39226 CASE_CFN_COS:
39227 CASE_CFN_ACOSH:
39228 CASE_CFN_ACOS:
39229 if ((el_mode != DFmode || n != 2)
39230 && (el_mode != SFmode || n != 4))
39231 return NULL_TREE;
39232 break;
39234 default:
39235 return NULL_TREE;
39238 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39239 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39241 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39242 strcpy (name, "vmlsLn4");
39243 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39244 strcpy (name, "vmldLn2");
39245 else if (n == 4)
39247 sprintf (name, "vmls%s", bname+10);
39248 name[strlen (name)-1] = '4';
39250 else
39251 sprintf (name, "vmld%s2", bname+10);
39253 /* Convert to uppercase. */
39254 name[4] &= ~0x20;
39256 arity = 0;
39257 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39258 arity++;
39260 if (arity == 1)
39261 fntype = build_function_type_list (type_out, type_in, NULL);
39262 else
39263 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39265 /* Build a function declaration for the vectorized function. */
39266 new_fndecl = build_decl (BUILTINS_LOCATION,
39267 FUNCTION_DECL, get_identifier (name), fntype);
39268 TREE_PUBLIC (new_fndecl) = 1;
39269 DECL_EXTERNAL (new_fndecl) = 1;
39270 DECL_IS_NOVOPS (new_fndecl) = 1;
39271 TREE_READONLY (new_fndecl) = 1;
39273 return new_fndecl;
39276 /* Handler for an ACML-style interface to
39277 a library with vectorized intrinsics. */
39279 static tree
39280 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39282 char name[20] = "__vr.._";
39283 tree fntype, new_fndecl, args;
39284 unsigned arity;
39285 const char *bname;
39286 machine_mode el_mode, in_mode;
39287 int n, in_n;
39289 /* The ACML is 64bits only and suitable for unsafe math only as
39290 it does not correctly support parts of IEEE with the required
39291 precision such as denormals. */
39292 if (!TARGET_64BIT
39293 || !flag_unsafe_math_optimizations)
39294 return NULL_TREE;
39296 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39297 n = TYPE_VECTOR_SUBPARTS (type_out);
39298 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39299 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39300 if (el_mode != in_mode
39301 || n != in_n)
39302 return NULL_TREE;
39304 switch (fn)
39306 CASE_CFN_SIN:
39307 CASE_CFN_COS:
39308 CASE_CFN_EXP:
39309 CASE_CFN_LOG:
39310 CASE_CFN_LOG2:
39311 CASE_CFN_LOG10:
39312 if (el_mode == DFmode && n == 2)
39314 name[4] = 'd';
39315 name[5] = '2';
39317 else if (el_mode == SFmode && n == 4)
39319 name[4] = 's';
39320 name[5] = '4';
39322 else
39323 return NULL_TREE;
39324 break;
39326 default:
39327 return NULL_TREE;
39330 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39331 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39332 sprintf (name + 7, "%s", bname+10);
39334 arity = 0;
39335 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39336 arity++;
39338 if (arity == 1)
39339 fntype = build_function_type_list (type_out, type_in, NULL);
39340 else
39341 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39343 /* Build a function declaration for the vectorized function. */
39344 new_fndecl = build_decl (BUILTINS_LOCATION,
39345 FUNCTION_DECL, get_identifier (name), fntype);
39346 TREE_PUBLIC (new_fndecl) = 1;
39347 DECL_EXTERNAL (new_fndecl) = 1;
39348 DECL_IS_NOVOPS (new_fndecl) = 1;
39349 TREE_READONLY (new_fndecl) = 1;
39351 return new_fndecl;
39354 /* Returns a decl of a function that implements gather load with
39355 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39356 Return NULL_TREE if it is not available. */
39358 static tree
39359 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39360 const_tree index_type, int scale)
39362 bool si;
39363 enum ix86_builtins code;
39365 if (! TARGET_AVX2 || !TARGET_USE_GATHER)
39366 return NULL_TREE;
39368 if ((TREE_CODE (index_type) != INTEGER_TYPE
39369 && !POINTER_TYPE_P (index_type))
39370 || (TYPE_MODE (index_type) != SImode
39371 && TYPE_MODE (index_type) != DImode))
39372 return NULL_TREE;
39374 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39375 return NULL_TREE;
39377 /* v*gather* insn sign extends index to pointer mode. */
39378 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39379 && TYPE_UNSIGNED (index_type))
39380 return NULL_TREE;
39382 if (scale <= 0
39383 || scale > 8
39384 || (scale & (scale - 1)) != 0)
39385 return NULL_TREE;
39387 si = TYPE_MODE (index_type) == SImode;
39388 switch (TYPE_MODE (mem_vectype))
39390 case E_V2DFmode:
39391 if (TARGET_AVX512VL)
39392 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39393 else
39394 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39395 break;
39396 case E_V4DFmode:
39397 if (TARGET_AVX512VL)
39398 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39399 else
39400 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39401 break;
39402 case E_V2DImode:
39403 if (TARGET_AVX512VL)
39404 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39405 else
39406 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39407 break;
39408 case E_V4DImode:
39409 if (TARGET_AVX512VL)
39410 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39411 else
39412 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39413 break;
39414 case E_V4SFmode:
39415 if (TARGET_AVX512VL)
39416 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39417 else
39418 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39419 break;
39420 case E_V8SFmode:
39421 if (TARGET_AVX512VL)
39422 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39423 else
39424 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39425 break;
39426 case E_V4SImode:
39427 if (TARGET_AVX512VL)
39428 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39429 else
39430 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39431 break;
39432 case E_V8SImode:
39433 if (TARGET_AVX512VL)
39434 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39435 else
39436 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39437 break;
39438 case E_V8DFmode:
39439 if (TARGET_AVX512F)
39440 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39441 else
39442 return NULL_TREE;
39443 break;
39444 case E_V8DImode:
39445 if (TARGET_AVX512F)
39446 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39447 else
39448 return NULL_TREE;
39449 break;
39450 case E_V16SFmode:
39451 if (TARGET_AVX512F)
39452 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39453 else
39454 return NULL_TREE;
39455 break;
39456 case E_V16SImode:
39457 if (TARGET_AVX512F)
39458 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39459 else
39460 return NULL_TREE;
39461 break;
39462 default:
39463 return NULL_TREE;
39466 return ix86_get_builtin (code);
39469 /* Returns a decl of a function that implements scatter store with
39470 register type VECTYPE and index type INDEX_TYPE and SCALE.
39471 Return NULL_TREE if it is not available. */
39473 static tree
39474 ix86_vectorize_builtin_scatter (const_tree vectype,
39475 const_tree index_type, int scale)
39477 bool si;
39478 enum ix86_builtins code;
39480 if (!TARGET_AVX512F)
39481 return NULL_TREE;
39483 if ((TREE_CODE (index_type) != INTEGER_TYPE
39484 && !POINTER_TYPE_P (index_type))
39485 || (TYPE_MODE (index_type) != SImode
39486 && TYPE_MODE (index_type) != DImode))
39487 return NULL_TREE;
39489 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39490 return NULL_TREE;
39492 /* v*scatter* insn sign extends index to pointer mode. */
39493 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39494 && TYPE_UNSIGNED (index_type))
39495 return NULL_TREE;
39497 /* Scale can be 1, 2, 4 or 8. */
39498 if (scale <= 0
39499 || scale > 8
39500 || (scale & (scale - 1)) != 0)
39501 return NULL_TREE;
39503 si = TYPE_MODE (index_type) == SImode;
39504 switch (TYPE_MODE (vectype))
39506 case E_V8DFmode:
39507 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39508 break;
39509 case E_V8DImode:
39510 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39511 break;
39512 case E_V16SFmode:
39513 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39514 break;
39515 case E_V16SImode:
39516 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39517 break;
39518 default:
39519 return NULL_TREE;
39522 return ix86_builtins[code];
39525 /* Return true if it is safe to use the rsqrt optabs to optimize
39526 1.0/sqrt. */
39528 static bool
39529 use_rsqrt_p ()
39531 return (TARGET_SSE_MATH
39532 && flag_finite_math_only
39533 && !flag_trapping_math
39534 && flag_unsafe_math_optimizations);
39537 /* Returns a code for a target-specific builtin that implements
39538 reciprocal of the function, or NULL_TREE if not available. */
39540 static tree
39541 ix86_builtin_reciprocal (tree fndecl)
39543 switch (DECL_FUNCTION_CODE (fndecl))
39545 /* Vectorized version of sqrt to rsqrt conversion. */
39546 case IX86_BUILTIN_SQRTPS_NR:
39547 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39549 case IX86_BUILTIN_SQRTPS_NR256:
39550 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39552 default:
39553 return NULL_TREE;
39557 /* Helper for avx_vpermilps256_operand et al. This is also used by
39558 the expansion functions to turn the parallel back into a mask.
39559 The return value is 0 for no match and the imm8+1 for a match. */
39562 avx_vpermilp_parallel (rtx par, machine_mode mode)
39564 unsigned i, nelt = GET_MODE_NUNITS (mode);
39565 unsigned mask = 0;
39566 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39568 if (XVECLEN (par, 0) != (int) nelt)
39569 return 0;
39571 /* Validate that all of the elements are constants, and not totally
39572 out of range. Copy the data into an integral array to make the
39573 subsequent checks easier. */
39574 for (i = 0; i < nelt; ++i)
39576 rtx er = XVECEXP (par, 0, i);
39577 unsigned HOST_WIDE_INT ei;
39579 if (!CONST_INT_P (er))
39580 return 0;
39581 ei = INTVAL (er);
39582 if (ei >= nelt)
39583 return 0;
39584 ipar[i] = ei;
39587 switch (mode)
39589 case E_V8DFmode:
39590 /* In the 512-bit DFmode case, we can only move elements within
39591 a 128-bit lane. First fill the second part of the mask,
39592 then fallthru. */
39593 for (i = 4; i < 6; ++i)
39595 if (ipar[i] < 4 || ipar[i] >= 6)
39596 return 0;
39597 mask |= (ipar[i] - 4) << i;
39599 for (i = 6; i < 8; ++i)
39601 if (ipar[i] < 6)
39602 return 0;
39603 mask |= (ipar[i] - 6) << i;
39605 /* FALLTHRU */
39607 case E_V4DFmode:
39608 /* In the 256-bit DFmode case, we can only move elements within
39609 a 128-bit lane. */
39610 for (i = 0; i < 2; ++i)
39612 if (ipar[i] >= 2)
39613 return 0;
39614 mask |= ipar[i] << i;
39616 for (i = 2; i < 4; ++i)
39618 if (ipar[i] < 2)
39619 return 0;
39620 mask |= (ipar[i] - 2) << i;
39622 break;
39624 case E_V16SFmode:
39625 /* In 512 bit SFmode case, permutation in the upper 256 bits
39626 must mirror the permutation in the lower 256-bits. */
39627 for (i = 0; i < 8; ++i)
39628 if (ipar[i] + 8 != ipar[i + 8])
39629 return 0;
39630 /* FALLTHRU */
39632 case E_V8SFmode:
39633 /* In 256 bit SFmode case, we have full freedom of
39634 movement within the low 128-bit lane, but the high 128-bit
39635 lane must mirror the exact same pattern. */
39636 for (i = 0; i < 4; ++i)
39637 if (ipar[i] + 4 != ipar[i + 4])
39638 return 0;
39639 nelt = 4;
39640 /* FALLTHRU */
39642 case E_V2DFmode:
39643 case E_V4SFmode:
39644 /* In the 128-bit case, we've full freedom in the placement of
39645 the elements from the source operand. */
39646 for (i = 0; i < nelt; ++i)
39647 mask |= ipar[i] << (i * (nelt / 2));
39648 break;
39650 default:
39651 gcc_unreachable ();
39654 /* Make sure success has a non-zero value by adding one. */
39655 return mask + 1;
39658 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39659 the expansion functions to turn the parallel back into a mask.
39660 The return value is 0 for no match and the imm8+1 for a match. */
39663 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39665 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39666 unsigned mask = 0;
39667 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39669 if (XVECLEN (par, 0) != (int) nelt)
39670 return 0;
39672 /* Validate that all of the elements are constants, and not totally
39673 out of range. Copy the data into an integral array to make the
39674 subsequent checks easier. */
39675 for (i = 0; i < nelt; ++i)
39677 rtx er = XVECEXP (par, 0, i);
39678 unsigned HOST_WIDE_INT ei;
39680 if (!CONST_INT_P (er))
39681 return 0;
39682 ei = INTVAL (er);
39683 if (ei >= 2 * nelt)
39684 return 0;
39685 ipar[i] = ei;
39688 /* Validate that the halves of the permute are halves. */
39689 for (i = 0; i < nelt2 - 1; ++i)
39690 if (ipar[i] + 1 != ipar[i + 1])
39691 return 0;
39692 for (i = nelt2; i < nelt - 1; ++i)
39693 if (ipar[i] + 1 != ipar[i + 1])
39694 return 0;
39696 /* Reconstruct the mask. */
39697 for (i = 0; i < 2; ++i)
39699 unsigned e = ipar[i * nelt2];
39700 if (e % nelt2)
39701 return 0;
39702 e /= nelt2;
39703 mask |= e << (i * 4);
39706 /* Make sure success has a non-zero value by adding one. */
39707 return mask + 1;
39710 /* Return a register priority for hard reg REGNO. */
39711 static int
39712 ix86_register_priority (int hard_regno)
39714 /* ebp and r13 as the base always wants a displacement, r12 as the
39715 base always wants an index. So discourage their usage in an
39716 address. */
39717 if (hard_regno == R12_REG || hard_regno == R13_REG)
39718 return 0;
39719 if (hard_regno == BP_REG)
39720 return 1;
39721 /* New x86-64 int registers result in bigger code size. Discourage
39722 them. */
39723 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39724 return 2;
39725 /* New x86-64 SSE registers result in bigger code size. Discourage
39726 them. */
39727 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39728 return 2;
39729 /* Usage of AX register results in smaller code. Prefer it. */
39730 if (hard_regno == AX_REG)
39731 return 4;
39732 return 3;
39735 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39737 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39738 QImode must go into class Q_REGS.
39739 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39740 movdf to do mem-to-mem moves through integer regs. */
39742 static reg_class_t
39743 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39745 machine_mode mode = GET_MODE (x);
39747 /* We're only allowed to return a subclass of CLASS. Many of the
39748 following checks fail for NO_REGS, so eliminate that early. */
39749 if (regclass == NO_REGS)
39750 return NO_REGS;
39752 /* All classes can load zeros. */
39753 if (x == CONST0_RTX (mode))
39754 return regclass;
39756 /* Force constants into memory if we are loading a (nonzero) constant into
39757 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39758 instructions to load from a constant. */
39759 if (CONSTANT_P (x)
39760 && (MAYBE_MMX_CLASS_P (regclass)
39761 || MAYBE_SSE_CLASS_P (regclass)
39762 || MAYBE_MASK_CLASS_P (regclass)))
39763 return NO_REGS;
39765 /* Floating-point constants need more complex checks. */
39766 if (CONST_DOUBLE_P (x))
39768 /* General regs can load everything. */
39769 if (INTEGER_CLASS_P (regclass))
39770 return regclass;
39772 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39773 zero above. We only want to wind up preferring 80387 registers if
39774 we plan on doing computation with them. */
39775 if (IS_STACK_MODE (mode)
39776 && standard_80387_constant_p (x) > 0)
39778 /* Limit class to FP regs. */
39779 if (FLOAT_CLASS_P (regclass))
39780 return FLOAT_REGS;
39781 else if (regclass == FP_TOP_SSE_REGS)
39782 return FP_TOP_REG;
39783 else if (regclass == FP_SECOND_SSE_REGS)
39784 return FP_SECOND_REG;
39787 return NO_REGS;
39790 /* Prefer SSE regs only, if we can use them for math. */
39791 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39792 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39794 /* Generally when we see PLUS here, it's the function invariant
39795 (plus soft-fp const_int). Which can only be computed into general
39796 regs. */
39797 if (GET_CODE (x) == PLUS)
39798 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39800 /* QImode constants are easy to load, but non-constant QImode data
39801 must go into Q_REGS. */
39802 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39804 if (Q_CLASS_P (regclass))
39805 return regclass;
39806 else if (reg_class_subset_p (Q_REGS, regclass))
39807 return Q_REGS;
39808 else
39809 return NO_REGS;
39812 return regclass;
39815 /* Discourage putting floating-point values in SSE registers unless
39816 SSE math is being used, and likewise for the 387 registers. */
39817 static reg_class_t
39818 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39820 machine_mode mode = GET_MODE (x);
39822 /* Restrict the output reload class to the register bank that we are doing
39823 math on. If we would like not to return a subset of CLASS, reject this
39824 alternative: if reload cannot do this, it will still use its choice. */
39825 mode = GET_MODE (x);
39826 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39827 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39829 if (IS_STACK_MODE (mode))
39831 if (regclass == FP_TOP_SSE_REGS)
39832 return FP_TOP_REG;
39833 else if (regclass == FP_SECOND_SSE_REGS)
39834 return FP_SECOND_REG;
39835 else
39836 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39839 return regclass;
39842 static reg_class_t
39843 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39844 machine_mode mode, secondary_reload_info *sri)
39846 /* Double-word spills from general registers to non-offsettable memory
39847 references (zero-extended addresses) require special handling. */
39848 if (TARGET_64BIT
39849 && MEM_P (x)
39850 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39851 && INTEGER_CLASS_P (rclass)
39852 && !offsettable_memref_p (x))
39854 sri->icode = (in_p
39855 ? CODE_FOR_reload_noff_load
39856 : CODE_FOR_reload_noff_store);
39857 /* Add the cost of moving address to a temporary. */
39858 sri->extra_cost = 1;
39860 return NO_REGS;
39863 /* QImode spills from non-QI registers require
39864 intermediate register on 32bit targets. */
39865 if (mode == QImode
39866 && ((!TARGET_64BIT && !in_p
39867 && INTEGER_CLASS_P (rclass)
39868 && MAYBE_NON_Q_CLASS_P (rclass))
39869 || (!TARGET_AVX512DQ
39870 && MAYBE_MASK_CLASS_P (rclass))))
39872 int regno = true_regnum (x);
39874 /* Return Q_REGS if the operand is in memory. */
39875 if (regno == -1)
39876 return Q_REGS;
39878 return NO_REGS;
39881 /* This condition handles corner case where an expression involving
39882 pointers gets vectorized. We're trying to use the address of a
39883 stack slot as a vector initializer.
39885 (set (reg:V2DI 74 [ vect_cst_.2 ])
39886 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39888 Eventually frame gets turned into sp+offset like this:
39890 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39891 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39892 (const_int 392 [0x188]))))
39894 That later gets turned into:
39896 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39897 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39898 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39900 We'll have the following reload recorded:
39902 Reload 0: reload_in (DI) =
39903 (plus:DI (reg/f:DI 7 sp)
39904 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39905 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39906 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39907 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39908 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39909 reload_reg_rtx: (reg:V2DI 22 xmm1)
39911 Which isn't going to work since SSE instructions can't handle scalar
39912 additions. Returning GENERAL_REGS forces the addition into integer
39913 register and reload can handle subsequent reloads without problems. */
39915 if (in_p && GET_CODE (x) == PLUS
39916 && SSE_CLASS_P (rclass)
39917 && SCALAR_INT_MODE_P (mode))
39918 return GENERAL_REGS;
39920 return NO_REGS;
39923 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39925 static bool
39926 ix86_class_likely_spilled_p (reg_class_t rclass)
39928 switch (rclass)
39930 case AREG:
39931 case DREG:
39932 case CREG:
39933 case BREG:
39934 case AD_REGS:
39935 case SIREG:
39936 case DIREG:
39937 case SSE_FIRST_REG:
39938 case FP_TOP_REG:
39939 case FP_SECOND_REG:
39940 case BND_REGS:
39941 return true;
39943 default:
39944 break;
39947 return false;
39950 /* If we are copying between registers from different register sets
39951 (e.g. FP and integer), we may need a memory location.
39953 The function can't work reliably when one of the CLASSES is a class
39954 containing registers from multiple sets. We avoid this by never combining
39955 different sets in a single alternative in the machine description.
39956 Ensure that this constraint holds to avoid unexpected surprises.
39958 When STRICT is false, we are being called from REGISTER_MOVE_COST,
39959 so do not enforce these sanity checks.
39961 To optimize register_move_cost performance, define inline variant. */
39963 static inline bool
39964 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39965 reg_class_t class2, int strict)
39967 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39968 return false;
39970 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39971 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39972 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39973 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39974 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39975 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
39976 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
39977 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
39979 gcc_assert (!strict || lra_in_progress);
39980 return true;
39983 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39984 return true;
39986 /* Between mask and general, we have moves no larger than word size. */
39987 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
39988 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39989 return true;
39991 /* ??? This is a lie. We do have moves between mmx/general, and for
39992 mmx/sse2. But by saying we need secondary memory we discourage the
39993 register allocator from using the mmx registers unless needed. */
39994 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39995 return true;
39997 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39999 /* SSE1 doesn't have any direct moves from other classes. */
40000 if (!TARGET_SSE2)
40001 return true;
40003 /* If the target says that inter-unit moves are more expensive
40004 than moving through memory, then don't generate them. */
40005 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40006 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40007 return true;
40009 /* Between SSE and general, we have moves no larger than word size. */
40010 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40011 return true;
40014 return false;
40017 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
40019 static bool
40020 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
40021 reg_class_t class2)
40023 return inline_secondary_memory_needed (mode, class1, class2, true);
40026 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
40028 get_secondary_mem widens integral modes to BITS_PER_WORD.
40029 There is no need to emit full 64 bit move on 64 bit targets
40030 for integral modes that can be moved using 32 bit move. */
40032 static machine_mode
40033 ix86_secondary_memory_needed_mode (machine_mode mode)
40035 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
40036 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
40037 return mode;
40040 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40042 On the 80386, this is the size of MODE in words,
40043 except in the FP regs, where a single reg is always enough. */
40045 static unsigned char
40046 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40048 if (MAYBE_INTEGER_CLASS_P (rclass))
40050 if (mode == XFmode)
40051 return (TARGET_64BIT ? 2 : 3);
40052 else if (mode == XCmode)
40053 return (TARGET_64BIT ? 4 : 6);
40054 else
40055 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40057 else
40059 if (COMPLEX_MODE_P (mode))
40060 return 2;
40061 else
40062 return 1;
40066 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
40068 static bool
40069 ix86_can_change_mode_class (machine_mode from, machine_mode to,
40070 reg_class_t regclass)
40072 if (from == to)
40073 return true;
40075 /* x87 registers can't do subreg at all, as all values are reformatted
40076 to extended precision. */
40077 if (MAYBE_FLOAT_CLASS_P (regclass))
40078 return false;
40080 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40082 /* Vector registers do not support QI or HImode loads. If we don't
40083 disallow a change to these modes, reload will assume it's ok to
40084 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40085 the vec_dupv4hi pattern. */
40086 if (GET_MODE_SIZE (from) < 4)
40087 return false;
40090 return true;
40093 /* Return index of MODE in the sse load/store tables. */
40095 static inline int
40096 sse_store_index (machine_mode mode)
40098 switch (GET_MODE_SIZE (mode))
40100 case 4:
40101 return 0;
40102 case 8:
40103 return 1;
40104 case 16:
40105 return 2;
40106 case 32:
40107 return 3;
40108 case 64:
40109 return 4;
40110 default:
40111 return -1;
40115 /* Return the cost of moving data of mode M between a
40116 register and memory. A value of 2 is the default; this cost is
40117 relative to those in `REGISTER_MOVE_COST'.
40119 This function is used extensively by register_move_cost that is used to
40120 build tables at startup. Make it inline in this case.
40121 When IN is 2, return maximum of in and out move cost.
40123 If moving between registers and memory is more expensive than
40124 between two registers, you should define this macro to express the
40125 relative cost.
40127 Model also increased moving costs of QImode registers in non
40128 Q_REGS classes.
40130 static inline int
40131 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40132 int in)
40134 int cost;
40135 if (FLOAT_CLASS_P (regclass))
40137 int index;
40138 switch (mode)
40140 case E_SFmode:
40141 index = 0;
40142 break;
40143 case E_DFmode:
40144 index = 1;
40145 break;
40146 case E_XFmode:
40147 index = 2;
40148 break;
40149 default:
40150 return 100;
40152 if (in == 2)
40153 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40154 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40156 if (SSE_CLASS_P (regclass))
40158 int index = sse_store_index (mode);
40159 if (index == -1)
40160 return 100;
40161 if (in == 2)
40162 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40163 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40165 if (MMX_CLASS_P (regclass))
40167 int index;
40168 switch (GET_MODE_SIZE (mode))
40170 case 4:
40171 index = 0;
40172 break;
40173 case 8:
40174 index = 1;
40175 break;
40176 default:
40177 return 100;
40179 if (in)
40180 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40181 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40183 switch (GET_MODE_SIZE (mode))
40185 case 1:
40186 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40188 if (!in)
40189 return ix86_cost->int_store[0];
40190 if (TARGET_PARTIAL_REG_DEPENDENCY
40191 && optimize_function_for_speed_p (cfun))
40192 cost = ix86_cost->movzbl_load;
40193 else
40194 cost = ix86_cost->int_load[0];
40195 if (in == 2)
40196 return MAX (cost, ix86_cost->int_store[0]);
40197 return cost;
40199 else
40201 if (in == 2)
40202 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40203 if (in)
40204 return ix86_cost->movzbl_load;
40205 else
40206 return ix86_cost->int_store[0] + 4;
40208 break;
40209 case 2:
40210 if (in == 2)
40211 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40212 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40213 default:
40214 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40215 if (mode == TFmode)
40216 mode = XFmode;
40217 if (in == 2)
40218 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40219 else if (in)
40220 cost = ix86_cost->int_load[2];
40221 else
40222 cost = ix86_cost->int_store[2];
40223 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40227 static int
40228 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40229 bool in)
40231 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40235 /* Return the cost of moving data from a register in class CLASS1 to
40236 one in class CLASS2.
40238 It is not required that the cost always equal 2 when FROM is the same as TO;
40239 on some machines it is expensive to move between registers if they are not
40240 general registers. */
40242 static int
40243 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40244 reg_class_t class2_i)
40246 enum reg_class class1 = (enum reg_class) class1_i;
40247 enum reg_class class2 = (enum reg_class) class2_i;
40249 /* In case we require secondary memory, compute cost of the store followed
40250 by load. In order to avoid bad register allocation choices, we need
40251 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40253 if (inline_secondary_memory_needed (mode, class1, class2, false))
40255 int cost = 1;
40257 cost += inline_memory_move_cost (mode, class1, 2);
40258 cost += inline_memory_move_cost (mode, class2, 2);
40260 /* In case of copying from general_purpose_register we may emit multiple
40261 stores followed by single load causing memory size mismatch stall.
40262 Count this as arbitrarily high cost of 20. */
40263 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
40264 && TARGET_MEMORY_MISMATCH_STALL
40265 && targetm.class_max_nregs (class1, mode)
40266 > targetm.class_max_nregs (class2, mode))
40267 cost += 20;
40269 /* In the case of FP/MMX moves, the registers actually overlap, and we
40270 have to switch modes in order to treat them differently. */
40271 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40272 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40273 cost += 20;
40275 return cost;
40278 /* Moves between SSE/MMX and integer unit are expensive. */
40279 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40280 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40282 /* ??? By keeping returned value relatively high, we limit the number
40283 of moves between integer and MMX/SSE registers for all targets.
40284 Additionally, high value prevents problem with x86_modes_tieable_p(),
40285 where integer modes in MMX/SSE registers are not tieable
40286 because of missing QImode and HImode moves to, from or between
40287 MMX/SSE registers. */
40288 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
40289 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
40291 if (MAYBE_FLOAT_CLASS_P (class1))
40292 return ix86_cost->fp_move;
40293 if (MAYBE_SSE_CLASS_P (class1))
40295 if (GET_MODE_BITSIZE (mode) <= 128)
40296 return ix86_cost->xmm_move;
40297 if (GET_MODE_BITSIZE (mode) <= 256)
40298 return ix86_cost->ymm_move;
40299 return ix86_cost->zmm_move;
40301 if (MAYBE_MMX_CLASS_P (class1))
40302 return ix86_cost->mmx_move;
40303 return 2;
40306 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
40307 words of a value of mode MODE but can be less for certain modes in
40308 special long registers.
40310 Actually there are no two word move instructions for consecutive
40311 registers. And only registers 0-3 may have mov byte instructions
40312 applied to them. */
40314 static unsigned int
40315 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
40317 if (GENERAL_REGNO_P (regno))
40319 if (mode == XFmode)
40320 return TARGET_64BIT ? 2 : 3;
40321 if (mode == XCmode)
40322 return TARGET_64BIT ? 4 : 6;
40323 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40325 if (COMPLEX_MODE_P (mode))
40326 return 2;
40327 if (mode == V64SFmode || mode == V64SImode)
40328 return 4;
40329 return 1;
40332 /* Implement TARGET_HARD_REGNO_MODE_OK. */
40334 static bool
40335 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
40337 /* Flags and only flags can only hold CCmode values. */
40338 if (CC_REGNO_P (regno))
40339 return GET_MODE_CLASS (mode) == MODE_CC;
40340 if (GET_MODE_CLASS (mode) == MODE_CC
40341 || GET_MODE_CLASS (mode) == MODE_RANDOM
40342 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40343 return false;
40344 if (STACK_REGNO_P (regno))
40345 return VALID_FP_MODE_P (mode);
40346 if (MASK_REGNO_P (regno))
40347 return (VALID_MASK_REG_MODE (mode)
40348 || (TARGET_AVX512BW
40349 && VALID_MASK_AVX512BW_MODE (mode)));
40350 if (BND_REGNO_P (regno))
40351 return VALID_BND_REG_MODE (mode);
40352 if (SSE_REGNO_P (regno))
40354 /* We implement the move patterns for all vector modes into and
40355 out of SSE registers, even when no operation instructions
40356 are available. */
40358 /* For AVX-512 we allow, regardless of regno:
40359 - XI mode
40360 - any of 512-bit wide vector mode
40361 - any scalar mode. */
40362 if (TARGET_AVX512F
40363 && (mode == XImode
40364 || VALID_AVX512F_REG_MODE (mode)
40365 || VALID_AVX512F_SCALAR_MODE (mode)))
40366 return true;
40368 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40369 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40370 && MOD4_SSE_REGNO_P (regno)
40371 && mode == V64SFmode)
40372 return true;
40374 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40375 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40376 && MOD4_SSE_REGNO_P (regno)
40377 && mode == V64SImode)
40378 return true;
40380 /* TODO check for QI/HI scalars. */
40381 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40382 if (TARGET_AVX512VL
40383 && (mode == OImode
40384 || mode == TImode
40385 || VALID_AVX256_REG_MODE (mode)
40386 || VALID_AVX512VL_128_REG_MODE (mode)))
40387 return true;
40389 /* xmm16-xmm31 are only available for AVX-512. */
40390 if (EXT_REX_SSE_REGNO_P (regno))
40391 return false;
40393 /* OImode and AVX modes are available only when AVX is enabled. */
40394 return ((TARGET_AVX
40395 && VALID_AVX256_REG_OR_OI_MODE (mode))
40396 || VALID_SSE_REG_MODE (mode)
40397 || VALID_SSE2_REG_MODE (mode)
40398 || VALID_MMX_REG_MODE (mode)
40399 || VALID_MMX_REG_MODE_3DNOW (mode));
40401 if (MMX_REGNO_P (regno))
40403 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40404 so if the register is available at all, then we can move data of
40405 the given mode into or out of it. */
40406 return (VALID_MMX_REG_MODE (mode)
40407 || VALID_MMX_REG_MODE_3DNOW (mode));
40410 if (mode == QImode)
40412 /* Take care for QImode values - they can be in non-QI regs,
40413 but then they do cause partial register stalls. */
40414 if (ANY_QI_REGNO_P (regno))
40415 return true;
40416 if (!TARGET_PARTIAL_REG_STALL)
40417 return true;
40418 /* LRA checks if the hard register is OK for the given mode.
40419 QImode values can live in non-QI regs, so we allow all
40420 registers here. */
40421 if (lra_in_progress)
40422 return true;
40423 return !can_create_pseudo_p ();
40425 /* We handle both integer and floats in the general purpose registers. */
40426 else if (VALID_INT_MODE_P (mode))
40427 return true;
40428 else if (VALID_FP_MODE_P (mode))
40429 return true;
40430 else if (VALID_DFP_MODE_P (mode))
40431 return true;
40432 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40433 on to use that value in smaller contexts, this can easily force a
40434 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40435 supporting DImode, allow it. */
40436 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40437 return true;
40439 return false;
40442 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
40443 saves SSE registers across calls is Win64 (thus no need to check the
40444 current ABI here), and with AVX enabled Win64 only guarantees that
40445 the low 16 bytes are saved. */
40447 static bool
40448 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
40450 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
40453 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40454 tieable integer mode. */
40456 static bool
40457 ix86_tieable_integer_mode_p (machine_mode mode)
40459 switch (mode)
40461 case E_HImode:
40462 case E_SImode:
40463 return true;
40465 case E_QImode:
40466 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40468 case E_DImode:
40469 return TARGET_64BIT;
40471 default:
40472 return false;
40476 /* Implement TARGET_MODES_TIEABLE_P.
40478 Return true if MODE1 is accessible in a register that can hold MODE2
40479 without copying. That is, all register classes that can hold MODE2
40480 can also hold MODE1. */
40482 static bool
40483 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40485 if (mode1 == mode2)
40486 return true;
40488 if (ix86_tieable_integer_mode_p (mode1)
40489 && ix86_tieable_integer_mode_p (mode2))
40490 return true;
40492 /* MODE2 being XFmode implies fp stack or general regs, which means we
40493 can tie any smaller floating point modes to it. Note that we do not
40494 tie this with TFmode. */
40495 if (mode2 == XFmode)
40496 return mode1 == SFmode || mode1 == DFmode;
40498 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40499 that we can tie it with SFmode. */
40500 if (mode2 == DFmode)
40501 return mode1 == SFmode;
40503 /* If MODE2 is only appropriate for an SSE register, then tie with
40504 any other mode acceptable to SSE registers. */
40505 if (GET_MODE_SIZE (mode2) == 32
40506 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40507 return (GET_MODE_SIZE (mode1) == 32
40508 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40509 if (GET_MODE_SIZE (mode2) == 16
40510 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40511 return (GET_MODE_SIZE (mode1) == 16
40512 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40514 /* If MODE2 is appropriate for an MMX register, then tie
40515 with any other mode acceptable to MMX registers. */
40516 if (GET_MODE_SIZE (mode2) == 8
40517 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40518 return (GET_MODE_SIZE (mode1) == 8
40519 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40521 return false;
40524 /* Return the cost of moving between two registers of mode MODE. */
40526 static int
40527 ix86_set_reg_reg_cost (machine_mode mode)
40529 unsigned int units = UNITS_PER_WORD;
40531 switch (GET_MODE_CLASS (mode))
40533 default:
40534 break;
40536 case MODE_CC:
40537 units = GET_MODE_SIZE (CCmode);
40538 break;
40540 case MODE_FLOAT:
40541 if ((TARGET_SSE && mode == TFmode)
40542 || (TARGET_80387 && mode == XFmode)
40543 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40544 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40545 units = GET_MODE_SIZE (mode);
40546 break;
40548 case MODE_COMPLEX_FLOAT:
40549 if ((TARGET_SSE && mode == TCmode)
40550 || (TARGET_80387 && mode == XCmode)
40551 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40552 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40553 units = GET_MODE_SIZE (mode);
40554 break;
40556 case MODE_VECTOR_INT:
40557 case MODE_VECTOR_FLOAT:
40558 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40559 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40560 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40561 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40562 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40563 units = GET_MODE_SIZE (mode);
40566 /* Return the cost of moving between two registers of mode MODE,
40567 assuming that the move will be in pieces of at most UNITS bytes. */
40568 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40571 /* Return cost of vector operation in MODE given that scalar version has
40572 COST. If PARALLEL is true assume that CPU has more than one unit
40573 performing the operation. */
40575 static int
40576 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
40578 if (!VECTOR_MODE_P (mode))
40579 return cost;
40581 if (!parallel)
40582 return cost * GET_MODE_NUNITS (mode);
40583 if (GET_MODE_BITSIZE (mode) == 128
40584 && TARGET_SSE_SPLIT_REGS)
40585 return cost * 2;
40586 if (GET_MODE_BITSIZE (mode) > 128
40587 && TARGET_AVX128_OPTIMAL)
40588 return cost * GET_MODE_BITSIZE (mode) / 128;
40589 return cost;
40592 /* Return cost of multiplication in MODE. */
40594 static int
40595 ix86_multiplication_cost (const struct processor_costs *cost,
40596 enum machine_mode mode)
40598 machine_mode inner_mode = mode;
40599 if (VECTOR_MODE_P (mode))
40600 inner_mode = GET_MODE_INNER (mode);
40602 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40603 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
40604 else if (X87_FLOAT_MODE_P (mode))
40605 return cost->fmul;
40606 else if (FLOAT_MODE_P (mode))
40607 return ix86_vec_cost (mode,
40608 inner_mode == DFmode
40609 ? cost->mulsd : cost->mulss, true);
40610 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40612 /* vpmullq is used in this case. No emulation is needed. */
40613 if (TARGET_AVX512DQ)
40614 return ix86_vec_cost (mode, cost->mulss, true);
40616 /* V*QImode is emulated with 7-13 insns. */
40617 if (mode == V16QImode || mode == V32QImode)
40619 int extra = 11;
40620 if (TARGET_XOP && mode == V16QImode)
40621 extra = 5;
40622 else if (TARGET_SSSE3)
40623 extra = 6;
40624 return ix86_vec_cost (mode,
40625 cost->mulss * 2 + cost->sse_op * extra,
40626 true);
40628 /* V*DImode is emulated with 5-8 insns. */
40629 else if (mode == V2DImode || mode == V4DImode)
40631 if (TARGET_XOP && mode == V2DImode)
40632 return ix86_vec_cost (mode,
40633 cost->mulss * 2 + cost->sse_op * 3,
40634 true);
40635 else
40636 return ix86_vec_cost (mode,
40637 cost->mulss * 3 + cost->sse_op * 5,
40638 true);
40640 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40641 insns, including two PMULUDQ. */
40642 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40643 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
40644 true);
40645 else
40646 return ix86_vec_cost (mode, cost->mulss, true);
40648 else
40649 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
40652 /* Return cost of multiplication in MODE. */
40654 static int
40655 ix86_division_cost (const struct processor_costs *cost,
40656 enum machine_mode mode)
40658 machine_mode inner_mode = mode;
40659 if (VECTOR_MODE_P (mode))
40660 inner_mode = GET_MODE_INNER (mode);
40662 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40663 return inner_mode == DFmode ? cost->divsd : cost->divss;
40664 else if (X87_FLOAT_MODE_P (mode))
40665 return cost->fdiv;
40666 else if (FLOAT_MODE_P (mode))
40667 return ix86_vec_cost (mode,
40668 inner_mode == DFmode ? cost->divsd : cost->divss,
40669 true);
40670 else
40671 return cost->divide[MODE_INDEX (mode)];
40674 /* Return cost of shift in MODE.
40675 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
40676 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
40677 if op1 is a result of subreg.
40679 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
40681 static int
40682 ix86_shift_rotate_cost (const struct processor_costs *cost,
40683 enum machine_mode mode, bool constant_op1,
40684 HOST_WIDE_INT op1_val,
40685 bool speed,
40686 bool and_in_op1,
40687 bool shift_and_truncate,
40688 bool *skip_op0, bool *skip_op1)
40690 if (skip_op0)
40691 *skip_op0 = *skip_op1 = false;
40692 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40694 /* V*QImode is emulated with 1-11 insns. */
40695 if (mode == V16QImode || mode == V32QImode)
40697 int count = 11;
40698 if (TARGET_XOP && mode == V16QImode)
40700 /* For XOP we use vpshab, which requires a broadcast of the
40701 value to the variable shift insn. For constants this
40702 means a V16Q const in mem; even when we can perform the
40703 shift with one insn set the cost to prefer paddb. */
40704 if (constant_op1)
40706 if (skip_op1)
40707 *skip_op1 = true;
40708 return ix86_vec_cost (mode,
40709 cost->sse_op
40710 + (speed
40712 : COSTS_N_BYTES
40713 (GET_MODE_UNIT_SIZE (mode))), true);
40715 count = 3;
40717 else if (TARGET_SSSE3)
40718 count = 7;
40719 return ix86_vec_cost (mode, cost->sse_op * count, true);
40721 else
40722 return ix86_vec_cost (mode, cost->sse_op, true);
40724 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40726 if (constant_op1)
40728 if (op1_val > 32)
40729 return cost->shift_const + COSTS_N_INSNS (2);
40730 else
40731 return cost->shift_const * 2;
40733 else
40735 if (and_in_op1)
40736 return cost->shift_var * 2;
40737 else
40738 return cost->shift_var * 6 + COSTS_N_INSNS (2);
40741 else
40743 if (constant_op1)
40744 return cost->shift_const;
40745 else if (shift_and_truncate)
40747 if (skip_op0)
40748 *skip_op0 = *skip_op1 = true;
40749 /* Return the cost after shift-and truncation. */
40750 return cost->shift_var;
40752 else
40753 return cost->shift_var;
40755 return cost->shift_const;
40758 /* Compute a (partial) cost for rtx X. Return true if the complete
40759 cost has been computed, and false if subexpressions should be
40760 scanned. In either case, *TOTAL contains the cost result. */
40762 static bool
40763 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40764 int *total, bool speed)
40766 rtx mask;
40767 enum rtx_code code = GET_CODE (x);
40768 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40769 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40770 int src_cost;
40772 switch (code)
40774 case SET:
40775 if (register_operand (SET_DEST (x), VOIDmode)
40776 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40778 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40779 return true;
40782 if (register_operand (SET_SRC (x), VOIDmode))
40783 /* Avoid potentially incorrect high cost from rtx_costs
40784 for non-tieable SUBREGs. */
40785 src_cost = 0;
40786 else
40788 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40790 if (CONSTANT_P (SET_SRC (x)))
40791 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40792 a small value, possibly zero for cheap constants. */
40793 src_cost += COSTS_N_INSNS (1);
40796 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40797 return true;
40799 case CONST_INT:
40800 case CONST:
40801 case LABEL_REF:
40802 case SYMBOL_REF:
40803 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40804 *total = 3;
40805 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40806 *total = 2;
40807 else if (flag_pic && SYMBOLIC_CONST (x)
40808 && !(TARGET_64BIT
40809 && (GET_CODE (x) == LABEL_REF
40810 || (GET_CODE (x) == SYMBOL_REF
40811 && SYMBOL_REF_LOCAL_P (x))))
40812 /* Use 0 cost for CONST to improve its propagation. */
40813 && (TARGET_64BIT || GET_CODE (x) != CONST))
40814 *total = 1;
40815 else
40816 *total = 0;
40817 return true;
40819 case CONST_DOUBLE:
40820 if (IS_STACK_MODE (mode))
40821 switch (standard_80387_constant_p (x))
40823 case -1:
40824 case 0:
40825 break;
40826 case 1: /* 0.0 */
40827 *total = 1;
40828 return true;
40829 default: /* Other constants */
40830 *total = 2;
40831 return true;
40833 /* FALLTHRU */
40835 case CONST_VECTOR:
40836 switch (standard_sse_constant_p (x, mode))
40838 case 0:
40839 break;
40840 case 1: /* 0: xor eliminates false dependency */
40841 *total = 0;
40842 return true;
40843 default: /* -1: cmp contains false dependency */
40844 *total = 1;
40845 return true;
40847 /* FALLTHRU */
40849 case CONST_WIDE_INT:
40850 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40851 it'll probably end up. Add a penalty for size. */
40852 *total = (COSTS_N_INSNS (1)
40853 + (!TARGET_64BIT && flag_pic)
40854 + (GET_MODE_SIZE (mode) <= 4
40855 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40856 return true;
40858 case ZERO_EXTEND:
40859 /* The zero extensions is often completely free on x86_64, so make
40860 it as cheap as possible. */
40861 if (TARGET_64BIT && mode == DImode
40862 && GET_MODE (XEXP (x, 0)) == SImode)
40863 *total = 1;
40864 else if (TARGET_ZERO_EXTEND_WITH_AND)
40865 *total = cost->add;
40866 else
40867 *total = cost->movzx;
40868 return false;
40870 case SIGN_EXTEND:
40871 *total = cost->movsx;
40872 return false;
40874 case ASHIFT:
40875 if (SCALAR_INT_MODE_P (mode)
40876 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40877 && CONST_INT_P (XEXP (x, 1)))
40879 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40880 if (value == 1)
40882 *total = cost->add;
40883 return false;
40885 if ((value == 2 || value == 3)
40886 && cost->lea <= cost->shift_const)
40888 *total = cost->lea;
40889 return false;
40892 /* FALLTHRU */
40894 case ROTATE:
40895 case ASHIFTRT:
40896 case LSHIFTRT:
40897 case ROTATERT:
40898 bool skip_op0, skip_op1;
40899 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
40900 CONST_INT_P (XEXP (x, 1))
40901 ? INTVAL (XEXP (x, 1)) : -1,
40902 speed,
40903 GET_CODE (XEXP (x, 1)) == AND,
40904 SUBREG_P (XEXP (x, 1))
40905 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
40906 &skip_op0, &skip_op1);
40907 if (skip_op0 || skip_op1)
40909 if (!skip_op0)
40910 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
40911 if (!skip_op1)
40912 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
40913 return true;
40915 return false;
40917 case FMA:
40919 rtx sub;
40921 gcc_assert (FLOAT_MODE_P (mode));
40922 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40924 *total = ix86_vec_cost (mode,
40925 mode == SFmode ? cost->fmass : cost->fmasd,
40926 true);
40927 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40929 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40930 sub = XEXP (x, 0);
40931 if (GET_CODE (sub) == NEG)
40932 sub = XEXP (sub, 0);
40933 *total += rtx_cost (sub, mode, FMA, 0, speed);
40935 sub = XEXP (x, 2);
40936 if (GET_CODE (sub) == NEG)
40937 sub = XEXP (sub, 0);
40938 *total += rtx_cost (sub, mode, FMA, 2, speed);
40939 return true;
40942 case MULT:
40943 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
40945 rtx op0 = XEXP (x, 0);
40946 rtx op1 = XEXP (x, 1);
40947 int nbits;
40948 if (CONST_INT_P (XEXP (x, 1)))
40950 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40951 for (nbits = 0; value != 0; value &= value - 1)
40952 nbits++;
40954 else
40955 /* This is arbitrary. */
40956 nbits = 7;
40958 /* Compute costs correctly for widening multiplication. */
40959 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40960 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40961 == GET_MODE_SIZE (mode))
40963 int is_mulwiden = 0;
40964 machine_mode inner_mode = GET_MODE (op0);
40966 if (GET_CODE (op0) == GET_CODE (op1))
40967 is_mulwiden = 1, op1 = XEXP (op1, 0);
40968 else if (CONST_INT_P (op1))
40970 if (GET_CODE (op0) == SIGN_EXTEND)
40971 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40972 == INTVAL (op1);
40973 else
40974 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40977 if (is_mulwiden)
40978 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40981 *total = (cost->mult_init[MODE_INDEX (mode)]
40982 + nbits * cost->mult_bit
40983 + rtx_cost (op0, mode, outer_code, opno, speed)
40984 + rtx_cost (op1, mode, outer_code, opno, speed));
40986 return true;
40988 *total = ix86_multiplication_cost (cost, mode);
40989 return false;
40991 case DIV:
40992 case UDIV:
40993 case MOD:
40994 case UMOD:
40995 *total = ix86_division_cost (cost, mode);
40996 return false;
40998 case PLUS:
40999 if (GET_MODE_CLASS (mode) == MODE_INT
41000 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41002 if (GET_CODE (XEXP (x, 0)) == PLUS
41003 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41004 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41005 && CONSTANT_P (XEXP (x, 1)))
41007 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41008 if (val == 2 || val == 4 || val == 8)
41010 *total = cost->lea;
41011 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41012 outer_code, opno, speed);
41013 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41014 outer_code, opno, speed);
41015 *total += rtx_cost (XEXP (x, 1), mode,
41016 outer_code, opno, speed);
41017 return true;
41020 else if (GET_CODE (XEXP (x, 0)) == MULT
41021 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41023 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41024 if (val == 2 || val == 4 || val == 8)
41026 *total = cost->lea;
41027 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41028 outer_code, opno, speed);
41029 *total += rtx_cost (XEXP (x, 1), mode,
41030 outer_code, opno, speed);
41031 return true;
41034 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41036 /* Add with carry, ignore the cost of adding a carry flag. */
41037 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41038 *total = cost->add;
41039 else
41041 *total = cost->lea;
41042 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41043 outer_code, opno, speed);
41046 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41047 outer_code, opno, speed);
41048 *total += rtx_cost (XEXP (x, 1), mode,
41049 outer_code, opno, speed);
41050 return true;
41053 /* FALLTHRU */
41055 case MINUS:
41056 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41057 if (GET_MODE_CLASS (mode) == MODE_INT
41058 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41059 && GET_CODE (XEXP (x, 0)) == MINUS
41060 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41062 *total = cost->add;
41063 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41064 outer_code, opno, speed);
41065 *total += rtx_cost (XEXP (x, 1), mode,
41066 outer_code, opno, speed);
41067 return true;
41070 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41072 *total = cost->addss;
41073 return false;
41075 else if (X87_FLOAT_MODE_P (mode))
41077 *total = cost->fadd;
41078 return false;
41080 else if (FLOAT_MODE_P (mode))
41082 *total = ix86_vec_cost (mode, cost->addss, true);
41083 return false;
41085 /* FALLTHRU */
41087 case AND:
41088 case IOR:
41089 case XOR:
41090 if (GET_MODE_CLASS (mode) == MODE_INT
41091 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41093 *total = (cost->add * 2
41094 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41095 << (GET_MODE (XEXP (x, 0)) != DImode))
41096 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41097 << (GET_MODE (XEXP (x, 1)) != DImode)));
41098 return true;
41100 /* FALLTHRU */
41102 case NEG:
41103 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41105 *total = cost->sse_op;
41106 return false;
41108 else if (X87_FLOAT_MODE_P (mode))
41110 *total = cost->fchs;
41111 return false;
41113 else if (FLOAT_MODE_P (mode))
41115 *total = ix86_vec_cost (mode, cost->sse_op, true);
41116 return false;
41118 /* FALLTHRU */
41120 case NOT:
41121 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41122 *total = ix86_vec_cost (mode, cost->sse_op, true);
41123 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41124 *total = cost->add * 2;
41125 else
41126 *total = cost->add;
41127 return false;
41129 case COMPARE:
41130 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41131 && XEXP (XEXP (x, 0), 1) == const1_rtx
41132 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41133 && XEXP (x, 1) == const0_rtx)
41135 /* This kind of construct is implemented using test[bwl].
41136 Treat it as if we had an AND. */
41137 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41138 *total = (cost->add
41139 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41140 opno, speed)
41141 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41142 return true;
41145 /* The embedded comparison operand is completely free. */
41146 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41147 && XEXP (x, 1) == const0_rtx)
41148 *total = 0;
41150 return false;
41152 case FLOAT_EXTEND:
41153 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41154 *total = 0;
41155 else
41156 *total = ix86_vec_cost (mode, cost->addss, true);
41157 return false;
41159 case FLOAT_TRUNCATE:
41160 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41161 *total = cost->fadd;
41162 else
41163 *total = ix86_vec_cost (mode, cost->addss, true);
41164 return false;
41166 case ABS:
41167 /* SSE requires memory load for the constant operand. It may make
41168 sense to account for this. Of course the constant operand may or
41169 may not be reused. */
41170 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41171 *total = cost->sse_op;
41172 else if (X87_FLOAT_MODE_P (mode))
41173 *total = cost->fabs;
41174 else if (FLOAT_MODE_P (mode))
41175 *total = ix86_vec_cost (mode, cost->sse_op, true);
41176 return false;
41178 case SQRT:
41179 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41180 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
41181 else if (X87_FLOAT_MODE_P (mode))
41182 *total = cost->fsqrt;
41183 else if (FLOAT_MODE_P (mode))
41184 *total = ix86_vec_cost (mode,
41185 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
41186 true);
41187 return false;
41189 case UNSPEC:
41190 if (XINT (x, 1) == UNSPEC_TP)
41191 *total = 0;
41192 return false;
41194 case VEC_SELECT:
41195 case VEC_CONCAT:
41196 case VEC_DUPLICATE:
41197 /* ??? Assume all of these vector manipulation patterns are
41198 recognizable. In which case they all pretty much have the
41199 same cost. */
41200 *total = cost->sse_op;
41201 return true;
41202 case VEC_MERGE:
41203 mask = XEXP (x, 2);
41204 /* This is masked instruction, assume the same cost,
41205 as nonmasked variant. */
41206 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41207 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41208 else
41209 *total = cost->sse_op;
41210 return true;
41212 default:
41213 return false;
41217 #if TARGET_MACHO
41219 static int current_machopic_label_num;
41221 /* Given a symbol name and its associated stub, write out the
41222 definition of the stub. */
41224 void
41225 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41227 unsigned int length;
41228 char *binder_name, *symbol_name, lazy_ptr_name[32];
41229 int label = ++current_machopic_label_num;
41231 /* For 64-bit we shouldn't get here. */
41232 gcc_assert (!TARGET_64BIT);
41234 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41235 symb = targetm.strip_name_encoding (symb);
41237 length = strlen (stub);
41238 binder_name = XALLOCAVEC (char, length + 32);
41239 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41241 length = strlen (symb);
41242 symbol_name = XALLOCAVEC (char, length + 32);
41243 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41245 sprintf (lazy_ptr_name, "L%d$lz", label);
41247 if (MACHOPIC_ATT_STUB)
41248 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41249 else if (MACHOPIC_PURE)
41250 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41251 else
41252 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41254 fprintf (file, "%s:\n", stub);
41255 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41257 if (MACHOPIC_ATT_STUB)
41259 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41261 else if (MACHOPIC_PURE)
41263 /* PIC stub. */
41264 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41265 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41266 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41267 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41268 label, lazy_ptr_name, label);
41269 fprintf (file, "\tjmp\t*%%ecx\n");
41271 else
41272 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41274 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41275 it needs no stub-binding-helper. */
41276 if (MACHOPIC_ATT_STUB)
41277 return;
41279 fprintf (file, "%s:\n", binder_name);
41281 if (MACHOPIC_PURE)
41283 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41284 fprintf (file, "\tpushl\t%%ecx\n");
41286 else
41287 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41289 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41291 /* N.B. Keep the correspondence of these
41292 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41293 old-pic/new-pic/non-pic stubs; altering this will break
41294 compatibility with existing dylibs. */
41295 if (MACHOPIC_PURE)
41297 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41298 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41300 else
41301 /* 16-byte -mdynamic-no-pic stub. */
41302 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41304 fprintf (file, "%s:\n", lazy_ptr_name);
41305 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41306 fprintf (file, ASM_LONG "%s\n", binder_name);
41308 #endif /* TARGET_MACHO */
41310 /* Order the registers for register allocator. */
41312 void
41313 x86_order_regs_for_local_alloc (void)
41315 int pos = 0;
41316 int i;
41318 /* First allocate the local general purpose registers. */
41319 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41320 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41321 reg_alloc_order [pos++] = i;
41323 /* Global general purpose registers. */
41324 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41325 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41326 reg_alloc_order [pos++] = i;
41328 /* x87 registers come first in case we are doing FP math
41329 using them. */
41330 if (!TARGET_SSE_MATH)
41331 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41332 reg_alloc_order [pos++] = i;
41334 /* SSE registers. */
41335 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41336 reg_alloc_order [pos++] = i;
41337 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41338 reg_alloc_order [pos++] = i;
41340 /* Extended REX SSE registers. */
41341 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41342 reg_alloc_order [pos++] = i;
41344 /* Mask register. */
41345 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41346 reg_alloc_order [pos++] = i;
41348 /* MPX bound registers. */
41349 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41350 reg_alloc_order [pos++] = i;
41352 /* x87 registers. */
41353 if (TARGET_SSE_MATH)
41354 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41355 reg_alloc_order [pos++] = i;
41357 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41358 reg_alloc_order [pos++] = i;
41360 /* Initialize the rest of array as we do not allocate some registers
41361 at all. */
41362 while (pos < FIRST_PSEUDO_REGISTER)
41363 reg_alloc_order [pos++] = 0;
41366 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41367 in struct attribute_spec handler. */
41368 static tree
41369 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
41370 bool *no_add_attrs)
41372 if (TREE_CODE (*node) != FUNCTION_TYPE
41373 && TREE_CODE (*node) != METHOD_TYPE
41374 && TREE_CODE (*node) != FIELD_DECL
41375 && TREE_CODE (*node) != TYPE_DECL)
41377 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41378 name);
41379 *no_add_attrs = true;
41380 return NULL_TREE;
41382 if (TARGET_64BIT)
41384 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41385 name);
41386 *no_add_attrs = true;
41387 return NULL_TREE;
41389 if (is_attribute_p ("callee_pop_aggregate_return", name))
41391 tree cst;
41393 cst = TREE_VALUE (args);
41394 if (TREE_CODE (cst) != INTEGER_CST)
41396 warning (OPT_Wattributes,
41397 "%qE attribute requires an integer constant argument",
41398 name);
41399 *no_add_attrs = true;
41401 else if (compare_tree_int (cst, 0) != 0
41402 && compare_tree_int (cst, 1) != 0)
41404 warning (OPT_Wattributes,
41405 "argument to %qE attribute is neither zero, nor one",
41406 name);
41407 *no_add_attrs = true;
41410 return NULL_TREE;
41413 return NULL_TREE;
41416 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41417 struct attribute_spec.handler. */
41418 static tree
41419 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41420 bool *no_add_attrs)
41422 if (TREE_CODE (*node) != FUNCTION_TYPE
41423 && TREE_CODE (*node) != METHOD_TYPE
41424 && TREE_CODE (*node) != FIELD_DECL
41425 && TREE_CODE (*node) != TYPE_DECL)
41427 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41428 name);
41429 *no_add_attrs = true;
41430 return NULL_TREE;
41433 /* Can combine regparm with all attributes but fastcall. */
41434 if (is_attribute_p ("ms_abi", name))
41436 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41438 error ("ms_abi and sysv_abi attributes are not compatible");
41441 return NULL_TREE;
41443 else if (is_attribute_p ("sysv_abi", name))
41445 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41447 error ("ms_abi and sysv_abi attributes are not compatible");
41450 return NULL_TREE;
41453 return NULL_TREE;
41456 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41457 struct attribute_spec.handler. */
41458 static tree
41459 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41460 bool *no_add_attrs)
41462 tree *type = NULL;
41463 if (DECL_P (*node))
41465 if (TREE_CODE (*node) == TYPE_DECL)
41466 type = &TREE_TYPE (*node);
41468 else
41469 type = node;
41471 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41473 warning (OPT_Wattributes, "%qE attribute ignored",
41474 name);
41475 *no_add_attrs = true;
41478 else if ((is_attribute_p ("ms_struct", name)
41479 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41480 || ((is_attribute_p ("gcc_struct", name)
41481 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41483 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41484 name);
41485 *no_add_attrs = true;
41488 return NULL_TREE;
41491 static tree
41492 ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
41493 bool *no_add_attrs)
41495 if (TREE_CODE (*node) != FUNCTION_DECL)
41497 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41498 name);
41499 *no_add_attrs = true;
41502 if (is_attribute_p ("indirect_branch", name))
41504 tree cst = TREE_VALUE (args);
41505 if (TREE_CODE (cst) != STRING_CST)
41507 warning (OPT_Wattributes,
41508 "%qE attribute requires a string constant argument",
41509 name);
41510 *no_add_attrs = true;
41512 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41513 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41514 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41515 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41517 warning (OPT_Wattributes,
41518 "argument to %qE attribute is not "
41519 "(keep|thunk|thunk-inline|thunk-extern)", name);
41520 *no_add_attrs = true;
41524 if (is_attribute_p ("function_return", name))
41526 tree cst = TREE_VALUE (args);
41527 if (TREE_CODE (cst) != STRING_CST)
41529 warning (OPT_Wattributes,
41530 "%qE attribute requires a string constant argument",
41531 name);
41532 *no_add_attrs = true;
41534 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41535 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41536 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41537 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41539 warning (OPT_Wattributes,
41540 "argument to %qE attribute is not "
41541 "(keep|thunk|thunk-inline|thunk-extern)", name);
41542 *no_add_attrs = true;
41546 return NULL_TREE;
41549 static tree
41550 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41551 int, bool *)
41553 return NULL_TREE;
41556 static tree
41557 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41559 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41560 but the function type contains args and return type data. */
41561 tree func_type = *node;
41562 tree return_type = TREE_TYPE (func_type);
41564 int nargs = 0;
41565 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41566 while (current_arg_type
41567 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41569 if (nargs == 0)
41571 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41572 error ("interrupt service routine should have a pointer "
41573 "as the first argument");
41575 else if (nargs == 1)
41577 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41578 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41579 error ("interrupt service routine should have unsigned %s"
41580 "int as the second argument",
41581 TARGET_64BIT
41582 ? (TARGET_X32 ? "long long " : "long ")
41583 : "");
41585 nargs++;
41586 current_arg_type = TREE_CHAIN (current_arg_type);
41588 if (!nargs || nargs > 2)
41589 error ("interrupt service routine can only have a pointer argument "
41590 "and an optional integer argument");
41591 if (! VOID_TYPE_P (return_type))
41592 error ("interrupt service routine can't have non-void return value");
41594 return NULL_TREE;
41597 static bool
41598 ix86_ms_bitfield_layout_p (const_tree record_type)
41600 return ((TARGET_MS_BITFIELD_LAYOUT
41601 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41602 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41605 /* Returns an expression indicating where the this parameter is
41606 located on entry to the FUNCTION. */
41608 static rtx
41609 x86_this_parameter (tree function)
41611 tree type = TREE_TYPE (function);
41612 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41613 int nregs;
41615 if (TARGET_64BIT)
41617 const int *parm_regs;
41619 if (ix86_function_type_abi (type) == MS_ABI)
41620 parm_regs = x86_64_ms_abi_int_parameter_registers;
41621 else
41622 parm_regs = x86_64_int_parameter_registers;
41623 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41626 nregs = ix86_function_regparm (type, function);
41628 if (nregs > 0 && !stdarg_p (type))
41630 int regno;
41631 unsigned int ccvt = ix86_get_callcvt (type);
41633 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41634 regno = aggr ? DX_REG : CX_REG;
41635 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41637 regno = CX_REG;
41638 if (aggr)
41639 return gen_rtx_MEM (SImode,
41640 plus_constant (Pmode, stack_pointer_rtx, 4));
41642 else
41644 regno = AX_REG;
41645 if (aggr)
41647 regno = DX_REG;
41648 if (nregs == 1)
41649 return gen_rtx_MEM (SImode,
41650 plus_constant (Pmode,
41651 stack_pointer_rtx, 4));
41654 return gen_rtx_REG (SImode, regno);
41657 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41658 aggr ? 8 : 4));
41661 /* Determine whether x86_output_mi_thunk can succeed. */
41663 static bool
41664 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41665 const_tree function)
41667 /* 64-bit can handle anything. */
41668 if (TARGET_64BIT)
41669 return true;
41671 /* For 32-bit, everything's fine if we have one free register. */
41672 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41673 return true;
41675 /* Need a free register for vcall_offset. */
41676 if (vcall_offset)
41677 return false;
41679 /* Need a free register for GOT references. */
41680 if (flag_pic && !targetm.binds_local_p (function))
41681 return false;
41683 /* Otherwise ok. */
41684 return true;
41687 /* Output the assembler code for a thunk function. THUNK_DECL is the
41688 declaration for the thunk function itself, FUNCTION is the decl for
41689 the target function. DELTA is an immediate constant offset to be
41690 added to THIS. If VCALL_OFFSET is nonzero, the word at
41691 *(*this + vcall_offset) should be added to THIS. */
41693 static void
41694 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41695 HOST_WIDE_INT vcall_offset, tree function)
41697 rtx this_param = x86_this_parameter (function);
41698 rtx this_reg, tmp, fnaddr;
41699 unsigned int tmp_regno;
41700 rtx_insn *insn;
41702 if (TARGET_64BIT)
41703 tmp_regno = R10_REG;
41704 else
41706 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41707 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41708 tmp_regno = AX_REG;
41709 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41710 tmp_regno = DX_REG;
41711 else
41712 tmp_regno = CX_REG;
41715 emit_note (NOTE_INSN_PROLOGUE_END);
41717 /* CET is enabled, insert EB instruction. */
41718 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
41719 emit_insn (gen_nop_endbr ());
41721 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41722 pull it in now and let DELTA benefit. */
41723 if (REG_P (this_param))
41724 this_reg = this_param;
41725 else if (vcall_offset)
41727 /* Put the this parameter into %eax. */
41728 this_reg = gen_rtx_REG (Pmode, AX_REG);
41729 emit_move_insn (this_reg, this_param);
41731 else
41732 this_reg = NULL_RTX;
41734 /* Adjust the this parameter by a fixed constant. */
41735 if (delta)
41737 rtx delta_rtx = GEN_INT (delta);
41738 rtx delta_dst = this_reg ? this_reg : this_param;
41740 if (TARGET_64BIT)
41742 if (!x86_64_general_operand (delta_rtx, Pmode))
41744 tmp = gen_rtx_REG (Pmode, tmp_regno);
41745 emit_move_insn (tmp, delta_rtx);
41746 delta_rtx = tmp;
41750 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41753 /* Adjust the this parameter by a value stored in the vtable. */
41754 if (vcall_offset)
41756 rtx vcall_addr, vcall_mem, this_mem;
41758 tmp = gen_rtx_REG (Pmode, tmp_regno);
41760 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41761 if (Pmode != ptr_mode)
41762 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41763 emit_move_insn (tmp, this_mem);
41765 /* Adjust the this parameter. */
41766 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41767 if (TARGET_64BIT
41768 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41770 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41771 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41772 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41775 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41776 if (Pmode != ptr_mode)
41777 emit_insn (gen_addsi_1_zext (this_reg,
41778 gen_rtx_REG (ptr_mode,
41779 REGNO (this_reg)),
41780 vcall_mem));
41781 else
41782 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41785 /* If necessary, drop THIS back to its stack slot. */
41786 if (this_reg && this_reg != this_param)
41787 emit_move_insn (this_param, this_reg);
41789 fnaddr = XEXP (DECL_RTL (function), 0);
41790 if (TARGET_64BIT)
41792 if (!flag_pic || targetm.binds_local_p (function)
41793 || TARGET_PECOFF)
41795 else
41797 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41798 tmp = gen_rtx_CONST (Pmode, tmp);
41799 fnaddr = gen_const_mem (Pmode, tmp);
41802 else
41804 if (!flag_pic || targetm.binds_local_p (function))
41806 #if TARGET_MACHO
41807 else if (TARGET_MACHO)
41809 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41810 fnaddr = XEXP (fnaddr, 0);
41812 #endif /* TARGET_MACHO */
41813 else
41815 tmp = gen_rtx_REG (Pmode, CX_REG);
41816 output_set_got (tmp, NULL_RTX);
41818 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41819 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41820 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41821 fnaddr = gen_const_mem (Pmode, fnaddr);
41825 /* Our sibling call patterns do not allow memories, because we have no
41826 predicate that can distinguish between frame and non-frame memory.
41827 For our purposes here, we can get away with (ab)using a jump pattern,
41828 because we're going to do no optimization. */
41829 if (MEM_P (fnaddr))
41831 if (sibcall_insn_operand (fnaddr, word_mode))
41833 fnaddr = XEXP (DECL_RTL (function), 0);
41834 tmp = gen_rtx_MEM (QImode, fnaddr);
41835 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41836 tmp = emit_call_insn (tmp);
41837 SIBLING_CALL_P (tmp) = 1;
41839 else
41840 emit_jump_insn (gen_indirect_jump (fnaddr));
41842 else
41844 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41846 // CM_LARGE_PIC always uses pseudo PIC register which is
41847 // uninitialized. Since FUNCTION is local and calling it
41848 // doesn't go through PLT, we use scratch register %r11 as
41849 // PIC register and initialize it here.
41850 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41851 ix86_init_large_pic_reg (tmp_regno);
41852 fnaddr = legitimize_pic_address (fnaddr,
41853 gen_rtx_REG (Pmode, tmp_regno));
41856 if (!sibcall_insn_operand (fnaddr, word_mode))
41858 tmp = gen_rtx_REG (word_mode, tmp_regno);
41859 if (GET_MODE (fnaddr) != word_mode)
41860 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41861 emit_move_insn (tmp, fnaddr);
41862 fnaddr = tmp;
41865 tmp = gen_rtx_MEM (QImode, fnaddr);
41866 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41867 tmp = emit_call_insn (tmp);
41868 SIBLING_CALL_P (tmp) = 1;
41870 emit_barrier ();
41872 /* Emit just enough of rest_of_compilation to get the insns emitted.
41873 Note that use_thunk calls assemble_start_function et al. */
41874 insn = get_insns ();
41875 shorten_branches (insn);
41876 final_start_function (insn, file, 1);
41877 final (insn, file, 1);
41878 final_end_function ();
41881 static void
41882 x86_file_start (void)
41884 default_file_start ();
41885 if (TARGET_16BIT)
41886 fputs ("\t.code16gcc\n", asm_out_file);
41887 #if TARGET_MACHO
41888 darwin_file_start ();
41889 #endif
41890 if (X86_FILE_START_VERSION_DIRECTIVE)
41891 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41892 if (X86_FILE_START_FLTUSED)
41893 fputs ("\t.global\t__fltused\n", asm_out_file);
41894 if (ix86_asm_dialect == ASM_INTEL)
41895 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41899 x86_field_alignment (tree type, int computed)
41901 machine_mode mode;
41903 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41904 return computed;
41905 if (TARGET_IAMCU)
41906 return iamcu_alignment (type, computed);
41907 mode = TYPE_MODE (strip_array_types (type));
41908 if (mode == DFmode || mode == DCmode
41909 || GET_MODE_CLASS (mode) == MODE_INT
41910 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41911 return MIN (32, computed);
41912 return computed;
41915 /* Print call to TARGET to FILE. */
41917 static void
41918 x86_print_call_or_nop (FILE *file, const char *target)
41920 if (flag_nop_mcount)
41921 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
41922 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
41923 else
41924 fprintf (file, "1:\tcall\t%s\n", target);
41927 /* Output assembler code to FILE to increment profiler label # LABELNO
41928 for profiling a function entry. */
41929 void
41930 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41932 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41933 : MCOUNT_NAME);
41934 if (TARGET_64BIT)
41936 #ifndef NO_PROFILE_COUNTERS
41937 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41938 #endif
41940 if (!TARGET_PECOFF && flag_pic)
41941 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41942 else
41943 x86_print_call_or_nop (file, mcount_name);
41945 else if (flag_pic)
41947 #ifndef NO_PROFILE_COUNTERS
41948 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41949 LPREFIX, labelno);
41950 #endif
41951 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41953 else
41955 #ifndef NO_PROFILE_COUNTERS
41956 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41957 LPREFIX, labelno);
41958 #endif
41959 x86_print_call_or_nop (file, mcount_name);
41962 if (flag_record_mcount)
41964 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41965 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41966 fprintf (file, "\t.previous\n");
41970 /* We don't have exact information about the insn sizes, but we may assume
41971 quite safely that we are informed about all 1 byte insns and memory
41972 address sizes. This is enough to eliminate unnecessary padding in
41973 99% of cases. */
41976 ix86_min_insn_size (rtx_insn *insn)
41978 int l = 0, len;
41980 if (!INSN_P (insn) || !active_insn_p (insn))
41981 return 0;
41983 /* Discard alignments we've emit and jump instructions. */
41984 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41985 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41986 return 0;
41988 /* Important case - calls are always 5 bytes.
41989 It is common to have many calls in the row. */
41990 if (CALL_P (insn)
41991 && symbolic_reference_mentioned_p (PATTERN (insn))
41992 && !SIBLING_CALL_P (insn))
41993 return 5;
41994 len = get_attr_length (insn);
41995 if (len <= 1)
41996 return 1;
41998 /* For normal instructions we rely on get_attr_length being exact,
41999 with a few exceptions. */
42000 if (!JUMP_P (insn))
42002 enum attr_type type = get_attr_type (insn);
42004 switch (type)
42006 case TYPE_MULTI:
42007 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42008 || asm_noperands (PATTERN (insn)) >= 0)
42009 return 0;
42010 break;
42011 case TYPE_OTHER:
42012 case TYPE_FCMP:
42013 break;
42014 default:
42015 /* Otherwise trust get_attr_length. */
42016 return len;
42019 l = get_attr_length_address (insn);
42020 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42021 l = 4;
42023 if (l)
42024 return 1+l;
42025 else
42026 return 2;
42029 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42031 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42032 window. */
42034 static void
42035 ix86_avoid_jump_mispredicts (void)
42037 rtx_insn *insn, *start = get_insns ();
42038 int nbytes = 0, njumps = 0;
42039 bool isjump = false;
42041 /* Look for all minimal intervals of instructions containing 4 jumps.
42042 The intervals are bounded by START and INSN. NBYTES is the total
42043 size of instructions in the interval including INSN and not including
42044 START. When the NBYTES is smaller than 16 bytes, it is possible
42045 that the end of START and INSN ends up in the same 16byte page.
42047 The smallest offset in the page INSN can start is the case where START
42048 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42049 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42051 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42052 have to, control transfer to label(s) can be performed through other
42053 means, and also we estimate minimum length of all asm stmts as 0. */
42054 for (insn = start; insn; insn = NEXT_INSN (insn))
42056 int min_size;
42058 if (LABEL_P (insn))
42060 int align = label_to_alignment (insn);
42061 int max_skip = label_to_max_skip (insn);
42063 if (max_skip > 15)
42064 max_skip = 15;
42065 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42066 already in the current 16 byte page, because otherwise
42067 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42068 bytes to reach 16 byte boundary. */
42069 if (align <= 0
42070 || (align <= 3 && max_skip != (1 << align) - 1))
42071 max_skip = 0;
42072 if (dump_file)
42073 fprintf (dump_file, "Label %i with max_skip %i\n",
42074 INSN_UID (insn), max_skip);
42075 if (max_skip)
42077 while (nbytes + max_skip >= 16)
42079 start = NEXT_INSN (start);
42080 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42081 || CALL_P (start))
42082 njumps--, isjump = true;
42083 else
42084 isjump = false;
42085 nbytes -= ix86_min_insn_size (start);
42088 continue;
42091 min_size = ix86_min_insn_size (insn);
42092 nbytes += min_size;
42093 if (dump_file)
42094 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42095 INSN_UID (insn), min_size);
42096 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42097 || CALL_P (insn))
42098 njumps++;
42099 else
42100 continue;
42102 while (njumps > 3)
42104 start = NEXT_INSN (start);
42105 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42106 || CALL_P (start))
42107 njumps--, isjump = true;
42108 else
42109 isjump = false;
42110 nbytes -= ix86_min_insn_size (start);
42112 gcc_assert (njumps >= 0);
42113 if (dump_file)
42114 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42115 INSN_UID (start), INSN_UID (insn), nbytes);
42117 if (njumps == 3 && isjump && nbytes < 16)
42119 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
42121 if (dump_file)
42122 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42123 INSN_UID (insn), padsize);
42124 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42128 #endif
42130 /* AMD Athlon works faster
42131 when RET is not destination of conditional jump or directly preceded
42132 by other jump instruction. We avoid the penalty by inserting NOP just
42133 before the RET instructions in such cases. */
42134 static void
42135 ix86_pad_returns (void)
42137 edge e;
42138 edge_iterator ei;
42140 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42142 basic_block bb = e->src;
42143 rtx_insn *ret = BB_END (bb);
42144 rtx_insn *prev;
42145 bool replace = false;
42147 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42148 || optimize_bb_for_size_p (bb))
42149 continue;
42150 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42151 if (active_insn_p (prev) || LABEL_P (prev))
42152 break;
42153 if (prev && LABEL_P (prev))
42155 edge e;
42156 edge_iterator ei;
42158 FOR_EACH_EDGE (e, ei, bb->preds)
42159 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42160 && !(e->flags & EDGE_FALLTHRU))
42162 replace = true;
42163 break;
42166 if (!replace)
42168 prev = prev_active_insn (ret);
42169 if (prev
42170 && ((JUMP_P (prev) && any_condjump_p (prev))
42171 || CALL_P (prev)))
42172 replace = true;
42173 /* Empty functions get branch mispredict even when
42174 the jump destination is not visible to us. */
42175 if (!prev && !optimize_function_for_size_p (cfun))
42176 replace = true;
42178 if (replace)
42180 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42181 delete_insn (ret);
42186 /* Count the minimum number of instructions in BB. Return 4 if the
42187 number of instructions >= 4. */
42189 static int
42190 ix86_count_insn_bb (basic_block bb)
42192 rtx_insn *insn;
42193 int insn_count = 0;
42195 /* Count number of instructions in this block. Return 4 if the number
42196 of instructions >= 4. */
42197 FOR_BB_INSNS (bb, insn)
42199 /* Only happen in exit blocks. */
42200 if (JUMP_P (insn)
42201 && ANY_RETURN_P (PATTERN (insn)))
42202 break;
42204 if (NONDEBUG_INSN_P (insn)
42205 && GET_CODE (PATTERN (insn)) != USE
42206 && GET_CODE (PATTERN (insn)) != CLOBBER)
42208 insn_count++;
42209 if (insn_count >= 4)
42210 return insn_count;
42214 return insn_count;
42218 /* Count the minimum number of instructions in code path in BB.
42219 Return 4 if the number of instructions >= 4. */
42221 static int
42222 ix86_count_insn (basic_block bb)
42224 edge e;
42225 edge_iterator ei;
42226 int min_prev_count;
42228 /* Only bother counting instructions along paths with no
42229 more than 2 basic blocks between entry and exit. Given
42230 that BB has an edge to exit, determine if a predecessor
42231 of BB has an edge from entry. If so, compute the number
42232 of instructions in the predecessor block. If there
42233 happen to be multiple such blocks, compute the minimum. */
42234 min_prev_count = 4;
42235 FOR_EACH_EDGE (e, ei, bb->preds)
42237 edge prev_e;
42238 edge_iterator prev_ei;
42240 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42242 min_prev_count = 0;
42243 break;
42245 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42247 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42249 int count = ix86_count_insn_bb (e->src);
42250 if (count < min_prev_count)
42251 min_prev_count = count;
42252 break;
42257 if (min_prev_count < 4)
42258 min_prev_count += ix86_count_insn_bb (bb);
42260 return min_prev_count;
42263 /* Pad short function to 4 instructions. */
42265 static void
42266 ix86_pad_short_function (void)
42268 edge e;
42269 edge_iterator ei;
42271 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42273 rtx_insn *ret = BB_END (e->src);
42274 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42276 int insn_count = ix86_count_insn (e->src);
42278 /* Pad short function. */
42279 if (insn_count < 4)
42281 rtx_insn *insn = ret;
42283 /* Find epilogue. */
42284 while (insn
42285 && (!NOTE_P (insn)
42286 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42287 insn = PREV_INSN (insn);
42289 if (!insn)
42290 insn = ret;
42292 /* Two NOPs count as one instruction. */
42293 insn_count = 2 * (4 - insn_count);
42294 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42300 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42301 the epilogue, the Windows system unwinder will apply epilogue logic and
42302 produce incorrect offsets. This can be avoided by adding a nop between
42303 the last insn that can throw and the first insn of the epilogue. */
42305 static void
42306 ix86_seh_fixup_eh_fallthru (void)
42308 edge e;
42309 edge_iterator ei;
42311 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42313 rtx_insn *insn, *next;
42315 /* Find the beginning of the epilogue. */
42316 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42317 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42318 break;
42319 if (insn == NULL)
42320 continue;
42322 /* We only care about preceding insns that can throw. */
42323 insn = prev_active_insn (insn);
42324 if (insn == NULL || !can_throw_internal (insn))
42325 continue;
42327 /* Do not separate calls from their debug information. */
42328 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42329 if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
42330 insn = next;
42331 else
42332 break;
42334 emit_insn_after (gen_nops (const1_rtx), insn);
42338 /* Given a register number BASE, the lowest of a group of registers, update
42339 regsets IN and OUT with the registers that should be avoided in input
42340 and output operands respectively when trying to avoid generating a modr/m
42341 byte for -mmitigate-rop. */
42343 static void
42344 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42346 SET_HARD_REG_BIT (out, base);
42347 SET_HARD_REG_BIT (out, base + 1);
42348 SET_HARD_REG_BIT (in, base + 2);
42349 SET_HARD_REG_BIT (in, base + 3);
42352 /* Called if -mmitigate-rop is in effect. Try to rewrite instructions so
42353 that certain encodings of modr/m bytes do not occur. */
42354 static void
42355 ix86_mitigate_rop (void)
42357 HARD_REG_SET input_risky;
42358 HARD_REG_SET output_risky;
42359 HARD_REG_SET inout_risky;
42361 CLEAR_HARD_REG_SET (output_risky);
42362 CLEAR_HARD_REG_SET (input_risky);
42363 SET_HARD_REG_BIT (output_risky, AX_REG);
42364 SET_HARD_REG_BIT (output_risky, CX_REG);
42365 SET_HARD_REG_BIT (input_risky, BX_REG);
42366 SET_HARD_REG_BIT (input_risky, DX_REG);
42367 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42368 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42369 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42370 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42371 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42372 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42373 COPY_HARD_REG_SET (inout_risky, input_risky);
42374 IOR_HARD_REG_SET (inout_risky, output_risky);
42376 df_note_add_problem ();
42377 /* Fix up what stack-regs did. */
42378 df_insn_rescan_all ();
42379 df_analyze ();
42381 regrename_init (true);
42382 regrename_analyze (NULL);
42384 auto_vec<du_head_p> cands;
42386 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42388 if (!NONDEBUG_INSN_P (insn))
42389 continue;
42391 if (GET_CODE (PATTERN (insn)) == USE
42392 || GET_CODE (PATTERN (insn)) == CLOBBER)
42393 continue;
42395 extract_insn (insn);
42397 int opno0, opno1;
42398 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42399 recog_data.n_operands, &opno0,
42400 &opno1);
42402 if (!ix86_rop_should_change_byte_p (modrm))
42403 continue;
42405 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42407 /* This happens when regrename has to fail a block. */
42408 if (!info->op_info)
42409 continue;
42411 if (info->op_info[opno0].n_chains != 0)
42413 gcc_assert (info->op_info[opno0].n_chains == 1);
42414 du_head_p op0c;
42415 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42416 if (op0c->target_data_1 + op0c->target_data_2 == 0
42417 && !op0c->cannot_rename)
42418 cands.safe_push (op0c);
42420 op0c->target_data_1++;
42422 if (info->op_info[opno1].n_chains != 0)
42424 gcc_assert (info->op_info[opno1].n_chains == 1);
42425 du_head_p op1c;
42426 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42427 if (op1c->target_data_1 + op1c->target_data_2 == 0
42428 && !op1c->cannot_rename)
42429 cands.safe_push (op1c);
42431 op1c->target_data_2++;
42435 int i;
42436 du_head_p head;
42437 FOR_EACH_VEC_ELT (cands, i, head)
42439 int old_reg, best_reg;
42440 HARD_REG_SET unavailable;
42442 CLEAR_HARD_REG_SET (unavailable);
42443 if (head->target_data_1)
42444 IOR_HARD_REG_SET (unavailable, output_risky);
42445 if (head->target_data_2)
42446 IOR_HARD_REG_SET (unavailable, input_risky);
42448 int n_uses;
42449 reg_class superclass = regrename_find_superclass (head, &n_uses,
42450 &unavailable);
42451 old_reg = head->regno;
42452 best_reg = find_rename_reg (head, superclass, &unavailable,
42453 old_reg, false);
42454 bool ok = regrename_do_replace (head, best_reg);
42455 gcc_assert (ok);
42456 if (dump_file)
42457 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42458 reg_names[best_reg], reg_class_names[superclass]);
42462 regrename_finish ();
42464 df_analyze ();
42466 basic_block bb;
42467 regset_head live;
42469 INIT_REG_SET (&live);
42471 FOR_EACH_BB_FN (bb, cfun)
42473 rtx_insn *insn;
42475 COPY_REG_SET (&live, DF_LR_OUT (bb));
42476 df_simulate_initialize_backwards (bb, &live);
42478 FOR_BB_INSNS_REVERSE (bb, insn)
42480 if (!NONDEBUG_INSN_P (insn))
42481 continue;
42483 df_simulate_one_insn_backwards (bb, insn, &live);
42485 if (GET_CODE (PATTERN (insn)) == USE
42486 || GET_CODE (PATTERN (insn)) == CLOBBER)
42487 continue;
42489 extract_insn (insn);
42490 constrain_operands_cached (insn, reload_completed);
42491 int opno0, opno1;
42492 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42493 recog_data.n_operands, &opno0,
42494 &opno1);
42495 if (modrm < 0
42496 || !ix86_rop_should_change_byte_p (modrm)
42497 || opno0 == opno1)
42498 continue;
42500 rtx oldreg = recog_data.operand[opno1];
42501 preprocess_constraints (insn);
42502 const operand_alternative *alt = which_op_alt ();
42504 int i;
42505 for (i = 0; i < recog_data.n_operands; i++)
42506 if (i != opno1
42507 && alt[i].earlyclobber
42508 && reg_overlap_mentioned_p (recog_data.operand[i],
42509 oldreg))
42510 break;
42512 if (i < recog_data.n_operands)
42513 continue;
42515 if (dump_file)
42516 fprintf (dump_file,
42517 "attempting to fix modrm byte in insn %d:"
42518 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42519 reg_class_names[alt[opno1].cl]);
42521 HARD_REG_SET unavailable;
42522 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42523 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42524 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42525 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42526 IOR_HARD_REG_SET (unavailable, output_risky);
42527 IOR_COMPL_HARD_REG_SET (unavailable,
42528 reg_class_contents[alt[opno1].cl]);
42530 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42531 if (!TEST_HARD_REG_BIT (unavailable, i))
42532 break;
42533 if (i == FIRST_PSEUDO_REGISTER)
42535 if (dump_file)
42536 fprintf (dump_file, ", none available\n");
42537 continue;
42539 if (dump_file)
42540 fprintf (dump_file, " -> %d\n", i);
42541 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42542 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42543 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42548 /* Implement machine specific optimizations. We implement padding of returns
42549 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42550 static void
42551 ix86_reorg (void)
42553 /* We are freeing block_for_insn in the toplev to keep compatibility
42554 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42555 compute_bb_for_insn ();
42557 if (flag_mitigate_rop)
42558 ix86_mitigate_rop ();
42560 if (TARGET_SEH && current_function_has_exception_handlers ())
42561 ix86_seh_fixup_eh_fallthru ();
42563 if (optimize && optimize_function_for_speed_p (cfun))
42565 if (TARGET_PAD_SHORT_FUNCTION)
42566 ix86_pad_short_function ();
42567 else if (TARGET_PAD_RETURNS)
42568 ix86_pad_returns ();
42569 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42570 if (TARGET_FOUR_JUMP_LIMIT)
42571 ix86_avoid_jump_mispredicts ();
42572 #endif
42576 /* Return nonzero when QImode register that must be represented via REX prefix
42577 is used. */
42578 bool
42579 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42581 int i;
42582 extract_insn_cached (insn);
42583 for (i = 0; i < recog_data.n_operands; i++)
42584 if (GENERAL_REG_P (recog_data.operand[i])
42585 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42586 return true;
42587 return false;
42590 /* Return true when INSN mentions register that must be encoded using REX
42591 prefix. */
42592 bool
42593 x86_extended_reg_mentioned_p (rtx insn)
42595 subrtx_iterator::array_type array;
42596 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42598 const_rtx x = *iter;
42599 if (REG_P (x)
42600 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42601 return true;
42603 return false;
42606 /* If profitable, negate (without causing overflow) integer constant
42607 of mode MODE at location LOC. Return true in this case. */
42608 bool
42609 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42611 HOST_WIDE_INT val;
42613 if (!CONST_INT_P (*loc))
42614 return false;
42616 switch (mode)
42618 case E_DImode:
42619 /* DImode x86_64 constants must fit in 32 bits. */
42620 gcc_assert (x86_64_immediate_operand (*loc, mode));
42622 mode = SImode;
42623 break;
42625 case E_SImode:
42626 case E_HImode:
42627 case E_QImode:
42628 break;
42630 default:
42631 gcc_unreachable ();
42634 /* Avoid overflows. */
42635 if (mode_signbit_p (mode, *loc))
42636 return false;
42638 val = INTVAL (*loc);
42640 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42641 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42642 if ((val < 0 && val != -128)
42643 || val == 128)
42645 *loc = GEN_INT (-val);
42646 return true;
42649 return false;
42652 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42653 optabs would emit if we didn't have TFmode patterns. */
42655 void
42656 x86_emit_floatuns (rtx operands[2])
42658 rtx_code_label *neglab, *donelab;
42659 rtx i0, i1, f0, in, out;
42660 machine_mode mode, inmode;
42662 inmode = GET_MODE (operands[1]);
42663 gcc_assert (inmode == SImode || inmode == DImode);
42665 out = operands[0];
42666 in = force_reg (inmode, operands[1]);
42667 mode = GET_MODE (out);
42668 neglab = gen_label_rtx ();
42669 donelab = gen_label_rtx ();
42670 f0 = gen_reg_rtx (mode);
42672 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42674 expand_float (out, in, 0);
42676 emit_jump_insn (gen_jump (donelab));
42677 emit_barrier ();
42679 emit_label (neglab);
42681 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42682 1, OPTAB_DIRECT);
42683 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42684 1, OPTAB_DIRECT);
42685 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42687 expand_float (f0, i0, 0);
42689 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42691 emit_label (donelab);
42694 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42695 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42696 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42697 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42699 /* Get a vector mode of the same size as the original but with elements
42700 twice as wide. This is only guaranteed to apply to integral vectors. */
42702 static inline machine_mode
42703 get_mode_wider_vector (machine_mode o)
42705 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42706 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
42707 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42708 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42709 return n;
42712 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42713 fill target with val via vec_duplicate. */
42715 static bool
42716 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42718 bool ok;
42719 rtx_insn *insn;
42720 rtx dup;
42722 /* First attempt to recognize VAL as-is. */
42723 dup = gen_vec_duplicate (mode, val);
42724 insn = emit_insn (gen_rtx_SET (target, dup));
42725 if (recog_memoized (insn) < 0)
42727 rtx_insn *seq;
42728 machine_mode innermode = GET_MODE_INNER (mode);
42729 rtx reg;
42731 /* If that fails, force VAL into a register. */
42733 start_sequence ();
42734 reg = force_reg (innermode, val);
42735 if (GET_MODE (reg) != innermode)
42736 reg = gen_lowpart (innermode, reg);
42737 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
42738 seq = get_insns ();
42739 end_sequence ();
42740 if (seq)
42741 emit_insn_before (seq, insn);
42743 ok = recog_memoized (insn) >= 0;
42744 gcc_assert (ok);
42746 return true;
42749 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42750 with all elements equal to VAR. Return true if successful. */
42752 static bool
42753 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42754 rtx target, rtx val)
42756 bool ok;
42758 switch (mode)
42760 case E_V2SImode:
42761 case E_V2SFmode:
42762 if (!mmx_ok)
42763 return false;
42764 /* FALLTHRU */
42766 case E_V4DFmode:
42767 case E_V4DImode:
42768 case E_V8SFmode:
42769 case E_V8SImode:
42770 case E_V2DFmode:
42771 case E_V2DImode:
42772 case E_V4SFmode:
42773 case E_V4SImode:
42774 case E_V16SImode:
42775 case E_V8DImode:
42776 case E_V16SFmode:
42777 case E_V8DFmode:
42778 return ix86_vector_duplicate_value (mode, target, val);
42780 case E_V4HImode:
42781 if (!mmx_ok)
42782 return false;
42783 if (TARGET_SSE || TARGET_3DNOW_A)
42785 rtx x;
42787 val = gen_lowpart (SImode, val);
42788 x = gen_rtx_TRUNCATE (HImode, val);
42789 x = gen_rtx_VEC_DUPLICATE (mode, x);
42790 emit_insn (gen_rtx_SET (target, x));
42791 return true;
42793 goto widen;
42795 case E_V8QImode:
42796 if (!mmx_ok)
42797 return false;
42798 goto widen;
42800 case E_V8HImode:
42801 if (TARGET_AVX2)
42802 return ix86_vector_duplicate_value (mode, target, val);
42804 if (TARGET_SSE2)
42806 struct expand_vec_perm_d dperm;
42807 rtx tmp1, tmp2;
42809 permute:
42810 memset (&dperm, 0, sizeof (dperm));
42811 dperm.target = target;
42812 dperm.vmode = mode;
42813 dperm.nelt = GET_MODE_NUNITS (mode);
42814 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42815 dperm.one_operand_p = true;
42817 /* Extend to SImode using a paradoxical SUBREG. */
42818 tmp1 = gen_reg_rtx (SImode);
42819 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42821 /* Insert the SImode value as low element of a V4SImode vector. */
42822 tmp2 = gen_reg_rtx (V4SImode);
42823 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42824 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42826 ok = (expand_vec_perm_1 (&dperm)
42827 || expand_vec_perm_broadcast_1 (&dperm));
42828 gcc_assert (ok);
42829 return ok;
42831 goto widen;
42833 case E_V16QImode:
42834 if (TARGET_AVX2)
42835 return ix86_vector_duplicate_value (mode, target, val);
42837 if (TARGET_SSE2)
42838 goto permute;
42839 goto widen;
42841 widen:
42842 /* Replicate the value once into the next wider mode and recurse. */
42844 machine_mode smode, wsmode, wvmode;
42845 rtx x;
42847 smode = GET_MODE_INNER (mode);
42848 wvmode = get_mode_wider_vector (mode);
42849 wsmode = GET_MODE_INNER (wvmode);
42851 val = convert_modes (wsmode, smode, val, true);
42852 x = expand_simple_binop (wsmode, ASHIFT, val,
42853 GEN_INT (GET_MODE_BITSIZE (smode)),
42854 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42855 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42857 x = gen_reg_rtx (wvmode);
42858 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42859 gcc_assert (ok);
42860 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42861 return ok;
42864 case E_V16HImode:
42865 case E_V32QImode:
42866 if (TARGET_AVX2)
42867 return ix86_vector_duplicate_value (mode, target, val);
42868 else
42870 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42871 rtx x = gen_reg_rtx (hvmode);
42873 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42874 gcc_assert (ok);
42876 x = gen_rtx_VEC_CONCAT (mode, x, x);
42877 emit_insn (gen_rtx_SET (target, x));
42879 return true;
42881 case E_V64QImode:
42882 case E_V32HImode:
42883 if (TARGET_AVX512BW)
42884 return ix86_vector_duplicate_value (mode, target, val);
42885 else
42887 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42888 rtx x = gen_reg_rtx (hvmode);
42890 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42891 gcc_assert (ok);
42893 x = gen_rtx_VEC_CONCAT (mode, x, x);
42894 emit_insn (gen_rtx_SET (target, x));
42896 return true;
42898 default:
42899 return false;
42903 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42904 whose ONE_VAR element is VAR, and other elements are zero. Return true
42905 if successful. */
42907 static bool
42908 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42909 rtx target, rtx var, int one_var)
42911 machine_mode vsimode;
42912 rtx new_target;
42913 rtx x, tmp;
42914 bool use_vector_set = false;
42915 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
42917 switch (mode)
42919 case E_V2DImode:
42920 /* For SSE4.1, we normally use vector set. But if the second
42921 element is zero and inter-unit moves are OK, we use movq
42922 instead. */
42923 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42924 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42925 && one_var == 0));
42926 break;
42927 case E_V16QImode:
42928 case E_V4SImode:
42929 case E_V4SFmode:
42930 use_vector_set = TARGET_SSE4_1;
42931 break;
42932 case E_V8HImode:
42933 use_vector_set = TARGET_SSE2;
42934 break;
42935 case E_V4HImode:
42936 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42937 break;
42938 case E_V32QImode:
42939 case E_V16HImode:
42940 use_vector_set = TARGET_AVX;
42941 break;
42942 case E_V8SImode:
42943 use_vector_set = TARGET_AVX;
42944 gen_vec_set_0 = gen_vec_setv8si_0;
42945 break;
42946 case E_V8SFmode:
42947 use_vector_set = TARGET_AVX;
42948 gen_vec_set_0 = gen_vec_setv8sf_0;
42949 break;
42950 case E_V4DFmode:
42951 use_vector_set = TARGET_AVX;
42952 gen_vec_set_0 = gen_vec_setv4df_0;
42953 break;
42954 case E_V4DImode:
42955 /* Use ix86_expand_vector_set in 64bit mode only. */
42956 use_vector_set = TARGET_AVX && TARGET_64BIT;
42957 gen_vec_set_0 = gen_vec_setv4di_0;
42958 break;
42959 case E_V16SImode:
42960 use_vector_set = TARGET_AVX512F && one_var == 0;
42961 gen_vec_set_0 = gen_vec_setv16si_0;
42962 break;
42963 case E_V16SFmode:
42964 use_vector_set = TARGET_AVX512F && one_var == 0;
42965 gen_vec_set_0 = gen_vec_setv16sf_0;
42966 break;
42967 case E_V8DFmode:
42968 use_vector_set = TARGET_AVX512F && one_var == 0;
42969 gen_vec_set_0 = gen_vec_setv8df_0;
42970 break;
42971 case E_V8DImode:
42972 /* Use ix86_expand_vector_set in 64bit mode only. */
42973 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
42974 gen_vec_set_0 = gen_vec_setv8di_0;
42975 break;
42976 default:
42977 break;
42980 if (use_vector_set)
42982 if (gen_vec_set_0 && one_var == 0)
42984 var = force_reg (GET_MODE_INNER (mode), var);
42985 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
42986 return true;
42988 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42989 var = force_reg (GET_MODE_INNER (mode), var);
42990 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42991 return true;
42994 switch (mode)
42996 case E_V2SFmode:
42997 case E_V2SImode:
42998 if (!mmx_ok)
42999 return false;
43000 /* FALLTHRU */
43002 case E_V2DFmode:
43003 case E_V2DImode:
43004 if (one_var != 0)
43005 return false;
43006 var = force_reg (GET_MODE_INNER (mode), var);
43007 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43008 emit_insn (gen_rtx_SET (target, x));
43009 return true;
43011 case E_V4SFmode:
43012 case E_V4SImode:
43013 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43014 new_target = gen_reg_rtx (mode);
43015 else
43016 new_target = target;
43017 var = force_reg (GET_MODE_INNER (mode), var);
43018 x = gen_rtx_VEC_DUPLICATE (mode, var);
43019 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43020 emit_insn (gen_rtx_SET (new_target, x));
43021 if (one_var != 0)
43023 /* We need to shuffle the value to the correct position, so
43024 create a new pseudo to store the intermediate result. */
43026 /* With SSE2, we can use the integer shuffle insns. */
43027 if (mode != V4SFmode && TARGET_SSE2)
43029 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43030 const1_rtx,
43031 GEN_INT (one_var == 1 ? 0 : 1),
43032 GEN_INT (one_var == 2 ? 0 : 1),
43033 GEN_INT (one_var == 3 ? 0 : 1)));
43034 if (target != new_target)
43035 emit_move_insn (target, new_target);
43036 return true;
43039 /* Otherwise convert the intermediate result to V4SFmode and
43040 use the SSE1 shuffle instructions. */
43041 if (mode != V4SFmode)
43043 tmp = gen_reg_rtx (V4SFmode);
43044 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43046 else
43047 tmp = new_target;
43049 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43050 const1_rtx,
43051 GEN_INT (one_var == 1 ? 0 : 1),
43052 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43053 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43055 if (mode != V4SFmode)
43056 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43057 else if (tmp != target)
43058 emit_move_insn (target, tmp);
43060 else if (target != new_target)
43061 emit_move_insn (target, new_target);
43062 return true;
43064 case E_V8HImode:
43065 case E_V16QImode:
43066 vsimode = V4SImode;
43067 goto widen;
43068 case E_V4HImode:
43069 case E_V8QImode:
43070 if (!mmx_ok)
43071 return false;
43072 vsimode = V2SImode;
43073 goto widen;
43074 widen:
43075 if (one_var != 0)
43076 return false;
43078 /* Zero extend the variable element to SImode and recurse. */
43079 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43081 x = gen_reg_rtx (vsimode);
43082 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43083 var, one_var))
43084 gcc_unreachable ();
43086 emit_move_insn (target, gen_lowpart (mode, x));
43087 return true;
43089 default:
43090 return false;
43094 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43095 consisting of the values in VALS. It is known that all elements
43096 except ONE_VAR are constants. Return true if successful. */
43098 static bool
43099 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43100 rtx target, rtx vals, int one_var)
43102 rtx var = XVECEXP (vals, 0, one_var);
43103 machine_mode wmode;
43104 rtx const_vec, x;
43106 const_vec = copy_rtx (vals);
43107 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43108 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43110 switch (mode)
43112 case E_V2DFmode:
43113 case E_V2DImode:
43114 case E_V2SFmode:
43115 case E_V2SImode:
43116 /* For the two element vectors, it's just as easy to use
43117 the general case. */
43118 return false;
43120 case E_V4DImode:
43121 /* Use ix86_expand_vector_set in 64bit mode only. */
43122 if (!TARGET_64BIT)
43123 return false;
43124 /* FALLTHRU */
43125 case E_V4DFmode:
43126 case E_V8SFmode:
43127 case E_V8SImode:
43128 case E_V16HImode:
43129 case E_V32QImode:
43130 case E_V4SFmode:
43131 case E_V4SImode:
43132 case E_V8HImode:
43133 case E_V4HImode:
43134 break;
43136 case E_V16QImode:
43137 if (TARGET_SSE4_1)
43138 break;
43139 wmode = V8HImode;
43140 goto widen;
43141 case E_V8QImode:
43142 wmode = V4HImode;
43143 goto widen;
43144 widen:
43145 /* There's no way to set one QImode entry easily. Combine
43146 the variable value with its adjacent constant value, and
43147 promote to an HImode set. */
43148 x = XVECEXP (vals, 0, one_var ^ 1);
43149 if (one_var & 1)
43151 var = convert_modes (HImode, QImode, var, true);
43152 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43153 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43154 x = GEN_INT (INTVAL (x) & 0xff);
43156 else
43158 var = convert_modes (HImode, QImode, var, true);
43159 x = gen_int_mode (INTVAL (x) << 8, HImode);
43161 if (x != const0_rtx)
43162 var = expand_simple_binop (HImode, IOR, var, x, var,
43163 1, OPTAB_LIB_WIDEN);
43165 x = gen_reg_rtx (wmode);
43166 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43167 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43169 emit_move_insn (target, gen_lowpart (mode, x));
43170 return true;
43172 default:
43173 return false;
43176 emit_move_insn (target, const_vec);
43177 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43178 return true;
43181 /* A subroutine of ix86_expand_vector_init_general. Use vector
43182 concatenate to handle the most general case: all values variable,
43183 and none identical. */
43185 static void
43186 ix86_expand_vector_init_concat (machine_mode mode,
43187 rtx target, rtx *ops, int n)
43189 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43190 rtx first[16], second[8], third[4];
43191 rtvec v;
43192 int i, j;
43194 switch (n)
43196 case 2:
43197 switch (mode)
43199 case E_V16SImode:
43200 cmode = V8SImode;
43201 break;
43202 case E_V16SFmode:
43203 cmode = V8SFmode;
43204 break;
43205 case E_V8DImode:
43206 cmode = V4DImode;
43207 break;
43208 case E_V8DFmode:
43209 cmode = V4DFmode;
43210 break;
43211 case E_V8SImode:
43212 cmode = V4SImode;
43213 break;
43214 case E_V8SFmode:
43215 cmode = V4SFmode;
43216 break;
43217 case E_V4DImode:
43218 cmode = V2DImode;
43219 break;
43220 case E_V4DFmode:
43221 cmode = V2DFmode;
43222 break;
43223 case E_V4SImode:
43224 cmode = V2SImode;
43225 break;
43226 case E_V4SFmode:
43227 cmode = V2SFmode;
43228 break;
43229 case E_V2DImode:
43230 cmode = DImode;
43231 break;
43232 case E_V2SImode:
43233 cmode = SImode;
43234 break;
43235 case E_V2DFmode:
43236 cmode = DFmode;
43237 break;
43238 case E_V2SFmode:
43239 cmode = SFmode;
43240 break;
43241 default:
43242 gcc_unreachable ();
43245 if (!register_operand (ops[1], cmode))
43246 ops[1] = force_reg (cmode, ops[1]);
43247 if (!register_operand (ops[0], cmode))
43248 ops[0] = force_reg (cmode, ops[0]);
43249 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43250 ops[1])));
43251 break;
43253 case 4:
43254 switch (mode)
43256 case E_V4DImode:
43257 cmode = V2DImode;
43258 break;
43259 case E_V4DFmode:
43260 cmode = V2DFmode;
43261 break;
43262 case E_V4SImode:
43263 cmode = V2SImode;
43264 break;
43265 case E_V4SFmode:
43266 cmode = V2SFmode;
43267 break;
43268 default:
43269 gcc_unreachable ();
43271 goto half;
43273 case 8:
43274 switch (mode)
43276 case E_V8DImode:
43277 cmode = V2DImode;
43278 hmode = V4DImode;
43279 break;
43280 case E_V8DFmode:
43281 cmode = V2DFmode;
43282 hmode = V4DFmode;
43283 break;
43284 case E_V8SImode:
43285 cmode = V2SImode;
43286 hmode = V4SImode;
43287 break;
43288 case E_V8SFmode:
43289 cmode = V2SFmode;
43290 hmode = V4SFmode;
43291 break;
43292 default:
43293 gcc_unreachable ();
43295 goto half;
43297 case 16:
43298 switch (mode)
43300 case E_V16SImode:
43301 cmode = V2SImode;
43302 hmode = V4SImode;
43303 gmode = V8SImode;
43304 break;
43305 case E_V16SFmode:
43306 cmode = V2SFmode;
43307 hmode = V4SFmode;
43308 gmode = V8SFmode;
43309 break;
43310 default:
43311 gcc_unreachable ();
43313 goto half;
43315 half:
43316 /* FIXME: We process inputs backward to help RA. PR 36222. */
43317 i = n - 1;
43318 j = (n >> 1) - 1;
43319 for (; i > 0; i -= 2, j--)
43321 first[j] = gen_reg_rtx (cmode);
43322 v = gen_rtvec (2, ops[i - 1], ops[i]);
43323 ix86_expand_vector_init (false, first[j],
43324 gen_rtx_PARALLEL (cmode, v));
43327 n >>= 1;
43328 if (n > 4)
43330 gcc_assert (hmode != VOIDmode);
43331 gcc_assert (gmode != VOIDmode);
43332 for (i = j = 0; i < n; i += 2, j++)
43334 second[j] = gen_reg_rtx (hmode);
43335 ix86_expand_vector_init_concat (hmode, second [j],
43336 &first [i], 2);
43338 n >>= 1;
43339 for (i = j = 0; i < n; i += 2, j++)
43341 third[j] = gen_reg_rtx (gmode);
43342 ix86_expand_vector_init_concat (gmode, third[j],
43343 &second[i], 2);
43345 n >>= 1;
43346 ix86_expand_vector_init_concat (mode, target, third, n);
43348 else if (n > 2)
43350 gcc_assert (hmode != VOIDmode);
43351 for (i = j = 0; i < n; i += 2, j++)
43353 second[j] = gen_reg_rtx (hmode);
43354 ix86_expand_vector_init_concat (hmode, second [j],
43355 &first [i], 2);
43357 n >>= 1;
43358 ix86_expand_vector_init_concat (mode, target, second, n);
43360 else
43361 ix86_expand_vector_init_concat (mode, target, first, n);
43362 break;
43364 default:
43365 gcc_unreachable ();
43369 /* A subroutine of ix86_expand_vector_init_general. Use vector
43370 interleave to handle the most general case: all values variable,
43371 and none identical. */
43373 static void
43374 ix86_expand_vector_init_interleave (machine_mode mode,
43375 rtx target, rtx *ops, int n)
43377 machine_mode first_imode, second_imode, third_imode, inner_mode;
43378 int i, j;
43379 rtx op0, op1;
43380 rtx (*gen_load_even) (rtx, rtx, rtx);
43381 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43382 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43384 switch (mode)
43386 case E_V8HImode:
43387 gen_load_even = gen_vec_setv8hi;
43388 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43389 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43390 inner_mode = HImode;
43391 first_imode = V4SImode;
43392 second_imode = V2DImode;
43393 third_imode = VOIDmode;
43394 break;
43395 case E_V16QImode:
43396 gen_load_even = gen_vec_setv16qi;
43397 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43398 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43399 inner_mode = QImode;
43400 first_imode = V8HImode;
43401 second_imode = V4SImode;
43402 third_imode = V2DImode;
43403 break;
43404 default:
43405 gcc_unreachable ();
43408 for (i = 0; i < n; i++)
43410 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43411 op0 = gen_reg_rtx (SImode);
43412 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43414 /* Insert the SImode value as low element of V4SImode vector. */
43415 op1 = gen_reg_rtx (V4SImode);
43416 op0 = gen_rtx_VEC_MERGE (V4SImode,
43417 gen_rtx_VEC_DUPLICATE (V4SImode,
43418 op0),
43419 CONST0_RTX (V4SImode),
43420 const1_rtx);
43421 emit_insn (gen_rtx_SET (op1, op0));
43423 /* Cast the V4SImode vector back to a vector in orignal mode. */
43424 op0 = gen_reg_rtx (mode);
43425 emit_move_insn (op0, gen_lowpart (mode, op1));
43427 /* Load even elements into the second position. */
43428 emit_insn (gen_load_even (op0,
43429 force_reg (inner_mode,
43430 ops [i + i + 1]),
43431 const1_rtx));
43433 /* Cast vector to FIRST_IMODE vector. */
43434 ops[i] = gen_reg_rtx (first_imode);
43435 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43438 /* Interleave low FIRST_IMODE vectors. */
43439 for (i = j = 0; i < n; i += 2, j++)
43441 op0 = gen_reg_rtx (first_imode);
43442 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43444 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43445 ops[j] = gen_reg_rtx (second_imode);
43446 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43449 /* Interleave low SECOND_IMODE vectors. */
43450 switch (second_imode)
43452 case E_V4SImode:
43453 for (i = j = 0; i < n / 2; i += 2, j++)
43455 op0 = gen_reg_rtx (second_imode);
43456 emit_insn (gen_interleave_second_low (op0, ops[i],
43457 ops[i + 1]));
43459 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43460 vector. */
43461 ops[j] = gen_reg_rtx (third_imode);
43462 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43464 second_imode = V2DImode;
43465 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43466 /* FALLTHRU */
43468 case E_V2DImode:
43469 op0 = gen_reg_rtx (second_imode);
43470 emit_insn (gen_interleave_second_low (op0, ops[0],
43471 ops[1]));
43473 /* Cast the SECOND_IMODE vector back to a vector on original
43474 mode. */
43475 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43476 break;
43478 default:
43479 gcc_unreachable ();
43483 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43484 all values variable, and none identical. */
43486 static void
43487 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43488 rtx target, rtx vals)
43490 rtx ops[64], op0, op1, op2, op3, op4, op5;
43491 machine_mode half_mode = VOIDmode;
43492 machine_mode quarter_mode = VOIDmode;
43493 int n, i;
43495 switch (mode)
43497 case E_V2SFmode:
43498 case E_V2SImode:
43499 if (!mmx_ok && !TARGET_SSE)
43500 break;
43501 /* FALLTHRU */
43503 case E_V16SImode:
43504 case E_V16SFmode:
43505 case E_V8DFmode:
43506 case E_V8DImode:
43507 case E_V8SFmode:
43508 case E_V8SImode:
43509 case E_V4DFmode:
43510 case E_V4DImode:
43511 case E_V4SFmode:
43512 case E_V4SImode:
43513 case E_V2DFmode:
43514 case E_V2DImode:
43515 n = GET_MODE_NUNITS (mode);
43516 for (i = 0; i < n; i++)
43517 ops[i] = XVECEXP (vals, 0, i);
43518 ix86_expand_vector_init_concat (mode, target, ops, n);
43519 return;
43521 case E_V2TImode:
43522 for (i = 0; i < 2; i++)
43523 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43524 op0 = gen_reg_rtx (V4DImode);
43525 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
43526 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43527 return;
43529 case E_V4TImode:
43530 for (i = 0; i < 4; i++)
43531 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43532 ops[4] = gen_reg_rtx (V4DImode);
43533 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
43534 ops[5] = gen_reg_rtx (V4DImode);
43535 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
43536 op0 = gen_reg_rtx (V8DImode);
43537 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
43538 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43539 return;
43541 case E_V32QImode:
43542 half_mode = V16QImode;
43543 goto half;
43545 case E_V16HImode:
43546 half_mode = V8HImode;
43547 goto half;
43549 half:
43550 n = GET_MODE_NUNITS (mode);
43551 for (i = 0; i < n; i++)
43552 ops[i] = XVECEXP (vals, 0, i);
43553 op0 = gen_reg_rtx (half_mode);
43554 op1 = gen_reg_rtx (half_mode);
43555 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43556 n >> 2);
43557 ix86_expand_vector_init_interleave (half_mode, op1,
43558 &ops [n >> 1], n >> 2);
43559 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43560 return;
43562 case E_V64QImode:
43563 quarter_mode = V16QImode;
43564 half_mode = V32QImode;
43565 goto quarter;
43567 case E_V32HImode:
43568 quarter_mode = V8HImode;
43569 half_mode = V16HImode;
43570 goto quarter;
43572 quarter:
43573 n = GET_MODE_NUNITS (mode);
43574 for (i = 0; i < n; i++)
43575 ops[i] = XVECEXP (vals, 0, i);
43576 op0 = gen_reg_rtx (quarter_mode);
43577 op1 = gen_reg_rtx (quarter_mode);
43578 op2 = gen_reg_rtx (quarter_mode);
43579 op3 = gen_reg_rtx (quarter_mode);
43580 op4 = gen_reg_rtx (half_mode);
43581 op5 = gen_reg_rtx (half_mode);
43582 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43583 n >> 3);
43584 ix86_expand_vector_init_interleave (quarter_mode, op1,
43585 &ops [n >> 2], n >> 3);
43586 ix86_expand_vector_init_interleave (quarter_mode, op2,
43587 &ops [n >> 1], n >> 3);
43588 ix86_expand_vector_init_interleave (quarter_mode, op3,
43589 &ops [(n >> 1) | (n >> 2)], n >> 3);
43590 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43591 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43592 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43593 return;
43595 case E_V16QImode:
43596 if (!TARGET_SSE4_1)
43597 break;
43598 /* FALLTHRU */
43600 case E_V8HImode:
43601 if (!TARGET_SSE2)
43602 break;
43604 /* Don't use ix86_expand_vector_init_interleave if we can't
43605 move from GPR to SSE register directly. */
43606 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43607 break;
43609 n = GET_MODE_NUNITS (mode);
43610 for (i = 0; i < n; i++)
43611 ops[i] = XVECEXP (vals, 0, i);
43612 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43613 return;
43615 case E_V4HImode:
43616 case E_V8QImode:
43617 break;
43619 default:
43620 gcc_unreachable ();
43624 int i, j, n_elts, n_words, n_elt_per_word;
43625 machine_mode inner_mode;
43626 rtx words[4], shift;
43628 inner_mode = GET_MODE_INNER (mode);
43629 n_elts = GET_MODE_NUNITS (mode);
43630 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43631 n_elt_per_word = n_elts / n_words;
43632 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43634 for (i = 0; i < n_words; ++i)
43636 rtx word = NULL_RTX;
43638 for (j = 0; j < n_elt_per_word; ++j)
43640 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43641 elt = convert_modes (word_mode, inner_mode, elt, true);
43643 if (j == 0)
43644 word = elt;
43645 else
43647 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43648 word, 1, OPTAB_LIB_WIDEN);
43649 word = expand_simple_binop (word_mode, IOR, word, elt,
43650 word, 1, OPTAB_LIB_WIDEN);
43654 words[i] = word;
43657 if (n_words == 1)
43658 emit_move_insn (target, gen_lowpart (mode, words[0]));
43659 else if (n_words == 2)
43661 rtx tmp = gen_reg_rtx (mode);
43662 emit_clobber (tmp);
43663 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43664 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43665 emit_move_insn (target, tmp);
43667 else if (n_words == 4)
43669 rtx tmp = gen_reg_rtx (V4SImode);
43670 gcc_assert (word_mode == SImode);
43671 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43672 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43673 emit_move_insn (target, gen_lowpart (mode, tmp));
43675 else
43676 gcc_unreachable ();
43680 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43681 instructions unless MMX_OK is true. */
43683 void
43684 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43686 machine_mode mode = GET_MODE (target);
43687 machine_mode inner_mode = GET_MODE_INNER (mode);
43688 int n_elts = GET_MODE_NUNITS (mode);
43689 int n_var = 0, one_var = -1;
43690 bool all_same = true, all_const_zero = true;
43691 int i;
43692 rtx x;
43694 /* Handle first initialization from vector elts. */
43695 if (n_elts != XVECLEN (vals, 0))
43697 rtx subtarget = target;
43698 x = XVECEXP (vals, 0, 0);
43699 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
43700 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
43702 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
43703 if (inner_mode == QImode || inner_mode == HImode)
43705 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
43706 mode = mode_for_vector (SImode, n_bits / 4).require ();
43707 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
43708 ops[0] = gen_lowpart (inner_mode, ops[0]);
43709 ops[1] = gen_lowpart (inner_mode, ops[1]);
43710 subtarget = gen_reg_rtx (mode);
43712 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
43713 if (subtarget != target)
43714 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
43715 return;
43717 gcc_unreachable ();
43720 for (i = 0; i < n_elts; ++i)
43722 x = XVECEXP (vals, 0, i);
43723 if (!(CONST_SCALAR_INT_P (x)
43724 || CONST_DOUBLE_P (x)
43725 || CONST_FIXED_P (x)))
43726 n_var++, one_var = i;
43727 else if (x != CONST0_RTX (inner_mode))
43728 all_const_zero = false;
43729 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43730 all_same = false;
43733 /* Constants are best loaded from the constant pool. */
43734 if (n_var == 0)
43736 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43737 return;
43740 /* If all values are identical, broadcast the value. */
43741 if (all_same
43742 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43743 XVECEXP (vals, 0, 0)))
43744 return;
43746 /* Values where only one field is non-constant are best loaded from
43747 the pool and overwritten via move later. */
43748 if (n_var == 1)
43750 if (all_const_zero
43751 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43752 XVECEXP (vals, 0, one_var),
43753 one_var))
43754 return;
43756 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43757 return;
43760 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43763 void
43764 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43766 machine_mode mode = GET_MODE (target);
43767 machine_mode inner_mode = GET_MODE_INNER (mode);
43768 machine_mode half_mode;
43769 bool use_vec_merge = false;
43770 rtx tmp;
43771 static rtx (*gen_extract[6][2]) (rtx, rtx)
43773 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43774 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43775 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43776 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43777 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43778 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43780 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43782 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43783 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43784 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43785 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43786 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43787 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43789 int i, j, n;
43790 machine_mode mmode = VOIDmode;
43791 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43793 switch (mode)
43795 case E_V2SFmode:
43796 case E_V2SImode:
43797 if (mmx_ok)
43799 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43800 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43801 if (elt == 0)
43802 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43803 else
43804 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43805 emit_insn (gen_rtx_SET (target, tmp));
43806 return;
43808 break;
43810 case E_V2DImode:
43811 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43812 if (use_vec_merge)
43813 break;
43815 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43816 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43817 if (elt == 0)
43818 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43819 else
43820 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43821 emit_insn (gen_rtx_SET (target, tmp));
43822 return;
43824 case E_V2DFmode:
43826 rtx op0, op1;
43828 /* For the two element vectors, we implement a VEC_CONCAT with
43829 the extraction of the other element. */
43831 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43832 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43834 if (elt == 0)
43835 op0 = val, op1 = tmp;
43836 else
43837 op0 = tmp, op1 = val;
43839 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43840 emit_insn (gen_rtx_SET (target, tmp));
43842 return;
43844 case E_V4SFmode:
43845 use_vec_merge = TARGET_SSE4_1;
43846 if (use_vec_merge)
43847 break;
43849 switch (elt)
43851 case 0:
43852 use_vec_merge = true;
43853 break;
43855 case 1:
43856 /* tmp = target = A B C D */
43857 tmp = copy_to_reg (target);
43858 /* target = A A B B */
43859 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43860 /* target = X A B B */
43861 ix86_expand_vector_set (false, target, val, 0);
43862 /* target = A X C D */
43863 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43864 const1_rtx, const0_rtx,
43865 GEN_INT (2+4), GEN_INT (3+4)));
43866 return;
43868 case 2:
43869 /* tmp = target = A B C D */
43870 tmp = copy_to_reg (target);
43871 /* tmp = X B C D */
43872 ix86_expand_vector_set (false, tmp, val, 0);
43873 /* target = A B X D */
43874 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43875 const0_rtx, const1_rtx,
43876 GEN_INT (0+4), GEN_INT (3+4)));
43877 return;
43879 case 3:
43880 /* tmp = target = A B C D */
43881 tmp = copy_to_reg (target);
43882 /* tmp = X B C D */
43883 ix86_expand_vector_set (false, tmp, val, 0);
43884 /* target = A B X D */
43885 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43886 const0_rtx, const1_rtx,
43887 GEN_INT (2+4), GEN_INT (0+4)));
43888 return;
43890 default:
43891 gcc_unreachable ();
43893 break;
43895 case E_V4SImode:
43896 use_vec_merge = TARGET_SSE4_1;
43897 if (use_vec_merge)
43898 break;
43900 /* Element 0 handled by vec_merge below. */
43901 if (elt == 0)
43903 use_vec_merge = true;
43904 break;
43907 if (TARGET_SSE2)
43909 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43910 store into element 0, then shuffle them back. */
43912 rtx order[4];
43914 order[0] = GEN_INT (elt);
43915 order[1] = const1_rtx;
43916 order[2] = const2_rtx;
43917 order[3] = GEN_INT (3);
43918 order[elt] = const0_rtx;
43920 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43921 order[1], order[2], order[3]));
43923 ix86_expand_vector_set (false, target, val, 0);
43925 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43926 order[1], order[2], order[3]));
43928 else
43930 /* For SSE1, we have to reuse the V4SF code. */
43931 rtx t = gen_reg_rtx (V4SFmode);
43932 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43933 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43934 emit_move_insn (target, gen_lowpart (mode, t));
43936 return;
43938 case E_V8HImode:
43939 use_vec_merge = TARGET_SSE2;
43940 break;
43941 case E_V4HImode:
43942 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43943 break;
43945 case E_V16QImode:
43946 use_vec_merge = TARGET_SSE4_1;
43947 break;
43949 case E_V8QImode:
43950 break;
43952 case E_V32QImode:
43953 half_mode = V16QImode;
43954 j = 0;
43955 n = 16;
43956 goto half;
43958 case E_V16HImode:
43959 half_mode = V8HImode;
43960 j = 1;
43961 n = 8;
43962 goto half;
43964 case E_V8SImode:
43965 half_mode = V4SImode;
43966 j = 2;
43967 n = 4;
43968 goto half;
43970 case E_V4DImode:
43971 half_mode = V2DImode;
43972 j = 3;
43973 n = 2;
43974 goto half;
43976 case E_V8SFmode:
43977 half_mode = V4SFmode;
43978 j = 4;
43979 n = 4;
43980 goto half;
43982 case E_V4DFmode:
43983 half_mode = V2DFmode;
43984 j = 5;
43985 n = 2;
43986 goto half;
43988 half:
43989 /* Compute offset. */
43990 i = elt / n;
43991 elt %= n;
43993 gcc_assert (i <= 1);
43995 /* Extract the half. */
43996 tmp = gen_reg_rtx (half_mode);
43997 emit_insn (gen_extract[j][i] (tmp, target));
43999 /* Put val in tmp at elt. */
44000 ix86_expand_vector_set (false, tmp, val, elt);
44002 /* Put it back. */
44003 emit_insn (gen_insert[j][i] (target, target, tmp));
44004 return;
44006 case E_V8DFmode:
44007 if (TARGET_AVX512F)
44009 mmode = QImode;
44010 gen_blendm = gen_avx512f_blendmv8df;
44012 break;
44014 case E_V8DImode:
44015 if (TARGET_AVX512F)
44017 mmode = QImode;
44018 gen_blendm = gen_avx512f_blendmv8di;
44020 break;
44022 case E_V16SFmode:
44023 if (TARGET_AVX512F)
44025 mmode = HImode;
44026 gen_blendm = gen_avx512f_blendmv16sf;
44028 break;
44030 case E_V16SImode:
44031 if (TARGET_AVX512F)
44033 mmode = HImode;
44034 gen_blendm = gen_avx512f_blendmv16si;
44036 break;
44038 case E_V32HImode:
44039 if (TARGET_AVX512F && TARGET_AVX512BW)
44041 mmode = SImode;
44042 gen_blendm = gen_avx512bw_blendmv32hi;
44044 break;
44046 case E_V64QImode:
44047 if (TARGET_AVX512F && TARGET_AVX512BW)
44049 mmode = DImode;
44050 gen_blendm = gen_avx512bw_blendmv64qi;
44052 break;
44054 default:
44055 break;
44058 if (mmode != VOIDmode)
44060 tmp = gen_reg_rtx (mode);
44061 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44062 /* The avx512*_blendm<mode> expanders have different operand order
44063 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44064 elements where the mask is set and second input operand otherwise,
44065 in {sse,avx}*_*blend* the first input operand is used for elements
44066 where the mask is clear and second input operand otherwise. */
44067 emit_insn (gen_blendm (target, target, tmp,
44068 force_reg (mmode,
44069 gen_int_mode (1 << elt, mmode))));
44071 else if (use_vec_merge)
44073 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44074 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44075 emit_insn (gen_rtx_SET (target, tmp));
44077 else
44079 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44081 emit_move_insn (mem, target);
44083 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44084 emit_move_insn (tmp, val);
44086 emit_move_insn (target, mem);
44090 void
44091 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44093 machine_mode mode = GET_MODE (vec);
44094 machine_mode inner_mode = GET_MODE_INNER (mode);
44095 bool use_vec_extr = false;
44096 rtx tmp;
44098 switch (mode)
44100 case E_V2SImode:
44101 case E_V2SFmode:
44102 if (!mmx_ok)
44103 break;
44104 /* FALLTHRU */
44106 case E_V2DFmode:
44107 case E_V2DImode:
44108 case E_V2TImode:
44109 case E_V4TImode:
44110 use_vec_extr = true;
44111 break;
44113 case E_V4SFmode:
44114 use_vec_extr = TARGET_SSE4_1;
44115 if (use_vec_extr)
44116 break;
44118 switch (elt)
44120 case 0:
44121 tmp = vec;
44122 break;
44124 case 1:
44125 case 3:
44126 tmp = gen_reg_rtx (mode);
44127 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44128 GEN_INT (elt), GEN_INT (elt),
44129 GEN_INT (elt+4), GEN_INT (elt+4)));
44130 break;
44132 case 2:
44133 tmp = gen_reg_rtx (mode);
44134 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44135 break;
44137 default:
44138 gcc_unreachable ();
44140 vec = tmp;
44141 use_vec_extr = true;
44142 elt = 0;
44143 break;
44145 case E_V4SImode:
44146 use_vec_extr = TARGET_SSE4_1;
44147 if (use_vec_extr)
44148 break;
44150 if (TARGET_SSE2)
44152 switch (elt)
44154 case 0:
44155 tmp = vec;
44156 break;
44158 case 1:
44159 case 3:
44160 tmp = gen_reg_rtx (mode);
44161 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44162 GEN_INT (elt), GEN_INT (elt),
44163 GEN_INT (elt), GEN_INT (elt)));
44164 break;
44166 case 2:
44167 tmp = gen_reg_rtx (mode);
44168 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44169 break;
44171 default:
44172 gcc_unreachable ();
44174 vec = tmp;
44175 use_vec_extr = true;
44176 elt = 0;
44178 else
44180 /* For SSE1, we have to reuse the V4SF code. */
44181 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44182 gen_lowpart (V4SFmode, vec), elt);
44183 return;
44185 break;
44187 case E_V8HImode:
44188 use_vec_extr = TARGET_SSE2;
44189 break;
44190 case E_V4HImode:
44191 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44192 break;
44194 case E_V16QImode:
44195 use_vec_extr = TARGET_SSE4_1;
44196 break;
44198 case E_V8SFmode:
44199 if (TARGET_AVX)
44201 tmp = gen_reg_rtx (V4SFmode);
44202 if (elt < 4)
44203 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44204 else
44205 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44206 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44207 return;
44209 break;
44211 case E_V4DFmode:
44212 if (TARGET_AVX)
44214 tmp = gen_reg_rtx (V2DFmode);
44215 if (elt < 2)
44216 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44217 else
44218 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44219 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44220 return;
44222 break;
44224 case E_V32QImode:
44225 if (TARGET_AVX)
44227 tmp = gen_reg_rtx (V16QImode);
44228 if (elt < 16)
44229 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44230 else
44231 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44232 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44233 return;
44235 break;
44237 case E_V16HImode:
44238 if (TARGET_AVX)
44240 tmp = gen_reg_rtx (V8HImode);
44241 if (elt < 8)
44242 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44243 else
44244 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44245 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44246 return;
44248 break;
44250 case E_V8SImode:
44251 if (TARGET_AVX)
44253 tmp = gen_reg_rtx (V4SImode);
44254 if (elt < 4)
44255 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44256 else
44257 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44258 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44259 return;
44261 break;
44263 case E_V4DImode:
44264 if (TARGET_AVX)
44266 tmp = gen_reg_rtx (V2DImode);
44267 if (elt < 2)
44268 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44269 else
44270 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44271 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44272 return;
44274 break;
44276 case E_V32HImode:
44277 if (TARGET_AVX512BW)
44279 tmp = gen_reg_rtx (V16HImode);
44280 if (elt < 16)
44281 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44282 else
44283 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44284 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44285 return;
44287 break;
44289 case E_V64QImode:
44290 if (TARGET_AVX512BW)
44292 tmp = gen_reg_rtx (V32QImode);
44293 if (elt < 32)
44294 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44295 else
44296 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44297 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44298 return;
44300 break;
44302 case E_V16SFmode:
44303 tmp = gen_reg_rtx (V8SFmode);
44304 if (elt < 8)
44305 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44306 else
44307 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44308 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44309 return;
44311 case E_V8DFmode:
44312 tmp = gen_reg_rtx (V4DFmode);
44313 if (elt < 4)
44314 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44315 else
44316 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44317 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44318 return;
44320 case E_V16SImode:
44321 tmp = gen_reg_rtx (V8SImode);
44322 if (elt < 8)
44323 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44324 else
44325 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44326 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44327 return;
44329 case E_V8DImode:
44330 tmp = gen_reg_rtx (V4DImode);
44331 if (elt < 4)
44332 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44333 else
44334 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44335 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44336 return;
44338 case E_V8QImode:
44339 /* ??? Could extract the appropriate HImode element and shift. */
44340 default:
44341 break;
44344 if (use_vec_extr)
44346 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44347 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44349 /* Let the rtl optimizers know about the zero extension performed. */
44350 if (inner_mode == QImode || inner_mode == HImode)
44352 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44353 target = gen_lowpart (SImode, target);
44356 emit_insn (gen_rtx_SET (target, tmp));
44358 else
44360 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44362 emit_move_insn (mem, vec);
44364 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44365 emit_move_insn (target, tmp);
44369 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44370 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44371 The upper bits of DEST are undefined, though they shouldn't cause
44372 exceptions (some bits from src or all zeros are ok). */
44374 static void
44375 emit_reduc_half (rtx dest, rtx src, int i)
44377 rtx tem, d = dest;
44378 switch (GET_MODE (src))
44380 case E_V4SFmode:
44381 if (i == 128)
44382 tem = gen_sse_movhlps (dest, src, src);
44383 else
44384 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44385 GEN_INT (1 + 4), GEN_INT (1 + 4));
44386 break;
44387 case E_V2DFmode:
44388 tem = gen_vec_interleave_highv2df (dest, src, src);
44389 break;
44390 case E_V16QImode:
44391 case E_V8HImode:
44392 case E_V4SImode:
44393 case E_V2DImode:
44394 d = gen_reg_rtx (V1TImode);
44395 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44396 GEN_INT (i / 2));
44397 break;
44398 case E_V8SFmode:
44399 if (i == 256)
44400 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44401 else
44402 tem = gen_avx_shufps256 (dest, src, src,
44403 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44404 break;
44405 case E_V4DFmode:
44406 if (i == 256)
44407 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44408 else
44409 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44410 break;
44411 case E_V32QImode:
44412 case E_V16HImode:
44413 case E_V8SImode:
44414 case E_V4DImode:
44415 if (i == 256)
44417 if (GET_MODE (dest) != V4DImode)
44418 d = gen_reg_rtx (V4DImode);
44419 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44420 gen_lowpart (V4DImode, src),
44421 const1_rtx);
44423 else
44425 d = gen_reg_rtx (V2TImode);
44426 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44427 GEN_INT (i / 2));
44429 break;
44430 case E_V64QImode:
44431 case E_V32HImode:
44432 case E_V16SImode:
44433 case E_V16SFmode:
44434 case E_V8DImode:
44435 case E_V8DFmode:
44436 if (i > 128)
44437 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44438 gen_lowpart (V16SImode, src),
44439 gen_lowpart (V16SImode, src),
44440 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44441 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44442 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44443 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44444 GEN_INT (0xC), GEN_INT (0xD),
44445 GEN_INT (0xE), GEN_INT (0xF),
44446 GEN_INT (0x10), GEN_INT (0x11),
44447 GEN_INT (0x12), GEN_INT (0x13),
44448 GEN_INT (0x14), GEN_INT (0x15),
44449 GEN_INT (0x16), GEN_INT (0x17));
44450 else
44451 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44452 gen_lowpart (V16SImode, src),
44453 GEN_INT (i == 128 ? 0x2 : 0x1),
44454 GEN_INT (0x3),
44455 GEN_INT (0x3),
44456 GEN_INT (0x3),
44457 GEN_INT (i == 128 ? 0x6 : 0x5),
44458 GEN_INT (0x7),
44459 GEN_INT (0x7),
44460 GEN_INT (0x7),
44461 GEN_INT (i == 128 ? 0xA : 0x9),
44462 GEN_INT (0xB),
44463 GEN_INT (0xB),
44464 GEN_INT (0xB),
44465 GEN_INT (i == 128 ? 0xE : 0xD),
44466 GEN_INT (0xF),
44467 GEN_INT (0xF),
44468 GEN_INT (0xF));
44469 break;
44470 default:
44471 gcc_unreachable ();
44473 emit_insn (tem);
44474 if (d != dest)
44475 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44478 /* Expand a vector reduction. FN is the binary pattern to reduce;
44479 DEST is the destination; IN is the input vector. */
44481 void
44482 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44484 rtx half, dst, vec = in;
44485 machine_mode mode = GET_MODE (in);
44486 int i;
44488 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44489 if (TARGET_SSE4_1
44490 && mode == V8HImode
44491 && fn == gen_uminv8hi3)
44493 emit_insn (gen_sse4_1_phminposuw (dest, in));
44494 return;
44497 for (i = GET_MODE_BITSIZE (mode);
44498 i > GET_MODE_UNIT_BITSIZE (mode);
44499 i >>= 1)
44501 half = gen_reg_rtx (mode);
44502 emit_reduc_half (half, vec, i);
44503 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44504 dst = dest;
44505 else
44506 dst = gen_reg_rtx (mode);
44507 emit_insn (fn (dst, half, vec));
44508 vec = dst;
44512 /* Target hook for scalar_mode_supported_p. */
44513 static bool
44514 ix86_scalar_mode_supported_p (scalar_mode mode)
44516 if (DECIMAL_FLOAT_MODE_P (mode))
44517 return default_decimal_float_supported_p ();
44518 else if (mode == TFmode)
44519 return true;
44520 else
44521 return default_scalar_mode_supported_p (mode);
44524 /* Implements target hook vector_mode_supported_p. */
44525 static bool
44526 ix86_vector_mode_supported_p (machine_mode mode)
44528 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44529 return true;
44530 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44531 return true;
44532 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44533 return true;
44534 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44535 return true;
44536 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44537 return true;
44538 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44539 return true;
44540 return false;
44543 /* Target hook for c_mode_for_suffix. */
44544 static machine_mode
44545 ix86_c_mode_for_suffix (char suffix)
44547 if (suffix == 'q')
44548 return TFmode;
44549 if (suffix == 'w')
44550 return XFmode;
44552 return VOIDmode;
44555 /* Worker function for TARGET_MD_ASM_ADJUST.
44557 We implement asm flag outputs, and maintain source compatibility
44558 with the old cc0-based compiler. */
44560 static rtx_insn *
44561 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44562 vec<const char *> &constraints,
44563 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44565 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44566 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44568 bool saw_asm_flag = false;
44570 start_sequence ();
44571 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44573 const char *con = constraints[i];
44574 if (strncmp (con, "=@cc", 4) != 0)
44575 continue;
44576 con += 4;
44577 if (strchr (con, ',') != NULL)
44579 error ("alternatives not allowed in asm flag output");
44580 continue;
44583 bool invert = false;
44584 if (con[0] == 'n')
44585 invert = true, con++;
44587 machine_mode mode = CCmode;
44588 rtx_code code = UNKNOWN;
44590 switch (con[0])
44592 case 'a':
44593 if (con[1] == 0)
44594 mode = CCAmode, code = EQ;
44595 else if (con[1] == 'e' && con[2] == 0)
44596 mode = CCCmode, code = NE;
44597 break;
44598 case 'b':
44599 if (con[1] == 0)
44600 mode = CCCmode, code = EQ;
44601 else if (con[1] == 'e' && con[2] == 0)
44602 mode = CCAmode, code = NE;
44603 break;
44604 case 'c':
44605 if (con[1] == 0)
44606 mode = CCCmode, code = EQ;
44607 break;
44608 case 'e':
44609 if (con[1] == 0)
44610 mode = CCZmode, code = EQ;
44611 break;
44612 case 'g':
44613 if (con[1] == 0)
44614 mode = CCGCmode, code = GT;
44615 else if (con[1] == 'e' && con[2] == 0)
44616 mode = CCGCmode, code = GE;
44617 break;
44618 case 'l':
44619 if (con[1] == 0)
44620 mode = CCGCmode, code = LT;
44621 else if (con[1] == 'e' && con[2] == 0)
44622 mode = CCGCmode, code = LE;
44623 break;
44624 case 'o':
44625 if (con[1] == 0)
44626 mode = CCOmode, code = EQ;
44627 break;
44628 case 'p':
44629 if (con[1] == 0)
44630 mode = CCPmode, code = EQ;
44631 break;
44632 case 's':
44633 if (con[1] == 0)
44634 mode = CCSmode, code = EQ;
44635 break;
44636 case 'z':
44637 if (con[1] == 0)
44638 mode = CCZmode, code = EQ;
44639 break;
44641 if (code == UNKNOWN)
44643 error ("unknown asm flag output %qs", constraints[i]);
44644 continue;
44646 if (invert)
44647 code = reverse_condition (code);
44649 rtx dest = outputs[i];
44650 if (!saw_asm_flag)
44652 /* This is the first asm flag output. Here we put the flags
44653 register in as the real output and adjust the condition to
44654 allow it. */
44655 constraints[i] = "=Bf";
44656 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44657 saw_asm_flag = true;
44659 else
44661 /* We don't need the flags register as output twice. */
44662 constraints[i] = "=X";
44663 outputs[i] = gen_rtx_SCRATCH (SImode);
44666 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44667 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44669 machine_mode dest_mode = GET_MODE (dest);
44670 if (!SCALAR_INT_MODE_P (dest_mode))
44672 error ("invalid type for asm flag output");
44673 continue;
44676 if (dest_mode == DImode && !TARGET_64BIT)
44677 dest_mode = SImode;
44679 if (dest_mode != QImode)
44681 rtx destqi = gen_reg_rtx (QImode);
44682 emit_insn (gen_rtx_SET (destqi, x));
44684 if (TARGET_ZERO_EXTEND_WITH_AND
44685 && optimize_function_for_speed_p (cfun))
44687 x = force_reg (dest_mode, const0_rtx);
44689 emit_insn (gen_movstrictqi
44690 (gen_lowpart (QImode, x), destqi));
44692 else
44693 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44696 if (dest_mode != GET_MODE (dest))
44698 rtx tmp = gen_reg_rtx (SImode);
44700 emit_insn (gen_rtx_SET (tmp, x));
44701 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44703 else
44704 emit_insn (gen_rtx_SET (dest, x));
44706 rtx_insn *seq = get_insns ();
44707 end_sequence ();
44709 if (saw_asm_flag)
44710 return seq;
44711 else
44713 /* If we had no asm flag outputs, clobber the flags. */
44714 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44715 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44716 return NULL;
44720 /* Implements target vector targetm.asm.encode_section_info. */
44722 static void ATTRIBUTE_UNUSED
44723 ix86_encode_section_info (tree decl, rtx rtl, int first)
44725 default_encode_section_info (decl, rtl, first);
44727 if (ix86_in_large_data_p (decl))
44728 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44731 /* Worker function for REVERSE_CONDITION. */
44733 enum rtx_code
44734 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44736 return (mode == CCFPmode
44737 ? reverse_condition_maybe_unordered (code)
44738 : reverse_condition (code));
44741 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44742 to OPERANDS[0]. */
44744 const char *
44745 output_387_reg_move (rtx_insn *insn, rtx *operands)
44747 if (REG_P (operands[0]))
44749 if (REG_P (operands[1])
44750 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44752 if (REGNO (operands[0]) == FIRST_STACK_REG)
44753 return output_387_ffreep (operands, 0);
44754 return "fstp\t%y0";
44756 if (STACK_TOP_P (operands[0]))
44757 return "fld%Z1\t%y1";
44758 return "fst\t%y0";
44760 else if (MEM_P (operands[0]))
44762 gcc_assert (REG_P (operands[1]));
44763 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44764 return "fstp%Z0\t%y0";
44765 else
44767 /* There is no non-popping store to memory for XFmode.
44768 So if we need one, follow the store with a load. */
44769 if (GET_MODE (operands[0]) == XFmode)
44770 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44771 else
44772 return "fst%Z0\t%y0";
44775 else
44776 gcc_unreachable();
44779 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44780 FP status register is set. */
44782 void
44783 ix86_emit_fp_unordered_jump (rtx label)
44785 rtx reg = gen_reg_rtx (HImode);
44786 rtx temp;
44788 emit_insn (gen_x86_fnstsw_1 (reg));
44790 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44792 emit_insn (gen_x86_sahf_1 (reg));
44794 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44795 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44797 else
44799 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44801 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44802 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44805 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44806 gen_rtx_LABEL_REF (VOIDmode, label),
44807 pc_rtx);
44808 temp = gen_rtx_SET (pc_rtx, temp);
44810 emit_jump_insn (temp);
44811 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44814 /* Output code to perform a log1p XFmode calculation. */
44816 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44818 rtx_code_label *label1 = gen_label_rtx ();
44819 rtx_code_label *label2 = gen_label_rtx ();
44821 rtx tmp = gen_reg_rtx (XFmode);
44822 rtx tmp2 = gen_reg_rtx (XFmode);
44823 rtx test;
44825 emit_insn (gen_absxf2 (tmp, op1));
44826 test = gen_rtx_GE (VOIDmode, tmp,
44827 const_double_from_real_value (
44828 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44829 XFmode));
44830 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44832 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44833 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44834 emit_jump (label2);
44836 emit_label (label1);
44837 emit_move_insn (tmp, CONST1_RTX (XFmode));
44838 emit_insn (gen_addxf3 (tmp, op1, tmp));
44839 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44840 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44842 emit_label (label2);
44845 /* Emit code for round calculation. */
44846 void ix86_emit_i387_round (rtx op0, rtx op1)
44848 machine_mode inmode = GET_MODE (op1);
44849 machine_mode outmode = GET_MODE (op0);
44850 rtx e1, e2, res, tmp, tmp1, half;
44851 rtx scratch = gen_reg_rtx (HImode);
44852 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44853 rtx_code_label *jump_label = gen_label_rtx ();
44854 rtx insn;
44855 rtx (*gen_abs) (rtx, rtx);
44856 rtx (*gen_neg) (rtx, rtx);
44858 switch (inmode)
44860 case E_SFmode:
44861 gen_abs = gen_abssf2;
44862 break;
44863 case E_DFmode:
44864 gen_abs = gen_absdf2;
44865 break;
44866 case E_XFmode:
44867 gen_abs = gen_absxf2;
44868 break;
44869 default:
44870 gcc_unreachable ();
44873 switch (outmode)
44875 case E_SFmode:
44876 gen_neg = gen_negsf2;
44877 break;
44878 case E_DFmode:
44879 gen_neg = gen_negdf2;
44880 break;
44881 case E_XFmode:
44882 gen_neg = gen_negxf2;
44883 break;
44884 case E_HImode:
44885 gen_neg = gen_neghi2;
44886 break;
44887 case E_SImode:
44888 gen_neg = gen_negsi2;
44889 break;
44890 case E_DImode:
44891 gen_neg = gen_negdi2;
44892 break;
44893 default:
44894 gcc_unreachable ();
44897 e1 = gen_reg_rtx (inmode);
44898 e2 = gen_reg_rtx (inmode);
44899 res = gen_reg_rtx (outmode);
44901 half = const_double_from_real_value (dconsthalf, inmode);
44903 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44905 /* scratch = fxam(op1) */
44906 emit_insn (gen_rtx_SET (scratch,
44907 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44908 UNSPEC_FXAM)));
44909 /* e1 = fabs(op1) */
44910 emit_insn (gen_abs (e1, op1));
44912 /* e2 = e1 + 0.5 */
44913 half = force_reg (inmode, half);
44914 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44916 /* res = floor(e2) */
44917 if (inmode != XFmode)
44919 tmp1 = gen_reg_rtx (XFmode);
44921 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44923 else
44924 tmp1 = e2;
44926 switch (outmode)
44928 case E_SFmode:
44929 case E_DFmode:
44931 rtx tmp0 = gen_reg_rtx (XFmode);
44933 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44935 emit_insn (gen_rtx_SET (res,
44936 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44937 UNSPEC_TRUNC_NOOP)));
44939 break;
44940 case E_XFmode:
44941 emit_insn (gen_frndintxf2_floor (res, tmp1));
44942 break;
44943 case E_HImode:
44944 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44945 break;
44946 case E_SImode:
44947 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44948 break;
44949 case E_DImode:
44950 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44951 break;
44952 default:
44953 gcc_unreachable ();
44956 /* flags = signbit(a) */
44957 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44959 /* if (flags) then res = -res */
44960 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44961 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44962 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44963 pc_rtx);
44964 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44965 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44966 JUMP_LABEL (insn) = jump_label;
44968 emit_insn (gen_neg (res, res));
44970 emit_label (jump_label);
44971 LABEL_NUSES (jump_label) = 1;
44973 emit_move_insn (op0, res);
44976 /* Output code to perform a Newton-Rhapson approximation of a single precision
44977 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44979 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44981 rtx x0, x1, e0, e1;
44983 x0 = gen_reg_rtx (mode);
44984 e0 = gen_reg_rtx (mode);
44985 e1 = gen_reg_rtx (mode);
44986 x1 = gen_reg_rtx (mode);
44988 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44990 b = force_reg (mode, b);
44992 /* x0 = rcp(b) estimate */
44993 if (mode == V16SFmode || mode == V8DFmode)
44995 if (TARGET_AVX512ER)
44997 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44998 UNSPEC_RCP28)));
44999 /* res = a * x0 */
45000 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45001 return;
45003 else
45004 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45005 UNSPEC_RCP14)));
45007 else
45008 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45009 UNSPEC_RCP)));
45011 /* e0 = x0 * b */
45012 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45014 /* e0 = x0 * e0 */
45015 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45017 /* e1 = x0 + x0 */
45018 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45020 /* x1 = e1 - e0 */
45021 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45023 /* res = a * x1 */
45024 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45027 /* Output code to perform a Newton-Rhapson approximation of a
45028 single precision floating point [reciprocal] square root. */
45030 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45032 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45033 REAL_VALUE_TYPE r;
45034 int unspec;
45036 x0 = gen_reg_rtx (mode);
45037 e0 = gen_reg_rtx (mode);
45038 e1 = gen_reg_rtx (mode);
45039 e2 = gen_reg_rtx (mode);
45040 e3 = gen_reg_rtx (mode);
45042 if (TARGET_AVX512ER && mode == V16SFmode)
45044 if (recip)
45045 /* res = rsqrt28(a) estimate */
45046 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45047 UNSPEC_RSQRT28)));
45048 else
45050 /* x0 = rsqrt28(a) estimate */
45051 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45052 UNSPEC_RSQRT28)));
45053 /* res = rcp28(x0) estimate */
45054 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45055 UNSPEC_RCP28)));
45057 return;
45060 real_from_integer (&r, VOIDmode, -3, SIGNED);
45061 mthree = const_double_from_real_value (r, SFmode);
45063 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45064 mhalf = const_double_from_real_value (r, SFmode);
45065 unspec = UNSPEC_RSQRT;
45067 if (VECTOR_MODE_P (mode))
45069 mthree = ix86_build_const_vector (mode, true, mthree);
45070 mhalf = ix86_build_const_vector (mode, true, mhalf);
45071 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45072 if (GET_MODE_SIZE (mode) == 64)
45073 unspec = UNSPEC_RSQRT14;
45076 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45077 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45079 a = force_reg (mode, a);
45081 /* x0 = rsqrt(a) estimate */
45082 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45083 unspec)));
45085 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45086 if (!recip)
45088 rtx zero = force_reg (mode, CONST0_RTX(mode));
45089 rtx mask;
45091 /* Handle masked compare. */
45092 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45094 mask = gen_reg_rtx (HImode);
45095 /* Imm value 0x4 corresponds to not-equal comparison. */
45096 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45097 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45099 else
45101 mask = gen_reg_rtx (mode);
45102 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45103 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45107 /* e0 = x0 * a */
45108 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45109 /* e1 = e0 * x0 */
45110 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45112 /* e2 = e1 - 3. */
45113 mthree = force_reg (mode, mthree);
45114 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45116 mhalf = force_reg (mode, mhalf);
45117 if (recip)
45118 /* e3 = -.5 * x0 */
45119 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45120 else
45121 /* e3 = -.5 * e0 */
45122 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45123 /* ret = e2 * e3 */
45124 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45127 #ifdef TARGET_SOLARIS
45128 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45130 static void
45131 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45132 tree decl)
45134 /* With Binutils 2.15, the "@unwind" marker must be specified on
45135 every occurrence of the ".eh_frame" section, not just the first
45136 one. */
45137 if (TARGET_64BIT
45138 && strcmp (name, ".eh_frame") == 0)
45140 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45141 flags & SECTION_WRITE ? "aw" : "a");
45142 return;
45145 #ifndef USE_GAS
45146 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45148 solaris_elf_asm_comdat_section (name, flags, decl);
45149 return;
45151 #endif
45153 default_elf_asm_named_section (name, flags, decl);
45155 #endif /* TARGET_SOLARIS */
45157 /* Return the mangling of TYPE if it is an extended fundamental type. */
45159 static const char *
45160 ix86_mangle_type (const_tree type)
45162 type = TYPE_MAIN_VARIANT (type);
45164 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45165 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45166 return NULL;
45168 switch (TYPE_MODE (type))
45170 case E_TFmode:
45171 /* __float128 is "g". */
45172 return "g";
45173 case E_XFmode:
45174 /* "long double" or __float80 is "e". */
45175 return "e";
45176 default:
45177 return NULL;
45181 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
45183 static tree
45184 ix86_stack_protect_guard (void)
45186 if (TARGET_SSP_TLS_GUARD)
45188 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
45189 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
45190 tree type = build_qualified_type (type_node, qual);
45191 tree t;
45193 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
45195 t = ix86_tls_stack_chk_guard_decl;
45197 if (t == NULL)
45199 rtx x;
45201 t = build_decl
45202 (UNKNOWN_LOCATION, VAR_DECL,
45203 get_identifier (ix86_stack_protector_guard_symbol_str),
45204 type);
45205 TREE_STATIC (t) = 1;
45206 TREE_PUBLIC (t) = 1;
45207 DECL_EXTERNAL (t) = 1;
45208 TREE_USED (t) = 1;
45209 TREE_THIS_VOLATILE (t) = 1;
45210 DECL_ARTIFICIAL (t) = 1;
45211 DECL_IGNORED_P (t) = 1;
45213 /* Do not share RTL as the declaration is visible outside of
45214 current function. */
45215 x = DECL_RTL (t);
45216 RTX_FLAG (x, used) = 1;
45218 ix86_tls_stack_chk_guard_decl = t;
45221 else
45223 tree asptrtype = build_pointer_type (type);
45225 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45226 t = build2 (MEM_REF, asptrtype, t,
45227 build_int_cst (asptrtype, 0));
45230 return t;
45233 return default_stack_protect_guard ();
45236 /* For 32-bit code we can save PIC register setup by using
45237 __stack_chk_fail_local hidden function instead of calling
45238 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45239 register, so it is better to call __stack_chk_fail directly. */
45241 static tree ATTRIBUTE_UNUSED
45242 ix86_stack_protect_fail (void)
45244 return TARGET_64BIT
45245 ? default_external_stack_protect_fail ()
45246 : default_hidden_stack_protect_fail ();
45249 /* Select a format to encode pointers in exception handling data. CODE
45250 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45251 true if the symbol may be affected by dynamic relocations.
45253 ??? All x86 object file formats are capable of representing this.
45254 After all, the relocation needed is the same as for the call insn.
45255 Whether or not a particular assembler allows us to enter such, I
45256 guess we'll have to see. */
45258 asm_preferred_eh_data_format (int code, int global)
45260 if (flag_pic)
45262 int type = DW_EH_PE_sdata8;
45263 if (!TARGET_64BIT
45264 || ix86_cmodel == CM_SMALL_PIC
45265 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45266 type = DW_EH_PE_sdata4;
45267 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45269 if (ix86_cmodel == CM_SMALL
45270 || (ix86_cmodel == CM_MEDIUM && code))
45271 return DW_EH_PE_udata4;
45272 return DW_EH_PE_absptr;
45275 /* Expand copysign from SIGN to the positive value ABS_VALUE
45276 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45277 the sign-bit. */
45278 static void
45279 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45281 machine_mode mode = GET_MODE (sign);
45282 rtx sgn = gen_reg_rtx (mode);
45283 if (mask == NULL_RTX)
45285 machine_mode vmode;
45287 if (mode == SFmode)
45288 vmode = V4SFmode;
45289 else if (mode == DFmode)
45290 vmode = V2DFmode;
45291 else
45292 vmode = mode;
45294 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45295 if (!VECTOR_MODE_P (mode))
45297 /* We need to generate a scalar mode mask in this case. */
45298 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45299 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45300 mask = gen_reg_rtx (mode);
45301 emit_insn (gen_rtx_SET (mask, tmp));
45304 else
45305 mask = gen_rtx_NOT (mode, mask);
45306 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45307 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45310 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45311 mask for masking out the sign-bit is stored in *SMASK, if that is
45312 non-null. */
45313 static rtx
45314 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45316 machine_mode vmode, mode = GET_MODE (op0);
45317 rtx xa, mask;
45319 xa = gen_reg_rtx (mode);
45320 if (mode == SFmode)
45321 vmode = V4SFmode;
45322 else if (mode == DFmode)
45323 vmode = V2DFmode;
45324 else
45325 vmode = mode;
45326 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45327 if (!VECTOR_MODE_P (mode))
45329 /* We need to generate a scalar mode mask in this case. */
45330 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45331 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45332 mask = gen_reg_rtx (mode);
45333 emit_insn (gen_rtx_SET (mask, tmp));
45335 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45337 if (smask)
45338 *smask = mask;
45340 return xa;
45343 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45344 swapping the operands if SWAP_OPERANDS is true. The expanded
45345 code is a forward jump to a newly created label in case the
45346 comparison is true. The generated label rtx is returned. */
45347 static rtx_code_label *
45348 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45349 bool swap_operands)
45351 bool unordered_compare = ix86_unordered_fp_compare (code);
45352 rtx_code_label *label;
45353 rtx tmp, reg;
45355 if (swap_operands)
45356 std::swap (op0, op1);
45358 label = gen_label_rtx ();
45359 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
45360 if (unordered_compare)
45361 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
45362 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
45363 emit_insn (gen_rtx_SET (reg, tmp));
45364 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
45365 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45366 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45367 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45368 JUMP_LABEL (tmp) = label;
45370 return label;
45373 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45374 using comparison code CODE. Operands are swapped for the comparison if
45375 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45376 static rtx
45377 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45378 bool swap_operands)
45380 rtx (*insn)(rtx, rtx, rtx, rtx);
45381 machine_mode mode = GET_MODE (op0);
45382 rtx mask = gen_reg_rtx (mode);
45384 if (swap_operands)
45385 std::swap (op0, op1);
45387 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45389 emit_insn (insn (mask, op0, op1,
45390 gen_rtx_fmt_ee (code, mode, op0, op1)));
45391 return mask;
45394 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45395 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45396 static rtx
45397 ix86_gen_TWO52 (machine_mode mode)
45399 REAL_VALUE_TYPE TWO52r;
45400 rtx TWO52;
45402 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45403 TWO52 = const_double_from_real_value (TWO52r, mode);
45404 TWO52 = force_reg (mode, TWO52);
45406 return TWO52;
45409 /* Expand SSE sequence for computing lround from OP1 storing
45410 into OP0. */
45411 void
45412 ix86_expand_lround (rtx op0, rtx op1)
45414 /* C code for the stuff we're doing below:
45415 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45416 return (long)tmp;
45418 machine_mode mode = GET_MODE (op1);
45419 const struct real_format *fmt;
45420 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45421 rtx adj;
45423 /* load nextafter (0.5, 0.0) */
45424 fmt = REAL_MODE_FORMAT (mode);
45425 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45426 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45428 /* adj = copysign (0.5, op1) */
45429 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45430 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45432 /* adj = op1 + adj */
45433 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45435 /* op0 = (imode)adj */
45436 expand_fix (op0, adj, 0);
45439 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45440 into OPERAND0. */
45441 void
45442 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45444 /* C code for the stuff we're doing below (for do_floor):
45445 xi = (long)op1;
45446 xi -= (double)xi > op1 ? 1 : 0;
45447 return xi;
45449 machine_mode fmode = GET_MODE (op1);
45450 machine_mode imode = GET_MODE (op0);
45451 rtx ireg, freg, tmp;
45452 rtx_code_label *label;
45454 /* reg = (long)op1 */
45455 ireg = gen_reg_rtx (imode);
45456 expand_fix (ireg, op1, 0);
45458 /* freg = (double)reg */
45459 freg = gen_reg_rtx (fmode);
45460 expand_float (freg, ireg, 0);
45462 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45463 label = ix86_expand_sse_compare_and_jump (UNLE,
45464 freg, op1, !do_floor);
45465 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45466 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45467 emit_move_insn (ireg, tmp);
45469 emit_label (label);
45470 LABEL_NUSES (label) = 1;
45472 emit_move_insn (op0, ireg);
45475 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
45476 void
45477 ix86_expand_rint (rtx operand0, rtx operand1)
45479 /* C code for the stuff we're doing below:
45480 xa = fabs (operand1);
45481 if (!isless (xa, 2**52))
45482 return operand1;
45483 two52 = 2**52;
45484 if (flag_rounding_math)
45486 two52 = copysign (two52, operand1);
45487 xa = operand1;
45489 xa = xa + two52 - two52;
45490 return copysign (xa, operand1);
45492 machine_mode mode = GET_MODE (operand0);
45493 rtx res, xa, TWO52, two52, mask;
45494 rtx_code_label *label;
45496 res = gen_reg_rtx (mode);
45497 emit_move_insn (res, operand1);
45499 /* xa = abs (operand1) */
45500 xa = ix86_expand_sse_fabs (res, &mask);
45502 /* if (!isless (xa, TWO52)) goto label; */
45503 TWO52 = ix86_gen_TWO52 (mode);
45504 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45506 two52 = TWO52;
45507 if (flag_rounding_math)
45509 two52 = gen_reg_rtx (mode);
45510 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
45511 xa = res;
45514 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
45515 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
45517 ix86_sse_copysign_to_positive (res, xa, res, mask);
45519 emit_label (label);
45520 LABEL_NUSES (label) = 1;
45522 emit_move_insn (operand0, res);
45525 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45526 into OPERAND0. */
45527 void
45528 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45530 /* C code for the stuff we expand below.
45531 double xa = fabs (x), x2;
45532 if (!isless (xa, TWO52))
45533 return x;
45534 xa = xa + TWO52 - TWO52;
45535 x2 = copysign (xa, x);
45536 Compensate. Floor:
45537 if (x2 > x)
45538 x2 -= 1;
45539 Compensate. Ceil:
45540 if (x2 < x)
45541 x2 -= -1;
45542 return x2;
45544 machine_mode mode = GET_MODE (operand0);
45545 rtx xa, TWO52, tmp, one, res, mask;
45546 rtx_code_label *label;
45548 TWO52 = ix86_gen_TWO52 (mode);
45550 /* Temporary for holding the result, initialized to the input
45551 operand to ease control flow. */
45552 res = gen_reg_rtx (mode);
45553 emit_move_insn (res, operand1);
45555 /* xa = abs (operand1) */
45556 xa = ix86_expand_sse_fabs (res, &mask);
45558 /* if (!isless (xa, TWO52)) goto label; */
45559 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45561 /* xa = xa + TWO52 - TWO52; */
45562 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45563 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45565 /* xa = copysign (xa, operand1) */
45566 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45568 /* generate 1.0 or -1.0 */
45569 one = force_reg (mode,
45570 const_double_from_real_value (do_floor
45571 ? dconst1 : dconstm1, mode));
45573 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45574 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45575 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45576 /* We always need to subtract here to preserve signed zero. */
45577 tmp = expand_simple_binop (mode, MINUS,
45578 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45579 emit_move_insn (res, tmp);
45581 emit_label (label);
45582 LABEL_NUSES (label) = 1;
45584 emit_move_insn (operand0, res);
45587 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45588 into OPERAND0. */
45589 void
45590 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45592 /* C code for the stuff we expand below.
45593 double xa = fabs (x), x2;
45594 if (!isless (xa, TWO52))
45595 return x;
45596 x2 = (double)(long)x;
45597 Compensate. Floor:
45598 if (x2 > x)
45599 x2 -= 1;
45600 Compensate. Ceil:
45601 if (x2 < x)
45602 x2 += 1;
45603 if (HONOR_SIGNED_ZEROS (mode))
45604 return copysign (x2, x);
45605 return x2;
45607 machine_mode mode = GET_MODE (operand0);
45608 rtx xa, xi, TWO52, tmp, one, res, mask;
45609 rtx_code_label *label;
45611 TWO52 = ix86_gen_TWO52 (mode);
45613 /* Temporary for holding the result, initialized to the input
45614 operand to ease control flow. */
45615 res = gen_reg_rtx (mode);
45616 emit_move_insn (res, operand1);
45618 /* xa = abs (operand1) */
45619 xa = ix86_expand_sse_fabs (res, &mask);
45621 /* if (!isless (xa, TWO52)) goto label; */
45622 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45624 /* xa = (double)(long)x */
45625 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45626 expand_fix (xi, res, 0);
45627 expand_float (xa, xi, 0);
45629 /* generate 1.0 */
45630 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45632 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45633 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45634 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45635 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45636 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45637 emit_move_insn (res, tmp);
45639 if (HONOR_SIGNED_ZEROS (mode))
45640 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45642 emit_label (label);
45643 LABEL_NUSES (label) = 1;
45645 emit_move_insn (operand0, res);
45648 /* Expand SSE sequence for computing round from OPERAND1 storing
45649 into OPERAND0. Sequence that works without relying on DImode truncation
45650 via cvttsd2siq that is only available on 64bit targets. */
45651 void
45652 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45654 /* C code for the stuff we expand below.
45655 double xa = fabs (x), xa2, x2;
45656 if (!isless (xa, TWO52))
45657 return x;
45658 Using the absolute value and copying back sign makes
45659 -0.0 -> -0.0 correct.
45660 xa2 = xa + TWO52 - TWO52;
45661 Compensate.
45662 dxa = xa2 - xa;
45663 if (dxa <= -0.5)
45664 xa2 += 1;
45665 else if (dxa > 0.5)
45666 xa2 -= 1;
45667 x2 = copysign (xa2, x);
45668 return x2;
45670 machine_mode mode = GET_MODE (operand0);
45671 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45672 rtx_code_label *label;
45674 TWO52 = ix86_gen_TWO52 (mode);
45676 /* Temporary for holding the result, initialized to the input
45677 operand to ease control flow. */
45678 res = gen_reg_rtx (mode);
45679 emit_move_insn (res, operand1);
45681 /* xa = abs (operand1) */
45682 xa = ix86_expand_sse_fabs (res, &mask);
45684 /* if (!isless (xa, TWO52)) goto label; */
45685 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45687 /* xa2 = xa + TWO52 - TWO52; */
45688 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45689 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45691 /* dxa = xa2 - xa; */
45692 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45694 /* generate 0.5, 1.0 and -0.5 */
45695 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45696 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45697 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45698 0, OPTAB_DIRECT);
45700 /* Compensate. */
45701 tmp = gen_reg_rtx (mode);
45702 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45703 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45704 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45705 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45706 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45707 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45708 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45709 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45711 /* res = copysign (xa2, operand1) */
45712 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45714 emit_label (label);
45715 LABEL_NUSES (label) = 1;
45717 emit_move_insn (operand0, res);
45720 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45721 into OPERAND0. */
45722 void
45723 ix86_expand_trunc (rtx operand0, rtx operand1)
45725 /* C code for SSE variant we expand below.
45726 double xa = fabs (x), x2;
45727 if (!isless (xa, TWO52))
45728 return x;
45729 x2 = (double)(long)x;
45730 if (HONOR_SIGNED_ZEROS (mode))
45731 return copysign (x2, x);
45732 return x2;
45734 machine_mode mode = GET_MODE (operand0);
45735 rtx xa, xi, TWO52, res, mask;
45736 rtx_code_label *label;
45738 TWO52 = ix86_gen_TWO52 (mode);
45740 /* Temporary for holding the result, initialized to the input
45741 operand to ease control flow. */
45742 res = gen_reg_rtx (mode);
45743 emit_move_insn (res, operand1);
45745 /* xa = abs (operand1) */
45746 xa = ix86_expand_sse_fabs (res, &mask);
45748 /* if (!isless (xa, TWO52)) goto label; */
45749 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45751 /* x = (double)(long)x */
45752 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45753 expand_fix (xi, res, 0);
45754 expand_float (res, xi, 0);
45756 if (HONOR_SIGNED_ZEROS (mode))
45757 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45759 emit_label (label);
45760 LABEL_NUSES (label) = 1;
45762 emit_move_insn (operand0, res);
45765 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45766 into OPERAND0. */
45767 void
45768 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45770 machine_mode mode = GET_MODE (operand0);
45771 rtx xa, mask, TWO52, one, res, smask, tmp;
45772 rtx_code_label *label;
45774 /* C code for SSE variant we expand below.
45775 double xa = fabs (x), x2;
45776 if (!isless (xa, TWO52))
45777 return x;
45778 xa2 = xa + TWO52 - TWO52;
45779 Compensate:
45780 if (xa2 > xa)
45781 xa2 -= 1.0;
45782 x2 = copysign (xa2, x);
45783 return x2;
45786 TWO52 = ix86_gen_TWO52 (mode);
45788 /* Temporary for holding the result, initialized to the input
45789 operand to ease control flow. */
45790 res = gen_reg_rtx (mode);
45791 emit_move_insn (res, operand1);
45793 /* xa = abs (operand1) */
45794 xa = ix86_expand_sse_fabs (res, &smask);
45796 /* if (!isless (xa, TWO52)) goto label; */
45797 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45799 /* res = xa + TWO52 - TWO52; */
45800 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45801 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45802 emit_move_insn (res, tmp);
45804 /* generate 1.0 */
45805 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45807 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45808 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45809 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45810 tmp = expand_simple_binop (mode, MINUS,
45811 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45812 emit_move_insn (res, tmp);
45814 /* res = copysign (res, operand1) */
45815 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45817 emit_label (label);
45818 LABEL_NUSES (label) = 1;
45820 emit_move_insn (operand0, res);
45823 /* Expand SSE sequence for computing round from OPERAND1 storing
45824 into OPERAND0. */
45825 void
45826 ix86_expand_round (rtx operand0, rtx operand1)
45828 /* C code for the stuff we're doing below:
45829 double xa = fabs (x);
45830 if (!isless (xa, TWO52))
45831 return x;
45832 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45833 return copysign (xa, x);
45835 machine_mode mode = GET_MODE (operand0);
45836 rtx res, TWO52, xa, xi, half, mask;
45837 rtx_code_label *label;
45838 const struct real_format *fmt;
45839 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45841 /* Temporary for holding the result, initialized to the input
45842 operand to ease control flow. */
45843 res = gen_reg_rtx (mode);
45844 emit_move_insn (res, operand1);
45846 TWO52 = ix86_gen_TWO52 (mode);
45847 xa = ix86_expand_sse_fabs (res, &mask);
45848 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45850 /* load nextafter (0.5, 0.0) */
45851 fmt = REAL_MODE_FORMAT (mode);
45852 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45853 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45855 /* xa = xa + 0.5 */
45856 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45857 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45859 /* xa = (double)(int64_t)xa */
45860 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45861 expand_fix (xi, xa, 0);
45862 expand_float (xa, xi, 0);
45864 /* res = copysign (xa, operand1) */
45865 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45867 emit_label (label);
45868 LABEL_NUSES (label) = 1;
45870 emit_move_insn (operand0, res);
45873 /* Expand SSE sequence for computing round
45874 from OP1 storing into OP0 using sse4 round insn. */
45875 void
45876 ix86_expand_round_sse4 (rtx op0, rtx op1)
45878 machine_mode mode = GET_MODE (op0);
45879 rtx e1, e2, res, half;
45880 const struct real_format *fmt;
45881 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45882 rtx (*gen_copysign) (rtx, rtx, rtx);
45883 rtx (*gen_round) (rtx, rtx, rtx);
45885 switch (mode)
45887 case E_SFmode:
45888 gen_copysign = gen_copysignsf3;
45889 gen_round = gen_sse4_1_roundsf2;
45890 break;
45891 case E_DFmode:
45892 gen_copysign = gen_copysigndf3;
45893 gen_round = gen_sse4_1_rounddf2;
45894 break;
45895 default:
45896 gcc_unreachable ();
45899 /* round (a) = trunc (a + copysign (0.5, a)) */
45901 /* load nextafter (0.5, 0.0) */
45902 fmt = REAL_MODE_FORMAT (mode);
45903 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45904 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45905 half = const_double_from_real_value (pred_half, mode);
45907 /* e1 = copysign (0.5, op1) */
45908 e1 = gen_reg_rtx (mode);
45909 emit_insn (gen_copysign (e1, half, op1));
45911 /* e2 = op1 + e1 */
45912 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45914 /* res = trunc (e2) */
45915 res = gen_reg_rtx (mode);
45916 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45918 emit_move_insn (op0, res);
45922 /* Table of valid machine attributes. */
45923 static const struct attribute_spec ix86_attribute_table[] =
45925 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
45926 affects_type_identity, handler, exclude } */
45927 /* Stdcall attribute says callee is responsible for popping arguments
45928 if they are not variable. */
45929 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45930 NULL },
45931 /* Fastcall attribute says callee is responsible for popping arguments
45932 if they are not variable. */
45933 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45934 NULL },
45935 /* Thiscall attribute says callee is responsible for popping arguments
45936 if they are not variable. */
45937 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45938 NULL },
45939 /* Cdecl attribute says the callee is a normal C declaration */
45940 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45941 NULL },
45942 /* Regparm attribute specifies how many integer arguments are to be
45943 passed in registers. */
45944 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
45945 NULL },
45946 /* Sseregparm attribute says we are using x86_64 calling conventions
45947 for FP arguments. */
45948 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45949 NULL },
45950 /* The transactional memory builtins are implicitly regparm or fastcall
45951 depending on the ABI. Override the generic do-nothing attribute that
45952 these builtins were declared with. */
45953 { "*tm regparm", 0, 0, false, true, true, true,
45954 ix86_handle_tm_regparm_attribute, NULL },
45955 /* force_align_arg_pointer says this function realigns the stack at entry. */
45956 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45957 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
45958 NULL },
45959 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45960 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
45961 NULL },
45962 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
45963 NULL },
45964 { "shared", 0, 0, true, false, false, false,
45965 ix86_handle_shared_attribute, NULL },
45966 #endif
45967 { "ms_struct", 0, 0, false, false, false, false,
45968 ix86_handle_struct_attribute, NULL },
45969 { "gcc_struct", 0, 0, false, false, false, false,
45970 ix86_handle_struct_attribute, NULL },
45971 #ifdef SUBTARGET_ATTRIBUTE_TABLE
45972 SUBTARGET_ATTRIBUTE_TABLE,
45973 #endif
45974 /* ms_abi and sysv_abi calling convention function attributes. */
45975 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
45976 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
45977 NULL },
45978 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45979 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45980 { "ms_hook_prologue", 0, 0, true, false, false, false,
45981 ix86_handle_fndecl_attribute, NULL },
45982 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
45983 ix86_handle_callee_pop_aggregate_return, NULL },
45984 { "interrupt", 0, 0, false, true, true, false,
45985 ix86_handle_interrupt_attribute, NULL },
45986 { "no_caller_saved_registers", 0, 0, false, true, true, false,
45987 ix86_handle_no_caller_saved_registers_attribute, NULL },
45988 { "naked", 0, 0, true, false, false, false,
45989 ix86_handle_fndecl_attribute, NULL },
45990 { "indirect_branch", 1, 1, true, false, false, false,
45991 ix86_handle_fndecl_attribute, NULL },
45992 { "function_return", 1, 1, true, false, false, false,
45993 ix86_handle_fndecl_attribute, NULL },
45995 /* End element. */
45996 { NULL, 0, 0, false, false, false, false, NULL, NULL }
45999 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46000 static int
46001 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46002 tree vectype, int)
46004 bool fp = false;
46005 machine_mode mode = TImode;
46006 int index;
46007 if (vectype != NULL)
46009 fp = FLOAT_TYPE_P (vectype);
46010 mode = TYPE_MODE (vectype);
46013 switch (type_of_cost)
46015 case scalar_stmt:
46016 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
46018 case scalar_load:
46019 /* load/store costs are relative to register move which is 2. Recompute
46020 it to COSTS_N_INSNS so everything have same base. */
46021 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
46022 : ix86_cost->int_load [2]) / 2;
46024 case scalar_store:
46025 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
46026 : ix86_cost->int_store [2]) / 2;
46028 case vector_stmt:
46029 return ix86_vec_cost (mode,
46030 fp ? ix86_cost->addss : ix86_cost->sse_op,
46031 true);
46033 case vector_load:
46034 index = sse_store_index (mode);
46035 /* See PR82713 - we may end up being called on non-vector type. */
46036 if (index < 0)
46037 index = 2;
46038 return ix86_vec_cost (mode,
46039 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
46040 true);
46042 case vector_store:
46043 index = sse_store_index (mode);
46044 /* See PR82713 - we may end up being called on non-vector type. */
46045 if (index < 0)
46046 index = 2;
46047 return ix86_vec_cost (mode,
46048 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
46049 true);
46051 case vec_to_scalar:
46052 case scalar_to_vec:
46053 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
46055 /* We should have separate costs for unaligned loads and gather/scatter.
46056 Do that incrementally. */
46057 case unaligned_load:
46058 index = sse_store_index (mode);
46059 /* See PR82713 - we may end up being called on non-vector type. */
46060 if (index < 0)
46061 index = 2;
46062 return ix86_vec_cost (mode,
46063 COSTS_N_INSNS
46064 (ix86_cost->sse_unaligned_load[index]) / 2,
46065 true);
46067 case unaligned_store:
46068 index = sse_store_index (mode);
46069 /* See PR82713 - we may end up being called on non-vector type. */
46070 if (index < 0)
46071 index = 2;
46072 return ix86_vec_cost (mode,
46073 COSTS_N_INSNS
46074 (ix86_cost->sse_unaligned_store[index]) / 2,
46075 true);
46077 case vector_gather_load:
46078 return ix86_vec_cost (mode,
46079 COSTS_N_INSNS
46080 (ix86_cost->gather_static
46081 + ix86_cost->gather_per_elt
46082 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46083 true);
46085 case vector_scatter_store:
46086 return ix86_vec_cost (mode,
46087 COSTS_N_INSNS
46088 (ix86_cost->scatter_static
46089 + ix86_cost->scatter_per_elt
46090 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46091 true);
46093 case cond_branch_taken:
46094 return ix86_cost->cond_taken_branch_cost;
46096 case cond_branch_not_taken:
46097 return ix86_cost->cond_not_taken_branch_cost;
46099 case vec_perm:
46100 case vec_promote_demote:
46101 return ix86_vec_cost (mode,
46102 ix86_cost->sse_op, true);
46104 case vec_construct:
46106 /* N element inserts. */
46107 int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false);
46108 /* One vinserti128 for combining two SSE vectors for AVX256. */
46109 if (GET_MODE_BITSIZE (mode) == 256)
46110 cost += ix86_vec_cost (mode, ix86_cost->addss, true);
46111 /* One vinserti64x4 and two vinserti128 for combining SSE
46112 and AVX256 vectors to AVX512. */
46113 else if (GET_MODE_BITSIZE (mode) == 512)
46114 cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true);
46115 return cost;
46118 default:
46119 gcc_unreachable ();
46123 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46124 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46125 insn every time. */
46127 static GTY(()) rtx_insn *vselect_insn;
46129 /* Initialize vselect_insn. */
46131 static void
46132 init_vselect_insn (void)
46134 unsigned i;
46135 rtx x;
46137 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46138 for (i = 0; i < MAX_VECT_LEN; ++i)
46139 XVECEXP (x, 0, i) = const0_rtx;
46140 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46141 const0_rtx), x);
46142 x = gen_rtx_SET (const0_rtx, x);
46143 start_sequence ();
46144 vselect_insn = emit_insn (x);
46145 end_sequence ();
46148 /* Construct (set target (vec_select op0 (parallel perm))) and
46149 return true if that's a valid instruction in the active ISA. */
46151 static bool
46152 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46153 unsigned nelt, bool testing_p)
46155 unsigned int i;
46156 rtx x, save_vconcat;
46157 int icode;
46159 if (vselect_insn == NULL_RTX)
46160 init_vselect_insn ();
46162 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46163 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46164 for (i = 0; i < nelt; ++i)
46165 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46166 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46167 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46168 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46169 SET_DEST (PATTERN (vselect_insn)) = target;
46170 icode = recog_memoized (vselect_insn);
46172 if (icode >= 0 && !testing_p)
46173 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46175 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46176 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46177 INSN_CODE (vselect_insn) = -1;
46179 return icode >= 0;
46182 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46184 static bool
46185 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46186 const unsigned char *perm, unsigned nelt,
46187 bool testing_p)
46189 machine_mode v2mode;
46190 rtx x;
46191 bool ok;
46193 if (vselect_insn == NULL_RTX)
46194 init_vselect_insn ();
46196 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
46197 return false;
46198 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46199 PUT_MODE (x, v2mode);
46200 XEXP (x, 0) = op0;
46201 XEXP (x, 1) = op1;
46202 ok = expand_vselect (target, x, perm, nelt, testing_p);
46203 XEXP (x, 0) = const0_rtx;
46204 XEXP (x, 1) = const0_rtx;
46205 return ok;
46208 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46209 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46211 static bool
46212 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46214 machine_mode mmode, vmode = d->vmode;
46215 unsigned i, mask, nelt = d->nelt;
46216 rtx target, op0, op1, maskop, x;
46217 rtx rperm[32], vperm;
46219 if (d->one_operand_p)
46220 return false;
46221 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46222 && (TARGET_AVX512BW
46223 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46225 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46227 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46229 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46231 else
46232 return false;
46234 /* This is a blend, not a permute. Elements must stay in their
46235 respective lanes. */
46236 for (i = 0; i < nelt; ++i)
46238 unsigned e = d->perm[i];
46239 if (!(e == i || e == i + nelt))
46240 return false;
46243 if (d->testing_p)
46244 return true;
46246 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46247 decision should be extracted elsewhere, so that we only try that
46248 sequence once all budget==3 options have been tried. */
46249 target = d->target;
46250 op0 = d->op0;
46251 op1 = d->op1;
46252 mask = 0;
46254 switch (vmode)
46256 case E_V8DFmode:
46257 case E_V16SFmode:
46258 case E_V4DFmode:
46259 case E_V8SFmode:
46260 case E_V2DFmode:
46261 case E_V4SFmode:
46262 case E_V8HImode:
46263 case E_V8SImode:
46264 case E_V32HImode:
46265 case E_V64QImode:
46266 case E_V16SImode:
46267 case E_V8DImode:
46268 for (i = 0; i < nelt; ++i)
46269 mask |= (d->perm[i] >= nelt) << i;
46270 break;
46272 case E_V2DImode:
46273 for (i = 0; i < 2; ++i)
46274 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46275 vmode = V8HImode;
46276 goto do_subreg;
46278 case E_V4SImode:
46279 for (i = 0; i < 4; ++i)
46280 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46281 vmode = V8HImode;
46282 goto do_subreg;
46284 case E_V16QImode:
46285 /* See if bytes move in pairs so we can use pblendw with
46286 an immediate argument, rather than pblendvb with a vector
46287 argument. */
46288 for (i = 0; i < 16; i += 2)
46289 if (d->perm[i] + 1 != d->perm[i + 1])
46291 use_pblendvb:
46292 for (i = 0; i < nelt; ++i)
46293 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46295 finish_pblendvb:
46296 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46297 vperm = force_reg (vmode, vperm);
46299 if (GET_MODE_SIZE (vmode) == 16)
46300 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46301 else
46302 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46303 if (target != d->target)
46304 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46305 return true;
46308 for (i = 0; i < 8; ++i)
46309 mask |= (d->perm[i * 2] >= 16) << i;
46310 vmode = V8HImode;
46311 /* FALLTHRU */
46313 do_subreg:
46314 target = gen_reg_rtx (vmode);
46315 op0 = gen_lowpart (vmode, op0);
46316 op1 = gen_lowpart (vmode, op1);
46317 break;
46319 case E_V32QImode:
46320 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46321 for (i = 0; i < 32; i += 2)
46322 if (d->perm[i] + 1 != d->perm[i + 1])
46323 goto use_pblendvb;
46324 /* See if bytes move in quadruplets. If yes, vpblendd
46325 with immediate can be used. */
46326 for (i = 0; i < 32; i += 4)
46327 if (d->perm[i] + 2 != d->perm[i + 2])
46328 break;
46329 if (i < 32)
46331 /* See if bytes move the same in both lanes. If yes,
46332 vpblendw with immediate can be used. */
46333 for (i = 0; i < 16; i += 2)
46334 if (d->perm[i] + 16 != d->perm[i + 16])
46335 goto use_pblendvb;
46337 /* Use vpblendw. */
46338 for (i = 0; i < 16; ++i)
46339 mask |= (d->perm[i * 2] >= 32) << i;
46340 vmode = V16HImode;
46341 goto do_subreg;
46344 /* Use vpblendd. */
46345 for (i = 0; i < 8; ++i)
46346 mask |= (d->perm[i * 4] >= 32) << i;
46347 vmode = V8SImode;
46348 goto do_subreg;
46350 case E_V16HImode:
46351 /* See if words move in pairs. If yes, vpblendd can be used. */
46352 for (i = 0; i < 16; i += 2)
46353 if (d->perm[i] + 1 != d->perm[i + 1])
46354 break;
46355 if (i < 16)
46357 /* See if words move the same in both lanes. If not,
46358 vpblendvb must be used. */
46359 for (i = 0; i < 8; i++)
46360 if (d->perm[i] + 8 != d->perm[i + 8])
46362 /* Use vpblendvb. */
46363 for (i = 0; i < 32; ++i)
46364 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46366 vmode = V32QImode;
46367 nelt = 32;
46368 target = gen_reg_rtx (vmode);
46369 op0 = gen_lowpart (vmode, op0);
46370 op1 = gen_lowpart (vmode, op1);
46371 goto finish_pblendvb;
46374 /* Use vpblendw. */
46375 for (i = 0; i < 16; ++i)
46376 mask |= (d->perm[i] >= 16) << i;
46377 break;
46380 /* Use vpblendd. */
46381 for (i = 0; i < 8; ++i)
46382 mask |= (d->perm[i * 2] >= 16) << i;
46383 vmode = V8SImode;
46384 goto do_subreg;
46386 case E_V4DImode:
46387 /* Use vpblendd. */
46388 for (i = 0; i < 4; ++i)
46389 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46390 vmode = V8SImode;
46391 goto do_subreg;
46393 default:
46394 gcc_unreachable ();
46397 switch (vmode)
46399 case E_V8DFmode:
46400 case E_V8DImode:
46401 mmode = QImode;
46402 break;
46403 case E_V16SFmode:
46404 case E_V16SImode:
46405 mmode = HImode;
46406 break;
46407 case E_V32HImode:
46408 mmode = SImode;
46409 break;
46410 case E_V64QImode:
46411 mmode = DImode;
46412 break;
46413 default:
46414 mmode = VOIDmode;
46417 if (mmode != VOIDmode)
46418 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46419 else
46420 maskop = GEN_INT (mask);
46422 /* This matches five different patterns with the different modes. */
46423 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46424 x = gen_rtx_SET (target, x);
46425 emit_insn (x);
46426 if (target != d->target)
46427 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46429 return true;
46432 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46433 in terms of the variable form of vpermilps.
46435 Note that we will have already failed the immediate input vpermilps,
46436 which requires that the high and low part shuffle be identical; the
46437 variable form doesn't require that. */
46439 static bool
46440 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46442 rtx rperm[8], vperm;
46443 unsigned i;
46445 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46446 return false;
46448 /* We can only permute within the 128-bit lane. */
46449 for (i = 0; i < 8; ++i)
46451 unsigned e = d->perm[i];
46452 if (i < 4 ? e >= 4 : e < 4)
46453 return false;
46456 if (d->testing_p)
46457 return true;
46459 for (i = 0; i < 8; ++i)
46461 unsigned e = d->perm[i];
46463 /* Within each 128-bit lane, the elements of op0 are numbered
46464 from 0 and the elements of op1 are numbered from 4. */
46465 if (e >= 8 + 4)
46466 e -= 8;
46467 else if (e >= 4)
46468 e -= 4;
46470 rperm[i] = GEN_INT (e);
46473 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46474 vperm = force_reg (V8SImode, vperm);
46475 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46477 return true;
46480 /* Return true if permutation D can be performed as VMODE permutation
46481 instead. */
46483 static bool
46484 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46486 unsigned int i, j, chunk;
46488 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46489 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46490 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46491 return false;
46493 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46494 return true;
46496 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46497 for (i = 0; i < d->nelt; i += chunk)
46498 if (d->perm[i] & (chunk - 1))
46499 return false;
46500 else
46501 for (j = 1; j < chunk; ++j)
46502 if (d->perm[i] + j != d->perm[i + j])
46503 return false;
46505 return true;
46508 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46509 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46511 static bool
46512 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46514 unsigned i, nelt, eltsz, mask;
46515 unsigned char perm[64];
46516 machine_mode vmode = V16QImode;
46517 rtx rperm[64], vperm, target, op0, op1;
46519 nelt = d->nelt;
46521 if (!d->one_operand_p)
46523 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46525 if (TARGET_AVX2
46526 && valid_perm_using_mode_p (V2TImode, d))
46528 if (d->testing_p)
46529 return true;
46531 /* Use vperm2i128 insn. The pattern uses
46532 V4DImode instead of V2TImode. */
46533 target = d->target;
46534 if (d->vmode != V4DImode)
46535 target = gen_reg_rtx (V4DImode);
46536 op0 = gen_lowpart (V4DImode, d->op0);
46537 op1 = gen_lowpart (V4DImode, d->op1);
46538 rperm[0]
46539 = GEN_INT ((d->perm[0] / (nelt / 2))
46540 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46541 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46542 if (target != d->target)
46543 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46544 return true;
46546 return false;
46549 else
46551 if (GET_MODE_SIZE (d->vmode) == 16)
46553 if (!TARGET_SSSE3)
46554 return false;
46556 else if (GET_MODE_SIZE (d->vmode) == 32)
46558 if (!TARGET_AVX2)
46559 return false;
46561 /* V4DImode should be already handled through
46562 expand_vselect by vpermq instruction. */
46563 gcc_assert (d->vmode != V4DImode);
46565 vmode = V32QImode;
46566 if (d->vmode == V8SImode
46567 || d->vmode == V16HImode
46568 || d->vmode == V32QImode)
46570 /* First see if vpermq can be used for
46571 V8SImode/V16HImode/V32QImode. */
46572 if (valid_perm_using_mode_p (V4DImode, d))
46574 for (i = 0; i < 4; i++)
46575 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46576 if (d->testing_p)
46577 return true;
46578 target = gen_reg_rtx (V4DImode);
46579 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46580 perm, 4, false))
46582 emit_move_insn (d->target,
46583 gen_lowpart (d->vmode, target));
46584 return true;
46586 return false;
46589 /* Next see if vpermd can be used. */
46590 if (valid_perm_using_mode_p (V8SImode, d))
46591 vmode = V8SImode;
46593 /* Or if vpermps can be used. */
46594 else if (d->vmode == V8SFmode)
46595 vmode = V8SImode;
46597 if (vmode == V32QImode)
46599 /* vpshufb only works intra lanes, it is not
46600 possible to shuffle bytes in between the lanes. */
46601 for (i = 0; i < nelt; ++i)
46602 if ((d->perm[i] ^ i) & (nelt / 2))
46603 return false;
46606 else if (GET_MODE_SIZE (d->vmode) == 64)
46608 if (!TARGET_AVX512BW)
46609 return false;
46611 /* If vpermq didn't work, vpshufb won't work either. */
46612 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46613 return false;
46615 vmode = V64QImode;
46616 if (d->vmode == V16SImode
46617 || d->vmode == V32HImode
46618 || d->vmode == V64QImode)
46620 /* First see if vpermq can be used for
46621 V16SImode/V32HImode/V64QImode. */
46622 if (valid_perm_using_mode_p (V8DImode, d))
46624 for (i = 0; i < 8; i++)
46625 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46626 if (d->testing_p)
46627 return true;
46628 target = gen_reg_rtx (V8DImode);
46629 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46630 perm, 8, false))
46632 emit_move_insn (d->target,
46633 gen_lowpart (d->vmode, target));
46634 return true;
46636 return false;
46639 /* Next see if vpermd can be used. */
46640 if (valid_perm_using_mode_p (V16SImode, d))
46641 vmode = V16SImode;
46643 /* Or if vpermps can be used. */
46644 else if (d->vmode == V16SFmode)
46645 vmode = V16SImode;
46646 if (vmode == V64QImode)
46648 /* vpshufb only works intra lanes, it is not
46649 possible to shuffle bytes in between the lanes. */
46650 for (i = 0; i < nelt; ++i)
46651 if ((d->perm[i] ^ i) & (nelt / 4))
46652 return false;
46655 else
46656 return false;
46659 if (d->testing_p)
46660 return true;
46662 if (vmode == V8SImode)
46663 for (i = 0; i < 8; ++i)
46664 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46665 else if (vmode == V16SImode)
46666 for (i = 0; i < 16; ++i)
46667 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46668 else
46670 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46671 if (!d->one_operand_p)
46672 mask = 2 * nelt - 1;
46673 else if (vmode == V16QImode)
46674 mask = nelt - 1;
46675 else if (vmode == V64QImode)
46676 mask = nelt / 4 - 1;
46677 else
46678 mask = nelt / 2 - 1;
46680 for (i = 0; i < nelt; ++i)
46682 unsigned j, e = d->perm[i] & mask;
46683 for (j = 0; j < eltsz; ++j)
46684 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46688 vperm = gen_rtx_CONST_VECTOR (vmode,
46689 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46690 vperm = force_reg (vmode, vperm);
46692 target = d->target;
46693 if (d->vmode != vmode)
46694 target = gen_reg_rtx (vmode);
46695 op0 = gen_lowpart (vmode, d->op0);
46696 if (d->one_operand_p)
46698 if (vmode == V16QImode)
46699 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46700 else if (vmode == V32QImode)
46701 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46702 else if (vmode == V64QImode)
46703 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46704 else if (vmode == V8SFmode)
46705 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46706 else if (vmode == V8SImode)
46707 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46708 else if (vmode == V16SFmode)
46709 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46710 else if (vmode == V16SImode)
46711 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46712 else
46713 gcc_unreachable ();
46715 else
46717 op1 = gen_lowpart (vmode, d->op1);
46718 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46720 if (target != d->target)
46721 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46723 return true;
46726 /* For V*[QHS]Imode permutations, check if the same permutation
46727 can't be performed in a 2x, 4x or 8x wider inner mode. */
46729 static bool
46730 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46731 struct expand_vec_perm_d *nd)
46733 int i;
46734 machine_mode mode = VOIDmode;
46736 switch (d->vmode)
46738 case E_V16QImode: mode = V8HImode; break;
46739 case E_V32QImode: mode = V16HImode; break;
46740 case E_V64QImode: mode = V32HImode; break;
46741 case E_V8HImode: mode = V4SImode; break;
46742 case E_V16HImode: mode = V8SImode; break;
46743 case E_V32HImode: mode = V16SImode; break;
46744 case E_V4SImode: mode = V2DImode; break;
46745 case E_V8SImode: mode = V4DImode; break;
46746 case E_V16SImode: mode = V8DImode; break;
46747 default: return false;
46749 for (i = 0; i < d->nelt; i += 2)
46750 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46751 return false;
46752 nd->vmode = mode;
46753 nd->nelt = d->nelt / 2;
46754 for (i = 0; i < nd->nelt; i++)
46755 nd->perm[i] = d->perm[2 * i] / 2;
46756 if (GET_MODE_INNER (mode) != DImode)
46757 canonicalize_vector_int_perm (nd, nd);
46758 if (nd != d)
46760 nd->one_operand_p = d->one_operand_p;
46761 nd->testing_p = d->testing_p;
46762 if (d->op0 == d->op1)
46763 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46764 else
46766 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46767 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46769 if (d->testing_p)
46770 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46771 else
46772 nd->target = gen_reg_rtx (nd->vmode);
46774 return true;
46777 /* Try to expand one-operand permutation with constant mask. */
46779 static bool
46780 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46782 machine_mode mode = GET_MODE (d->op0);
46783 machine_mode maskmode = mode;
46784 rtx (*gen) (rtx, rtx, rtx) = NULL;
46785 rtx target, op0, mask;
46786 rtx vec[64];
46788 if (!rtx_equal_p (d->op0, d->op1))
46789 return false;
46791 if (!TARGET_AVX512F)
46792 return false;
46794 switch (mode)
46796 case E_V16SImode:
46797 gen = gen_avx512f_permvarv16si;
46798 break;
46799 case E_V16SFmode:
46800 gen = gen_avx512f_permvarv16sf;
46801 maskmode = V16SImode;
46802 break;
46803 case E_V8DImode:
46804 gen = gen_avx512f_permvarv8di;
46805 break;
46806 case E_V8DFmode:
46807 gen = gen_avx512f_permvarv8df;
46808 maskmode = V8DImode;
46809 break;
46810 default:
46811 return false;
46814 target = d->target;
46815 op0 = d->op0;
46816 for (int i = 0; i < d->nelt; ++i)
46817 vec[i] = GEN_INT (d->perm[i]);
46818 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46819 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46820 return true;
46823 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46824 in a single instruction. */
46826 static bool
46827 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46829 unsigned i, nelt = d->nelt;
46830 struct expand_vec_perm_d nd;
46832 /* Check plain VEC_SELECT first, because AVX has instructions that could
46833 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46834 input where SEL+CONCAT may not. */
46835 if (d->one_operand_p)
46837 int mask = nelt - 1;
46838 bool identity_perm = true;
46839 bool broadcast_perm = true;
46841 for (i = 0; i < nelt; i++)
46843 nd.perm[i] = d->perm[i] & mask;
46844 if (nd.perm[i] != i)
46845 identity_perm = false;
46846 if (nd.perm[i])
46847 broadcast_perm = false;
46850 if (identity_perm)
46852 if (!d->testing_p)
46853 emit_move_insn (d->target, d->op0);
46854 return true;
46856 else if (broadcast_perm && TARGET_AVX2)
46858 /* Use vpbroadcast{b,w,d}. */
46859 rtx (*gen) (rtx, rtx) = NULL;
46860 switch (d->vmode)
46862 case E_V64QImode:
46863 if (TARGET_AVX512BW)
46864 gen = gen_avx512bw_vec_dupv64qi_1;
46865 break;
46866 case E_V32QImode:
46867 gen = gen_avx2_pbroadcastv32qi_1;
46868 break;
46869 case E_V32HImode:
46870 if (TARGET_AVX512BW)
46871 gen = gen_avx512bw_vec_dupv32hi_1;
46872 break;
46873 case E_V16HImode:
46874 gen = gen_avx2_pbroadcastv16hi_1;
46875 break;
46876 case E_V16SImode:
46877 if (TARGET_AVX512F)
46878 gen = gen_avx512f_vec_dupv16si_1;
46879 break;
46880 case E_V8SImode:
46881 gen = gen_avx2_pbroadcastv8si_1;
46882 break;
46883 case E_V16QImode:
46884 gen = gen_avx2_pbroadcastv16qi;
46885 break;
46886 case E_V8HImode:
46887 gen = gen_avx2_pbroadcastv8hi;
46888 break;
46889 case E_V16SFmode:
46890 if (TARGET_AVX512F)
46891 gen = gen_avx512f_vec_dupv16sf_1;
46892 break;
46893 case E_V8SFmode:
46894 gen = gen_avx2_vec_dupv8sf_1;
46895 break;
46896 case E_V8DFmode:
46897 if (TARGET_AVX512F)
46898 gen = gen_avx512f_vec_dupv8df_1;
46899 break;
46900 case E_V8DImode:
46901 if (TARGET_AVX512F)
46902 gen = gen_avx512f_vec_dupv8di_1;
46903 break;
46904 /* For other modes prefer other shuffles this function creates. */
46905 default: break;
46907 if (gen != NULL)
46909 if (!d->testing_p)
46910 emit_insn (gen (d->target, d->op0));
46911 return true;
46915 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46916 return true;
46918 /* There are plenty of patterns in sse.md that are written for
46919 SEL+CONCAT and are not replicated for a single op. Perhaps
46920 that should be changed, to avoid the nastiness here. */
46922 /* Recognize interleave style patterns, which means incrementing
46923 every other permutation operand. */
46924 for (i = 0; i < nelt; i += 2)
46926 nd.perm[i] = d->perm[i] & mask;
46927 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46929 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46930 d->testing_p))
46931 return true;
46933 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46934 if (nelt >= 4)
46936 for (i = 0; i < nelt; i += 4)
46938 nd.perm[i + 0] = d->perm[i + 0] & mask;
46939 nd.perm[i + 1] = d->perm[i + 1] & mask;
46940 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46941 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46944 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46945 d->testing_p))
46946 return true;
46950 /* Finally, try the fully general two operand permute. */
46951 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46952 d->testing_p))
46953 return true;
46955 /* Recognize interleave style patterns with reversed operands. */
46956 if (!d->one_operand_p)
46958 for (i = 0; i < nelt; ++i)
46960 unsigned e = d->perm[i];
46961 if (e >= nelt)
46962 e -= nelt;
46963 else
46964 e += nelt;
46965 nd.perm[i] = e;
46968 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
46969 d->testing_p))
46970 return true;
46973 /* Try the SSE4.1 blend variable merge instructions. */
46974 if (expand_vec_perm_blend (d))
46975 return true;
46977 /* Try one of the AVX vpermil variable permutations. */
46978 if (expand_vec_perm_vpermil (d))
46979 return true;
46981 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
46982 vpshufb, vpermd, vpermps or vpermq variable permutation. */
46983 if (expand_vec_perm_pshufb (d))
46984 return true;
46986 /* Try the AVX2 vpalignr instruction. */
46987 if (expand_vec_perm_palignr (d, true))
46988 return true;
46990 /* Try the AVX512F vperm{s,d} instructions. */
46991 if (ix86_expand_vec_one_operand_perm_avx512 (d))
46992 return true;
46994 /* Try the AVX512F vpermt2/vpermi2 instructions. */
46995 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
46996 return true;
46998 /* See if we can get the same permutation in different vector integer
46999 mode. */
47000 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47002 if (!d->testing_p)
47003 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47004 return true;
47006 return false;
47009 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47010 in terms of a pair of pshuflw + pshufhw instructions. */
47012 static bool
47013 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47015 unsigned char perm2[MAX_VECT_LEN];
47016 unsigned i;
47017 bool ok;
47019 if (d->vmode != V8HImode || !d->one_operand_p)
47020 return false;
47022 /* The two permutations only operate in 64-bit lanes. */
47023 for (i = 0; i < 4; ++i)
47024 if (d->perm[i] >= 4)
47025 return false;
47026 for (i = 4; i < 8; ++i)
47027 if (d->perm[i] < 4)
47028 return false;
47030 if (d->testing_p)
47031 return true;
47033 /* Emit the pshuflw. */
47034 memcpy (perm2, d->perm, 4);
47035 for (i = 4; i < 8; ++i)
47036 perm2[i] = i;
47037 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47038 gcc_assert (ok);
47040 /* Emit the pshufhw. */
47041 memcpy (perm2 + 4, d->perm + 4, 4);
47042 for (i = 0; i < 4; ++i)
47043 perm2[i] = i;
47044 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47045 gcc_assert (ok);
47047 return true;
47050 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47051 the permutation using the SSSE3 palignr instruction. This succeeds
47052 when all of the elements in PERM fit within one vector and we merely
47053 need to shift them down so that a single vector permutation has a
47054 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47055 the vpalignr instruction itself can perform the requested permutation. */
47057 static bool
47058 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47060 unsigned i, nelt = d->nelt;
47061 unsigned min, max, minswap, maxswap;
47062 bool in_order, ok, swap = false;
47063 rtx shift, target;
47064 struct expand_vec_perm_d dcopy;
47066 /* Even with AVX, palignr only operates on 128-bit vectors,
47067 in AVX2 palignr operates on both 128-bit lanes. */
47068 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47069 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47070 return false;
47072 min = 2 * nelt;
47073 max = 0;
47074 minswap = 2 * nelt;
47075 maxswap = 0;
47076 for (i = 0; i < nelt; ++i)
47078 unsigned e = d->perm[i];
47079 unsigned eswap = d->perm[i] ^ nelt;
47080 if (GET_MODE_SIZE (d->vmode) == 32)
47082 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47083 eswap = e ^ (nelt / 2);
47085 if (e < min)
47086 min = e;
47087 if (e > max)
47088 max = e;
47089 if (eswap < minswap)
47090 minswap = eswap;
47091 if (eswap > maxswap)
47092 maxswap = eswap;
47094 if (min == 0
47095 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47097 if (d->one_operand_p
47098 || minswap == 0
47099 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47100 ? nelt / 2 : nelt))
47101 return false;
47102 swap = true;
47103 min = minswap;
47104 max = maxswap;
47107 /* Given that we have SSSE3, we know we'll be able to implement the
47108 single operand permutation after the palignr with pshufb for
47109 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47110 first. */
47111 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47112 return true;
47114 dcopy = *d;
47115 if (swap)
47117 dcopy.op0 = d->op1;
47118 dcopy.op1 = d->op0;
47119 for (i = 0; i < nelt; ++i)
47120 dcopy.perm[i] ^= nelt;
47123 in_order = true;
47124 for (i = 0; i < nelt; ++i)
47126 unsigned e = dcopy.perm[i];
47127 if (GET_MODE_SIZE (d->vmode) == 32
47128 && e >= nelt
47129 && (e & (nelt / 2 - 1)) < min)
47130 e = e - min - (nelt / 2);
47131 else
47132 e = e - min;
47133 if (e != i)
47134 in_order = false;
47135 dcopy.perm[i] = e;
47137 dcopy.one_operand_p = true;
47139 if (single_insn_only_p && !in_order)
47140 return false;
47142 /* For AVX2, test whether we can permute the result in one instruction. */
47143 if (d->testing_p)
47145 if (in_order)
47146 return true;
47147 dcopy.op1 = dcopy.op0;
47148 return expand_vec_perm_1 (&dcopy);
47151 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47152 if (GET_MODE_SIZE (d->vmode) == 16)
47154 target = gen_reg_rtx (TImode);
47155 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47156 gen_lowpart (TImode, dcopy.op0), shift));
47158 else
47160 target = gen_reg_rtx (V2TImode);
47161 emit_insn (gen_avx2_palignrv2ti (target,
47162 gen_lowpart (V2TImode, dcopy.op1),
47163 gen_lowpart (V2TImode, dcopy.op0),
47164 shift));
47167 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47169 /* Test for the degenerate case where the alignment by itself
47170 produces the desired permutation. */
47171 if (in_order)
47173 emit_move_insn (d->target, dcopy.op0);
47174 return true;
47177 ok = expand_vec_perm_1 (&dcopy);
47178 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47180 return ok;
47183 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47184 the permutation using the SSE4_1 pblendv instruction. Potentially
47185 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47187 static bool
47188 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47190 unsigned i, which, nelt = d->nelt;
47191 struct expand_vec_perm_d dcopy, dcopy1;
47192 machine_mode vmode = d->vmode;
47193 bool ok;
47195 /* Use the same checks as in expand_vec_perm_blend. */
47196 if (d->one_operand_p)
47197 return false;
47198 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47200 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47202 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47204 else
47205 return false;
47207 /* Figure out where permutation elements stay not in their
47208 respective lanes. */
47209 for (i = 0, which = 0; i < nelt; ++i)
47211 unsigned e = d->perm[i];
47212 if (e != i)
47213 which |= (e < nelt ? 1 : 2);
47215 /* We can pblend the part where elements stay not in their
47216 respective lanes only when these elements are all in one
47217 half of a permutation.
47218 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47219 lanes, but both 8 and 9 >= 8
47220 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47221 respective lanes and 8 >= 8, but 2 not. */
47222 if (which != 1 && which != 2)
47223 return false;
47224 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47225 return true;
47227 /* First we apply one operand permutation to the part where
47228 elements stay not in their respective lanes. */
47229 dcopy = *d;
47230 if (which == 2)
47231 dcopy.op0 = dcopy.op1 = d->op1;
47232 else
47233 dcopy.op0 = dcopy.op1 = d->op0;
47234 if (!d->testing_p)
47235 dcopy.target = gen_reg_rtx (vmode);
47236 dcopy.one_operand_p = true;
47238 for (i = 0; i < nelt; ++i)
47239 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47241 ok = expand_vec_perm_1 (&dcopy);
47242 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47243 return false;
47244 else
47245 gcc_assert (ok);
47246 if (d->testing_p)
47247 return true;
47249 /* Next we put permuted elements into their positions. */
47250 dcopy1 = *d;
47251 if (which == 2)
47252 dcopy1.op1 = dcopy.target;
47253 else
47254 dcopy1.op0 = dcopy.target;
47256 for (i = 0; i < nelt; ++i)
47257 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47259 ok = expand_vec_perm_blend (&dcopy1);
47260 gcc_assert (ok);
47262 return true;
47265 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47267 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47268 a two vector permutation into a single vector permutation by using
47269 an interleave operation to merge the vectors. */
47271 static bool
47272 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47274 struct expand_vec_perm_d dremap, dfinal;
47275 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47276 unsigned HOST_WIDE_INT contents;
47277 unsigned char remap[2 * MAX_VECT_LEN];
47278 rtx_insn *seq;
47279 bool ok, same_halves = false;
47281 if (GET_MODE_SIZE (d->vmode) == 16)
47283 if (d->one_operand_p)
47284 return false;
47286 else if (GET_MODE_SIZE (d->vmode) == 32)
47288 if (!TARGET_AVX)
47289 return false;
47290 /* For 32-byte modes allow even d->one_operand_p.
47291 The lack of cross-lane shuffling in some instructions
47292 might prevent a single insn shuffle. */
47293 dfinal = *d;
47294 dfinal.testing_p = true;
47295 /* If expand_vec_perm_interleave3 can expand this into
47296 a 3 insn sequence, give up and let it be expanded as
47297 3 insn sequence. While that is one insn longer,
47298 it doesn't need a memory operand and in the common
47299 case that both interleave low and high permutations
47300 with the same operands are adjacent needs 4 insns
47301 for both after CSE. */
47302 if (expand_vec_perm_interleave3 (&dfinal))
47303 return false;
47305 else
47306 return false;
47308 /* Examine from whence the elements come. */
47309 contents = 0;
47310 for (i = 0; i < nelt; ++i)
47311 contents |= HOST_WIDE_INT_1U << d->perm[i];
47313 memset (remap, 0xff, sizeof (remap));
47314 dremap = *d;
47316 if (GET_MODE_SIZE (d->vmode) == 16)
47318 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47320 /* Split the two input vectors into 4 halves. */
47321 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47322 h2 = h1 << nelt2;
47323 h3 = h2 << nelt2;
47324 h4 = h3 << nelt2;
47326 /* If the elements from the low halves use interleave low, and similarly
47327 for interleave high. If the elements are from mis-matched halves, we
47328 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47329 if ((contents & (h1 | h3)) == contents)
47331 /* punpckl* */
47332 for (i = 0; i < nelt2; ++i)
47334 remap[i] = i * 2;
47335 remap[i + nelt] = i * 2 + 1;
47336 dremap.perm[i * 2] = i;
47337 dremap.perm[i * 2 + 1] = i + nelt;
47339 if (!TARGET_SSE2 && d->vmode == V4SImode)
47340 dremap.vmode = V4SFmode;
47342 else if ((contents & (h2 | h4)) == contents)
47344 /* punpckh* */
47345 for (i = 0; i < nelt2; ++i)
47347 remap[i + nelt2] = i * 2;
47348 remap[i + nelt + nelt2] = i * 2 + 1;
47349 dremap.perm[i * 2] = i + nelt2;
47350 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47352 if (!TARGET_SSE2 && d->vmode == V4SImode)
47353 dremap.vmode = V4SFmode;
47355 else if ((contents & (h1 | h4)) == contents)
47357 /* shufps */
47358 for (i = 0; i < nelt2; ++i)
47360 remap[i] = i;
47361 remap[i + nelt + nelt2] = i + nelt2;
47362 dremap.perm[i] = i;
47363 dremap.perm[i + nelt2] = i + nelt + nelt2;
47365 if (nelt != 4)
47367 /* shufpd */
47368 dremap.vmode = V2DImode;
47369 dremap.nelt = 2;
47370 dremap.perm[0] = 0;
47371 dremap.perm[1] = 3;
47374 else if ((contents & (h2 | h3)) == contents)
47376 /* shufps */
47377 for (i = 0; i < nelt2; ++i)
47379 remap[i + nelt2] = i;
47380 remap[i + nelt] = i + nelt2;
47381 dremap.perm[i] = i + nelt2;
47382 dremap.perm[i + nelt2] = i + nelt;
47384 if (nelt != 4)
47386 /* shufpd */
47387 dremap.vmode = V2DImode;
47388 dremap.nelt = 2;
47389 dremap.perm[0] = 1;
47390 dremap.perm[1] = 2;
47393 else
47394 return false;
47396 else
47398 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47399 unsigned HOST_WIDE_INT q[8];
47400 unsigned int nonzero_halves[4];
47402 /* Split the two input vectors into 8 quarters. */
47403 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47404 for (i = 1; i < 8; ++i)
47405 q[i] = q[0] << (nelt4 * i);
47406 for (i = 0; i < 4; ++i)
47407 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47409 nonzero_halves[nzcnt] = i;
47410 ++nzcnt;
47413 if (nzcnt == 1)
47415 gcc_assert (d->one_operand_p);
47416 nonzero_halves[1] = nonzero_halves[0];
47417 same_halves = true;
47419 else if (d->one_operand_p)
47421 gcc_assert (nonzero_halves[0] == 0);
47422 gcc_assert (nonzero_halves[1] == 1);
47425 if (nzcnt <= 2)
47427 if (d->perm[0] / nelt2 == nonzero_halves[1])
47429 /* Attempt to increase the likelihood that dfinal
47430 shuffle will be intra-lane. */
47431 std::swap (nonzero_halves[0], nonzero_halves[1]);
47434 /* vperm2f128 or vperm2i128. */
47435 for (i = 0; i < nelt2; ++i)
47437 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47438 remap[i + nonzero_halves[0] * nelt2] = i;
47439 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47440 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47443 if (d->vmode != V8SFmode
47444 && d->vmode != V4DFmode
47445 && d->vmode != V8SImode)
47447 dremap.vmode = V8SImode;
47448 dremap.nelt = 8;
47449 for (i = 0; i < 4; ++i)
47451 dremap.perm[i] = i + nonzero_halves[0] * 4;
47452 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47456 else if (d->one_operand_p)
47457 return false;
47458 else if (TARGET_AVX2
47459 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47461 /* vpunpckl* */
47462 for (i = 0; i < nelt4; ++i)
47464 remap[i] = i * 2;
47465 remap[i + nelt] = i * 2 + 1;
47466 remap[i + nelt2] = i * 2 + nelt2;
47467 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47468 dremap.perm[i * 2] = i;
47469 dremap.perm[i * 2 + 1] = i + nelt;
47470 dremap.perm[i * 2 + nelt2] = i + nelt2;
47471 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47474 else if (TARGET_AVX2
47475 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47477 /* vpunpckh* */
47478 for (i = 0; i < nelt4; ++i)
47480 remap[i + nelt4] = i * 2;
47481 remap[i + nelt + nelt4] = i * 2 + 1;
47482 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47483 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47484 dremap.perm[i * 2] = i + nelt4;
47485 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47486 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47487 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47490 else
47491 return false;
47494 /* Use the remapping array set up above to move the elements from their
47495 swizzled locations into their final destinations. */
47496 dfinal = *d;
47497 for (i = 0; i < nelt; ++i)
47499 unsigned e = remap[d->perm[i]];
47500 gcc_assert (e < nelt);
47501 /* If same_halves is true, both halves of the remapped vector are the
47502 same. Avoid cross-lane accesses if possible. */
47503 if (same_halves && i >= nelt2)
47505 gcc_assert (e < nelt2);
47506 dfinal.perm[i] = e + nelt2;
47508 else
47509 dfinal.perm[i] = e;
47511 if (!d->testing_p)
47513 dremap.target = gen_reg_rtx (dremap.vmode);
47514 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47516 dfinal.op1 = dfinal.op0;
47517 dfinal.one_operand_p = true;
47519 /* Test if the final remap can be done with a single insn. For V4SFmode or
47520 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47521 start_sequence ();
47522 ok = expand_vec_perm_1 (&dfinal);
47523 seq = get_insns ();
47524 end_sequence ();
47526 if (!ok)
47527 return false;
47529 if (d->testing_p)
47530 return true;
47532 if (dremap.vmode != dfinal.vmode)
47534 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47535 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47538 ok = expand_vec_perm_1 (&dremap);
47539 gcc_assert (ok);
47541 emit_insn (seq);
47542 return true;
47545 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47546 a single vector cross-lane permutation into vpermq followed
47547 by any of the single insn permutations. */
47549 static bool
47550 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47552 struct expand_vec_perm_d dremap, dfinal;
47553 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47554 unsigned contents[2];
47555 bool ok;
47557 if (!(TARGET_AVX2
47558 && (d->vmode == V32QImode || d->vmode == V16HImode)
47559 && d->one_operand_p))
47560 return false;
47562 contents[0] = 0;
47563 contents[1] = 0;
47564 for (i = 0; i < nelt2; ++i)
47566 contents[0] |= 1u << (d->perm[i] / nelt4);
47567 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47570 for (i = 0; i < 2; ++i)
47572 unsigned int cnt = 0;
47573 for (j = 0; j < 4; ++j)
47574 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47575 return false;
47578 if (d->testing_p)
47579 return true;
47581 dremap = *d;
47582 dremap.vmode = V4DImode;
47583 dremap.nelt = 4;
47584 dremap.target = gen_reg_rtx (V4DImode);
47585 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47586 dremap.op1 = dremap.op0;
47587 dremap.one_operand_p = true;
47588 for (i = 0; i < 2; ++i)
47590 unsigned int cnt = 0;
47591 for (j = 0; j < 4; ++j)
47592 if ((contents[i] & (1u << j)) != 0)
47593 dremap.perm[2 * i + cnt++] = j;
47594 for (; cnt < 2; ++cnt)
47595 dremap.perm[2 * i + cnt] = 0;
47598 dfinal = *d;
47599 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47600 dfinal.op1 = dfinal.op0;
47601 dfinal.one_operand_p = true;
47602 for (i = 0, j = 0; i < nelt; ++i)
47604 if (i == nelt2)
47605 j = 2;
47606 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47607 if ((d->perm[i] / nelt4) == dremap.perm[j])
47609 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47610 dfinal.perm[i] |= nelt4;
47611 else
47612 gcc_unreachable ();
47615 ok = expand_vec_perm_1 (&dremap);
47616 gcc_assert (ok);
47618 ok = expand_vec_perm_1 (&dfinal);
47619 gcc_assert (ok);
47621 return true;
47624 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47625 a vector permutation using two instructions, vperm2f128 resp.
47626 vperm2i128 followed by any single in-lane permutation. */
47628 static bool
47629 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47631 struct expand_vec_perm_d dfirst, dsecond;
47632 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47633 bool ok;
47635 if (!TARGET_AVX
47636 || GET_MODE_SIZE (d->vmode) != 32
47637 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47638 return false;
47640 dsecond = *d;
47641 dsecond.one_operand_p = false;
47642 dsecond.testing_p = true;
47644 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47645 immediate. For perm < 16 the second permutation uses
47646 d->op0 as first operand, for perm >= 16 it uses d->op1
47647 as first operand. The second operand is the result of
47648 vperm2[fi]128. */
47649 for (perm = 0; perm < 32; perm++)
47651 /* Ignore permutations which do not move anything cross-lane. */
47652 if (perm < 16)
47654 /* The second shuffle for e.g. V4DFmode has
47655 0123 and ABCD operands.
47656 Ignore AB23, as 23 is already in the second lane
47657 of the first operand. */
47658 if ((perm & 0xc) == (1 << 2)) continue;
47659 /* And 01CD, as 01 is in the first lane of the first
47660 operand. */
47661 if ((perm & 3) == 0) continue;
47662 /* And 4567, as then the vperm2[fi]128 doesn't change
47663 anything on the original 4567 second operand. */
47664 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47666 else
47668 /* The second shuffle for e.g. V4DFmode has
47669 4567 and ABCD operands.
47670 Ignore AB67, as 67 is already in the second lane
47671 of the first operand. */
47672 if ((perm & 0xc) == (3 << 2)) continue;
47673 /* And 45CD, as 45 is in the first lane of the first
47674 operand. */
47675 if ((perm & 3) == 2) continue;
47676 /* And 0123, as then the vperm2[fi]128 doesn't change
47677 anything on the original 0123 first operand. */
47678 if ((perm & 0xf) == (1 << 2)) continue;
47681 for (i = 0; i < nelt; i++)
47683 j = d->perm[i] / nelt2;
47684 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47685 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47686 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47687 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47688 else
47689 break;
47692 if (i == nelt)
47694 start_sequence ();
47695 ok = expand_vec_perm_1 (&dsecond);
47696 end_sequence ();
47698 else
47699 ok = false;
47701 if (ok)
47703 if (d->testing_p)
47704 return true;
47706 /* Found a usable second shuffle. dfirst will be
47707 vperm2f128 on d->op0 and d->op1. */
47708 dsecond.testing_p = false;
47709 dfirst = *d;
47710 dfirst.target = gen_reg_rtx (d->vmode);
47711 for (i = 0; i < nelt; i++)
47712 dfirst.perm[i] = (i & (nelt2 - 1))
47713 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47715 canonicalize_perm (&dfirst);
47716 ok = expand_vec_perm_1 (&dfirst);
47717 gcc_assert (ok);
47719 /* And dsecond is some single insn shuffle, taking
47720 d->op0 and result of vperm2f128 (if perm < 16) or
47721 d->op1 and result of vperm2f128 (otherwise). */
47722 if (perm >= 16)
47723 dsecond.op0 = dsecond.op1;
47724 dsecond.op1 = dfirst.target;
47726 ok = expand_vec_perm_1 (&dsecond);
47727 gcc_assert (ok);
47729 return true;
47732 /* For one operand, the only useful vperm2f128 permutation is 0x01
47733 aka lanes swap. */
47734 if (d->one_operand_p)
47735 return false;
47738 return false;
47741 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47742 a two vector permutation using 2 intra-lane interleave insns
47743 and cross-lane shuffle for 32-byte vectors. */
47745 static bool
47746 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47748 unsigned i, nelt;
47749 rtx (*gen) (rtx, rtx, rtx);
47751 if (d->one_operand_p)
47752 return false;
47753 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47755 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47757 else
47758 return false;
47760 nelt = d->nelt;
47761 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47762 return false;
47763 for (i = 0; i < nelt; i += 2)
47764 if (d->perm[i] != d->perm[0] + i / 2
47765 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47766 return false;
47768 if (d->testing_p)
47769 return true;
47771 switch (d->vmode)
47773 case E_V32QImode:
47774 if (d->perm[0])
47775 gen = gen_vec_interleave_highv32qi;
47776 else
47777 gen = gen_vec_interleave_lowv32qi;
47778 break;
47779 case E_V16HImode:
47780 if (d->perm[0])
47781 gen = gen_vec_interleave_highv16hi;
47782 else
47783 gen = gen_vec_interleave_lowv16hi;
47784 break;
47785 case E_V8SImode:
47786 if (d->perm[0])
47787 gen = gen_vec_interleave_highv8si;
47788 else
47789 gen = gen_vec_interleave_lowv8si;
47790 break;
47791 case E_V4DImode:
47792 if (d->perm[0])
47793 gen = gen_vec_interleave_highv4di;
47794 else
47795 gen = gen_vec_interleave_lowv4di;
47796 break;
47797 case E_V8SFmode:
47798 if (d->perm[0])
47799 gen = gen_vec_interleave_highv8sf;
47800 else
47801 gen = gen_vec_interleave_lowv8sf;
47802 break;
47803 case E_V4DFmode:
47804 if (d->perm[0])
47805 gen = gen_vec_interleave_highv4df;
47806 else
47807 gen = gen_vec_interleave_lowv4df;
47808 break;
47809 default:
47810 gcc_unreachable ();
47813 emit_insn (gen (d->target, d->op0, d->op1));
47814 return true;
47817 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47818 a single vector permutation using a single intra-lane vector
47819 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47820 the non-swapped and swapped vectors together. */
47822 static bool
47823 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47825 struct expand_vec_perm_d dfirst, dsecond;
47826 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47827 rtx_insn *seq;
47828 bool ok;
47829 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47831 if (!TARGET_AVX
47832 || TARGET_AVX2
47833 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47834 || !d->one_operand_p)
47835 return false;
47837 dfirst = *d;
47838 for (i = 0; i < nelt; i++)
47839 dfirst.perm[i] = 0xff;
47840 for (i = 0, msk = 0; i < nelt; i++)
47842 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47843 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47844 return false;
47845 dfirst.perm[j] = d->perm[i];
47846 if (j != i)
47847 msk |= (1 << i);
47849 for (i = 0; i < nelt; i++)
47850 if (dfirst.perm[i] == 0xff)
47851 dfirst.perm[i] = i;
47853 if (!d->testing_p)
47854 dfirst.target = gen_reg_rtx (dfirst.vmode);
47856 start_sequence ();
47857 ok = expand_vec_perm_1 (&dfirst);
47858 seq = get_insns ();
47859 end_sequence ();
47861 if (!ok)
47862 return false;
47864 if (d->testing_p)
47865 return true;
47867 emit_insn (seq);
47869 dsecond = *d;
47870 dsecond.op0 = dfirst.target;
47871 dsecond.op1 = dfirst.target;
47872 dsecond.one_operand_p = true;
47873 dsecond.target = gen_reg_rtx (dsecond.vmode);
47874 for (i = 0; i < nelt; i++)
47875 dsecond.perm[i] = i ^ nelt2;
47877 ok = expand_vec_perm_1 (&dsecond);
47878 gcc_assert (ok);
47880 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47881 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47882 return true;
47885 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47886 permutation using two vperm2f128, followed by a vshufpd insn blending
47887 the two vectors together. */
47889 static bool
47890 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47892 struct expand_vec_perm_d dfirst, dsecond, dthird;
47893 bool ok;
47895 if (!TARGET_AVX || (d->vmode != V4DFmode))
47896 return false;
47898 if (d->testing_p)
47899 return true;
47901 dfirst = *d;
47902 dsecond = *d;
47903 dthird = *d;
47905 dfirst.perm[0] = (d->perm[0] & ~1);
47906 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47907 dfirst.perm[2] = (d->perm[2] & ~1);
47908 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47909 dsecond.perm[0] = (d->perm[1] & ~1);
47910 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47911 dsecond.perm[2] = (d->perm[3] & ~1);
47912 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47913 dthird.perm[0] = (d->perm[0] % 2);
47914 dthird.perm[1] = (d->perm[1] % 2) + 4;
47915 dthird.perm[2] = (d->perm[2] % 2) + 2;
47916 dthird.perm[3] = (d->perm[3] % 2) + 6;
47918 dfirst.target = gen_reg_rtx (dfirst.vmode);
47919 dsecond.target = gen_reg_rtx (dsecond.vmode);
47920 dthird.op0 = dfirst.target;
47921 dthird.op1 = dsecond.target;
47922 dthird.one_operand_p = false;
47924 canonicalize_perm (&dfirst);
47925 canonicalize_perm (&dsecond);
47927 ok = expand_vec_perm_1 (&dfirst)
47928 && expand_vec_perm_1 (&dsecond)
47929 && expand_vec_perm_1 (&dthird);
47931 gcc_assert (ok);
47933 return true;
47936 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47937 permutation with two pshufb insns and an ior. We should have already
47938 failed all two instruction sequences. */
47940 static bool
47941 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47943 rtx rperm[2][16], vperm, l, h, op, m128;
47944 unsigned int i, nelt, eltsz;
47946 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47947 return false;
47948 gcc_assert (!d->one_operand_p);
47950 if (d->testing_p)
47951 return true;
47953 nelt = d->nelt;
47954 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47956 /* Generate two permutation masks. If the required element is within
47957 the given vector it is shuffled into the proper lane. If the required
47958 element is in the other vector, force a zero into the lane by setting
47959 bit 7 in the permutation mask. */
47960 m128 = GEN_INT (-128);
47961 for (i = 0; i < nelt; ++i)
47963 unsigned j, e = d->perm[i];
47964 unsigned which = (e >= nelt);
47965 if (e >= nelt)
47966 e -= nelt;
47968 for (j = 0; j < eltsz; ++j)
47970 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
47971 rperm[1-which][i*eltsz + j] = m128;
47975 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
47976 vperm = force_reg (V16QImode, vperm);
47978 l = gen_reg_rtx (V16QImode);
47979 op = gen_lowpart (V16QImode, d->op0);
47980 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
47982 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
47983 vperm = force_reg (V16QImode, vperm);
47985 h = gen_reg_rtx (V16QImode);
47986 op = gen_lowpart (V16QImode, d->op1);
47987 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
47989 op = d->target;
47990 if (d->vmode != V16QImode)
47991 op = gen_reg_rtx (V16QImode);
47992 emit_insn (gen_iorv16qi3 (op, l, h));
47993 if (op != d->target)
47994 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47996 return true;
47999 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48000 with two vpshufb insns, vpermq and vpor. We should have already failed
48001 all two or three instruction sequences. */
48003 static bool
48004 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48006 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48007 unsigned int i, nelt, eltsz;
48009 if (!TARGET_AVX2
48010 || !d->one_operand_p
48011 || (d->vmode != V32QImode && d->vmode != V16HImode))
48012 return false;
48014 if (d->testing_p)
48015 return true;
48017 nelt = d->nelt;
48018 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48020 /* Generate two permutation masks. If the required element is within
48021 the same lane, it is shuffled in. If the required element from the
48022 other lane, force a zero by setting bit 7 in the permutation mask.
48023 In the other mask the mask has non-negative elements if element
48024 is requested from the other lane, but also moved to the other lane,
48025 so that the result of vpshufb can have the two V2TImode halves
48026 swapped. */
48027 m128 = GEN_INT (-128);
48028 for (i = 0; i < nelt; ++i)
48030 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48031 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48033 for (j = 0; j < eltsz; ++j)
48035 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48036 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48040 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48041 vperm = force_reg (V32QImode, vperm);
48043 h = gen_reg_rtx (V32QImode);
48044 op = gen_lowpart (V32QImode, d->op0);
48045 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48047 /* Swap the 128-byte lanes of h into hp. */
48048 hp = gen_reg_rtx (V4DImode);
48049 op = gen_lowpart (V4DImode, h);
48050 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48051 const1_rtx));
48053 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48054 vperm = force_reg (V32QImode, vperm);
48056 l = gen_reg_rtx (V32QImode);
48057 op = gen_lowpart (V32QImode, d->op0);
48058 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48060 op = d->target;
48061 if (d->vmode != V32QImode)
48062 op = gen_reg_rtx (V32QImode);
48063 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48064 if (op != d->target)
48065 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48067 return true;
48070 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48071 and extract-odd permutations of two V32QImode and V16QImode operand
48072 with two vpshufb insns, vpor and vpermq. We should have already
48073 failed all two or three instruction sequences. */
48075 static bool
48076 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48078 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48079 unsigned int i, nelt, eltsz;
48081 if (!TARGET_AVX2
48082 || d->one_operand_p
48083 || (d->vmode != V32QImode && d->vmode != V16HImode))
48084 return false;
48086 for (i = 0; i < d->nelt; ++i)
48087 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48088 return false;
48090 if (d->testing_p)
48091 return true;
48093 nelt = d->nelt;
48094 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48096 /* Generate two permutation masks. In the first permutation mask
48097 the first quarter will contain indexes for the first half
48098 of the op0, the second quarter will contain bit 7 set, third quarter
48099 will contain indexes for the second half of the op0 and the
48100 last quarter bit 7 set. In the second permutation mask
48101 the first quarter will contain bit 7 set, the second quarter
48102 indexes for the first half of the op1, the third quarter bit 7 set
48103 and last quarter indexes for the second half of the op1.
48104 I.e. the first mask e.g. for V32QImode extract even will be:
48105 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48106 (all values masked with 0xf except for -128) and second mask
48107 for extract even will be
48108 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48109 m128 = GEN_INT (-128);
48110 for (i = 0; i < nelt; ++i)
48112 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48113 unsigned which = d->perm[i] >= nelt;
48114 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48116 for (j = 0; j < eltsz; ++j)
48118 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48119 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48123 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48124 vperm = force_reg (V32QImode, vperm);
48126 l = gen_reg_rtx (V32QImode);
48127 op = gen_lowpart (V32QImode, d->op0);
48128 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48130 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48131 vperm = force_reg (V32QImode, vperm);
48133 h = gen_reg_rtx (V32QImode);
48134 op = gen_lowpart (V32QImode, d->op1);
48135 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48137 ior = gen_reg_rtx (V32QImode);
48138 emit_insn (gen_iorv32qi3 (ior, l, h));
48140 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48141 op = gen_reg_rtx (V4DImode);
48142 ior = gen_lowpart (V4DImode, ior);
48143 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48144 const1_rtx, GEN_INT (3)));
48145 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48147 return true;
48150 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48151 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48152 with two "and" and "pack" or two "shift" and "pack" insns. We should
48153 have already failed all two instruction sequences. */
48155 static bool
48156 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48158 rtx op, dop0, dop1, t;
48159 unsigned i, odd, c, s, nelt = d->nelt;
48160 bool end_perm = false;
48161 machine_mode half_mode;
48162 rtx (*gen_and) (rtx, rtx, rtx);
48163 rtx (*gen_pack) (rtx, rtx, rtx);
48164 rtx (*gen_shift) (rtx, rtx, rtx);
48166 if (d->one_operand_p)
48167 return false;
48169 switch (d->vmode)
48171 case E_V8HImode:
48172 /* Required for "pack". */
48173 if (!TARGET_SSE4_1)
48174 return false;
48175 c = 0xffff;
48176 s = 16;
48177 half_mode = V4SImode;
48178 gen_and = gen_andv4si3;
48179 gen_pack = gen_sse4_1_packusdw;
48180 gen_shift = gen_lshrv4si3;
48181 break;
48182 case E_V16QImode:
48183 /* No check as all instructions are SSE2. */
48184 c = 0xff;
48185 s = 8;
48186 half_mode = V8HImode;
48187 gen_and = gen_andv8hi3;
48188 gen_pack = gen_sse2_packuswb;
48189 gen_shift = gen_lshrv8hi3;
48190 break;
48191 case E_V16HImode:
48192 if (!TARGET_AVX2)
48193 return false;
48194 c = 0xffff;
48195 s = 16;
48196 half_mode = V8SImode;
48197 gen_and = gen_andv8si3;
48198 gen_pack = gen_avx2_packusdw;
48199 gen_shift = gen_lshrv8si3;
48200 end_perm = true;
48201 break;
48202 case E_V32QImode:
48203 if (!TARGET_AVX2)
48204 return false;
48205 c = 0xff;
48206 s = 8;
48207 half_mode = V16HImode;
48208 gen_and = gen_andv16hi3;
48209 gen_pack = gen_avx2_packuswb;
48210 gen_shift = gen_lshrv16hi3;
48211 end_perm = true;
48212 break;
48213 default:
48214 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48215 general shuffles. */
48216 return false;
48219 /* Check that permutation is even or odd. */
48220 odd = d->perm[0];
48221 if (odd > 1)
48222 return false;
48224 for (i = 1; i < nelt; ++i)
48225 if (d->perm[i] != 2 * i + odd)
48226 return false;
48228 if (d->testing_p)
48229 return true;
48231 dop0 = gen_reg_rtx (half_mode);
48232 dop1 = gen_reg_rtx (half_mode);
48233 if (odd == 0)
48235 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
48236 t = force_reg (half_mode, t);
48237 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48238 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48240 else
48242 emit_insn (gen_shift (dop0,
48243 gen_lowpart (half_mode, d->op0),
48244 GEN_INT (s)));
48245 emit_insn (gen_shift (dop1,
48246 gen_lowpart (half_mode, d->op1),
48247 GEN_INT (s)));
48249 /* In AVX2 for 256 bit case we need to permute pack result. */
48250 if (TARGET_AVX2 && end_perm)
48252 op = gen_reg_rtx (d->vmode);
48253 t = gen_reg_rtx (V4DImode);
48254 emit_insn (gen_pack (op, dop0, dop1));
48255 emit_insn (gen_avx2_permv4di_1 (t,
48256 gen_lowpart (V4DImode, op),
48257 const0_rtx,
48258 const2_rtx,
48259 const1_rtx,
48260 GEN_INT (3)));
48261 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48263 else
48264 emit_insn (gen_pack (d->target, dop0, dop1));
48266 return true;
48269 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48270 and extract-odd permutations of two V64QI operands
48271 with two "shifts", two "truncs" and one "concat" insns for "odd"
48272 and two "truncs" and one concat insn for "even."
48273 Have already failed all two instruction sequences. */
48275 static bool
48276 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48278 rtx t1, t2, t3, t4;
48279 unsigned i, odd, nelt = d->nelt;
48281 if (!TARGET_AVX512BW
48282 || d->one_operand_p
48283 || d->vmode != V64QImode)
48284 return false;
48286 /* Check that permutation is even or odd. */
48287 odd = d->perm[0];
48288 if (odd > 1)
48289 return false;
48291 for (i = 1; i < nelt; ++i)
48292 if (d->perm[i] != 2 * i + odd)
48293 return false;
48295 if (d->testing_p)
48296 return true;
48299 if (odd)
48301 t1 = gen_reg_rtx (V32HImode);
48302 t2 = gen_reg_rtx (V32HImode);
48303 emit_insn (gen_lshrv32hi3 (t1,
48304 gen_lowpart (V32HImode, d->op0),
48305 GEN_INT (8)));
48306 emit_insn (gen_lshrv32hi3 (t2,
48307 gen_lowpart (V32HImode, d->op1),
48308 GEN_INT (8)));
48310 else
48312 t1 = gen_lowpart (V32HImode, d->op0);
48313 t2 = gen_lowpart (V32HImode, d->op1);
48316 t3 = gen_reg_rtx (V32QImode);
48317 t4 = gen_reg_rtx (V32QImode);
48318 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48319 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48320 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48322 return true;
48325 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48326 and extract-odd permutations. */
48328 static bool
48329 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48331 rtx t1, t2, t3, t4, t5;
48333 switch (d->vmode)
48335 case E_V4DFmode:
48336 if (d->testing_p)
48337 break;
48338 t1 = gen_reg_rtx (V4DFmode);
48339 t2 = gen_reg_rtx (V4DFmode);
48341 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48342 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48343 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48345 /* Now an unpck[lh]pd will produce the result required. */
48346 if (odd)
48347 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48348 else
48349 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48350 emit_insn (t3);
48351 break;
48353 case E_V8SFmode:
48355 int mask = odd ? 0xdd : 0x88;
48357 if (d->testing_p)
48358 break;
48359 t1 = gen_reg_rtx (V8SFmode);
48360 t2 = gen_reg_rtx (V8SFmode);
48361 t3 = gen_reg_rtx (V8SFmode);
48363 /* Shuffle within the 128-bit lanes to produce:
48364 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48365 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48366 GEN_INT (mask)));
48368 /* Shuffle the lanes around to produce:
48369 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48370 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48371 GEN_INT (0x3)));
48373 /* Shuffle within the 128-bit lanes to produce:
48374 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48375 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48377 /* Shuffle within the 128-bit lanes to produce:
48378 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48379 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48381 /* Shuffle the lanes around to produce:
48382 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48383 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48384 GEN_INT (0x20)));
48386 break;
48388 case E_V2DFmode:
48389 case E_V4SFmode:
48390 case E_V2DImode:
48391 case E_V4SImode:
48392 /* These are always directly implementable by expand_vec_perm_1. */
48393 gcc_unreachable ();
48395 case E_V8HImode:
48396 if (TARGET_SSE4_1)
48397 return expand_vec_perm_even_odd_pack (d);
48398 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48399 return expand_vec_perm_pshufb2 (d);
48400 else
48402 if (d->testing_p)
48403 break;
48404 /* We need 2*log2(N)-1 operations to achieve odd/even
48405 with interleave. */
48406 t1 = gen_reg_rtx (V8HImode);
48407 t2 = gen_reg_rtx (V8HImode);
48408 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48409 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48410 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48411 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48412 if (odd)
48413 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48414 else
48415 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48416 emit_insn (t3);
48418 break;
48420 case E_V16QImode:
48421 return expand_vec_perm_even_odd_pack (d);
48423 case E_V16HImode:
48424 case E_V32QImode:
48425 return expand_vec_perm_even_odd_pack (d);
48427 case E_V64QImode:
48428 return expand_vec_perm_even_odd_trunc (d);
48430 case E_V4DImode:
48431 if (!TARGET_AVX2)
48433 struct expand_vec_perm_d d_copy = *d;
48434 d_copy.vmode = V4DFmode;
48435 if (d->testing_p)
48436 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48437 else
48438 d_copy.target = gen_reg_rtx (V4DFmode);
48439 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48440 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48441 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48443 if (!d->testing_p)
48444 emit_move_insn (d->target,
48445 gen_lowpart (V4DImode, d_copy.target));
48446 return true;
48448 return false;
48451 if (d->testing_p)
48452 break;
48454 t1 = gen_reg_rtx (V4DImode);
48455 t2 = gen_reg_rtx (V4DImode);
48457 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48458 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48459 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48461 /* Now an vpunpck[lh]qdq will produce the result required. */
48462 if (odd)
48463 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48464 else
48465 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48466 emit_insn (t3);
48467 break;
48469 case E_V8SImode:
48470 if (!TARGET_AVX2)
48472 struct expand_vec_perm_d d_copy = *d;
48473 d_copy.vmode = V8SFmode;
48474 if (d->testing_p)
48475 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48476 else
48477 d_copy.target = gen_reg_rtx (V8SFmode);
48478 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48479 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48480 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48482 if (!d->testing_p)
48483 emit_move_insn (d->target,
48484 gen_lowpart (V8SImode, d_copy.target));
48485 return true;
48487 return false;
48490 if (d->testing_p)
48491 break;
48493 t1 = gen_reg_rtx (V8SImode);
48494 t2 = gen_reg_rtx (V8SImode);
48495 t3 = gen_reg_rtx (V4DImode);
48496 t4 = gen_reg_rtx (V4DImode);
48497 t5 = gen_reg_rtx (V4DImode);
48499 /* Shuffle the lanes around into
48500 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48501 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48502 gen_lowpart (V4DImode, d->op1),
48503 GEN_INT (0x20)));
48504 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48505 gen_lowpart (V4DImode, d->op1),
48506 GEN_INT (0x31)));
48508 /* Swap the 2nd and 3rd position in each lane into
48509 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48510 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48511 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48512 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48513 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48515 /* Now an vpunpck[lh]qdq will produce
48516 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48517 if (odd)
48518 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48519 gen_lowpart (V4DImode, t2));
48520 else
48521 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48522 gen_lowpart (V4DImode, t2));
48523 emit_insn (t3);
48524 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48525 break;
48527 default:
48528 gcc_unreachable ();
48531 return true;
48534 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48535 extract-even and extract-odd permutations. */
48537 static bool
48538 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48540 unsigned i, odd, nelt = d->nelt;
48542 odd = d->perm[0];
48543 if (odd != 0 && odd != 1)
48544 return false;
48546 for (i = 1; i < nelt; ++i)
48547 if (d->perm[i] != 2 * i + odd)
48548 return false;
48550 return expand_vec_perm_even_odd_1 (d, odd);
48553 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48554 permutations. We assume that expand_vec_perm_1 has already failed. */
48556 static bool
48557 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48559 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48560 machine_mode vmode = d->vmode;
48561 unsigned char perm2[4];
48562 rtx op0 = d->op0, dest;
48563 bool ok;
48565 switch (vmode)
48567 case E_V4DFmode:
48568 case E_V8SFmode:
48569 /* These are special-cased in sse.md so that we can optionally
48570 use the vbroadcast instruction. They expand to two insns
48571 if the input happens to be in a register. */
48572 gcc_unreachable ();
48574 case E_V2DFmode:
48575 case E_V2DImode:
48576 case E_V4SFmode:
48577 case E_V4SImode:
48578 /* These are always implementable using standard shuffle patterns. */
48579 gcc_unreachable ();
48581 case E_V8HImode:
48582 case E_V16QImode:
48583 /* These can be implemented via interleave. We save one insn by
48584 stopping once we have promoted to V4SImode and then use pshufd. */
48585 if (d->testing_p)
48586 return true;
48589 rtx dest;
48590 rtx (*gen) (rtx, rtx, rtx)
48591 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48592 : gen_vec_interleave_lowv8hi;
48594 if (elt >= nelt2)
48596 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48597 : gen_vec_interleave_highv8hi;
48598 elt -= nelt2;
48600 nelt2 /= 2;
48602 dest = gen_reg_rtx (vmode);
48603 emit_insn (gen (dest, op0, op0));
48604 vmode = get_mode_wider_vector (vmode);
48605 op0 = gen_lowpart (vmode, dest);
48607 while (vmode != V4SImode);
48609 memset (perm2, elt, 4);
48610 dest = gen_reg_rtx (V4SImode);
48611 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48612 gcc_assert (ok);
48613 if (!d->testing_p)
48614 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48615 return true;
48617 case E_V64QImode:
48618 case E_V32QImode:
48619 case E_V16HImode:
48620 case E_V8SImode:
48621 case E_V4DImode:
48622 /* For AVX2 broadcasts of the first element vpbroadcast* or
48623 vpermq should be used by expand_vec_perm_1. */
48624 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48625 return false;
48627 default:
48628 gcc_unreachable ();
48632 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48633 broadcast permutations. */
48635 static bool
48636 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48638 unsigned i, elt, nelt = d->nelt;
48640 if (!d->one_operand_p)
48641 return false;
48643 elt = d->perm[0];
48644 for (i = 1; i < nelt; ++i)
48645 if (d->perm[i] != elt)
48646 return false;
48648 return expand_vec_perm_broadcast_1 (d);
48651 /* Implement arbitrary permutations of two V64QImode operands
48652 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
48653 static bool
48654 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
48656 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48657 return false;
48659 if (d->testing_p)
48660 return true;
48662 struct expand_vec_perm_d ds[2];
48663 rtx rperm[128], vperm, target0, target1;
48664 unsigned int i, nelt;
48665 machine_mode vmode;
48667 nelt = d->nelt;
48668 vmode = V64QImode;
48670 for (i = 0; i < 2; i++)
48672 ds[i] = *d;
48673 ds[i].vmode = V32HImode;
48674 ds[i].nelt = 32;
48675 ds[i].target = gen_reg_rtx (V32HImode);
48676 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48677 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48680 /* Prepare permutations such that the first one takes care of
48681 putting the even bytes into the right positions or one higher
48682 positions (ds[0]) and the second one takes care of
48683 putting the odd bytes into the right positions or one below
48684 (ds[1]). */
48686 for (i = 0; i < nelt; i++)
48688 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48689 if (i & 1)
48691 rperm[i] = constm1_rtx;
48692 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48694 else
48696 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48697 rperm[i + 64] = constm1_rtx;
48701 bool ok = expand_vec_perm_1 (&ds[0]);
48702 gcc_assert (ok);
48703 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48705 ok = expand_vec_perm_1 (&ds[1]);
48706 gcc_assert (ok);
48707 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48709 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48710 vperm = force_reg (vmode, vperm);
48711 target0 = gen_reg_rtx (V64QImode);
48712 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48714 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48715 vperm = force_reg (vmode, vperm);
48716 target1 = gen_reg_rtx (V64QImode);
48717 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48719 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48720 return true;
48723 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48724 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48725 all the shorter instruction sequences. */
48727 static bool
48728 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48730 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48731 unsigned int i, nelt, eltsz;
48732 bool used[4];
48734 if (!TARGET_AVX2
48735 || d->one_operand_p
48736 || (d->vmode != V32QImode && d->vmode != V16HImode))
48737 return false;
48739 if (d->testing_p)
48740 return true;
48742 nelt = d->nelt;
48743 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48745 /* Generate 4 permutation masks. If the required element is within
48746 the same lane, it is shuffled in. If the required element from the
48747 other lane, force a zero by setting bit 7 in the permutation mask.
48748 In the other mask the mask has non-negative elements if element
48749 is requested from the other lane, but also moved to the other lane,
48750 so that the result of vpshufb can have the two V2TImode halves
48751 swapped. */
48752 m128 = GEN_INT (-128);
48753 for (i = 0; i < 32; ++i)
48755 rperm[0][i] = m128;
48756 rperm[1][i] = m128;
48757 rperm[2][i] = m128;
48758 rperm[3][i] = m128;
48760 used[0] = false;
48761 used[1] = false;
48762 used[2] = false;
48763 used[3] = false;
48764 for (i = 0; i < nelt; ++i)
48766 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48767 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48768 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48770 for (j = 0; j < eltsz; ++j)
48771 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48772 used[which] = true;
48775 for (i = 0; i < 2; ++i)
48777 if (!used[2 * i + 1])
48779 h[i] = NULL_RTX;
48780 continue;
48782 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48783 gen_rtvec_v (32, rperm[2 * i + 1]));
48784 vperm = force_reg (V32QImode, vperm);
48785 h[i] = gen_reg_rtx (V32QImode);
48786 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48787 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48790 /* Swap the 128-byte lanes of h[X]. */
48791 for (i = 0; i < 2; ++i)
48793 if (h[i] == NULL_RTX)
48794 continue;
48795 op = gen_reg_rtx (V4DImode);
48796 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48797 const2_rtx, GEN_INT (3), const0_rtx,
48798 const1_rtx));
48799 h[i] = gen_lowpart (V32QImode, op);
48802 for (i = 0; i < 2; ++i)
48804 if (!used[2 * i])
48806 l[i] = NULL_RTX;
48807 continue;
48809 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48810 vperm = force_reg (V32QImode, vperm);
48811 l[i] = gen_reg_rtx (V32QImode);
48812 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48813 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48816 for (i = 0; i < 2; ++i)
48818 if (h[i] && l[i])
48820 op = gen_reg_rtx (V32QImode);
48821 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48822 l[i] = op;
48824 else if (h[i])
48825 l[i] = h[i];
48828 gcc_assert (l[0] && l[1]);
48829 op = d->target;
48830 if (d->vmode != V32QImode)
48831 op = gen_reg_rtx (V32QImode);
48832 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48833 if (op != d->target)
48834 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48835 return true;
48838 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
48839 taken care of, perform the expansion in D and return true on success. */
48841 static bool
48842 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48844 /* Try a single instruction expansion. */
48845 if (expand_vec_perm_1 (d))
48846 return true;
48848 /* Try sequences of two instructions. */
48850 if (expand_vec_perm_pshuflw_pshufhw (d))
48851 return true;
48853 if (expand_vec_perm_palignr (d, false))
48854 return true;
48856 if (expand_vec_perm_interleave2 (d))
48857 return true;
48859 if (expand_vec_perm_broadcast (d))
48860 return true;
48862 if (expand_vec_perm_vpermq_perm_1 (d))
48863 return true;
48865 if (expand_vec_perm_vperm2f128 (d))
48866 return true;
48868 if (expand_vec_perm_pblendv (d))
48869 return true;
48871 /* Try sequences of three instructions. */
48873 if (expand_vec_perm_even_odd_pack (d))
48874 return true;
48876 if (expand_vec_perm_2vperm2f128_vshuf (d))
48877 return true;
48879 if (expand_vec_perm_pshufb2 (d))
48880 return true;
48882 if (expand_vec_perm_interleave3 (d))
48883 return true;
48885 if (expand_vec_perm_vperm2f128_vblend (d))
48886 return true;
48888 /* Try sequences of four instructions. */
48890 if (expand_vec_perm_even_odd_trunc (d))
48891 return true;
48892 if (expand_vec_perm_vpshufb2_vpermq (d))
48893 return true;
48895 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48896 return true;
48898 if (expand_vec_perm_vpermt2_vpshub2 (d))
48899 return true;
48901 /* ??? Look for narrow permutations whose element orderings would
48902 allow the promotion to a wider mode. */
48904 /* ??? Look for sequences of interleave or a wider permute that place
48905 the data into the correct lanes for a half-vector shuffle like
48906 pshuf[lh]w or vpermilps. */
48908 /* ??? Look for sequences of interleave that produce the desired results.
48909 The combinatorics of punpck[lh] get pretty ugly... */
48911 if (expand_vec_perm_even_odd (d))
48912 return true;
48914 /* Even longer sequences. */
48915 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48916 return true;
48918 /* See if we can get the same permutation in different vector integer
48919 mode. */
48920 struct expand_vec_perm_d nd;
48921 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48923 if (!d->testing_p)
48924 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48925 return true;
48928 return false;
48931 /* If a permutation only uses one operand, make it clear. Returns true
48932 if the permutation references both operands. */
48934 static bool
48935 canonicalize_perm (struct expand_vec_perm_d *d)
48937 int i, which, nelt = d->nelt;
48939 for (i = which = 0; i < nelt; ++i)
48940 which |= (d->perm[i] < nelt ? 1 : 2);
48942 d->one_operand_p = true;
48943 switch (which)
48945 default:
48946 gcc_unreachable();
48948 case 3:
48949 if (!rtx_equal_p (d->op0, d->op1))
48951 d->one_operand_p = false;
48952 break;
48954 /* The elements of PERM do not suggest that only the first operand
48955 is used, but both operands are identical. Allow easier matching
48956 of the permutation by folding the permutation into the single
48957 input vector. */
48958 /* FALLTHRU */
48960 case 2:
48961 for (i = 0; i < nelt; ++i)
48962 d->perm[i] &= nelt - 1;
48963 d->op0 = d->op1;
48964 break;
48966 case 1:
48967 d->op1 = d->op0;
48968 break;
48971 return (which == 3);
48974 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
48976 static bool
48977 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
48978 rtx op1, const vec_perm_indices &sel)
48980 struct expand_vec_perm_d d;
48981 unsigned char perm[MAX_VECT_LEN];
48982 unsigned int i, nelt, which;
48983 bool two_args;
48985 d.target = target;
48986 d.op0 = op0;
48987 d.op1 = op1;
48989 d.vmode = vmode;
48990 gcc_assert (VECTOR_MODE_P (d.vmode));
48991 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48992 d.testing_p = !target;
48994 gcc_assert (sel.length () == nelt);
48995 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
48997 /* Given sufficient ISA support we can just return true here
48998 for selected vector modes. */
48999 switch (d.vmode)
49001 case E_V16SFmode:
49002 case E_V16SImode:
49003 case E_V8DImode:
49004 case E_V8DFmode:
49005 if (!TARGET_AVX512F)
49006 return false;
49007 /* All implementable with a single vperm[it]2 insn. */
49008 if (d.testing_p)
49009 return true;
49010 break;
49011 case E_V32HImode:
49012 if (!TARGET_AVX512BW)
49013 return false;
49014 if (d.testing_p)
49015 /* All implementable with a single vperm[it]2 insn. */
49016 return true;
49017 break;
49018 case E_V64QImode:
49019 if (!TARGET_AVX512BW)
49020 return false;
49021 if (d.testing_p)
49022 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
49023 return true;
49024 break;
49025 case E_V8SImode:
49026 case E_V8SFmode:
49027 case E_V4DFmode:
49028 case E_V4DImode:
49029 if (!TARGET_AVX)
49030 return false;
49031 if (d.testing_p && TARGET_AVX512VL)
49032 /* All implementable with a single vperm[it]2 insn. */
49033 return true;
49034 break;
49035 case E_V16HImode:
49036 if (!TARGET_SSE2)
49037 return false;
49038 if (d.testing_p && TARGET_AVX2)
49039 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49040 return true;
49041 break;
49042 case E_V32QImode:
49043 if (!TARGET_SSE2)
49044 return false;
49045 if (d.testing_p && TARGET_AVX2)
49046 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49047 return true;
49048 break;
49049 case E_V8HImode:
49050 case E_V16QImode:
49051 if (!TARGET_SSE2)
49052 return false;
49053 /* Fall through. */
49054 case E_V4SImode:
49055 case E_V4SFmode:
49056 if (!TARGET_SSE)
49057 return false;
49058 /* All implementable with a single vpperm insn. */
49059 if (d.testing_p && TARGET_XOP)
49060 return true;
49061 /* All implementable with 2 pshufb + 1 ior. */
49062 if (d.testing_p && TARGET_SSSE3)
49063 return true;
49064 break;
49065 case E_V2DImode:
49066 case E_V2DFmode:
49067 if (!TARGET_SSE)
49068 return false;
49069 /* All implementable with shufpd or unpck[lh]pd. */
49070 if (d.testing_p)
49071 return true;
49072 break;
49073 default:
49074 return false;
49077 for (i = which = 0; i < nelt; ++i)
49079 unsigned char e = sel[i];
49080 gcc_assert (e < 2 * nelt);
49081 d.perm[i] = e;
49082 perm[i] = e;
49083 which |= (e < nelt ? 1 : 2);
49086 if (d.testing_p)
49088 /* For all elements from second vector, fold the elements to first. */
49089 if (which == 2)
49090 for (i = 0; i < nelt; ++i)
49091 d.perm[i] -= nelt;
49093 /* Check whether the mask can be applied to the vector type. */
49094 d.one_operand_p = (which != 3);
49096 /* Implementable with shufps or pshufd. */
49097 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49098 return true;
49100 /* Otherwise we have to go through the motions and see if we can
49101 figure out how to generate the requested permutation. */
49102 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49103 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49104 if (!d.one_operand_p)
49105 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49107 start_sequence ();
49108 bool ret = ix86_expand_vec_perm_const_1 (&d);
49109 end_sequence ();
49111 return ret;
49114 two_args = canonicalize_perm (&d);
49116 if (ix86_expand_vec_perm_const_1 (&d))
49117 return true;
49119 /* If the selector says both arguments are needed, but the operands are the
49120 same, the above tried to expand with one_operand_p and flattened selector.
49121 If that didn't work, retry without one_operand_p; we succeeded with that
49122 during testing. */
49123 if (two_args && d.one_operand_p)
49125 d.one_operand_p = false;
49126 memcpy (d.perm, perm, sizeof (perm));
49127 return ix86_expand_vec_perm_const_1 (&d);
49130 return false;
49133 void
49134 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49136 struct expand_vec_perm_d d;
49137 unsigned i, nelt;
49139 d.target = targ;
49140 d.op0 = op0;
49141 d.op1 = op1;
49142 d.vmode = GET_MODE (targ);
49143 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49144 d.one_operand_p = false;
49145 d.testing_p = false;
49147 for (i = 0; i < nelt; ++i)
49148 d.perm[i] = i * 2 + odd;
49150 /* We'll either be able to implement the permutation directly... */
49151 if (expand_vec_perm_1 (&d))
49152 return;
49154 /* ... or we use the special-case patterns. */
49155 expand_vec_perm_even_odd_1 (&d, odd);
49158 static void
49159 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49161 struct expand_vec_perm_d d;
49162 unsigned i, nelt, base;
49163 bool ok;
49165 d.target = targ;
49166 d.op0 = op0;
49167 d.op1 = op1;
49168 d.vmode = GET_MODE (targ);
49169 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49170 d.one_operand_p = false;
49171 d.testing_p = false;
49173 base = high_p ? nelt / 2 : 0;
49174 for (i = 0; i < nelt / 2; ++i)
49176 d.perm[i * 2] = i + base;
49177 d.perm[i * 2 + 1] = i + base + nelt;
49180 /* Note that for AVX this isn't one instruction. */
49181 ok = ix86_expand_vec_perm_const_1 (&d);
49182 gcc_assert (ok);
49186 /* Expand a vector operation CODE for a V*QImode in terms of the
49187 same operation on V*HImode. */
49189 void
49190 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49192 machine_mode qimode = GET_MODE (dest);
49193 machine_mode himode;
49194 rtx (*gen_il) (rtx, rtx, rtx);
49195 rtx (*gen_ih) (rtx, rtx, rtx);
49196 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49197 struct expand_vec_perm_d d;
49198 bool ok, full_interleave;
49199 bool uns_p = false;
49200 int i;
49202 switch (qimode)
49204 case E_V16QImode:
49205 himode = V8HImode;
49206 gen_il = gen_vec_interleave_lowv16qi;
49207 gen_ih = gen_vec_interleave_highv16qi;
49208 break;
49209 case E_V32QImode:
49210 himode = V16HImode;
49211 gen_il = gen_avx2_interleave_lowv32qi;
49212 gen_ih = gen_avx2_interleave_highv32qi;
49213 break;
49214 case E_V64QImode:
49215 himode = V32HImode;
49216 gen_il = gen_avx512bw_interleave_lowv64qi;
49217 gen_ih = gen_avx512bw_interleave_highv64qi;
49218 break;
49219 default:
49220 gcc_unreachable ();
49223 op2_l = op2_h = op2;
49224 switch (code)
49226 case MULT:
49227 /* Unpack data such that we've got a source byte in each low byte of
49228 each word. We don't care what goes into the high byte of each word.
49229 Rather than trying to get zero in there, most convenient is to let
49230 it be a copy of the low byte. */
49231 op2_l = gen_reg_rtx (qimode);
49232 op2_h = gen_reg_rtx (qimode);
49233 emit_insn (gen_il (op2_l, op2, op2));
49234 emit_insn (gen_ih (op2_h, op2, op2));
49236 op1_l = gen_reg_rtx (qimode);
49237 op1_h = gen_reg_rtx (qimode);
49238 emit_insn (gen_il (op1_l, op1, op1));
49239 emit_insn (gen_ih (op1_h, op1, op1));
49240 full_interleave = qimode == V16QImode;
49241 break;
49243 case ASHIFT:
49244 case LSHIFTRT:
49245 uns_p = true;
49246 /* FALLTHRU */
49247 case ASHIFTRT:
49248 op1_l = gen_reg_rtx (himode);
49249 op1_h = gen_reg_rtx (himode);
49250 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49251 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49252 full_interleave = true;
49253 break;
49254 default:
49255 gcc_unreachable ();
49258 /* Perform the operation. */
49259 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49260 1, OPTAB_DIRECT);
49261 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49262 1, OPTAB_DIRECT);
49263 gcc_assert (res_l && res_h);
49265 /* Merge the data back into the right place. */
49266 d.target = dest;
49267 d.op0 = gen_lowpart (qimode, res_l);
49268 d.op1 = gen_lowpart (qimode, res_h);
49269 d.vmode = qimode;
49270 d.nelt = GET_MODE_NUNITS (qimode);
49271 d.one_operand_p = false;
49272 d.testing_p = false;
49274 if (full_interleave)
49276 /* For SSE2, we used an full interleave, so the desired
49277 results are in the even elements. */
49278 for (i = 0; i < d.nelt; ++i)
49279 d.perm[i] = i * 2;
49281 else
49283 /* For AVX, the interleave used above was not cross-lane. So the
49284 extraction is evens but with the second and third quarter swapped.
49285 Happily, that is even one insn shorter than even extraction.
49286 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49287 always first from the first and then from the second source operand,
49288 the index bits above the low 4 bits remains the same.
49289 Thus, for d.nelt == 32 we want permutation
49290 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49291 and for d.nelt == 64 we want permutation
49292 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49293 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49294 for (i = 0; i < d.nelt; ++i)
49295 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49298 ok = ix86_expand_vec_perm_const_1 (&d);
49299 gcc_assert (ok);
49301 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49302 gen_rtx_fmt_ee (code, qimode, op1, op2));
49305 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49306 if op is CONST_VECTOR with all odd elements equal to their
49307 preceding element. */
49309 static bool
49310 const_vector_equal_evenodd_p (rtx op)
49312 machine_mode mode = GET_MODE (op);
49313 int i, nunits = GET_MODE_NUNITS (mode);
49314 if (GET_CODE (op) != CONST_VECTOR
49315 || nunits != CONST_VECTOR_NUNITS (op))
49316 return false;
49317 for (i = 0; i < nunits; i += 2)
49318 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49319 return false;
49320 return true;
49323 void
49324 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49325 bool uns_p, bool odd_p)
49327 machine_mode mode = GET_MODE (op1);
49328 machine_mode wmode = GET_MODE (dest);
49329 rtx x;
49330 rtx orig_op1 = op1, orig_op2 = op2;
49332 if (!nonimmediate_operand (op1, mode))
49333 op1 = force_reg (mode, op1);
49334 if (!nonimmediate_operand (op2, mode))
49335 op2 = force_reg (mode, op2);
49337 /* We only play even/odd games with vectors of SImode. */
49338 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49340 /* If we're looking for the odd results, shift those members down to
49341 the even slots. For some cpus this is faster than a PSHUFD. */
49342 if (odd_p)
49344 /* For XOP use vpmacsdqh, but only for smult, as it is only
49345 signed. */
49346 if (TARGET_XOP && mode == V4SImode && !uns_p)
49348 x = force_reg (wmode, CONST0_RTX (wmode));
49349 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49350 return;
49353 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49354 if (!const_vector_equal_evenodd_p (orig_op1))
49355 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49356 x, NULL, 1, OPTAB_DIRECT);
49357 if (!const_vector_equal_evenodd_p (orig_op2))
49358 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49359 x, NULL, 1, OPTAB_DIRECT);
49360 op1 = gen_lowpart (mode, op1);
49361 op2 = gen_lowpart (mode, op2);
49364 if (mode == V16SImode)
49366 if (uns_p)
49367 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49368 else
49369 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49371 else if (mode == V8SImode)
49373 if (uns_p)
49374 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49375 else
49376 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49378 else if (uns_p)
49379 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49380 else if (TARGET_SSE4_1)
49381 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49382 else
49384 rtx s1, s2, t0, t1, t2;
49386 /* The easiest way to implement this without PMULDQ is to go through
49387 the motions as if we are performing a full 64-bit multiply. With
49388 the exception that we need to do less shuffling of the elements. */
49390 /* Compute the sign-extension, aka highparts, of the two operands. */
49391 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49392 op1, pc_rtx, pc_rtx);
49393 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49394 op2, pc_rtx, pc_rtx);
49396 /* Multiply LO(A) * HI(B), and vice-versa. */
49397 t1 = gen_reg_rtx (wmode);
49398 t2 = gen_reg_rtx (wmode);
49399 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49400 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49402 /* Multiply LO(A) * LO(B). */
49403 t0 = gen_reg_rtx (wmode);
49404 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49406 /* Combine and shift the highparts into place. */
49407 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49408 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49409 1, OPTAB_DIRECT);
49411 /* Combine high and low parts. */
49412 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49413 return;
49415 emit_insn (x);
49418 void
49419 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49420 bool uns_p, bool high_p)
49422 machine_mode wmode = GET_MODE (dest);
49423 machine_mode mode = GET_MODE (op1);
49424 rtx t1, t2, t3, t4, mask;
49426 switch (mode)
49428 case E_V4SImode:
49429 t1 = gen_reg_rtx (mode);
49430 t2 = gen_reg_rtx (mode);
49431 if (TARGET_XOP && !uns_p)
49433 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49434 shuffle the elements once so that all elements are in the right
49435 place for immediate use: { A C B D }. */
49436 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49437 const1_rtx, GEN_INT (3)));
49438 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49439 const1_rtx, GEN_INT (3)));
49441 else
49443 /* Put the elements into place for the multiply. */
49444 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49445 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49446 high_p = false;
49448 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49449 break;
49451 case E_V8SImode:
49452 /* Shuffle the elements between the lanes. After this we
49453 have { A B E F | C D G H } for each operand. */
49454 t1 = gen_reg_rtx (V4DImode);
49455 t2 = gen_reg_rtx (V4DImode);
49456 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49457 const0_rtx, const2_rtx,
49458 const1_rtx, GEN_INT (3)));
49459 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49460 const0_rtx, const2_rtx,
49461 const1_rtx, GEN_INT (3)));
49463 /* Shuffle the elements within the lanes. After this we
49464 have { A A B B | C C D D } or { E E F F | G G H H }. */
49465 t3 = gen_reg_rtx (V8SImode);
49466 t4 = gen_reg_rtx (V8SImode);
49467 mask = GEN_INT (high_p
49468 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49469 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49470 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49471 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49473 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49474 break;
49476 case E_V8HImode:
49477 case E_V16HImode:
49478 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49479 uns_p, OPTAB_DIRECT);
49480 t2 = expand_binop (mode,
49481 uns_p ? umul_highpart_optab : smul_highpart_optab,
49482 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49483 gcc_assert (t1 && t2);
49485 t3 = gen_reg_rtx (mode);
49486 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49487 emit_move_insn (dest, gen_lowpart (wmode, t3));
49488 break;
49490 case E_V16QImode:
49491 case E_V32QImode:
49492 case E_V32HImode:
49493 case E_V16SImode:
49494 case E_V64QImode:
49495 t1 = gen_reg_rtx (wmode);
49496 t2 = gen_reg_rtx (wmode);
49497 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49498 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49500 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49501 break;
49503 default:
49504 gcc_unreachable ();
49508 void
49509 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49511 rtx res_1, res_2, res_3, res_4;
49513 res_1 = gen_reg_rtx (V4SImode);
49514 res_2 = gen_reg_rtx (V4SImode);
49515 res_3 = gen_reg_rtx (V2DImode);
49516 res_4 = gen_reg_rtx (V2DImode);
49517 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49518 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49520 /* Move the results in element 2 down to element 1; we don't care
49521 what goes in elements 2 and 3. Then we can merge the parts
49522 back together with an interleave.
49524 Note that two other sequences were tried:
49525 (1) Use interleaves at the start instead of psrldq, which allows
49526 us to use a single shufps to merge things back at the end.
49527 (2) Use shufps here to combine the two vectors, then pshufd to
49528 put the elements in the correct order.
49529 In both cases the cost of the reformatting stall was too high
49530 and the overall sequence slower. */
49532 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49533 const0_rtx, const2_rtx,
49534 const0_rtx, const0_rtx));
49535 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49536 const0_rtx, const2_rtx,
49537 const0_rtx, const0_rtx));
49538 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49540 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49543 void
49544 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49546 machine_mode mode = GET_MODE (op0);
49547 rtx t1, t2, t3, t4, t5, t6;
49549 if (TARGET_AVX512DQ && mode == V8DImode)
49550 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49551 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49552 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49553 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49554 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49555 else if (TARGET_XOP && mode == V2DImode)
49557 /* op1: A,B,C,D, op2: E,F,G,H */
49558 op1 = gen_lowpart (V4SImode, op1);
49559 op2 = gen_lowpart (V4SImode, op2);
49561 t1 = gen_reg_rtx (V4SImode);
49562 t2 = gen_reg_rtx (V4SImode);
49563 t3 = gen_reg_rtx (V2DImode);
49564 t4 = gen_reg_rtx (V2DImode);
49566 /* t1: B,A,D,C */
49567 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49568 GEN_INT (1),
49569 GEN_INT (0),
49570 GEN_INT (3),
49571 GEN_INT (2)));
49573 /* t2: (B*E),(A*F),(D*G),(C*H) */
49574 emit_insn (gen_mulv4si3 (t2, t1, op2));
49576 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49577 emit_insn (gen_xop_phadddq (t3, t2));
49579 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49580 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49582 /* Multiply lower parts and add all */
49583 t5 = gen_reg_rtx (V2DImode);
49584 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49585 gen_lowpart (V4SImode, op1),
49586 gen_lowpart (V4SImode, op2)));
49587 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49590 else
49592 machine_mode nmode;
49593 rtx (*umul) (rtx, rtx, rtx);
49595 if (mode == V2DImode)
49597 umul = gen_vec_widen_umult_even_v4si;
49598 nmode = V4SImode;
49600 else if (mode == V4DImode)
49602 umul = gen_vec_widen_umult_even_v8si;
49603 nmode = V8SImode;
49605 else if (mode == V8DImode)
49607 umul = gen_vec_widen_umult_even_v16si;
49608 nmode = V16SImode;
49610 else
49611 gcc_unreachable ();
49614 /* Multiply low parts. */
49615 t1 = gen_reg_rtx (mode);
49616 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49618 /* Shift input vectors right 32 bits so we can multiply high parts. */
49619 t6 = GEN_INT (32);
49620 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49621 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49623 /* Multiply high parts by low parts. */
49624 t4 = gen_reg_rtx (mode);
49625 t5 = gen_reg_rtx (mode);
49626 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49627 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49629 /* Combine and shift the highparts back. */
49630 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49631 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49633 /* Combine high and low parts. */
49634 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49637 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49638 gen_rtx_MULT (mode, op1, op2));
49641 /* Return 1 if control tansfer instruction INSN
49642 should be encoded with bnd prefix.
49643 If insn is NULL then return 1 when control
49644 transfer instructions should be prefixed with
49645 bnd by default for current function. */
49647 bool
49648 ix86_bnd_prefixed_insn_p (rtx insn)
49650 /* For call insns check special flag. */
49651 if (insn && CALL_P (insn))
49653 rtx call = get_call_rtx_from (insn);
49654 if (call)
49655 return CALL_EXPR_WITH_BOUNDS_P (call);
49658 /* All other insns are prefixed only if function is instrumented. */
49659 return chkp_function_instrumented_p (current_function_decl);
49662 /* Return 1 if control tansfer instruction INSN
49663 should be encoded with notrack prefix. */
49665 static bool
49666 ix86_notrack_prefixed_insn_p (rtx insn)
49668 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
49669 return false;
49671 if (CALL_P (insn))
49673 rtx call = get_call_rtx_from (insn);
49674 gcc_assert (call != NULL_RTX);
49675 rtx addr = XEXP (call, 0);
49677 /* Do not emit 'notrack' if it's not an indirect call. */
49678 if (MEM_P (addr)
49679 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49680 return false;
49681 else
49682 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
49685 if (JUMP_P (insn) && !flag_cet_switch)
49687 rtx target = JUMP_LABEL (insn);
49688 if (target == NULL_RTX || ANY_RETURN_P (target))
49689 return false;
49691 /* Check the jump is a switch table. */
49692 rtx_insn *label = as_a<rtx_insn *> (target);
49693 rtx_insn *table = next_insn (label);
49694 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
49695 return false;
49696 else
49697 return true;
49699 return false;
49702 /* Calculate integer abs() using only SSE2 instructions. */
49704 void
49705 ix86_expand_sse2_abs (rtx target, rtx input)
49707 machine_mode mode = GET_MODE (target);
49708 rtx tmp0, tmp1, x;
49710 switch (mode)
49712 /* For 32-bit signed integer X, the best way to calculate the absolute
49713 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49714 case E_V4SImode:
49715 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49716 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49717 NULL, 0, OPTAB_DIRECT);
49718 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49719 NULL, 0, OPTAB_DIRECT);
49720 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49721 target, 0, OPTAB_DIRECT);
49722 break;
49724 /* For 16-bit signed integer X, the best way to calculate the absolute
49725 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49726 case E_V8HImode:
49727 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49729 x = expand_simple_binop (mode, SMAX, tmp0, input,
49730 target, 0, OPTAB_DIRECT);
49731 break;
49733 /* For 8-bit signed integer X, the best way to calculate the absolute
49734 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49735 as SSE2 provides the PMINUB insn. */
49736 case E_V16QImode:
49737 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49739 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49740 target, 0, OPTAB_DIRECT);
49741 break;
49743 default:
49744 gcc_unreachable ();
49747 if (x != target)
49748 emit_move_insn (target, x);
49751 /* Expand an extract from a vector register through pextr insn.
49752 Return true if successful. */
49754 bool
49755 ix86_expand_pextr (rtx *operands)
49757 rtx dst = operands[0];
49758 rtx src = operands[1];
49760 unsigned int size = INTVAL (operands[2]);
49761 unsigned int pos = INTVAL (operands[3]);
49763 if (SUBREG_P (dst))
49765 /* Reject non-lowpart subregs. */
49766 if (SUBREG_BYTE (dst) > 0)
49767 return false;
49768 dst = SUBREG_REG (dst);
49771 if (SUBREG_P (src))
49773 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49774 src = SUBREG_REG (src);
49777 switch (GET_MODE (src))
49779 case E_V16QImode:
49780 case E_V8HImode:
49781 case E_V4SImode:
49782 case E_V2DImode:
49783 case E_V1TImode:
49784 case E_TImode:
49786 machine_mode srcmode, dstmode;
49787 rtx d, pat;
49789 if (!int_mode_for_size (size, 0).exists (&dstmode))
49790 return false;
49792 switch (dstmode)
49794 case E_QImode:
49795 if (!TARGET_SSE4_1)
49796 return false;
49797 srcmode = V16QImode;
49798 break;
49800 case E_HImode:
49801 if (!TARGET_SSE2)
49802 return false;
49803 srcmode = V8HImode;
49804 break;
49806 case E_SImode:
49807 if (!TARGET_SSE4_1)
49808 return false;
49809 srcmode = V4SImode;
49810 break;
49812 case E_DImode:
49813 gcc_assert (TARGET_64BIT);
49814 if (!TARGET_SSE4_1)
49815 return false;
49816 srcmode = V2DImode;
49817 break;
49819 default:
49820 return false;
49823 /* Reject extractions from misaligned positions. */
49824 if (pos & (size-1))
49825 return false;
49827 if (GET_MODE (dst) == dstmode)
49828 d = dst;
49829 else
49830 d = gen_reg_rtx (dstmode);
49832 /* Construct insn pattern. */
49833 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49834 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49836 /* Let the rtl optimizers know about the zero extension performed. */
49837 if (dstmode == QImode || dstmode == HImode)
49839 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49840 d = gen_lowpart (SImode, d);
49843 emit_insn (gen_rtx_SET (d, pat));
49845 if (d != dst)
49846 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49847 return true;
49850 default:
49851 return false;
49855 /* Expand an insert into a vector register through pinsr insn.
49856 Return true if successful. */
49858 bool
49859 ix86_expand_pinsr (rtx *operands)
49861 rtx dst = operands[0];
49862 rtx src = operands[3];
49864 unsigned int size = INTVAL (operands[1]);
49865 unsigned int pos = INTVAL (operands[2]);
49867 if (SUBREG_P (dst))
49869 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49870 dst = SUBREG_REG (dst);
49873 switch (GET_MODE (dst))
49875 case E_V16QImode:
49876 case E_V8HImode:
49877 case E_V4SImode:
49878 case E_V2DImode:
49879 case E_V1TImode:
49880 case E_TImode:
49882 machine_mode srcmode, dstmode;
49883 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49884 rtx d;
49886 if (!int_mode_for_size (size, 0).exists (&srcmode))
49887 return false;
49889 switch (srcmode)
49891 case E_QImode:
49892 if (!TARGET_SSE4_1)
49893 return false;
49894 dstmode = V16QImode;
49895 pinsr = gen_sse4_1_pinsrb;
49896 break;
49898 case E_HImode:
49899 if (!TARGET_SSE2)
49900 return false;
49901 dstmode = V8HImode;
49902 pinsr = gen_sse2_pinsrw;
49903 break;
49905 case E_SImode:
49906 if (!TARGET_SSE4_1)
49907 return false;
49908 dstmode = V4SImode;
49909 pinsr = gen_sse4_1_pinsrd;
49910 break;
49912 case E_DImode:
49913 gcc_assert (TARGET_64BIT);
49914 if (!TARGET_SSE4_1)
49915 return false;
49916 dstmode = V2DImode;
49917 pinsr = gen_sse4_1_pinsrq;
49918 break;
49920 default:
49921 return false;
49924 /* Reject insertions to misaligned positions. */
49925 if (pos & (size-1))
49926 return false;
49928 if (SUBREG_P (src))
49930 unsigned int srcpos = SUBREG_BYTE (src);
49932 if (srcpos > 0)
49934 rtx extr_ops[4];
49936 extr_ops[0] = gen_reg_rtx (srcmode);
49937 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49938 extr_ops[2] = GEN_INT (size);
49939 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49941 if (!ix86_expand_pextr (extr_ops))
49942 return false;
49944 src = extr_ops[0];
49946 else
49947 src = gen_lowpart (srcmode, SUBREG_REG (src));
49950 if (GET_MODE (dst) == dstmode)
49951 d = dst;
49952 else
49953 d = gen_reg_rtx (dstmode);
49955 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49956 gen_lowpart (srcmode, src),
49957 GEN_INT (1 << (pos / size))));
49958 if (d != dst)
49959 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49960 return true;
49963 default:
49964 return false;
49968 /* This function returns the calling abi specific va_list type node.
49969 It returns the FNDECL specific va_list type. */
49971 static tree
49972 ix86_fn_abi_va_list (tree fndecl)
49974 if (!TARGET_64BIT)
49975 return va_list_type_node;
49976 gcc_assert (fndecl != NULL_TREE);
49978 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
49979 return ms_va_list_type_node;
49980 else
49981 return sysv_va_list_type_node;
49984 /* Returns the canonical va_list type specified by TYPE. If there
49985 is no valid TYPE provided, it return NULL_TREE. */
49987 static tree
49988 ix86_canonical_va_list_type (tree type)
49990 if (TARGET_64BIT)
49992 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
49993 return ms_va_list_type_node;
49995 if ((TREE_CODE (type) == ARRAY_TYPE
49996 && integer_zerop (array_type_nelts (type)))
49997 || POINTER_TYPE_P (type))
49999 tree elem_type = TREE_TYPE (type);
50000 if (TREE_CODE (elem_type) == RECORD_TYPE
50001 && lookup_attribute ("sysv_abi va_list",
50002 TYPE_ATTRIBUTES (elem_type)))
50003 return sysv_va_list_type_node;
50006 return NULL_TREE;
50009 return std_canonical_va_list_type (type);
50012 /* Iterate through the target-specific builtin types for va_list.
50013 IDX denotes the iterator, *PTREE is set to the result type of
50014 the va_list builtin, and *PNAME to its internal type.
50015 Returns zero if there is no element for this index, otherwise
50016 IDX should be increased upon the next call.
50017 Note, do not iterate a base builtin's name like __builtin_va_list.
50018 Used from c_common_nodes_and_builtins. */
50020 static int
50021 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50023 if (TARGET_64BIT)
50025 switch (idx)
50027 default:
50028 break;
50030 case 0:
50031 *ptree = ms_va_list_type_node;
50032 *pname = "__builtin_ms_va_list";
50033 return 1;
50035 case 1:
50036 *ptree = sysv_va_list_type_node;
50037 *pname = "__builtin_sysv_va_list";
50038 return 1;
50042 return 0;
50045 #undef TARGET_SCHED_DISPATCH
50046 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
50047 #undef TARGET_SCHED_DISPATCH_DO
50048 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
50049 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50050 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50051 #undef TARGET_SCHED_REORDER
50052 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
50053 #undef TARGET_SCHED_ADJUST_PRIORITY
50054 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50055 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50056 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50057 ix86_dependencies_evaluation_hook
50060 /* Implementation of reassociation_width target hook used by
50061 reassoc phase to identify parallelism level in reassociated
50062 tree. Statements tree_code is passed in OPC. Arguments type
50063 is passed in MODE. */
50065 static int
50066 ix86_reassociation_width (unsigned int op, machine_mode mode)
50068 int width = 1;
50069 /* Vector part. */
50070 if (VECTOR_MODE_P (mode))
50072 int div = 1;
50073 if (INTEGRAL_MODE_P (mode))
50074 width = ix86_cost->reassoc_vec_int;
50075 else if (FLOAT_MODE_P (mode))
50076 width = ix86_cost->reassoc_vec_fp;
50078 if (width == 1)
50079 return 1;
50081 /* Integer vector instructions execute in FP unit
50082 and can execute 3 additions and one multiplication per cycle. */
50083 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
50084 && op != PLUS && op != MINUS)
50085 return 1;
50087 /* Account for targets that splits wide vectors into multiple parts. */
50088 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
50089 div = GET_MODE_BITSIZE (mode) / 128;
50090 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
50091 div = GET_MODE_BITSIZE (mode) / 64;
50092 width = (width + div - 1) / div;
50094 /* Scalar part. */
50095 else if (INTEGRAL_MODE_P (mode))
50096 width = ix86_cost->reassoc_int;
50097 else if (FLOAT_MODE_P (mode))
50098 width = ix86_cost->reassoc_fp;
50100 /* Avoid using too many registers in 32bit mode. */
50101 if (!TARGET_64BIT && width > 2)
50102 width = 2;
50103 return width;
50106 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50107 place emms and femms instructions. */
50109 static machine_mode
50110 ix86_preferred_simd_mode (scalar_mode mode)
50112 if (!TARGET_SSE)
50113 return word_mode;
50115 switch (mode)
50117 case E_QImode:
50118 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50119 return V64QImode;
50120 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50121 return V32QImode;
50122 else
50123 return V16QImode;
50125 case E_HImode:
50126 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50127 return V32HImode;
50128 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50129 return V16HImode;
50130 else
50131 return V8HImode;
50133 case E_SImode:
50134 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50135 return V16SImode;
50136 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50137 return V8SImode;
50138 else
50139 return V4SImode;
50141 case E_DImode:
50142 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50143 return V8DImode;
50144 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50145 return V4DImode;
50146 else
50147 return V2DImode;
50149 case E_SFmode:
50150 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50151 return V16SFmode;
50152 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50153 return V8SFmode;
50154 else
50155 return V4SFmode;
50157 case E_DFmode:
50158 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50159 return V8DFmode;
50160 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50161 return V4DFmode;
50162 else if (TARGET_SSE2)
50163 return V2DFmode;
50164 /* FALLTHRU */
50166 default:
50167 return word_mode;
50171 /* All CPUs prefer to avoid cross-lane operations so perform reductions
50172 upper against lower halves up to SSE reg size. */
50174 static machine_mode
50175 ix86_split_reduction (machine_mode mode)
50177 /* Reduce lowpart against highpart until we reach SSE reg width to
50178 avoid cross-lane operations. */
50179 switch (mode)
50181 case E_V8DImode:
50182 case E_V4DImode:
50183 return V2DImode;
50184 case E_V16SImode:
50185 case E_V8SImode:
50186 return V4SImode;
50187 case E_V32HImode:
50188 case E_V16HImode:
50189 return V8HImode;
50190 case E_V64QImode:
50191 case E_V32QImode:
50192 return V16QImode;
50193 case E_V16SFmode:
50194 case E_V8SFmode:
50195 return V4SFmode;
50196 case E_V8DFmode:
50197 case E_V4DFmode:
50198 return V2DFmode;
50199 default:
50200 return mode;
50204 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50205 vectors. If AVX512F is enabled then try vectorizing with 512bit,
50206 256bit and 128bit vectors. */
50208 static void
50209 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
50211 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50213 sizes->safe_push (64);
50214 sizes->safe_push (32);
50215 sizes->safe_push (16);
50217 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50219 sizes->safe_push (32);
50220 sizes->safe_push (16);
50224 /* Implemenation of targetm.vectorize.get_mask_mode. */
50226 static opt_machine_mode
50227 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
50229 unsigned elem_size = vector_size / nunits;
50231 /* Scalar mask case. */
50232 if ((TARGET_AVX512F && vector_size == 64)
50233 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50235 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50236 return smallest_int_mode_for_size (nunits);
50239 scalar_int_mode elem_mode
50240 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
50242 gcc_assert (elem_size * nunits == vector_size);
50244 return mode_for_vector (elem_mode, nunits);
50249 /* Return class of registers which could be used for pseudo of MODE
50250 and of class RCLASS for spilling instead of memory. Return NO_REGS
50251 if it is not possible or non-profitable. */
50253 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50255 static reg_class_t
50256 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50258 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50259 && TARGET_SSE2
50260 && TARGET_INTER_UNIT_MOVES_TO_VEC
50261 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50262 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50263 && INTEGER_CLASS_P (rclass))
50264 return ALL_SSE_REGS;
50265 return NO_REGS;
50268 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
50269 but returns a lower bound. */
50271 static unsigned int
50272 ix86_max_noce_ifcvt_seq_cost (edge e)
50274 bool predictable_p = predictable_edge_p (e);
50276 enum compiler_param param
50277 = (predictable_p
50278 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50279 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50281 /* If we have a parameter set, use that, otherwise take a guess using
50282 BRANCH_COST. */
50283 if (global_options_set.x_param_values[param])
50284 return PARAM_VALUE (param);
50285 else
50286 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50289 /* Return true if SEQ is a good candidate as a replacement for the
50290 if-convertible sequence described in IF_INFO. */
50292 static bool
50293 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
50295 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
50297 int cmov_cnt = 0;
50298 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
50299 Maybe we should allow even more conditional moves as long as they
50300 are used far enough not to stall the CPU, or also consider
50301 IF_INFO->TEST_BB succ edge probabilities. */
50302 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
50304 rtx set = single_set (insn);
50305 if (!set)
50306 continue;
50307 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
50308 continue;
50309 rtx src = SET_SRC (set);
50310 machine_mode mode = GET_MODE (src);
50311 if (GET_MODE_CLASS (mode) != MODE_INT
50312 && GET_MODE_CLASS (mode) != MODE_FLOAT)
50313 continue;
50314 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
50315 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
50316 continue;
50317 /* insn is CMOV or FCMOV. */
50318 if (++cmov_cnt > 1)
50319 return false;
50322 return default_noce_conversion_profitable_p (seq, if_info);
50325 /* Implement targetm.vectorize.init_cost. */
50327 static void *
50328 ix86_init_cost (struct loop *)
50330 unsigned *cost = XNEWVEC (unsigned, 3);
50331 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50332 return cost;
50335 /* Implement targetm.vectorize.add_stmt_cost. */
50337 static unsigned
50338 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50339 struct _stmt_vec_info *stmt_info, int misalign,
50340 enum vect_cost_model_location where)
50342 unsigned *cost = (unsigned *) data;
50343 unsigned retval = 0;
50345 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50346 int stmt_cost = - 1;
50348 if ((kind == vector_stmt || kind == scalar_stmt)
50349 && stmt_info
50350 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
50352 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
50353 bool fp = false;
50354 machine_mode mode = TImode;
50356 if (vectype != NULL)
50358 fp = FLOAT_TYPE_P (vectype);
50359 mode = TYPE_MODE (vectype);
50361 /*machine_mode inner_mode = mode;
50362 if (VECTOR_MODE_P (mode))
50363 inner_mode = GET_MODE_INNER (mode);*/
50365 switch (subcode)
50367 case PLUS_EXPR:
50368 case POINTER_PLUS_EXPR:
50369 case MINUS_EXPR:
50370 if (kind == scalar_stmt)
50372 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50373 stmt_cost = ix86_cost->addss;
50374 else if (X87_FLOAT_MODE_P (mode))
50375 stmt_cost = ix86_cost->fadd;
50376 else
50377 stmt_cost = ix86_cost->add;
50379 else
50380 stmt_cost = ix86_vec_cost (mode,
50381 fp ? ix86_cost->addss
50382 : ix86_cost->sse_op,
50383 true);
50384 break;
50386 case MULT_EXPR:
50387 case WIDEN_MULT_EXPR:
50388 case MULT_HIGHPART_EXPR:
50389 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
50390 break;
50391 case FMA_EXPR:
50392 stmt_cost = ix86_vec_cost (mode,
50393 mode == SFmode ? ix86_cost->fmass
50394 : ix86_cost->fmasd,
50395 true);
50396 break;
50397 case NEGATE_EXPR:
50398 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50399 stmt_cost = ix86_cost->sse_op;
50400 else if (X87_FLOAT_MODE_P (mode))
50401 stmt_cost = ix86_cost->fchs;
50402 else if (VECTOR_MODE_P (mode))
50403 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50404 else
50405 stmt_cost = ix86_cost->add;
50406 break;
50407 case TRUNC_DIV_EXPR:
50408 case CEIL_DIV_EXPR:
50409 case FLOOR_DIV_EXPR:
50410 case ROUND_DIV_EXPR:
50411 case TRUNC_MOD_EXPR:
50412 case CEIL_MOD_EXPR:
50413 case FLOOR_MOD_EXPR:
50414 case RDIV_EXPR:
50415 case ROUND_MOD_EXPR:
50416 case EXACT_DIV_EXPR:
50417 stmt_cost = ix86_division_cost (ix86_cost, mode);
50418 break;
50420 case RSHIFT_EXPR:
50421 case LSHIFT_EXPR:
50422 case LROTATE_EXPR:
50423 case RROTATE_EXPR:
50425 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
50426 stmt_cost = ix86_shift_rotate_cost
50427 (ix86_cost, mode,
50428 TREE_CODE (op2) == INTEGER_CST,
50429 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
50430 true, false, false, NULL, NULL);
50432 break;
50433 case NOP_EXPR:
50434 stmt_cost = 0;
50435 break;
50437 case BIT_IOR_EXPR:
50438 case ABS_EXPR:
50439 case MIN_EXPR:
50440 case MAX_EXPR:
50441 case BIT_XOR_EXPR:
50442 case BIT_AND_EXPR:
50443 case BIT_NOT_EXPR:
50444 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50445 stmt_cost = ix86_cost->sse_op;
50446 else if (VECTOR_MODE_P (mode))
50447 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50448 else
50449 stmt_cost = ix86_cost->add;
50450 break;
50451 default:
50452 break;
50455 /* If we do elementwise loads into a vector then we are bound by
50456 latency and execution resources for the many scalar loads
50457 (AGU and load ports). Try to account for this by scaling the
50458 construction cost by the number of elements involved. */
50459 if (kind == vec_construct
50460 && stmt_info
50461 && stmt_info->type == load_vec_info_type
50462 && stmt_info->memory_access_type == VMAT_ELEMENTWISE)
50464 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50465 stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
50467 if (stmt_cost == -1)
50468 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50470 /* Penalize DFmode vector operations for Bonnell. */
50471 if (TARGET_BONNELL && kind == vector_stmt
50472 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50473 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50475 /* Statements in an inner loop relative to the loop being
50476 vectorized are weighted more heavily. The value here is
50477 arbitrary and could potentially be improved with analysis. */
50478 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50479 count *= 50; /* FIXME. */
50481 retval = (unsigned) (count * stmt_cost);
50483 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50484 for Silvermont as it has out of order integer pipeline and can execute
50485 2 scalar instruction per tick, but has in order SIMD pipeline. */
50486 if ((TARGET_SILVERMONT || TARGET_INTEL)
50487 && stmt_info && stmt_info->stmt)
50489 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50490 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50491 retval = (retval * 17) / 10;
50494 cost[where] += retval;
50496 return retval;
50499 /* Implement targetm.vectorize.finish_cost. */
50501 static void
50502 ix86_finish_cost (void *data, unsigned *prologue_cost,
50503 unsigned *body_cost, unsigned *epilogue_cost)
50505 unsigned *cost = (unsigned *) data;
50506 *prologue_cost = cost[vect_prologue];
50507 *body_cost = cost[vect_body];
50508 *epilogue_cost = cost[vect_epilogue];
50511 /* Implement targetm.vectorize.destroy_cost_data. */
50513 static void
50514 ix86_destroy_cost_data (void *data)
50516 free (data);
50519 /* Validate target specific memory model bits in VAL. */
50521 static unsigned HOST_WIDE_INT
50522 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50524 enum memmodel model = memmodel_from_int (val);
50525 bool strong;
50527 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50528 |MEMMODEL_MASK)
50529 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50531 warning (OPT_Winvalid_memory_model,
50532 "unknown architecture specific memory model");
50533 return MEMMODEL_SEQ_CST;
50535 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50536 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50538 warning (OPT_Winvalid_memory_model,
50539 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50540 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50542 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50544 warning (OPT_Winvalid_memory_model,
50545 "HLE_RELEASE not used with RELEASE or stronger memory model");
50546 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50548 return val;
50551 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50552 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50553 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50554 or number of vecsize_mangle variants that should be emitted. */
50556 static int
50557 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50558 struct cgraph_simd_clone *clonei,
50559 tree base_type, int num)
50561 int ret = 1;
50563 if (clonei->simdlen
50564 && (clonei->simdlen < 2
50565 || clonei->simdlen > 1024
50566 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50568 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50569 "unsupported simdlen %d", clonei->simdlen);
50570 return 0;
50573 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50574 if (TREE_CODE (ret_type) != VOID_TYPE)
50575 switch (TYPE_MODE (ret_type))
50577 case E_QImode:
50578 case E_HImode:
50579 case E_SImode:
50580 case E_DImode:
50581 case E_SFmode:
50582 case E_DFmode:
50583 /* case E_SCmode: */
50584 /* case E_DCmode: */
50585 break;
50586 default:
50587 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50588 "unsupported return type %qT for simd", ret_type);
50589 return 0;
50592 tree t;
50593 int i;
50595 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50596 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50597 switch (TYPE_MODE (TREE_TYPE (t)))
50599 case E_QImode:
50600 case E_HImode:
50601 case E_SImode:
50602 case E_DImode:
50603 case E_SFmode:
50604 case E_DFmode:
50605 /* case E_SCmode: */
50606 /* case E_DCmode: */
50607 break;
50608 default:
50609 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50610 "unsupported argument type %qT for simd", TREE_TYPE (t));
50611 return 0;
50614 if (!TREE_PUBLIC (node->decl))
50616 /* If the function isn't exported, we can pick up just one ISA
50617 for the clones. */
50618 if (TARGET_AVX512F)
50619 clonei->vecsize_mangle = 'e';
50620 else if (TARGET_AVX2)
50621 clonei->vecsize_mangle = 'd';
50622 else if (TARGET_AVX)
50623 clonei->vecsize_mangle = 'c';
50624 else
50625 clonei->vecsize_mangle = 'b';
50626 ret = 1;
50628 else
50630 clonei->vecsize_mangle = "bcde"[num];
50631 ret = 4;
50633 clonei->mask_mode = VOIDmode;
50634 switch (clonei->vecsize_mangle)
50636 case 'b':
50637 clonei->vecsize_int = 128;
50638 clonei->vecsize_float = 128;
50639 break;
50640 case 'c':
50641 clonei->vecsize_int = 128;
50642 clonei->vecsize_float = 256;
50643 break;
50644 case 'd':
50645 clonei->vecsize_int = 256;
50646 clonei->vecsize_float = 256;
50647 break;
50648 case 'e':
50649 clonei->vecsize_int = 512;
50650 clonei->vecsize_float = 512;
50651 if (TYPE_MODE (base_type) == QImode)
50652 clonei->mask_mode = DImode;
50653 else
50654 clonei->mask_mode = SImode;
50655 break;
50657 if (clonei->simdlen == 0)
50659 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50660 clonei->simdlen = clonei->vecsize_int;
50661 else
50662 clonei->simdlen = clonei->vecsize_float;
50663 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50665 else if (clonei->simdlen > 16)
50667 /* For compatibility with ICC, use the same upper bounds
50668 for simdlen. In particular, for CTYPE below, use the return type,
50669 unless the function returns void, in that case use the characteristic
50670 type. If it is possible for given SIMDLEN to pass CTYPE value
50671 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50672 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50673 emit corresponding clone. */
50674 tree ctype = ret_type;
50675 if (TREE_CODE (ret_type) == VOID_TYPE)
50676 ctype = base_type;
50677 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50678 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50679 cnt /= clonei->vecsize_int;
50680 else
50681 cnt /= clonei->vecsize_float;
50682 if (cnt > (TARGET_64BIT ? 16 : 8))
50684 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50685 "unsupported simdlen %d", clonei->simdlen);
50686 return 0;
50689 return ret;
50692 /* Add target attribute to SIMD clone NODE if needed. */
50694 static void
50695 ix86_simd_clone_adjust (struct cgraph_node *node)
50697 const char *str = NULL;
50698 gcc_assert (node->decl == cfun->decl);
50699 switch (node->simdclone->vecsize_mangle)
50701 case 'b':
50702 if (!TARGET_SSE2)
50703 str = "sse2";
50704 break;
50705 case 'c':
50706 if (!TARGET_AVX)
50707 str = "avx";
50708 break;
50709 case 'd':
50710 if (!TARGET_AVX2)
50711 str = "avx2";
50712 break;
50713 case 'e':
50714 if (!TARGET_AVX512F)
50715 str = "avx512f";
50716 break;
50717 default:
50718 gcc_unreachable ();
50720 if (str == NULL)
50721 return;
50722 push_cfun (NULL);
50723 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50724 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50725 gcc_assert (ok);
50726 pop_cfun ();
50727 ix86_reset_previous_fndecl ();
50728 ix86_set_current_function (node->decl);
50731 /* If SIMD clone NODE can't be used in a vectorized loop
50732 in current function, return -1, otherwise return a badness of using it
50733 (0 if it is most desirable from vecsize_mangle point of view, 1
50734 slightly less desirable, etc.). */
50736 static int
50737 ix86_simd_clone_usable (struct cgraph_node *node)
50739 switch (node->simdclone->vecsize_mangle)
50741 case 'b':
50742 if (!TARGET_SSE2)
50743 return -1;
50744 if (!TARGET_AVX)
50745 return 0;
50746 return TARGET_AVX2 ? 2 : 1;
50747 case 'c':
50748 if (!TARGET_AVX)
50749 return -1;
50750 return TARGET_AVX2 ? 1 : 0;
50751 case 'd':
50752 if (!TARGET_AVX2)
50753 return -1;
50754 return 0;
50755 case 'e':
50756 if (!TARGET_AVX512F)
50757 return -1;
50758 return 0;
50759 default:
50760 gcc_unreachable ();
50764 /* This function adjusts the unroll factor based on
50765 the hardware capabilities. For ex, bdver3 has
50766 a loop buffer which makes unrolling of smaller
50767 loops less important. This function decides the
50768 unroll factor using number of memory references
50769 (value 32 is used) as a heuristic. */
50771 static unsigned
50772 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50774 basic_block *bbs;
50775 rtx_insn *insn;
50776 unsigned i;
50777 unsigned mem_count = 0;
50779 if (!TARGET_ADJUST_UNROLL)
50780 return nunroll;
50782 /* Count the number of memory references within the loop body.
50783 This value determines the unrolling factor for bdver3 and bdver4
50784 architectures. */
50785 subrtx_iterator::array_type array;
50786 bbs = get_loop_body (loop);
50787 for (i = 0; i < loop->num_nodes; i++)
50788 FOR_BB_INSNS (bbs[i], insn)
50789 if (NONDEBUG_INSN_P (insn))
50790 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50791 if (const_rtx x = *iter)
50792 if (MEM_P (x))
50794 machine_mode mode = GET_MODE (x);
50795 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50796 if (n_words > 4)
50797 mem_count += 2;
50798 else
50799 mem_count += 1;
50801 free (bbs);
50803 if (mem_count && mem_count <=32)
50804 return MIN (nunroll, 32 / mem_count);
50806 return nunroll;
50810 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50812 static bool
50813 ix86_float_exceptions_rounding_supported_p (void)
50815 /* For x87 floating point with standard excess precision handling,
50816 there is no adddf3 pattern (since x87 floating point only has
50817 XFmode operations) so the default hook implementation gets this
50818 wrong. */
50819 return TARGET_80387 || TARGET_SSE_MATH;
50822 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50824 static void
50825 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50827 if (!TARGET_80387 && !TARGET_SSE_MATH)
50828 return;
50829 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50830 if (TARGET_80387)
50832 tree fenv_index_type = build_index_type (size_int (6));
50833 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50834 tree fenv_var = create_tmp_var_raw (fenv_type);
50835 TREE_ADDRESSABLE (fenv_var) = 1;
50836 tree fenv_ptr = build_pointer_type (fenv_type);
50837 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50838 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50839 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50840 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50841 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50842 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50843 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50844 tree hold_fnclex = build_call_expr (fnclex, 0);
50845 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50846 NULL_TREE, NULL_TREE);
50847 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50848 hold_fnclex);
50849 *clear = build_call_expr (fnclex, 0);
50850 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50851 tree fnstsw_call = build_call_expr (fnstsw, 0);
50852 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50853 sw_var, fnstsw_call);
50854 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50855 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50856 exceptions_var, exceptions_x87);
50857 *update = build2 (COMPOUND_EXPR, integer_type_node,
50858 sw_mod, update_mod);
50859 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50860 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50862 if (TARGET_SSE_MATH)
50864 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50865 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50866 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50867 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50868 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50869 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50870 mxcsr_orig_var, stmxcsr_hold_call);
50871 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50872 mxcsr_orig_var,
50873 build_int_cst (unsigned_type_node, 0x1f80));
50874 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50875 build_int_cst (unsigned_type_node, 0xffffffc0));
50876 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50877 mxcsr_mod_var, hold_mod_val);
50878 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50879 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50880 hold_assign_orig, hold_assign_mod);
50881 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50882 ldmxcsr_hold_call);
50883 if (*hold)
50884 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50885 else
50886 *hold = hold_all;
50887 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50888 if (*clear)
50889 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50890 ldmxcsr_clear_call);
50891 else
50892 *clear = ldmxcsr_clear_call;
50893 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50894 tree exceptions_sse = fold_convert (integer_type_node,
50895 stxmcsr_update_call);
50896 if (*update)
50898 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50899 exceptions_var, exceptions_sse);
50900 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50901 exceptions_var, exceptions_mod);
50902 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50903 exceptions_assign);
50905 else
50906 *update = build2 (MODIFY_EXPR, integer_type_node,
50907 exceptions_var, exceptions_sse);
50908 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50909 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50910 ldmxcsr_update_call);
50912 tree atomic_feraiseexcept
50913 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50914 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50915 1, exceptions_var);
50916 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50917 atomic_feraiseexcept_call);
50920 /* Return mode to be used for bounds or VOIDmode
50921 if bounds are not supported. */
50923 static machine_mode
50924 ix86_mpx_bound_mode ()
50926 /* Do not support pointer checker if MPX
50927 is not enabled. */
50928 if (!TARGET_MPX)
50930 if (flag_check_pointer_bounds)
50931 warning (0, "Pointer Checker requires MPX support on this target."
50932 " Use -mmpx options to enable MPX.");
50933 return VOIDmode;
50936 return BNDmode;
50939 /* Return constant used to statically initialize constant bounds.
50941 This function is used to create special bound values. For now
50942 only INIT bounds and NONE bounds are expected. More special
50943 values may be added later. */
50945 static tree
50946 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50948 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50949 : build_zero_cst (pointer_sized_int_node);
50950 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50951 : build_minus_one_cst (pointer_sized_int_node);
50953 /* This function is supposed to be used to create INIT and
50954 NONE bounds only. */
50955 gcc_assert ((lb == 0 && ub == -1)
50956 || (lb == -1 && ub == 0));
50958 return build_complex (NULL, low, high);
50961 /* Generate a list of statements STMTS to initialize pointer bounds
50962 variable VAR with bounds LB and UB. Return the number of generated
50963 statements. */
50965 static int
50966 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50968 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50969 tree lhs, modify, var_p;
50971 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50972 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50974 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50975 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50976 append_to_statement_list (modify, stmts);
50978 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50979 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50980 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50981 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50982 append_to_statement_list (modify, stmts);
50984 return 2;
50987 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50988 /* For i386, common symbol is local only for non-PIE binaries. For
50989 x86-64, common symbol is local only for non-PIE binaries or linker
50990 supports copy reloc in PIE binaries. */
50992 static bool
50993 ix86_binds_local_p (const_tree exp)
50995 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50996 (!flag_pic
50997 || (TARGET_64BIT
50998 && HAVE_LD_PIE_COPYRELOC != 0)));
51000 #endif
51002 /* If MEM is in the form of [base+offset], extract the two parts
51003 of address and set to BASE and OFFSET, otherwise return false. */
51005 static bool
51006 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
51008 rtx addr;
51010 gcc_assert (MEM_P (mem));
51012 addr = XEXP (mem, 0);
51014 if (GET_CODE (addr) == CONST)
51015 addr = XEXP (addr, 0);
51017 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
51019 *base = addr;
51020 *offset = const0_rtx;
51021 return true;
51024 if (GET_CODE (addr) == PLUS
51025 && (REG_P (XEXP (addr, 0))
51026 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
51027 && CONST_INT_P (XEXP (addr, 1)))
51029 *base = XEXP (addr, 0);
51030 *offset = XEXP (addr, 1);
51031 return true;
51034 return false;
51037 /* Given OPERANDS of consecutive load/store, check if we can merge
51038 them into move multiple. LOAD is true if they are load instructions.
51039 MODE is the mode of memory operands. */
51041 bool
51042 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
51043 machine_mode mode)
51045 HOST_WIDE_INT offval_1, offval_2, msize;
51046 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
51048 if (load)
51050 mem_1 = operands[1];
51051 mem_2 = operands[3];
51052 reg_1 = operands[0];
51053 reg_2 = operands[2];
51055 else
51057 mem_1 = operands[0];
51058 mem_2 = operands[2];
51059 reg_1 = operands[1];
51060 reg_2 = operands[3];
51063 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
51065 if (REGNO (reg_1) != REGNO (reg_2))
51066 return false;
51068 /* Check if the addresses are in the form of [base+offset]. */
51069 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
51070 return false;
51071 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
51072 return false;
51074 /* Check if the bases are the same. */
51075 if (!rtx_equal_p (base_1, base_2))
51076 return false;
51078 offval_1 = INTVAL (offset_1);
51079 offval_2 = INTVAL (offset_2);
51080 msize = GET_MODE_SIZE (mode);
51081 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
51082 if (offval_1 + msize != offval_2)
51083 return false;
51085 return true;
51088 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
51090 static bool
51091 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
51092 optimization_type opt_type)
51094 switch (op)
51096 case asin_optab:
51097 case acos_optab:
51098 case log1p_optab:
51099 case exp_optab:
51100 case exp10_optab:
51101 case exp2_optab:
51102 case expm1_optab:
51103 case ldexp_optab:
51104 case scalb_optab:
51105 case round_optab:
51106 return opt_type == OPTIMIZE_FOR_SPEED;
51108 case rint_optab:
51109 if (SSE_FLOAT_MODE_P (mode1)
51110 && TARGET_SSE_MATH
51111 && !flag_trapping_math
51112 && !TARGET_SSE4_1)
51113 return opt_type == OPTIMIZE_FOR_SPEED;
51114 return true;
51116 case floor_optab:
51117 case ceil_optab:
51118 case btrunc_optab:
51119 if (SSE_FLOAT_MODE_P (mode1)
51120 && TARGET_SSE_MATH
51121 && !flag_trapping_math
51122 && TARGET_SSE4_1)
51123 return true;
51124 return opt_type == OPTIMIZE_FOR_SPEED;
51126 case rsqrt_optab:
51127 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51129 default:
51130 return true;
51134 /* Address space support.
51136 This is not "far pointers" in the 16-bit sense, but an easy way
51137 to use %fs and %gs segment prefixes. Therefore:
51139 (a) All address spaces have the same modes,
51140 (b) All address spaces have the same addresss forms,
51141 (c) While %fs and %gs are technically subsets of the generic
51142 address space, they are probably not subsets of each other.
51143 (d) Since we have no access to the segment base register values
51144 without resorting to a system call, we cannot convert a
51145 non-default address space to a default address space.
51146 Therefore we do not claim %fs or %gs are subsets of generic.
51148 Therefore we can (mostly) use the default hooks. */
51150 /* All use of segmentation is assumed to make address 0 valid. */
51152 static bool
51153 ix86_addr_space_zero_address_valid (addr_space_t as)
51155 return as != ADDR_SPACE_GENERIC;
51158 static void
51159 ix86_init_libfuncs (void)
51161 if (TARGET_64BIT)
51163 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51164 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51166 else
51168 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51169 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51172 #if TARGET_MACHO
51173 darwin_rename_builtins ();
51174 #endif
51177 /* Generate call to __divmoddi4. */
51179 static void
51180 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51181 rtx op0, rtx op1,
51182 rtx *quot_p, rtx *rem_p)
51184 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51186 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51187 mode,
51188 op0, GET_MODE (op0),
51189 op1, GET_MODE (op1),
51190 XEXP (rem, 0), Pmode);
51191 *quot_p = quot;
51192 *rem_p = rem;
51195 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
51196 FPU, assume that the fpcw is set to extended precision; when using
51197 only SSE, rounding is correct; when using both SSE and the FPU,
51198 the rounding precision is indeterminate, since either may be chosen
51199 apparently at random. */
51201 static enum flt_eval_method
51202 ix86_excess_precision (enum excess_precision_type type)
51204 switch (type)
51206 case EXCESS_PRECISION_TYPE_FAST:
51207 /* The fastest type to promote to will always be the native type,
51208 whether that occurs with implicit excess precision or
51209 otherwise. */
51210 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51211 case EXCESS_PRECISION_TYPE_STANDARD:
51212 case EXCESS_PRECISION_TYPE_IMPLICIT:
51213 /* Otherwise, the excess precision we want when we are
51214 in a standards compliant mode, and the implicit precision we
51215 provide would be identical were it not for the unpredictable
51216 cases. */
51217 if (!TARGET_80387)
51218 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51219 else if (!TARGET_MIX_SSE_I387)
51221 if (!TARGET_SSE_MATH)
51222 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51223 else if (TARGET_SSE2)
51224 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51227 /* If we are in standards compliant mode, but we know we will
51228 calculate in unpredictable precision, return
51229 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
51230 excess precision if the target can't guarantee it will honor
51231 it. */
51232 return (type == EXCESS_PRECISION_TYPE_STANDARD
51233 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51234 : FLT_EVAL_METHOD_UNPREDICTABLE);
51235 default:
51236 gcc_unreachable ();
51239 return FLT_EVAL_METHOD_UNPREDICTABLE;
51242 /* Implement PUSH_ROUNDING. On 386, we have pushw instruction that
51243 decrements by exactly 2 no matter what the position was, there is no pushb.
51245 But as CIE data alignment factor on this arch is -4 for 32bit targets
51246 and -8 for 64bit targets, we need to make sure all stack pointer adjustments
51247 are in multiple of 4 for 32bit targets and 8 for 64bit targets. */
51249 poly_int64
51250 ix86_push_rounding (poly_int64 bytes)
51252 return ROUND_UP (bytes, UNITS_PER_WORD);
51255 /* Target-specific selftests. */
51257 #if CHECKING_P
51259 namespace selftest {
51261 /* Verify that hard regs are dumped as expected (in compact mode). */
51263 static void
51264 ix86_test_dumping_hard_regs ()
51266 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51267 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51270 /* Test dumping an insn with repeated references to the same SCRATCH,
51271 to verify the rtx_reuse code. */
51273 static void
51274 ix86_test_dumping_memory_blockage ()
51276 set_new_first_and_last_insn (NULL, NULL);
51278 rtx pat = gen_memory_blockage ();
51279 rtx_reuse_manager r;
51280 r.preprocess (pat);
51282 /* Verify that the repeated references to the SCRATCH show use
51283 reuse IDS. The first should be prefixed with a reuse ID,
51284 and the second should be dumped as a "reuse_rtx" of that ID.
51285 The expected string assumes Pmode == DImode. */
51286 if (Pmode == DImode)
51287 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51288 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51289 " (unspec:BLK [\n"
51290 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51291 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51294 /* Verify loading an RTL dump; specifically a dump of copying
51295 a param on x86_64 from a hard reg into the frame.
51296 This test is target-specific since the dump contains target-specific
51297 hard reg names. */
51299 static void
51300 ix86_test_loading_dump_fragment_1 ()
51302 rtl_dump_test t (SELFTEST_LOCATION,
51303 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51305 rtx_insn *insn = get_insn_by_uid (1);
51307 /* The block structure and indentation here is purely for
51308 readability; it mirrors the structure of the rtx. */
51309 tree mem_expr;
51311 rtx pat = PATTERN (insn);
51312 ASSERT_EQ (SET, GET_CODE (pat));
51314 rtx dest = SET_DEST (pat);
51315 ASSERT_EQ (MEM, GET_CODE (dest));
51316 /* Verify the "/c" was parsed. */
51317 ASSERT_TRUE (RTX_FLAG (dest, call));
51318 ASSERT_EQ (SImode, GET_MODE (dest));
51320 rtx addr = XEXP (dest, 0);
51321 ASSERT_EQ (PLUS, GET_CODE (addr));
51322 ASSERT_EQ (DImode, GET_MODE (addr));
51324 rtx lhs = XEXP (addr, 0);
51325 /* Verify that the "frame" REG was consolidated. */
51326 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51329 rtx rhs = XEXP (addr, 1);
51330 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51331 ASSERT_EQ (-4, INTVAL (rhs));
51334 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51335 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51336 /* "i" should have been handled by synthesizing a global int
51337 variable named "i". */
51338 mem_expr = MEM_EXPR (dest);
51339 ASSERT_NE (mem_expr, NULL);
51340 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51341 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51342 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51343 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51344 /* "+0". */
51345 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51346 ASSERT_EQ (0, MEM_OFFSET (dest));
51347 /* "S4". */
51348 ASSERT_EQ (4, MEM_SIZE (dest));
51349 /* "A32. */
51350 ASSERT_EQ (32, MEM_ALIGN (dest));
51353 rtx src = SET_SRC (pat);
51354 ASSERT_EQ (REG, GET_CODE (src));
51355 ASSERT_EQ (SImode, GET_MODE (src));
51356 ASSERT_EQ (5, REGNO (src));
51357 tree reg_expr = REG_EXPR (src);
51358 /* "i" here should point to the same var as for the MEM_EXPR. */
51359 ASSERT_EQ (reg_expr, mem_expr);
51364 /* Verify that the RTL loader copes with a call_insn dump.
51365 This test is target-specific since the dump contains a target-specific
51366 hard reg name. */
51368 static void
51369 ix86_test_loading_call_insn ()
51371 /* The test dump includes register "xmm0", where requires TARGET_SSE
51372 to exist. */
51373 if (!TARGET_SSE)
51374 return;
51376 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51378 rtx_insn *insn = get_insns ();
51379 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51381 /* "/j". */
51382 ASSERT_TRUE (RTX_FLAG (insn, jump));
51384 rtx pat = PATTERN (insn);
51385 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51387 /* Verify REG_NOTES. */
51389 /* "(expr_list:REG_CALL_DECL". */
51390 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51391 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51392 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51394 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51395 rtx_expr_list *note1 = note0->next ();
51396 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51398 ASSERT_EQ (NULL, note1->next ());
51401 /* Verify CALL_INSN_FUNCTION_USAGE. */
51403 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51404 rtx_expr_list *usage
51405 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51406 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51407 ASSERT_EQ (DFmode, GET_MODE (usage));
51408 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51409 ASSERT_EQ (NULL, usage->next ());
51413 /* Verify that the RTL loader copes a dump from print_rtx_function.
51414 This test is target-specific since the dump contains target-specific
51415 hard reg names. */
51417 static void
51418 ix86_test_loading_full_dump ()
51420 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51422 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51424 rtx_insn *insn_1 = get_insn_by_uid (1);
51425 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51427 rtx_insn *insn_7 = get_insn_by_uid (7);
51428 ASSERT_EQ (INSN, GET_CODE (insn_7));
51429 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51431 rtx_insn *insn_15 = get_insn_by_uid (15);
51432 ASSERT_EQ (INSN, GET_CODE (insn_15));
51433 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51435 /* Verify crtl->return_rtx. */
51436 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51437 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51438 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51441 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51442 In particular, verify that it correctly loads the 2nd operand.
51443 This test is target-specific since these are machine-specific
51444 operands (and enums). */
51446 static void
51447 ix86_test_loading_unspec ()
51449 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51451 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51453 ASSERT_TRUE (cfun);
51455 /* Test of an UNSPEC. */
51456 rtx_insn *insn = get_insns ();
51457 ASSERT_EQ (INSN, GET_CODE (insn));
51458 rtx set = single_set (insn);
51459 ASSERT_NE (NULL, set);
51460 rtx dst = SET_DEST (set);
51461 ASSERT_EQ (MEM, GET_CODE (dst));
51462 rtx src = SET_SRC (set);
51463 ASSERT_EQ (UNSPEC, GET_CODE (src));
51464 ASSERT_EQ (BLKmode, GET_MODE (src));
51465 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51467 rtx v0 = XVECEXP (src, 0, 0);
51469 /* Verify that the two uses of the first SCRATCH have pointer
51470 equality. */
51471 rtx scratch_a = XEXP (dst, 0);
51472 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51474 rtx scratch_b = XEXP (v0, 0);
51475 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51477 ASSERT_EQ (scratch_a, scratch_b);
51479 /* Verify that the two mems are thus treated as equal. */
51480 ASSERT_TRUE (rtx_equal_p (dst, v0));
51482 /* Verify the the insn is recognized. */
51483 ASSERT_NE(-1, recog_memoized (insn));
51485 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51486 insn = NEXT_INSN (insn);
51487 ASSERT_EQ (INSN, GET_CODE (insn));
51489 set = single_set (insn);
51490 ASSERT_NE (NULL, set);
51492 src = SET_SRC (set);
51493 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51494 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51497 /* Run all target-specific selftests. */
51499 static void
51500 ix86_run_selftests (void)
51502 ix86_test_dumping_hard_regs ();
51503 ix86_test_dumping_memory_blockage ();
51505 /* Various tests of loading RTL dumps, here because they contain
51506 ix86-isms (e.g. names of hard regs). */
51507 ix86_test_loading_dump_fragment_1 ();
51508 ix86_test_loading_call_insn ();
51509 ix86_test_loading_full_dump ();
51510 ix86_test_loading_unspec ();
51513 } // namespace selftest
51515 #endif /* CHECKING_P */
51517 /* Initialize the GCC target structure. */
51518 #undef TARGET_RETURN_IN_MEMORY
51519 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51521 #undef TARGET_LEGITIMIZE_ADDRESS
51522 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51524 #undef TARGET_ATTRIBUTE_TABLE
51525 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51526 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51527 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51528 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51529 # undef TARGET_MERGE_DECL_ATTRIBUTES
51530 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51531 #endif
51533 #undef TARGET_COMP_TYPE_ATTRIBUTES
51534 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51536 #undef TARGET_INIT_BUILTINS
51537 #define TARGET_INIT_BUILTINS ix86_init_builtins
51538 #undef TARGET_BUILTIN_DECL
51539 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51540 #undef TARGET_EXPAND_BUILTIN
51541 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51543 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51544 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51545 ix86_builtin_vectorized_function
51547 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51548 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51550 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51551 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51553 #undef TARGET_BUILTIN_RECIPROCAL
51554 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51556 #undef TARGET_ASM_FUNCTION_EPILOGUE
51557 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51559 #undef TARGET_ENCODE_SECTION_INFO
51560 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51561 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51562 #else
51563 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51564 #endif
51566 #undef TARGET_ASM_OPEN_PAREN
51567 #define TARGET_ASM_OPEN_PAREN ""
51568 #undef TARGET_ASM_CLOSE_PAREN
51569 #define TARGET_ASM_CLOSE_PAREN ""
51571 #undef TARGET_ASM_BYTE_OP
51572 #define TARGET_ASM_BYTE_OP ASM_BYTE
51574 #undef TARGET_ASM_ALIGNED_HI_OP
51575 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51576 #undef TARGET_ASM_ALIGNED_SI_OP
51577 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51578 #ifdef ASM_QUAD
51579 #undef TARGET_ASM_ALIGNED_DI_OP
51580 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51581 #endif
51583 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51584 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51586 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51587 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51589 #undef TARGET_ASM_UNALIGNED_HI_OP
51590 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51591 #undef TARGET_ASM_UNALIGNED_SI_OP
51592 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51593 #undef TARGET_ASM_UNALIGNED_DI_OP
51594 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51596 #undef TARGET_PRINT_OPERAND
51597 #define TARGET_PRINT_OPERAND ix86_print_operand
51598 #undef TARGET_PRINT_OPERAND_ADDRESS
51599 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51600 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51601 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51602 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51603 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51605 #undef TARGET_SCHED_INIT_GLOBAL
51606 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51607 #undef TARGET_SCHED_ADJUST_COST
51608 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51609 #undef TARGET_SCHED_ISSUE_RATE
51610 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51611 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51612 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51613 ia32_multipass_dfa_lookahead
51614 #undef TARGET_SCHED_MACRO_FUSION_P
51615 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51616 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51617 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51619 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51620 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51622 #undef TARGET_MEMMODEL_CHECK
51623 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51625 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51626 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51628 #ifdef HAVE_AS_TLS
51629 #undef TARGET_HAVE_TLS
51630 #define TARGET_HAVE_TLS true
51631 #endif
51632 #undef TARGET_CANNOT_FORCE_CONST_MEM
51633 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51634 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51635 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51637 #undef TARGET_DELEGITIMIZE_ADDRESS
51638 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51640 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
51641 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
51643 #undef TARGET_MS_BITFIELD_LAYOUT_P
51644 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51646 #if TARGET_MACHO
51647 #undef TARGET_BINDS_LOCAL_P
51648 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51649 #else
51650 #undef TARGET_BINDS_LOCAL_P
51651 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51652 #endif
51653 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51654 #undef TARGET_BINDS_LOCAL_P
51655 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51656 #endif
51658 #undef TARGET_ASM_OUTPUT_MI_THUNK
51659 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51660 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51661 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51663 #undef TARGET_ASM_FILE_START
51664 #define TARGET_ASM_FILE_START x86_file_start
51666 #undef TARGET_OPTION_OVERRIDE
51667 #define TARGET_OPTION_OVERRIDE ix86_option_override
51669 #undef TARGET_REGISTER_MOVE_COST
51670 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51671 #undef TARGET_MEMORY_MOVE_COST
51672 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51673 #undef TARGET_RTX_COSTS
51674 #define TARGET_RTX_COSTS ix86_rtx_costs
51675 #undef TARGET_ADDRESS_COST
51676 #define TARGET_ADDRESS_COST ix86_address_cost
51678 #undef TARGET_FLAGS_REGNUM
51679 #define TARGET_FLAGS_REGNUM FLAGS_REG
51680 #undef TARGET_FIXED_CONDITION_CODE_REGS
51681 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51682 #undef TARGET_CC_MODES_COMPATIBLE
51683 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51685 #undef TARGET_MACHINE_DEPENDENT_REORG
51686 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51688 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51689 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51691 #undef TARGET_BUILD_BUILTIN_VA_LIST
51692 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51694 #undef TARGET_FOLD_BUILTIN
51695 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51697 #undef TARGET_GIMPLE_FOLD_BUILTIN
51698 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51700 #undef TARGET_COMPARE_VERSION_PRIORITY
51701 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51703 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51704 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51705 ix86_generate_version_dispatcher_body
51707 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51708 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51709 ix86_get_function_versions_dispatcher
51711 #undef TARGET_ENUM_VA_LIST_P
51712 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51714 #undef TARGET_FN_ABI_VA_LIST
51715 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51717 #undef TARGET_CANONICAL_VA_LIST_TYPE
51718 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51720 #undef TARGET_EXPAND_BUILTIN_VA_START
51721 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51723 #undef TARGET_MD_ASM_ADJUST
51724 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51726 #undef TARGET_C_EXCESS_PRECISION
51727 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51728 #undef TARGET_PROMOTE_PROTOTYPES
51729 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51730 #undef TARGET_SETUP_INCOMING_VARARGS
51731 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51732 #undef TARGET_MUST_PASS_IN_STACK
51733 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51734 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
51735 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
51736 #undef TARGET_FUNCTION_ARG_ADVANCE
51737 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51738 #undef TARGET_FUNCTION_ARG
51739 #define TARGET_FUNCTION_ARG ix86_function_arg
51740 #undef TARGET_INIT_PIC_REG
51741 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51742 #undef TARGET_USE_PSEUDO_PIC_REG
51743 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51744 #undef TARGET_FUNCTION_ARG_BOUNDARY
51745 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51746 #undef TARGET_PASS_BY_REFERENCE
51747 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51748 #undef TARGET_INTERNAL_ARG_POINTER
51749 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51750 #undef TARGET_UPDATE_STACK_BOUNDARY
51751 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51752 #undef TARGET_GET_DRAP_RTX
51753 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51754 #undef TARGET_STRICT_ARGUMENT_NAMING
51755 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51756 #undef TARGET_STATIC_CHAIN
51757 #define TARGET_STATIC_CHAIN ix86_static_chain
51758 #undef TARGET_TRAMPOLINE_INIT
51759 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51760 #undef TARGET_RETURN_POPS_ARGS
51761 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51763 #undef TARGET_WARN_FUNC_RETURN
51764 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
51766 #undef TARGET_LEGITIMATE_COMBINED_INSN
51767 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51769 #undef TARGET_ASAN_SHADOW_OFFSET
51770 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51772 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51773 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51775 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51776 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51778 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51779 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51781 #undef TARGET_C_MODE_FOR_SUFFIX
51782 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51784 #ifdef HAVE_AS_TLS
51785 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51786 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51787 #endif
51789 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51790 #undef TARGET_INSERT_ATTRIBUTES
51791 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51792 #endif
51794 #undef TARGET_MANGLE_TYPE
51795 #define TARGET_MANGLE_TYPE ix86_mangle_type
51797 #undef TARGET_STACK_PROTECT_GUARD
51798 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51800 #if !TARGET_MACHO
51801 #undef TARGET_STACK_PROTECT_FAIL
51802 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51803 #endif
51805 #undef TARGET_FUNCTION_VALUE
51806 #define TARGET_FUNCTION_VALUE ix86_function_value
51808 #undef TARGET_FUNCTION_VALUE_REGNO_P
51809 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51811 #undef TARGET_PROMOTE_FUNCTION_MODE
51812 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51814 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51815 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51817 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51818 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51820 #undef TARGET_INSTANTIATE_DECLS
51821 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51823 #undef TARGET_SECONDARY_RELOAD
51824 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51825 #undef TARGET_SECONDARY_MEMORY_NEEDED
51826 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
51827 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
51828 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
51830 #undef TARGET_CLASS_MAX_NREGS
51831 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51833 #undef TARGET_PREFERRED_RELOAD_CLASS
51834 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51835 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51836 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51837 #undef TARGET_CLASS_LIKELY_SPILLED_P
51838 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51840 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51841 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51842 ix86_builtin_vectorization_cost
51843 #undef TARGET_VECTORIZE_VEC_PERM_CONST
51844 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
51845 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51846 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51847 ix86_preferred_simd_mode
51848 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
51849 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
51850 ix86_split_reduction
51851 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51852 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51853 ix86_autovectorize_vector_sizes
51854 #undef TARGET_VECTORIZE_GET_MASK_MODE
51855 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51856 #undef TARGET_VECTORIZE_INIT_COST
51857 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51858 #undef TARGET_VECTORIZE_ADD_STMT_COST
51859 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51860 #undef TARGET_VECTORIZE_FINISH_COST
51861 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51862 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51863 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51865 #undef TARGET_SET_CURRENT_FUNCTION
51866 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51868 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51869 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51871 #undef TARGET_OPTION_SAVE
51872 #define TARGET_OPTION_SAVE ix86_function_specific_save
51874 #undef TARGET_OPTION_RESTORE
51875 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51877 #undef TARGET_OPTION_POST_STREAM_IN
51878 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51880 #undef TARGET_OPTION_PRINT
51881 #define TARGET_OPTION_PRINT ix86_function_specific_print
51883 #undef TARGET_OPTION_FUNCTION_VERSIONS
51884 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
51886 #undef TARGET_CAN_INLINE_P
51887 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51889 #undef TARGET_LEGITIMATE_ADDRESS_P
51890 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51892 #undef TARGET_REGISTER_PRIORITY
51893 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51895 #undef TARGET_REGISTER_USAGE_LEVELING_P
51896 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51898 #undef TARGET_LEGITIMATE_CONSTANT_P
51899 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51901 #undef TARGET_COMPUTE_FRAME_LAYOUT
51902 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
51904 #undef TARGET_FRAME_POINTER_REQUIRED
51905 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51907 #undef TARGET_CAN_ELIMINATE
51908 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
51910 #undef TARGET_EXTRA_LIVE_ON_ENTRY
51911 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
51913 #undef TARGET_ASM_CODE_END
51914 #define TARGET_ASM_CODE_END ix86_code_end
51916 #undef TARGET_CONDITIONAL_REGISTER_USAGE
51917 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
51919 #undef TARGET_CANONICALIZE_COMPARISON
51920 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
51922 #undef TARGET_LOOP_UNROLL_ADJUST
51923 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
51925 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51926 #undef TARGET_SPILL_CLASS
51927 #define TARGET_SPILL_CLASS ix86_spill_class
51929 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
51930 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
51931 ix86_simd_clone_compute_vecsize_and_simdlen
51933 #undef TARGET_SIMD_CLONE_ADJUST
51934 #define TARGET_SIMD_CLONE_ADJUST \
51935 ix86_simd_clone_adjust
51937 #undef TARGET_SIMD_CLONE_USABLE
51938 #define TARGET_SIMD_CLONE_USABLE \
51939 ix86_simd_clone_usable
51941 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
51942 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
51943 ix86_float_exceptions_rounding_supported_p
51945 #undef TARGET_MODE_EMIT
51946 #define TARGET_MODE_EMIT ix86_emit_mode_set
51948 #undef TARGET_MODE_NEEDED
51949 #define TARGET_MODE_NEEDED ix86_mode_needed
51951 #undef TARGET_MODE_AFTER
51952 #define TARGET_MODE_AFTER ix86_mode_after
51954 #undef TARGET_MODE_ENTRY
51955 #define TARGET_MODE_ENTRY ix86_mode_entry
51957 #undef TARGET_MODE_EXIT
51958 #define TARGET_MODE_EXIT ix86_mode_exit
51960 #undef TARGET_MODE_PRIORITY
51961 #define TARGET_MODE_PRIORITY ix86_mode_priority
51963 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
51964 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
51966 #undef TARGET_LOAD_BOUNDS_FOR_ARG
51967 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
51969 #undef TARGET_STORE_BOUNDS_FOR_ARG
51970 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
51972 #undef TARGET_LOAD_RETURNED_BOUNDS
51973 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
51975 #undef TARGET_STORE_RETURNED_BOUNDS
51976 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
51978 #undef TARGET_CHKP_BOUND_MODE
51979 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
51981 #undef TARGET_BUILTIN_CHKP_FUNCTION
51982 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
51984 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
51985 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
51987 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
51988 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
51990 #undef TARGET_CHKP_INITIALIZE_BOUNDS
51991 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
51993 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
51994 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
51996 #undef TARGET_OFFLOAD_OPTIONS
51997 #define TARGET_OFFLOAD_OPTIONS \
51998 ix86_offload_options
52000 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52001 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52003 #undef TARGET_OPTAB_SUPPORTED_P
52004 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52006 #undef TARGET_HARD_REGNO_SCRATCH_OK
52007 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52009 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52010 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52012 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52013 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52015 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52016 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52018 #undef TARGET_INIT_LIBFUNCS
52019 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52021 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52022 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52024 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52025 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52027 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
52028 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
52030 #undef TARGET_HARD_REGNO_NREGS
52031 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
52032 #undef TARGET_HARD_REGNO_MODE_OK
52033 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
52035 #undef TARGET_MODES_TIEABLE_P
52036 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
52038 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
52039 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
52040 ix86_hard_regno_call_part_clobbered
52042 #undef TARGET_CAN_CHANGE_MODE_CLASS
52043 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
52045 #undef TARGET_STATIC_RTX_ALIGNMENT
52046 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
52047 #undef TARGET_CONSTANT_ALIGNMENT
52048 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
52050 #undef TARGET_EMPTY_RECORD_P
52051 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
52053 #undef TARGET_WARN_PARAMETER_PASSING_ABI
52054 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
52056 #if CHECKING_P
52057 #undef TARGET_RUN_TARGET_SELFTESTS
52058 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52059 #endif /* #if CHECKING_P */
52061 struct gcc_target targetm = TARGET_INITIALIZER;
52063 #include "gt-i386.h"