i386: Skip DF_REF_INSN if DF_REF_INSN_INFO is false
[official-gcc.git] / gcc / config / i386 / i386.c
blob3fafcfe9613947694ce690c4fbc4e19a0533f439
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
92 /* This file should be included last. */
93 #include "target-def.h"
95 #include "x86-tune-costs.h"
97 static rtx legitimize_dllimport_symbol (rtx, bool);
98 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
99 static rtx legitimize_pe_coff_symbol (rtx, bool);
100 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
101 static bool ix86_save_reg (unsigned int, bool, bool);
102 static bool ix86_function_naked (const_tree);
103 static bool ix86_notrack_prefixed_insn_p (rtx);
105 #ifndef CHECK_STACK_LIMIT
106 #define CHECK_STACK_LIMIT (-1)
107 #endif
109 /* Return index of given mode in mult and division cost tables. */
110 #define MODE_INDEX(mode) \
111 ((mode) == QImode ? 0 \
112 : (mode) == HImode ? 1 \
113 : (mode) == SImode ? 2 \
114 : (mode) == DImode ? 3 \
115 : 4)
118 /* Set by -mtune. */
119 const struct processor_costs *ix86_tune_cost = NULL;
121 /* Set by -mtune or -Os. */
122 const struct processor_costs *ix86_cost = NULL;
124 /* Processor feature/optimization bitmasks. */
125 #define m_386 (1U<<PROCESSOR_I386)
126 #define m_486 (1U<<PROCESSOR_I486)
127 #define m_PENT (1U<<PROCESSOR_PENTIUM)
128 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
129 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
130 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
131 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
132 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
133 #define m_CORE2 (1U<<PROCESSOR_CORE2)
134 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
135 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
136 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
137 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
138 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
139 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
140 #define m_KNL (1U<<PROCESSOR_KNL)
141 #define m_KNM (1U<<PROCESSOR_KNM)
142 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
143 #define m_INTEL (1U<<PROCESSOR_INTEL)
145 #define m_GEODE (1U<<PROCESSOR_GEODE)
146 #define m_K6 (1U<<PROCESSOR_K6)
147 #define m_K6_GEODE (m_K6 | m_GEODE)
148 #define m_K8 (1U<<PROCESSOR_K8)
149 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
150 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
151 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
152 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
153 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
154 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
155 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
156 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
157 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
158 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
159 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
160 #define m_BTVER (m_BTVER1 | m_BTVER2)
161 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
162 | m_ZNVER1)
164 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
166 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
167 #undef DEF_TUNE
168 #define DEF_TUNE(tune, name, selector) name,
169 #include "x86-tune.def"
170 #undef DEF_TUNE
173 /* Feature tests against the various tunings. */
174 unsigned char ix86_tune_features[X86_TUNE_LAST];
176 /* Feature tests against the various tunings used to create ix86_tune_features
177 based on the processor mask. */
178 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
179 #undef DEF_TUNE
180 #define DEF_TUNE(tune, name, selector) selector,
181 #include "x86-tune.def"
182 #undef DEF_TUNE
185 /* Feature tests against the various architecture variations. */
186 unsigned char ix86_arch_features[X86_ARCH_LAST];
188 /* Feature tests against the various architecture variations, used to create
189 ix86_arch_features based on the processor mask. */
190 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
191 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
192 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
194 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
195 ~m_386,
197 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
198 ~(m_386 | m_486),
200 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
201 ~m_386,
203 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
204 ~m_386,
207 /* In case the average insn count for single function invocation is
208 lower than this constant, emit fast (but longer) prologue and
209 epilogue code. */
210 #define FAST_PROLOGUE_INSN_COUNT 20
212 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
213 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
214 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
215 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
217 /* Array of the smallest class containing reg number REGNO, indexed by
218 REGNO. Used by REGNO_REG_CLASS in i386.h. */
220 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
222 /* ax, dx, cx, bx */
223 AREG, DREG, CREG, BREG,
224 /* si, di, bp, sp */
225 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
226 /* FP registers */
227 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
228 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
229 /* arg pointer */
230 NON_Q_REGS,
231 /* flags, fpsr, fpcr, frame */
232 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
233 /* SSE registers */
234 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
235 SSE_REGS, SSE_REGS,
236 /* MMX registers */
237 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
238 MMX_REGS, MMX_REGS,
239 /* REX registers */
240 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
241 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
242 /* SSE REX registers */
243 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
244 SSE_REGS, SSE_REGS,
245 /* AVX-512 SSE registers */
246 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
247 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
248 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
249 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
250 /* Mask registers. */
251 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
252 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
253 /* MPX bound registers */
254 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
257 /* The "default" register map used in 32bit mode. */
259 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
261 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
262 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
263 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
264 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
265 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
267 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
268 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
269 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
270 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
271 101, 102, 103, 104, /* bound registers */
274 /* The "default" register map used in 64bit mode. */
276 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
278 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
279 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
280 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
281 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
282 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
283 8,9,10,11,12,13,14,15, /* extended integer registers */
284 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
285 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
286 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
287 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
288 126, 127, 128, 129, /* bound registers */
291 /* Define the register numbers to be used in Dwarf debugging information.
292 The SVR4 reference port C compiler uses the following register numbers
293 in its Dwarf output code:
294 0 for %eax (gcc regno = 0)
295 1 for %ecx (gcc regno = 2)
296 2 for %edx (gcc regno = 1)
297 3 for %ebx (gcc regno = 3)
298 4 for %esp (gcc regno = 7)
299 5 for %ebp (gcc regno = 6)
300 6 for %esi (gcc regno = 4)
301 7 for %edi (gcc regno = 5)
302 The following three DWARF register numbers are never generated by
303 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
304 believes these numbers have these meanings.
305 8 for %eip (no gcc equivalent)
306 9 for %eflags (gcc regno = 17)
307 10 for %trapno (no gcc equivalent)
308 It is not at all clear how we should number the FP stack registers
309 for the x86 architecture. If the version of SDB on x86/svr4 were
310 a bit less brain dead with respect to floating-point then we would
311 have a precedent to follow with respect to DWARF register numbers
312 for x86 FP registers, but the SDB on x86/svr4 is so completely
313 broken with respect to FP registers that it is hardly worth thinking
314 of it as something to strive for compatibility with.
315 The version of x86/svr4 SDB I have at the moment does (partially)
316 seem to believe that DWARF register number 11 is associated with
317 the x86 register %st(0), but that's about all. Higher DWARF
318 register numbers don't seem to be associated with anything in
319 particular, and even for DWARF regno 11, SDB only seems to under-
320 stand that it should say that a variable lives in %st(0) (when
321 asked via an `=' command) if we said it was in DWARF regno 11,
322 but SDB still prints garbage when asked for the value of the
323 variable in question (via a `/' command).
324 (Also note that the labels SDB prints for various FP stack regs
325 when doing an `x' command are all wrong.)
326 Note that these problems generally don't affect the native SVR4
327 C compiler because it doesn't allow the use of -O with -g and
328 because when it is *not* optimizing, it allocates a memory
329 location for each floating-point variable, and the memory
330 location is what gets described in the DWARF AT_location
331 attribute for the variable in question.
332 Regardless of the severe mental illness of the x86/svr4 SDB, we
333 do something sensible here and we use the following DWARF
334 register numbers. Note that these are all stack-top-relative
335 numbers.
336 11 for %st(0) (gcc regno = 8)
337 12 for %st(1) (gcc regno = 9)
338 13 for %st(2) (gcc regno = 10)
339 14 for %st(3) (gcc regno = 11)
340 15 for %st(4) (gcc regno = 12)
341 16 for %st(5) (gcc regno = 13)
342 17 for %st(6) (gcc regno = 14)
343 18 for %st(7) (gcc regno = 15)
345 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
347 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
348 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
349 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
350 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
351 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
352 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
353 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
354 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
355 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
356 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
357 101, 102, 103, 104, /* bound registers */
360 /* Define parameter passing and return registers. */
362 static int const x86_64_int_parameter_registers[6] =
364 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
367 static int const x86_64_ms_abi_int_parameter_registers[4] =
369 CX_REG, DX_REG, R8_REG, R9_REG
372 static int const x86_64_int_return_registers[4] =
374 AX_REG, DX_REG, DI_REG, SI_REG
377 /* Additional registers that are clobbered by SYSV calls. */
379 #define NUM_X86_64_MS_CLOBBERED_REGS 12
380 static int const x86_64_ms_sysv_extra_clobbered_registers
381 [NUM_X86_64_MS_CLOBBERED_REGS] =
383 SI_REG, DI_REG,
384 XMM6_REG, XMM7_REG,
385 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
386 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
389 enum xlogue_stub {
390 XLOGUE_STUB_SAVE,
391 XLOGUE_STUB_RESTORE,
392 XLOGUE_STUB_RESTORE_TAIL,
393 XLOGUE_STUB_SAVE_HFP,
394 XLOGUE_STUB_RESTORE_HFP,
395 XLOGUE_STUB_RESTORE_HFP_TAIL,
397 XLOGUE_STUB_COUNT
400 enum xlogue_stub_sets {
401 XLOGUE_SET_ALIGNED,
402 XLOGUE_SET_ALIGNED_PLUS_8,
403 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
404 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
406 XLOGUE_SET_COUNT
409 /* Register save/restore layout used by out-of-line stubs. */
410 class xlogue_layout {
411 public:
412 struct reginfo
414 unsigned regno;
415 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
416 rsi) to where each register is stored. */
419 unsigned get_nregs () const {return m_nregs;}
420 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
422 const reginfo &get_reginfo (unsigned reg) const
424 gcc_assert (reg < m_nregs);
425 return m_regs[reg];
428 static const char *get_stub_name (enum xlogue_stub stub,
429 unsigned n_extra_args);
431 /* Returns an rtx for the stub's symbol based upon
432 1.) the specified stub (save, restore or restore_ret) and
433 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
434 3.) rather or not stack alignment is being performed. */
435 static rtx get_stub_rtx (enum xlogue_stub stub);
437 /* Returns the amount of stack space (including padding) that the stub
438 needs to store registers based upon data in the machine_function. */
439 HOST_WIDE_INT get_stack_space_used () const
441 const struct machine_function *m = cfun->machine;
442 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
444 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
445 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
448 /* Returns the offset for the base pointer used by the stub. */
449 HOST_WIDE_INT get_stub_ptr_offset () const
451 return STUB_INDEX_OFFSET + m_stack_align_off_in;
454 static const struct xlogue_layout &get_instance ();
455 static unsigned count_stub_managed_regs ();
456 static bool is_stub_managed_reg (unsigned regno, unsigned count);
458 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
459 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
460 static const unsigned MAX_REGS = 18;
461 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
462 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
463 static const unsigned STUB_NAME_MAX_LEN = 20;
464 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
465 static const unsigned REG_ORDER[MAX_REGS];
466 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
468 private:
469 xlogue_layout ();
470 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
471 xlogue_layout (const xlogue_layout &);
473 /* True if hard frame pointer is used. */
474 bool m_hfp;
476 /* Max number of register this layout manages. */
477 unsigned m_nregs;
479 /* Incoming offset from 16-byte alignment. */
480 HOST_WIDE_INT m_stack_align_off_in;
482 /* Register order and offsets. */
483 struct reginfo m_regs[MAX_REGS];
485 /* Lazy-inited cache of symbol names for stubs. */
486 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
487 [STUB_NAME_MAX_LEN];
489 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
492 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
493 "savms64",
494 "resms64",
495 "resms64x",
496 "savms64f",
497 "resms64f",
498 "resms64fx"
501 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
502 /* The below offset values are where each register is stored for the layout
503 relative to incoming stack pointer. The value of each m_regs[].offset will
504 be relative to the incoming base pointer (rax or rsi) used by the stub.
506 s_instances: 0 1 2 3
507 Offset: realigned or aligned + 8
508 Register aligned aligned + 8 aligned w/HFP w/HFP */
509 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
510 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
511 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
512 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
513 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
514 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
515 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
516 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
517 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
518 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
519 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
520 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
521 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
522 BP_REG, /* 0xc0 0xc8 N/A N/A */
523 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
524 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
525 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
526 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
529 /* Instantiate static const values. */
530 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
531 const unsigned xlogue_layout::MIN_REGS;
532 const unsigned xlogue_layout::MAX_REGS;
533 const unsigned xlogue_layout::MAX_EXTRA_REGS;
534 const unsigned xlogue_layout::VARIANT_COUNT;
535 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
537 /* Initialize xlogue_layout::s_stub_names to zero. */
538 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
539 [STUB_NAME_MAX_LEN];
541 /* Instantiates all xlogue_layout instances. */
542 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
543 xlogue_layout (0, false),
544 xlogue_layout (8, false),
545 xlogue_layout (0, true),
546 xlogue_layout (8, true)
549 /* Return an appropriate const instance of xlogue_layout based upon values
550 in cfun->machine and crtl. */
551 const struct xlogue_layout &
552 xlogue_layout::get_instance ()
554 enum xlogue_stub_sets stub_set;
555 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
557 if (stack_realign_fp)
558 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
559 else if (frame_pointer_needed)
560 stub_set = aligned_plus_8
561 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
562 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
563 else
564 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
566 return s_instances[stub_set];
569 /* Determine how many clobbered registers can be saved by the stub.
570 Returns the count of registers the stub will save and restore. */
571 unsigned
572 xlogue_layout::count_stub_managed_regs ()
574 bool hfp = frame_pointer_needed || stack_realign_fp;
575 unsigned i, count;
576 unsigned regno;
578 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
580 regno = REG_ORDER[i];
581 if (regno == BP_REG && hfp)
582 continue;
583 if (!ix86_save_reg (regno, false, false))
584 break;
585 ++count;
587 return count;
590 /* Determine if register REGNO is a stub managed register given the
591 total COUNT of stub managed registers. */
592 bool
593 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
595 bool hfp = frame_pointer_needed || stack_realign_fp;
596 unsigned i;
598 for (i = 0; i < count; ++i)
600 gcc_assert (i < MAX_REGS);
601 if (REG_ORDER[i] == BP_REG && hfp)
602 ++count;
603 else if (REG_ORDER[i] == regno)
604 return true;
606 return false;
609 /* Constructor for xlogue_layout. */
610 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
611 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
612 m_stack_align_off_in (stack_align_off_in)
614 HOST_WIDE_INT offset = stack_align_off_in;
615 unsigned i, j;
617 for (i = j = 0; i < MAX_REGS; ++i)
619 unsigned regno = REG_ORDER[i];
621 if (regno == BP_REG && hfp)
622 continue;
623 if (SSE_REGNO_P (regno))
625 offset += 16;
626 /* Verify that SSE regs are always aligned. */
627 gcc_assert (!((stack_align_off_in + offset) & 15));
629 else
630 offset += 8;
632 m_regs[j].regno = regno;
633 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
635 gcc_assert (j == m_nregs);
638 const char *
639 xlogue_layout::get_stub_name (enum xlogue_stub stub,
640 unsigned n_extra_regs)
642 const int have_avx = TARGET_AVX;
643 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
645 /* Lazy init */
646 if (!*name)
648 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
649 (have_avx ? "avx" : "sse"),
650 STUB_BASE_NAMES[stub],
651 MIN_REGS + n_extra_regs);
652 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
655 return name;
658 /* Return rtx of a symbol ref for the entry point (based upon
659 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
661 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
663 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
664 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
665 gcc_assert (stub < XLOGUE_STUB_COUNT);
666 gcc_assert (crtl->stack_realign_finalized);
668 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
671 /* Define the structure for the machine field in struct function. */
673 struct GTY(()) stack_local_entry {
674 unsigned short mode;
675 unsigned short n;
676 rtx rtl;
677 struct stack_local_entry *next;
680 /* Which cpu are we scheduling for. */
681 enum attr_cpu ix86_schedule;
683 /* Which cpu are we optimizing for. */
684 enum processor_type ix86_tune;
686 /* Which instruction set architecture to use. */
687 enum processor_type ix86_arch;
689 /* True if processor has SSE prefetch instruction. */
690 unsigned char x86_prefetch_sse;
692 /* -mstackrealign option */
693 static const char ix86_force_align_arg_pointer_string[]
694 = "force_align_arg_pointer";
696 static rtx (*ix86_gen_leave) (void);
697 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
698 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
699 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
700 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
701 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
702 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
703 static rtx (*ix86_gen_clzero) (rtx);
704 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
705 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
706 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
707 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
709 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
711 /* Preferred alignment for stack boundary in bits. */
712 unsigned int ix86_preferred_stack_boundary;
714 /* Alignment for incoming stack boundary in bits specified at
715 command line. */
716 static unsigned int ix86_user_incoming_stack_boundary;
718 /* Default alignment for incoming stack boundary in bits. */
719 static unsigned int ix86_default_incoming_stack_boundary;
721 /* Alignment for incoming stack boundary in bits. */
722 unsigned int ix86_incoming_stack_boundary;
724 /* Calling abi specific va_list type nodes. */
725 static GTY(()) tree sysv_va_list_type_node;
726 static GTY(()) tree ms_va_list_type_node;
728 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
729 char internal_label_prefix[16];
730 int internal_label_prefix_len;
732 /* Fence to use after loop using movnt. */
733 tree x86_mfence;
735 /* Register class used for passing given 64bit part of the argument.
736 These represent classes as documented by the PS ABI, with the exception
737 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
738 use SF or DFmode move instead of DImode to avoid reformatting penalties.
740 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
741 whenever possible (upper half does contain padding). */
742 enum x86_64_reg_class
744 X86_64_NO_CLASS,
745 X86_64_INTEGER_CLASS,
746 X86_64_INTEGERSI_CLASS,
747 X86_64_SSE_CLASS,
748 X86_64_SSESF_CLASS,
749 X86_64_SSEDF_CLASS,
750 X86_64_SSEUP_CLASS,
751 X86_64_X87_CLASS,
752 X86_64_X87UP_CLASS,
753 X86_64_COMPLEX_X87_CLASS,
754 X86_64_MEMORY_CLASS
757 #define MAX_CLASSES 8
759 /* Table of constants used by fldpi, fldln2, etc.... */
760 static REAL_VALUE_TYPE ext_80387_constants_table [5];
761 static bool ext_80387_constants_init;
764 static struct machine_function * ix86_init_machine_status (void);
765 static rtx ix86_function_value (const_tree, const_tree, bool);
766 static bool ix86_function_value_regno_p (const unsigned int);
767 static unsigned int ix86_function_arg_boundary (machine_mode,
768 const_tree);
769 static rtx ix86_static_chain (const_tree, bool);
770 static int ix86_function_regparm (const_tree, const_tree);
771 static void ix86_compute_frame_layout (void);
772 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
773 rtx, rtx, int);
774 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
775 static tree ix86_canonical_va_list_type (tree);
776 static void predict_jump (int);
777 static unsigned int split_stack_prologue_scratch_regno (void);
778 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
780 enum ix86_function_specific_strings
782 IX86_FUNCTION_SPECIFIC_ARCH,
783 IX86_FUNCTION_SPECIFIC_TUNE,
784 IX86_FUNCTION_SPECIFIC_MAX
787 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
788 const char *, const char *, enum fpmath_unit,
789 bool);
790 static void ix86_function_specific_save (struct cl_target_option *,
791 struct gcc_options *opts);
792 static void ix86_function_specific_restore (struct gcc_options *opts,
793 struct cl_target_option *);
794 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
795 static void ix86_function_specific_print (FILE *, int,
796 struct cl_target_option *);
797 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
798 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
799 struct gcc_options *,
800 struct gcc_options *,
801 struct gcc_options *);
802 static bool ix86_can_inline_p (tree, tree);
803 static void ix86_set_current_function (tree);
804 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
806 static enum calling_abi ix86_function_abi (const_tree);
809 #ifndef SUBTARGET32_DEFAULT_CPU
810 #define SUBTARGET32_DEFAULT_CPU "i386"
811 #endif
813 /* Whether -mtune= or -march= were specified */
814 static int ix86_tune_defaulted;
815 static int ix86_arch_specified;
817 /* Vectorization library interface and handlers. */
818 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
820 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
821 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
823 /* Processor target table, indexed by processor number */
824 struct ptt
826 const char *const name; /* processor name */
827 const struct processor_costs *cost; /* Processor costs */
828 const int align_loop; /* Default alignments. */
829 const int align_loop_max_skip;
830 const int align_jump;
831 const int align_jump_max_skip;
832 const int align_func;
835 /* This table must be in sync with enum processor_type in i386.h. */
836 static const struct ptt processor_target_table[PROCESSOR_max] =
838 {"generic", &generic_cost, 16, 10, 16, 10, 16},
839 {"i386", &i386_cost, 4, 3, 4, 3, 4},
840 {"i486", &i486_cost, 16, 15, 16, 15, 16},
841 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
842 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
843 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
844 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
845 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
846 {"core2", &core_cost, 16, 10, 16, 10, 16},
847 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
848 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
849 {"haswell", &core_cost, 16, 10, 16, 10, 16},
850 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
851 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
852 {"knl", &slm_cost, 16, 15, 16, 7, 16},
853 {"knm", &slm_cost, 16, 15, 16, 7, 16},
854 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
855 {"intel", &intel_cost, 16, 15, 16, 7, 16},
856 {"geode", &geode_cost, 0, 0, 0, 0, 0},
857 {"k6", &k6_cost, 32, 7, 32, 7, 32},
858 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
859 {"k8", &k8_cost, 16, 7, 16, 7, 16},
860 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
861 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
862 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
863 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
864 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
865 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
866 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
867 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
870 static unsigned int
871 rest_of_handle_insert_vzeroupper (void)
873 int i;
875 /* vzeroupper instructions are inserted immediately after reload to
876 account for possible spills from 256bit registers. The pass
877 reuses mode switching infrastructure by re-running mode insertion
878 pass, so disable entities that have already been processed. */
879 for (i = 0; i < MAX_386_ENTITIES; i++)
880 ix86_optimize_mode_switching[i] = 0;
882 ix86_optimize_mode_switching[AVX_U128] = 1;
884 /* Call optimize_mode_switching. */
885 g->get_passes ()->execute_pass_mode_switching ();
886 return 0;
889 /* Return 1 if INSN uses or defines a hard register.
890 Hard register uses in a memory address are ignored.
891 Clobbers and flags definitions are ignored. */
893 static bool
894 has_non_address_hard_reg (rtx_insn *insn)
896 df_ref ref;
897 FOR_EACH_INSN_DEF (ref, insn)
898 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
899 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
900 && DF_REF_REGNO (ref) != FLAGS_REG)
901 return true;
903 FOR_EACH_INSN_USE (ref, insn)
904 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
905 return true;
907 return false;
910 /* Check if comparison INSN may be transformed
911 into vector comparison. Currently we transform
912 zero checks only which look like:
914 (set (reg:CCZ 17 flags)
915 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
916 (subreg:SI (reg:DI x) 0))
917 (const_int 0 [0]))) */
919 static bool
920 convertible_comparison_p (rtx_insn *insn)
922 if (!TARGET_SSE4_1)
923 return false;
925 rtx def_set = single_set (insn);
927 gcc_assert (def_set);
929 rtx src = SET_SRC (def_set);
930 rtx dst = SET_DEST (def_set);
932 gcc_assert (GET_CODE (src) == COMPARE);
934 if (GET_CODE (dst) != REG
935 || REGNO (dst) != FLAGS_REG
936 || GET_MODE (dst) != CCZmode)
937 return false;
939 rtx op1 = XEXP (src, 0);
940 rtx op2 = XEXP (src, 1);
942 if (op2 != CONST0_RTX (GET_MODE (op2)))
943 return false;
945 if (GET_CODE (op1) != IOR)
946 return false;
948 op2 = XEXP (op1, 1);
949 op1 = XEXP (op1, 0);
951 if (!SUBREG_P (op1)
952 || !SUBREG_P (op2)
953 || GET_MODE (op1) != SImode
954 || GET_MODE (op2) != SImode
955 || ((SUBREG_BYTE (op1) != 0
956 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
957 && (SUBREG_BYTE (op2) != 0
958 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
959 return false;
961 op1 = SUBREG_REG (op1);
962 op2 = SUBREG_REG (op2);
964 if (op1 != op2
965 || !REG_P (op1)
966 || GET_MODE (op1) != DImode)
967 return false;
969 return true;
972 /* The DImode version of scalar_to_vector_candidate_p. */
974 static bool
975 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
977 rtx def_set = single_set (insn);
979 if (!def_set)
980 return false;
982 if (has_non_address_hard_reg (insn))
983 return false;
985 rtx src = SET_SRC (def_set);
986 rtx dst = SET_DEST (def_set);
988 if (GET_CODE (src) == COMPARE)
989 return convertible_comparison_p (insn);
991 /* We are interested in DImode promotion only. */
992 if ((GET_MODE (src) != DImode
993 && !CONST_INT_P (src))
994 || GET_MODE (dst) != DImode)
995 return false;
997 if (!REG_P (dst) && !MEM_P (dst))
998 return false;
1000 switch (GET_CODE (src))
1002 case ASHIFTRT:
1003 if (!TARGET_AVX512VL)
1004 return false;
1005 /* FALLTHRU */
1007 case ASHIFT:
1008 case LSHIFTRT:
1009 if (!REG_P (XEXP (src, 1))
1010 && (!SUBREG_P (XEXP (src, 1))
1011 || SUBREG_BYTE (XEXP (src, 1)) != 0
1012 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1013 && (!CONST_INT_P (XEXP (src, 1))
1014 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1015 return false;
1017 if (GET_MODE (XEXP (src, 1)) != QImode
1018 && !CONST_INT_P (XEXP (src, 1)))
1019 return false;
1020 break;
1022 case PLUS:
1023 case MINUS:
1024 case IOR:
1025 case XOR:
1026 case AND:
1027 if (!REG_P (XEXP (src, 1))
1028 && !MEM_P (XEXP (src, 1))
1029 && !CONST_INT_P (XEXP (src, 1)))
1030 return false;
1032 if (GET_MODE (XEXP (src, 1)) != DImode
1033 && !CONST_INT_P (XEXP (src, 1)))
1034 return false;
1035 break;
1037 case NEG:
1038 case NOT:
1039 break;
1041 case REG:
1042 return true;
1044 case MEM:
1045 case CONST_INT:
1046 return REG_P (dst);
1048 default:
1049 return false;
1052 if (!REG_P (XEXP (src, 0))
1053 && !MEM_P (XEXP (src, 0))
1054 && !CONST_INT_P (XEXP (src, 0))
1055 /* Check for andnot case. */
1056 && (GET_CODE (src) != AND
1057 || GET_CODE (XEXP (src, 0)) != NOT
1058 || !REG_P (XEXP (XEXP (src, 0), 0))))
1059 return false;
1061 if (GET_MODE (XEXP (src, 0)) != DImode
1062 && !CONST_INT_P (XEXP (src, 0)))
1063 return false;
1065 return true;
1068 /* The TImode version of scalar_to_vector_candidate_p. */
1070 static bool
1071 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1073 rtx def_set = single_set (insn);
1075 if (!def_set)
1076 return false;
1078 if (has_non_address_hard_reg (insn))
1079 return false;
1081 rtx src = SET_SRC (def_set);
1082 rtx dst = SET_DEST (def_set);
1084 /* Only TImode load and store are allowed. */
1085 if (GET_MODE (dst) != TImode)
1086 return false;
1088 if (MEM_P (dst))
1090 /* Check for store. Memory must be aligned or unaligned store
1091 is optimal. Only support store from register, standard SSE
1092 constant or CONST_WIDE_INT generated from piecewise store.
1094 ??? Verify performance impact before enabling CONST_INT for
1095 __int128 store. */
1096 if (misaligned_operand (dst, TImode)
1097 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1098 return false;
1100 switch (GET_CODE (src))
1102 default:
1103 return false;
1105 case REG:
1106 case CONST_WIDE_INT:
1107 return true;
1109 case CONST_INT:
1110 return standard_sse_constant_p (src, TImode);
1113 else if (MEM_P (src))
1115 /* Check for load. Memory must be aligned or unaligned load is
1116 optimal. */
1117 return (REG_P (dst)
1118 && (!misaligned_operand (src, TImode)
1119 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1122 return false;
1125 /* Return 1 if INSN may be converted into vector
1126 instruction. */
1128 static bool
1129 scalar_to_vector_candidate_p (rtx_insn *insn)
1131 if (TARGET_64BIT)
1132 return timode_scalar_to_vector_candidate_p (insn);
1133 else
1134 return dimode_scalar_to_vector_candidate_p (insn);
1137 /* The DImode version of remove_non_convertible_regs. */
1139 static void
1140 dimode_remove_non_convertible_regs (bitmap candidates)
1142 bitmap_iterator bi;
1143 unsigned id;
1144 bitmap regs = BITMAP_ALLOC (NULL);
1146 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1148 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1149 rtx reg = SET_DEST (def_set);
1151 if (!REG_P (reg)
1152 || bitmap_bit_p (regs, REGNO (reg))
1153 || HARD_REGISTER_P (reg))
1154 continue;
1156 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1157 def;
1158 def = DF_REF_NEXT_REG (def))
1160 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1162 if (dump_file)
1163 fprintf (dump_file,
1164 "r%d has non convertible definition in insn %d\n",
1165 REGNO (reg), DF_REF_INSN_UID (def));
1167 bitmap_set_bit (regs, REGNO (reg));
1168 break;
1173 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1175 for (df_ref def = DF_REG_DEF_CHAIN (id);
1176 def;
1177 def = DF_REF_NEXT_REG (def))
1178 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1180 if (dump_file)
1181 fprintf (dump_file, "Removing insn %d from candidates list\n",
1182 DF_REF_INSN_UID (def));
1184 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1188 BITMAP_FREE (regs);
1191 /* For a register REGNO, scan instructions for its defs and uses.
1192 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1194 static void
1195 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1196 unsigned int regno)
1198 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1199 def;
1200 def = DF_REF_NEXT_REG (def))
1202 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1204 if (dump_file)
1205 fprintf (dump_file,
1206 "r%d has non convertible def in insn %d\n",
1207 regno, DF_REF_INSN_UID (def));
1209 bitmap_set_bit (regs, regno);
1210 break;
1214 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1215 ref;
1216 ref = DF_REF_NEXT_REG (ref))
1218 /* Debug instructions are skipped. */
1219 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1220 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1222 if (dump_file)
1223 fprintf (dump_file,
1224 "r%d has non convertible use in insn %d\n",
1225 regno, DF_REF_INSN_UID (ref));
1227 bitmap_set_bit (regs, regno);
1228 break;
1233 /* The TImode version of remove_non_convertible_regs. */
1235 static void
1236 timode_remove_non_convertible_regs (bitmap candidates)
1238 bitmap_iterator bi;
1239 unsigned id;
1240 bitmap regs = BITMAP_ALLOC (NULL);
1242 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1244 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1245 rtx dest = SET_DEST (def_set);
1246 rtx src = SET_SRC (def_set);
1248 if ((!REG_P (dest)
1249 || bitmap_bit_p (regs, REGNO (dest))
1250 || HARD_REGISTER_P (dest))
1251 && (!REG_P (src)
1252 || bitmap_bit_p (regs, REGNO (src))
1253 || HARD_REGISTER_P (src)))
1254 continue;
1256 if (REG_P (dest))
1257 timode_check_non_convertible_regs (candidates, regs,
1258 REGNO (dest));
1260 if (REG_P (src))
1261 timode_check_non_convertible_regs (candidates, regs,
1262 REGNO (src));
1265 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1267 for (df_ref def = DF_REG_DEF_CHAIN (id);
1268 def;
1269 def = DF_REF_NEXT_REG (def))
1270 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1272 if (dump_file)
1273 fprintf (dump_file, "Removing insn %d from candidates list\n",
1274 DF_REF_INSN_UID (def));
1276 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1279 for (df_ref ref = DF_REG_USE_CHAIN (id);
1280 ref;
1281 ref = DF_REF_NEXT_REG (ref))
1282 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1284 if (dump_file)
1285 fprintf (dump_file, "Removing insn %d from candidates list\n",
1286 DF_REF_INSN_UID (ref));
1288 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1292 BITMAP_FREE (regs);
1295 /* For a given bitmap of insn UIDs scans all instruction and
1296 remove insn from CANDIDATES in case it has both convertible
1297 and not convertible definitions.
1299 All insns in a bitmap are conversion candidates according to
1300 scalar_to_vector_candidate_p. Currently it implies all insns
1301 are single_set. */
1303 static void
1304 remove_non_convertible_regs (bitmap candidates)
1306 if (TARGET_64BIT)
1307 timode_remove_non_convertible_regs (candidates);
1308 else
1309 dimode_remove_non_convertible_regs (candidates);
1312 class scalar_chain
1314 public:
1315 scalar_chain ();
1316 virtual ~scalar_chain ();
1318 static unsigned max_id;
1320 /* ID of a chain. */
1321 unsigned int chain_id;
1322 /* A queue of instructions to be included into a chain. */
1323 bitmap queue;
1324 /* Instructions included into a chain. */
1325 bitmap insns;
1326 /* All registers defined by a chain. */
1327 bitmap defs;
1328 /* Registers used in both vector and sclar modes. */
1329 bitmap defs_conv;
1331 void build (bitmap candidates, unsigned insn_uid);
1332 virtual int compute_convert_gain () = 0;
1333 int convert ();
1335 protected:
1336 void add_to_queue (unsigned insn_uid);
1337 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1339 private:
1340 void add_insn (bitmap candidates, unsigned insn_uid);
1341 void analyze_register_chain (bitmap candidates, df_ref ref);
1342 virtual void mark_dual_mode_def (df_ref def) = 0;
1343 virtual void convert_insn (rtx_insn *insn) = 0;
1344 virtual void convert_registers () = 0;
1347 class dimode_scalar_chain : public scalar_chain
1349 public:
1350 int compute_convert_gain ();
1351 private:
1352 void mark_dual_mode_def (df_ref def);
1353 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1354 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1355 void convert_insn (rtx_insn *insn);
1356 void convert_op (rtx *op, rtx_insn *insn);
1357 void convert_reg (unsigned regno);
1358 void make_vector_copies (unsigned regno);
1359 void convert_registers ();
1360 int vector_const_cost (rtx exp);
1363 class timode_scalar_chain : public scalar_chain
1365 public:
1366 /* Convert from TImode to V1TImode is always faster. */
1367 int compute_convert_gain () { return 1; }
1369 private:
1370 void mark_dual_mode_def (df_ref def);
1371 void fix_debug_reg_uses (rtx reg);
1372 void convert_insn (rtx_insn *insn);
1373 /* We don't convert registers to difference size. */
1374 void convert_registers () {}
1377 unsigned scalar_chain::max_id = 0;
1379 /* Initialize new chain. */
1381 scalar_chain::scalar_chain ()
1383 chain_id = ++max_id;
1385 if (dump_file)
1386 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1388 bitmap_obstack_initialize (NULL);
1389 insns = BITMAP_ALLOC (NULL);
1390 defs = BITMAP_ALLOC (NULL);
1391 defs_conv = BITMAP_ALLOC (NULL);
1392 queue = NULL;
1395 /* Free chain's data. */
1397 scalar_chain::~scalar_chain ()
1399 BITMAP_FREE (insns);
1400 BITMAP_FREE (defs);
1401 BITMAP_FREE (defs_conv);
1402 bitmap_obstack_release (NULL);
1405 /* Add instruction into chains' queue. */
1407 void
1408 scalar_chain::add_to_queue (unsigned insn_uid)
1410 if (bitmap_bit_p (insns, insn_uid)
1411 || bitmap_bit_p (queue, insn_uid))
1412 return;
1414 if (dump_file)
1415 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1416 insn_uid, chain_id);
1417 bitmap_set_bit (queue, insn_uid);
1420 /* For DImode conversion, mark register defined by DEF as requiring
1421 conversion. */
1423 void
1424 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1426 gcc_assert (DF_REF_REG_DEF_P (def));
1428 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1429 return;
1431 if (dump_file)
1432 fprintf (dump_file,
1433 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1434 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1436 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1439 /* For TImode conversion, it is unused. */
1441 void
1442 timode_scalar_chain::mark_dual_mode_def (df_ref)
1444 gcc_unreachable ();
1447 /* Check REF's chain to add new insns into a queue
1448 and find registers requiring conversion. */
1450 void
1451 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1453 df_link *chain;
1455 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1456 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1457 add_to_queue (DF_REF_INSN_UID (ref));
1459 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1461 unsigned uid = DF_REF_INSN_UID (chain->ref);
1463 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1464 continue;
1466 if (!DF_REF_REG_MEM_P (chain->ref))
1468 if (bitmap_bit_p (insns, uid))
1469 continue;
1471 if (bitmap_bit_p (candidates, uid))
1473 add_to_queue (uid);
1474 continue;
1478 if (DF_REF_REG_DEF_P (chain->ref))
1480 if (dump_file)
1481 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1482 DF_REF_REGNO (chain->ref), uid);
1483 mark_dual_mode_def (chain->ref);
1485 else
1487 if (dump_file)
1488 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1489 DF_REF_REGNO (chain->ref), uid);
1490 mark_dual_mode_def (ref);
1495 /* Add instruction into a chain. */
1497 void
1498 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1500 if (bitmap_bit_p (insns, insn_uid))
1501 return;
1503 if (dump_file)
1504 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1506 bitmap_set_bit (insns, insn_uid);
1508 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1509 rtx def_set = single_set (insn);
1510 if (def_set && REG_P (SET_DEST (def_set))
1511 && !HARD_REGISTER_P (SET_DEST (def_set)))
1512 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1514 df_ref ref;
1515 df_ref def;
1516 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1517 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1518 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1519 def;
1520 def = DF_REF_NEXT_REG (def))
1521 analyze_register_chain (candidates, def);
1522 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1523 if (!DF_REF_REG_MEM_P (ref))
1524 analyze_register_chain (candidates, ref);
1527 /* Build new chain starting from insn INSN_UID recursively
1528 adding all dependent uses and definitions. */
1530 void
1531 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1533 queue = BITMAP_ALLOC (NULL);
1534 bitmap_set_bit (queue, insn_uid);
1536 if (dump_file)
1537 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1539 while (!bitmap_empty_p (queue))
1541 insn_uid = bitmap_first_set_bit (queue);
1542 bitmap_clear_bit (queue, insn_uid);
1543 bitmap_clear_bit (candidates, insn_uid);
1544 add_insn (candidates, insn_uid);
1547 if (dump_file)
1549 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1550 fprintf (dump_file, " insns: ");
1551 dump_bitmap (dump_file, insns);
1552 if (!bitmap_empty_p (defs_conv))
1554 bitmap_iterator bi;
1555 unsigned id;
1556 const char *comma = "";
1557 fprintf (dump_file, " defs to convert: ");
1558 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1560 fprintf (dump_file, "%sr%d", comma, id);
1561 comma = ", ";
1563 fprintf (dump_file, "\n");
1567 BITMAP_FREE (queue);
1570 /* Return a cost of building a vector costant
1571 instead of using a scalar one. */
1574 dimode_scalar_chain::vector_const_cost (rtx exp)
1576 gcc_assert (CONST_INT_P (exp));
1578 if (standard_sse_constant_p (exp, V2DImode))
1579 return COSTS_N_INSNS (1);
1580 return ix86_cost->sse_load[1];
1583 /* Compute a gain for chain conversion. */
1586 dimode_scalar_chain::compute_convert_gain ()
1588 bitmap_iterator bi;
1589 unsigned insn_uid;
1590 int gain = 0;
1591 int cost = 0;
1593 if (dump_file)
1594 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1596 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1598 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1599 rtx def_set = single_set (insn);
1600 rtx src = SET_SRC (def_set);
1601 rtx dst = SET_DEST (def_set);
1603 if (REG_P (src) && REG_P (dst))
1604 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1605 else if (REG_P (src) && MEM_P (dst))
1606 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1607 else if (MEM_P (src) && REG_P (dst))
1608 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1609 else if (GET_CODE (src) == ASHIFT
1610 || GET_CODE (src) == ASHIFTRT
1611 || GET_CODE (src) == LSHIFTRT)
1613 if (CONST_INT_P (XEXP (src, 0)))
1614 gain -= vector_const_cost (XEXP (src, 0));
1615 if (CONST_INT_P (XEXP (src, 1)))
1617 gain += ix86_cost->shift_const;
1618 if (INTVAL (XEXP (src, 1)) >= 32)
1619 gain -= COSTS_N_INSNS (1);
1621 else
1622 /* Additional gain for omitting two CMOVs. */
1623 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1625 else if (GET_CODE (src) == PLUS
1626 || GET_CODE (src) == MINUS
1627 || GET_CODE (src) == IOR
1628 || GET_CODE (src) == XOR
1629 || GET_CODE (src) == AND)
1631 gain += ix86_cost->add;
1632 /* Additional gain for andnot for targets without BMI. */
1633 if (GET_CODE (XEXP (src, 0)) == NOT
1634 && !TARGET_BMI)
1635 gain += 2 * ix86_cost->add;
1637 if (CONST_INT_P (XEXP (src, 0)))
1638 gain -= vector_const_cost (XEXP (src, 0));
1639 if (CONST_INT_P (XEXP (src, 1)))
1640 gain -= vector_const_cost (XEXP (src, 1));
1642 else if (GET_CODE (src) == NEG
1643 || GET_CODE (src) == NOT)
1644 gain += ix86_cost->add - COSTS_N_INSNS (1);
1645 else if (GET_CODE (src) == COMPARE)
1647 /* Assume comparison cost is the same. */
1649 else if (CONST_INT_P (src))
1651 if (REG_P (dst))
1652 gain += COSTS_N_INSNS (2);
1653 else if (MEM_P (dst))
1654 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1655 gain -= vector_const_cost (src);
1657 else
1658 gcc_unreachable ();
1661 if (dump_file)
1662 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1664 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1665 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1667 if (dump_file)
1668 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1670 gain -= cost;
1672 if (dump_file)
1673 fprintf (dump_file, " Total gain: %d\n", gain);
1675 return gain;
1678 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1681 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1683 if (x == reg)
1684 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1686 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1687 int i, j;
1688 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1690 if (fmt[i] == 'e')
1691 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1692 else if (fmt[i] == 'E')
1693 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1694 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1695 reg, new_reg);
1698 return x;
1701 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1703 void
1704 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1705 rtx reg, rtx new_reg)
1707 replace_with_subreg (single_set (insn), reg, new_reg);
1710 /* Insert generated conversion instruction sequence INSNS
1711 after instruction AFTER. New BB may be required in case
1712 instruction has EH region attached. */
1714 void
1715 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1717 if (!control_flow_insn_p (after))
1719 emit_insn_after (insns, after);
1720 return;
1723 basic_block bb = BLOCK_FOR_INSN (after);
1724 edge e = find_fallthru_edge (bb->succs);
1725 gcc_assert (e);
1727 basic_block new_bb = split_edge (e);
1728 emit_insn_after (insns, BB_HEAD (new_bb));
1731 /* Make vector copies for all register REGNO definitions
1732 and replace its uses in a chain. */
1734 void
1735 dimode_scalar_chain::make_vector_copies (unsigned regno)
1737 rtx reg = regno_reg_rtx[regno];
1738 rtx vreg = gen_reg_rtx (DImode);
1739 bool count_reg = false;
1740 df_ref ref;
1742 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1743 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1745 df_ref use;
1747 /* Detect the count register of a shift instruction. */
1748 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1749 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1751 rtx_insn *insn = DF_REF_INSN (use);
1752 rtx def_set = single_set (insn);
1754 gcc_assert (def_set);
1756 rtx src = SET_SRC (def_set);
1758 if ((GET_CODE (src) == ASHIFT
1759 || GET_CODE (src) == ASHIFTRT
1760 || GET_CODE (src) == LSHIFTRT)
1761 && !CONST_INT_P (XEXP (src, 1))
1762 && reg_or_subregno (XEXP (src, 1)) == regno)
1763 count_reg = true;
1766 start_sequence ();
1767 if (count_reg)
1769 rtx qreg = gen_lowpart (QImode, reg);
1770 rtx tmp = gen_reg_rtx (SImode);
1772 if (TARGET_ZERO_EXTEND_WITH_AND
1773 && optimize_function_for_speed_p (cfun))
1775 emit_move_insn (tmp, const0_rtx);
1776 emit_insn (gen_movstrictqi
1777 (gen_lowpart (QImode, tmp), qreg));
1779 else
1780 emit_insn (gen_rtx_SET
1781 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1783 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1785 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1786 emit_move_insn (slot, tmp);
1787 tmp = copy_rtx (slot);
1790 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1792 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1794 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1795 emit_move_insn (adjust_address (tmp, SImode, 0),
1796 gen_rtx_SUBREG (SImode, reg, 0));
1797 emit_move_insn (adjust_address (tmp, SImode, 4),
1798 gen_rtx_SUBREG (SImode, reg, 4));
1799 emit_move_insn (vreg, tmp);
1801 else if (TARGET_SSE4_1)
1803 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1804 CONST0_RTX (V4SImode),
1805 gen_rtx_SUBREG (SImode, reg, 0)));
1806 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1807 gen_rtx_SUBREG (V4SImode, vreg, 0),
1808 gen_rtx_SUBREG (SImode, reg, 4),
1809 GEN_INT (2)));
1811 else
1813 rtx tmp = gen_reg_rtx (DImode);
1814 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1815 CONST0_RTX (V4SImode),
1816 gen_rtx_SUBREG (SImode, reg, 0)));
1817 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1818 CONST0_RTX (V4SImode),
1819 gen_rtx_SUBREG (SImode, reg, 4)));
1820 emit_insn (gen_vec_interleave_lowv4si
1821 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1822 gen_rtx_SUBREG (V4SImode, vreg, 0),
1823 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1825 rtx_insn *seq = get_insns ();
1826 end_sequence ();
1827 rtx_insn *insn = DF_REF_INSN (ref);
1828 emit_conversion_insns (seq, insn);
1830 if (dump_file)
1831 fprintf (dump_file,
1832 " Copied r%d to a vector register r%d for insn %d\n",
1833 regno, REGNO (vreg), INSN_UID (insn));
1836 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1837 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1839 rtx_insn *insn = DF_REF_INSN (ref);
1840 if (count_reg)
1842 rtx def_set = single_set (insn);
1843 gcc_assert (def_set);
1845 rtx src = SET_SRC (def_set);
1847 if ((GET_CODE (src) == ASHIFT
1848 || GET_CODE (src) == ASHIFTRT
1849 || GET_CODE (src) == LSHIFTRT)
1850 && !CONST_INT_P (XEXP (src, 1))
1851 && reg_or_subregno (XEXP (src, 1)) == regno)
1852 XEXP (src, 1) = vreg;
1854 else
1855 replace_with_subreg_in_insn (insn, reg, vreg);
1857 if (dump_file)
1858 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1859 regno, REGNO (vreg), INSN_UID (insn));
1863 /* Convert all definitions of register REGNO
1864 and fix its uses. Scalar copies may be created
1865 in case register is used in not convertible insn. */
1867 void
1868 dimode_scalar_chain::convert_reg (unsigned regno)
1870 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1871 rtx reg = regno_reg_rtx[regno];
1872 rtx scopy = NULL_RTX;
1873 df_ref ref;
1874 bitmap conv;
1876 conv = BITMAP_ALLOC (NULL);
1877 bitmap_copy (conv, insns);
1879 if (scalar_copy)
1880 scopy = gen_reg_rtx (DImode);
1882 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1884 rtx_insn *insn = DF_REF_INSN (ref);
1885 rtx def_set = single_set (insn);
1886 rtx src = SET_SRC (def_set);
1887 rtx reg = DF_REF_REG (ref);
1889 if (!MEM_P (src))
1891 replace_with_subreg_in_insn (insn, reg, reg);
1892 bitmap_clear_bit (conv, INSN_UID (insn));
1895 if (scalar_copy)
1897 start_sequence ();
1898 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1900 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1901 emit_move_insn (tmp, reg);
1902 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1903 adjust_address (tmp, SImode, 0));
1904 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1905 adjust_address (tmp, SImode, 4));
1907 else if (TARGET_SSE4_1)
1909 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1910 emit_insn
1911 (gen_rtx_SET
1912 (gen_rtx_SUBREG (SImode, scopy, 0),
1913 gen_rtx_VEC_SELECT (SImode,
1914 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1916 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1917 emit_insn
1918 (gen_rtx_SET
1919 (gen_rtx_SUBREG (SImode, scopy, 4),
1920 gen_rtx_VEC_SELECT (SImode,
1921 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1923 else
1925 rtx vcopy = gen_reg_rtx (V2DImode);
1926 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1927 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1928 gen_rtx_SUBREG (SImode, vcopy, 0));
1929 emit_move_insn (vcopy,
1930 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1931 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1932 gen_rtx_SUBREG (SImode, vcopy, 0));
1934 rtx_insn *seq = get_insns ();
1935 end_sequence ();
1936 emit_conversion_insns (seq, insn);
1938 if (dump_file)
1939 fprintf (dump_file,
1940 " Copied r%d to a scalar register r%d for insn %d\n",
1941 regno, REGNO (scopy), INSN_UID (insn));
1945 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1946 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1948 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1950 rtx_insn *insn = DF_REF_INSN (ref);
1952 rtx def_set = single_set (insn);
1953 gcc_assert (def_set);
1955 rtx src = SET_SRC (def_set);
1956 rtx dst = SET_DEST (def_set);
1958 if ((GET_CODE (src) == ASHIFT
1959 || GET_CODE (src) == ASHIFTRT
1960 || GET_CODE (src) == LSHIFTRT)
1961 && !CONST_INT_P (XEXP (src, 1))
1962 && reg_or_subregno (XEXP (src, 1)) == regno)
1964 rtx tmp2 = gen_reg_rtx (V2DImode);
1966 start_sequence ();
1968 if (TARGET_SSE4_1)
1969 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1970 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1971 else
1973 rtx vec_cst
1974 = gen_rtx_CONST_VECTOR (V2DImode,
1975 gen_rtvec (2, GEN_INT (0xff),
1976 const0_rtx));
1977 vec_cst
1978 = validize_mem (force_const_mem (V2DImode, vec_cst));
1980 emit_insn (gen_rtx_SET
1981 (tmp2,
1982 gen_rtx_AND (V2DImode,
1983 gen_rtx_SUBREG (V2DImode, reg, 0),
1984 vec_cst)));
1986 rtx_insn *seq = get_insns ();
1987 end_sequence ();
1989 emit_insn_before (seq, insn);
1991 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1993 else if (!MEM_P (dst) || !REG_P (src))
1994 replace_with_subreg_in_insn (insn, reg, reg);
1996 bitmap_clear_bit (conv, INSN_UID (insn));
1999 /* Skip debug insns and uninitialized uses. */
2000 else if (DF_REF_CHAIN (ref)
2001 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2003 gcc_assert (scopy);
2004 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2005 df_insn_rescan (DF_REF_INSN (ref));
2008 BITMAP_FREE (conv);
2011 /* Convert operand OP in INSN. We should handle
2012 memory operands and uninitialized registers.
2013 All other register uses are converted during
2014 registers conversion. */
2016 void
2017 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2019 *op = copy_rtx_if_shared (*op);
2021 if (GET_CODE (*op) == NOT)
2023 convert_op (&XEXP (*op, 0), insn);
2024 PUT_MODE (*op, V2DImode);
2026 else if (MEM_P (*op))
2028 rtx tmp = gen_reg_rtx (DImode);
2030 emit_insn_before (gen_move_insn (tmp, *op), insn);
2031 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2033 if (dump_file)
2034 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2035 INSN_UID (insn), REGNO (tmp));
2037 else if (REG_P (*op))
2039 /* We may have not converted register usage in case
2040 this register has no definition. Otherwise it
2041 should be converted in convert_reg. */
2042 df_ref ref;
2043 FOR_EACH_INSN_USE (ref, insn)
2044 if (DF_REF_REGNO (ref) == REGNO (*op))
2046 gcc_assert (!DF_REF_CHAIN (ref));
2047 break;
2049 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2051 else if (CONST_INT_P (*op))
2053 rtx vec_cst;
2054 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2056 /* Prefer all ones vector in case of -1. */
2057 if (constm1_operand (*op, GET_MODE (*op)))
2058 vec_cst = CONSTM1_RTX (V2DImode);
2059 else
2060 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2061 gen_rtvec (2, *op, const0_rtx));
2063 if (!standard_sse_constant_p (vec_cst, V2DImode))
2065 start_sequence ();
2066 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2067 rtx_insn *seq = get_insns ();
2068 end_sequence ();
2069 emit_insn_before (seq, insn);
2072 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2073 *op = tmp;
2075 else
2077 gcc_assert (SUBREG_P (*op));
2078 gcc_assert (GET_MODE (*op) == V2DImode);
2082 /* Convert INSN to vector mode. */
2084 void
2085 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2087 rtx def_set = single_set (insn);
2088 rtx src = SET_SRC (def_set);
2089 rtx dst = SET_DEST (def_set);
2090 rtx subreg;
2092 if (MEM_P (dst) && !REG_P (src))
2094 /* There are no scalar integer instructions and therefore
2095 temporary register usage is required. */
2096 rtx tmp = gen_reg_rtx (DImode);
2097 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2098 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2101 switch (GET_CODE (src))
2103 case ASHIFT:
2104 case ASHIFTRT:
2105 case LSHIFTRT:
2106 convert_op (&XEXP (src, 0), insn);
2107 PUT_MODE (src, V2DImode);
2108 break;
2110 case PLUS:
2111 case MINUS:
2112 case IOR:
2113 case XOR:
2114 case AND:
2115 convert_op (&XEXP (src, 0), insn);
2116 convert_op (&XEXP (src, 1), insn);
2117 PUT_MODE (src, V2DImode);
2118 break;
2120 case NEG:
2121 src = XEXP (src, 0);
2122 convert_op (&src, insn);
2123 subreg = gen_reg_rtx (V2DImode);
2124 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2125 src = gen_rtx_MINUS (V2DImode, subreg, src);
2126 break;
2128 case NOT:
2129 src = XEXP (src, 0);
2130 convert_op (&src, insn);
2131 subreg = gen_reg_rtx (V2DImode);
2132 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2133 src = gen_rtx_XOR (V2DImode, src, subreg);
2134 break;
2136 case MEM:
2137 if (!REG_P (dst))
2138 convert_op (&src, insn);
2139 break;
2141 case REG:
2142 if (!MEM_P (dst))
2143 convert_op (&src, insn);
2144 break;
2146 case SUBREG:
2147 gcc_assert (GET_MODE (src) == V2DImode);
2148 break;
2150 case COMPARE:
2151 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2153 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2154 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2156 if (REG_P (src))
2157 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2158 else
2159 subreg = copy_rtx_if_shared (src);
2160 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2161 copy_rtx_if_shared (subreg),
2162 copy_rtx_if_shared (subreg)),
2163 insn);
2164 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2165 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2166 copy_rtx_if_shared (src)),
2167 UNSPEC_PTEST);
2168 break;
2170 case CONST_INT:
2171 convert_op (&src, insn);
2172 break;
2174 default:
2175 gcc_unreachable ();
2178 SET_SRC (def_set) = src;
2179 SET_DEST (def_set) = dst;
2181 /* Drop possible dead definitions. */
2182 PATTERN (insn) = def_set;
2184 INSN_CODE (insn) = -1;
2185 recog_memoized (insn);
2186 df_insn_rescan (insn);
2189 /* Fix uses of converted REG in debug insns. */
2191 void
2192 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2194 if (!flag_var_tracking)
2195 return;
2197 df_ref ref, next;
2198 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2200 rtx_insn *insn = DF_REF_INSN (ref);
2201 /* Make sure the next ref is for a different instruction,
2202 so that we're not affected by the rescan. */
2203 next = DF_REF_NEXT_REG (ref);
2204 while (next && DF_REF_INSN (next) == insn)
2205 next = DF_REF_NEXT_REG (next);
2207 if (DEBUG_INSN_P (insn))
2209 /* It may be a debug insn with a TImode variable in
2210 register. */
2211 bool changed = false;
2212 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2214 rtx *loc = DF_REF_LOC (ref);
2215 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2217 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2218 changed = true;
2221 if (changed)
2222 df_insn_rescan (insn);
2227 /* Convert INSN from TImode to V1T1mode. */
2229 void
2230 timode_scalar_chain::convert_insn (rtx_insn *insn)
2232 rtx def_set = single_set (insn);
2233 rtx src = SET_SRC (def_set);
2234 rtx dst = SET_DEST (def_set);
2236 switch (GET_CODE (dst))
2238 case REG:
2240 rtx tmp = find_reg_equal_equiv_note (insn);
2241 if (tmp)
2242 PUT_MODE (XEXP (tmp, 0), V1TImode);
2243 PUT_MODE (dst, V1TImode);
2244 fix_debug_reg_uses (dst);
2246 break;
2247 case MEM:
2248 PUT_MODE (dst, V1TImode);
2249 break;
2251 default:
2252 gcc_unreachable ();
2255 switch (GET_CODE (src))
2257 case REG:
2258 PUT_MODE (src, V1TImode);
2259 /* Call fix_debug_reg_uses only if SRC is never defined. */
2260 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2261 fix_debug_reg_uses (src);
2262 break;
2264 case MEM:
2265 PUT_MODE (src, V1TImode);
2266 break;
2268 case CONST_WIDE_INT:
2269 if (NONDEBUG_INSN_P (insn))
2271 /* Since there are no instructions to store 128-bit constant,
2272 temporary register usage is required. */
2273 rtx tmp = gen_reg_rtx (V1TImode);
2274 start_sequence ();
2275 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2276 src = validize_mem (force_const_mem (V1TImode, src));
2277 rtx_insn *seq = get_insns ();
2278 end_sequence ();
2279 if (seq)
2280 emit_insn_before (seq, insn);
2281 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2282 dst = tmp;
2284 break;
2286 case CONST_INT:
2287 switch (standard_sse_constant_p (src, TImode))
2289 case 1:
2290 src = CONST0_RTX (GET_MODE (dst));
2291 break;
2292 case 2:
2293 src = CONSTM1_RTX (GET_MODE (dst));
2294 break;
2295 default:
2296 gcc_unreachable ();
2298 if (NONDEBUG_INSN_P (insn))
2300 rtx tmp = gen_reg_rtx (V1TImode);
2301 /* Since there are no instructions to store standard SSE
2302 constant, temporary register usage is required. */
2303 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2304 dst = tmp;
2306 break;
2308 default:
2309 gcc_unreachable ();
2312 SET_SRC (def_set) = src;
2313 SET_DEST (def_set) = dst;
2315 /* Drop possible dead definitions. */
2316 PATTERN (insn) = def_set;
2318 INSN_CODE (insn) = -1;
2319 recog_memoized (insn);
2320 df_insn_rescan (insn);
2323 void
2324 dimode_scalar_chain::convert_registers ()
2326 bitmap_iterator bi;
2327 unsigned id;
2329 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2330 convert_reg (id);
2332 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2333 make_vector_copies (id);
2336 /* Convert whole chain creating required register
2337 conversions and copies. */
2340 scalar_chain::convert ()
2342 bitmap_iterator bi;
2343 unsigned id;
2344 int converted_insns = 0;
2346 if (!dbg_cnt (stv_conversion))
2347 return 0;
2349 if (dump_file)
2350 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2352 convert_registers ();
2354 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2356 convert_insn (DF_INSN_UID_GET (id)->insn);
2357 converted_insns++;
2360 return converted_insns;
2363 /* Main STV pass function. Find and convert scalar
2364 instructions into vector mode when profitable. */
2366 static unsigned int
2367 convert_scalars_to_vector ()
2369 basic_block bb;
2370 bitmap candidates;
2371 int converted_insns = 0;
2373 bitmap_obstack_initialize (NULL);
2374 candidates = BITMAP_ALLOC (NULL);
2376 calculate_dominance_info (CDI_DOMINATORS);
2377 df_set_flags (DF_DEFER_INSN_RESCAN);
2378 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2379 df_md_add_problem ();
2380 df_analyze ();
2382 /* Find all instructions we want to convert into vector mode. */
2383 if (dump_file)
2384 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2386 FOR_EACH_BB_FN (bb, cfun)
2388 rtx_insn *insn;
2389 FOR_BB_INSNS (bb, insn)
2390 if (scalar_to_vector_candidate_p (insn))
2392 if (dump_file)
2393 fprintf (dump_file, " insn %d is marked as a candidate\n",
2394 INSN_UID (insn));
2396 bitmap_set_bit (candidates, INSN_UID (insn));
2400 remove_non_convertible_regs (candidates);
2402 if (bitmap_empty_p (candidates))
2403 if (dump_file)
2404 fprintf (dump_file, "There are no candidates for optimization.\n");
2406 while (!bitmap_empty_p (candidates))
2408 unsigned uid = bitmap_first_set_bit (candidates);
2409 scalar_chain *chain;
2411 if (TARGET_64BIT)
2412 chain = new timode_scalar_chain;
2413 else
2414 chain = new dimode_scalar_chain;
2416 /* Find instructions chain we want to convert to vector mode.
2417 Check all uses and definitions to estimate all required
2418 conversions. */
2419 chain->build (candidates, uid);
2421 if (chain->compute_convert_gain () > 0)
2422 converted_insns += chain->convert ();
2423 else
2424 if (dump_file)
2425 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2426 chain->chain_id);
2428 delete chain;
2431 if (dump_file)
2432 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2434 BITMAP_FREE (candidates);
2435 bitmap_obstack_release (NULL);
2436 df_process_deferred_rescans ();
2438 /* Conversion means we may have 128bit register spills/fills
2439 which require aligned stack. */
2440 if (converted_insns)
2442 if (crtl->stack_alignment_needed < 128)
2443 crtl->stack_alignment_needed = 128;
2444 if (crtl->stack_alignment_estimated < 128)
2445 crtl->stack_alignment_estimated = 128;
2446 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2447 if (TARGET_64BIT)
2448 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2449 parm; parm = DECL_CHAIN (parm))
2451 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2452 continue;
2453 if (DECL_RTL_SET_P (parm)
2454 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2456 rtx r = DECL_RTL (parm);
2457 if (REG_P (r))
2458 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2460 if (DECL_INCOMING_RTL (parm)
2461 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2463 rtx r = DECL_INCOMING_RTL (parm);
2464 if (REG_P (r))
2465 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2470 return 0;
2473 namespace {
2475 const pass_data pass_data_insert_vzeroupper =
2477 RTL_PASS, /* type */
2478 "vzeroupper", /* name */
2479 OPTGROUP_NONE, /* optinfo_flags */
2480 TV_MACH_DEP, /* tv_id */
2481 0, /* properties_required */
2482 0, /* properties_provided */
2483 0, /* properties_destroyed */
2484 0, /* todo_flags_start */
2485 TODO_df_finish, /* todo_flags_finish */
2488 class pass_insert_vzeroupper : public rtl_opt_pass
2490 public:
2491 pass_insert_vzeroupper(gcc::context *ctxt)
2492 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2495 /* opt_pass methods: */
2496 virtual bool gate (function *)
2498 return TARGET_AVX && !TARGET_AVX512F
2499 && TARGET_VZEROUPPER && flag_expensive_optimizations
2500 && !optimize_size;
2503 virtual unsigned int execute (function *)
2505 return rest_of_handle_insert_vzeroupper ();
2508 }; // class pass_insert_vzeroupper
2510 const pass_data pass_data_stv =
2512 RTL_PASS, /* type */
2513 "stv", /* name */
2514 OPTGROUP_NONE, /* optinfo_flags */
2515 TV_MACH_DEP, /* tv_id */
2516 0, /* properties_required */
2517 0, /* properties_provided */
2518 0, /* properties_destroyed */
2519 0, /* todo_flags_start */
2520 TODO_df_finish, /* todo_flags_finish */
2523 class pass_stv : public rtl_opt_pass
2525 public:
2526 pass_stv (gcc::context *ctxt)
2527 : rtl_opt_pass (pass_data_stv, ctxt),
2528 timode_p (false)
2531 /* opt_pass methods: */
2532 virtual bool gate (function *)
2534 return (timode_p == !!TARGET_64BIT
2535 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2538 virtual unsigned int execute (function *)
2540 return convert_scalars_to_vector ();
2543 opt_pass *clone ()
2545 return new pass_stv (m_ctxt);
2548 void set_pass_param (unsigned int n, bool param)
2550 gcc_assert (n == 0);
2551 timode_p = param;
2554 private:
2555 bool timode_p;
2556 }; // class pass_stv
2558 } // anon namespace
2560 rtl_opt_pass *
2561 make_pass_insert_vzeroupper (gcc::context *ctxt)
2563 return new pass_insert_vzeroupper (ctxt);
2566 rtl_opt_pass *
2567 make_pass_stv (gcc::context *ctxt)
2569 return new pass_stv (ctxt);
2572 /* Inserting ENDBRANCH instructions. */
2574 static unsigned int
2575 rest_of_insert_endbranch (void)
2577 timevar_push (TV_MACH_DEP);
2579 rtx cet_eb;
2580 rtx_insn *insn;
2581 basic_block bb;
2583 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2584 absent among function attributes. Later an optimization will be
2585 introduced to make analysis if an address of a static function is
2586 taken. A static function whose address is not taken will get a
2587 nocf_check attribute. This will allow to reduce the number of EB. */
2589 if (!lookup_attribute ("nocf_check",
2590 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))))
2592 cet_eb = gen_nop_endbr ();
2594 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2595 insn = BB_HEAD (bb);
2596 emit_insn_before (cet_eb, insn);
2599 bb = 0;
2600 FOR_EACH_BB_FN (bb, cfun)
2602 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2603 insn = NEXT_INSN (insn))
2605 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2607 rtx_insn *next_insn = insn;
2609 while ((next_insn != BB_END (bb))
2610 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2611 || NOTE_P (NEXT_INSN (next_insn))
2612 || BARRIER_P (NEXT_INSN (next_insn))))
2613 next_insn = NEXT_INSN (next_insn);
2615 /* Generate ENDBRANCH after CALL, which can return more than
2616 twice, setjmp-like functions. */
2617 if (find_reg_note (insn, REG_SETJMP, NULL) != NULL)
2619 cet_eb = gen_nop_endbr ();
2620 emit_insn_after (cet_eb, next_insn);
2622 continue;
2625 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2627 rtx target = JUMP_LABEL (insn);
2628 if (target == NULL_RTX || ANY_RETURN_P (target))
2629 continue;
2631 /* Check the jump is a switch table. */
2632 rtx_insn *label = as_a<rtx_insn *> (target);
2633 rtx_insn *table = next_insn (label);
2634 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2635 continue;
2637 /* For the indirect jump find out all places it jumps and insert
2638 ENDBRANCH there. It should be done under a special flag to
2639 control ENDBRANCH generation for switch stmts. */
2640 edge_iterator ei;
2641 edge e;
2642 basic_block dest_blk;
2644 FOR_EACH_EDGE (e, ei, bb->succs)
2646 rtx_insn *insn;
2648 dest_blk = e->dest;
2649 insn = BB_HEAD (dest_blk);
2650 gcc_assert (LABEL_P (insn));
2651 cet_eb = gen_nop_endbr ();
2652 emit_insn_after (cet_eb, insn);
2654 continue;
2657 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2658 || (NOTE_P (insn)
2659 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2660 /* TODO. Check /s bit also. */
2662 cet_eb = gen_nop_endbr ();
2663 emit_insn_after (cet_eb, insn);
2664 continue;
2669 timevar_pop (TV_MACH_DEP);
2670 return 0;
2673 namespace {
2675 const pass_data pass_data_insert_endbranch =
2677 RTL_PASS, /* type. */
2678 "cet", /* name. */
2679 OPTGROUP_NONE, /* optinfo_flags. */
2680 TV_MACH_DEP, /* tv_id. */
2681 0, /* properties_required. */
2682 0, /* properties_provided. */
2683 0, /* properties_destroyed. */
2684 0, /* todo_flags_start. */
2685 0, /* todo_flags_finish. */
2688 class pass_insert_endbranch : public rtl_opt_pass
2690 public:
2691 pass_insert_endbranch (gcc::context *ctxt)
2692 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2695 /* opt_pass methods: */
2696 virtual bool gate (function *)
2698 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2701 virtual unsigned int execute (function *)
2703 return rest_of_insert_endbranch ();
2706 }; // class pass_insert_endbranch
2708 } // anon namespace
2710 rtl_opt_pass *
2711 make_pass_insert_endbranch (gcc::context *ctxt)
2713 return new pass_insert_endbranch (ctxt);
2716 /* Return true if a red-zone is in use. */
2718 bool
2719 ix86_using_red_zone (void)
2721 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2724 /* Return a string that documents the current -m options. The caller is
2725 responsible for freeing the string. */
2727 static char *
2728 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2729 int flags, int flags2,
2730 const char *arch, const char *tune,
2731 enum fpmath_unit fpmath, bool add_nl_p)
2733 struct ix86_target_opts
2735 const char *option; /* option string */
2736 HOST_WIDE_INT mask; /* isa mask options */
2739 /* This table is ordered so that options like -msse4.2 that imply other
2740 ISAs come first. Target string will be displayed in the same order. */
2741 static struct ix86_target_opts isa2_opts[] =
2743 { "-mgfni", OPTION_MASK_ISA_GFNI },
2744 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2745 { "-msgx", OPTION_MASK_ISA_SGX },
2746 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2747 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2748 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2749 { "-mibt", OPTION_MASK_ISA_IBT },
2750 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2752 static struct ix86_target_opts isa_opts[] =
2754 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2755 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2756 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2757 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2758 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2759 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2760 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2761 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2762 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2763 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2764 { "-mfma", OPTION_MASK_ISA_FMA },
2765 { "-mxop", OPTION_MASK_ISA_XOP },
2766 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2767 { "-mf16c", OPTION_MASK_ISA_F16C },
2768 { "-mavx", OPTION_MASK_ISA_AVX },
2769 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2770 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2771 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2772 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2773 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2774 { "-msse3", OPTION_MASK_ISA_SSE3 },
2775 { "-maes", OPTION_MASK_ISA_AES },
2776 { "-msha", OPTION_MASK_ISA_SHA },
2777 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2778 { "-msse2", OPTION_MASK_ISA_SSE2 },
2779 { "-msse", OPTION_MASK_ISA_SSE },
2780 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2781 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2782 { "-mmmx", OPTION_MASK_ISA_MMX },
2783 { "-mrtm", OPTION_MASK_ISA_RTM },
2784 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2785 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2786 { "-madx", OPTION_MASK_ISA_ADX },
2787 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2788 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2789 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2790 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2791 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2792 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2793 { "-mabm", OPTION_MASK_ISA_ABM },
2794 { "-mbmi", OPTION_MASK_ISA_BMI },
2795 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2796 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2797 { "-mtbm", OPTION_MASK_ISA_TBM },
2798 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2799 { "-mcx16", OPTION_MASK_ISA_CX16 },
2800 { "-msahf", OPTION_MASK_ISA_SAHF },
2801 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2802 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2803 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2804 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2805 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
2806 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2807 { "-mpku", OPTION_MASK_ISA_PKU },
2808 { "-mlwp", OPTION_MASK_ISA_LWP },
2809 { "-mhle", OPTION_MASK_ISA_HLE },
2810 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2811 { "-mmpx", OPTION_MASK_ISA_MPX },
2812 { "-mclwb", OPTION_MASK_ISA_CLWB }
2815 /* Flag options. */
2816 static struct ix86_target_opts flag_opts[] =
2818 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2819 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2820 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2821 { "-m80387", MASK_80387 },
2822 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2823 { "-malign-double", MASK_ALIGN_DOUBLE },
2824 { "-mcld", MASK_CLD },
2825 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2826 { "-mieee-fp", MASK_IEEE_FP },
2827 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2828 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2829 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2830 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2831 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2832 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2833 { "-mno-red-zone", MASK_NO_RED_ZONE },
2834 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2835 { "-mrecip", MASK_RECIP },
2836 { "-mrtd", MASK_RTD },
2837 { "-msseregparm", MASK_SSEREGPARM },
2838 { "-mstack-arg-probe", MASK_STACK_PROBE },
2839 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2840 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2841 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2842 { "-mvzeroupper", MASK_VZEROUPPER },
2843 { "-mstv", MASK_STV },
2844 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2845 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2846 { "-mprefer-avx128", MASK_PREFER_AVX128 },
2847 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2850 /* Additional flag options. */
2851 static struct ix86_target_opts flag2_opts[] =
2853 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
2854 { "-mprefer-avx256", OPTION_MASK_PREFER_AVX256 },
2857 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2858 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2860 char isa_other[40];
2861 char isa2_other[40];
2862 char flags_other[40];
2863 char flags2_other[40];
2864 unsigned num = 0;
2865 unsigned i, j;
2866 char *ret;
2867 char *ptr;
2868 size_t len;
2869 size_t line_len;
2870 size_t sep_len;
2871 const char *abi;
2873 memset (opts, '\0', sizeof (opts));
2875 /* Add -march= option. */
2876 if (arch)
2878 opts[num][0] = "-march=";
2879 opts[num++][1] = arch;
2882 /* Add -mtune= option. */
2883 if (tune)
2885 opts[num][0] = "-mtune=";
2886 opts[num++][1] = tune;
2889 /* Add -m32/-m64/-mx32. */
2890 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2892 if ((isa & OPTION_MASK_ABI_64) != 0)
2893 abi = "-m64";
2894 else
2895 abi = "-mx32";
2896 isa &= ~ (OPTION_MASK_ISA_64BIT
2897 | OPTION_MASK_ABI_64
2898 | OPTION_MASK_ABI_X32);
2900 else
2901 abi = "-m32";
2902 opts[num++][0] = abi;
2904 /* Pick out the options in isa2 options. */
2905 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2907 if ((isa2 & isa2_opts[i].mask) != 0)
2909 opts[num++][0] = isa2_opts[i].option;
2910 isa2 &= ~ isa2_opts[i].mask;
2914 if (isa2 && add_nl_p)
2916 opts[num++][0] = isa2_other;
2917 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2920 /* Pick out the options in isa options. */
2921 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2923 if ((isa & isa_opts[i].mask) != 0)
2925 opts[num++][0] = isa_opts[i].option;
2926 isa &= ~ isa_opts[i].mask;
2930 if (isa && add_nl_p)
2932 opts[num++][0] = isa_other;
2933 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2936 /* Add flag options. */
2937 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2939 if ((flags & flag_opts[i].mask) != 0)
2941 opts[num++][0] = flag_opts[i].option;
2942 flags &= ~ flag_opts[i].mask;
2946 if (flags && add_nl_p)
2948 opts[num++][0] = flags_other;
2949 sprintf (flags_other, "(other flags: %#x)", flags);
2952 /* Add additional flag options. */
2953 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2955 if ((flags2 & flag2_opts[i].mask) != 0)
2957 opts[num++][0] = flag2_opts[i].option;
2958 flags2 &= ~ flag2_opts[i].mask;
2962 if (flags2 && add_nl_p)
2964 opts[num++][0] = flags2_other;
2965 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2968 /* Add -fpmath= option. */
2969 if (fpmath)
2971 opts[num][0] = "-mfpmath=";
2972 switch ((int) fpmath)
2974 case FPMATH_387:
2975 opts[num++][1] = "387";
2976 break;
2978 case FPMATH_SSE:
2979 opts[num++][1] = "sse";
2980 break;
2982 case FPMATH_387 | FPMATH_SSE:
2983 opts[num++][1] = "sse+387";
2984 break;
2986 default:
2987 gcc_unreachable ();
2991 /* Any options? */
2992 if (num == 0)
2993 return NULL;
2995 gcc_assert (num < ARRAY_SIZE (opts));
2997 /* Size the string. */
2998 len = 0;
2999 sep_len = (add_nl_p) ? 3 : 1;
3000 for (i = 0; i < num; i++)
3002 len += sep_len;
3003 for (j = 0; j < 2; j++)
3004 if (opts[i][j])
3005 len += strlen (opts[i][j]);
3008 /* Build the string. */
3009 ret = ptr = (char *) xmalloc (len);
3010 line_len = 0;
3012 for (i = 0; i < num; i++)
3014 size_t len2[2];
3016 for (j = 0; j < 2; j++)
3017 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3019 if (i != 0)
3021 *ptr++ = ' ';
3022 line_len++;
3024 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3026 *ptr++ = '\\';
3027 *ptr++ = '\n';
3028 line_len = 0;
3032 for (j = 0; j < 2; j++)
3033 if (opts[i][j])
3035 memcpy (ptr, opts[i][j], len2[j]);
3036 ptr += len2[j];
3037 line_len += len2[j];
3041 *ptr = '\0';
3042 gcc_assert (ret + len >= ptr);
3044 return ret;
3047 /* Return true, if profiling code should be emitted before
3048 prologue. Otherwise it returns false.
3049 Note: For x86 with "hotfix" it is sorried. */
3050 static bool
3051 ix86_profile_before_prologue (void)
3053 return flag_fentry != 0;
3056 /* Function that is callable from the debugger to print the current
3057 options. */
3058 void ATTRIBUTE_UNUSED
3059 ix86_debug_options (void)
3061 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3062 target_flags, ix86_target_flags,
3063 ix86_arch_string,ix86_tune_string,
3064 ix86_fpmath, true);
3066 if (opts)
3068 fprintf (stderr, "%s\n\n", opts);
3069 free (opts);
3071 else
3072 fputs ("<no options>\n\n", stderr);
3074 return;
3077 /* Return true if T is one of the bytes we should avoid with
3078 -fmitigate-rop. */
3080 static bool
3081 ix86_rop_should_change_byte_p (int t)
3083 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3086 static const char *stringop_alg_names[] = {
3087 #define DEF_ENUM
3088 #define DEF_ALG(alg, name) #name,
3089 #include "stringop.def"
3090 #undef DEF_ENUM
3091 #undef DEF_ALG
3094 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3095 The string is of the following form (or comma separated list of it):
3097 strategy_alg:max_size:[align|noalign]
3099 where the full size range for the strategy is either [0, max_size] or
3100 [min_size, max_size], in which min_size is the max_size + 1 of the
3101 preceding range. The last size range must have max_size == -1.
3103 Examples:
3106 -mmemcpy-strategy=libcall:-1:noalign
3108 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3112 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3114 This is to tell the compiler to use the following strategy for memset
3115 1) when the expected size is between [1, 16], use rep_8byte strategy;
3116 2) when the size is between [17, 2048], use vector_loop;
3117 3) when the size is > 2048, use libcall. */
3119 struct stringop_size_range
3121 int max;
3122 stringop_alg alg;
3123 bool noalign;
3126 static void
3127 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3129 const struct stringop_algs *default_algs;
3130 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3131 char *curr_range_str, *next_range_str;
3132 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3133 int i = 0, n = 0;
3135 if (is_memset)
3136 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3137 else
3138 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3140 curr_range_str = strategy_str;
3144 int maxs;
3145 char alg_name[128];
3146 char align[16];
3147 next_range_str = strchr (curr_range_str, ',');
3148 if (next_range_str)
3149 *next_range_str++ = '\0';
3151 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
3152 alg_name, &maxs, align))
3154 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3155 return;
3158 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3160 error ("size ranges of option %qs should be increasing", opt);
3161 return;
3164 for (i = 0; i < last_alg; i++)
3165 if (!strcmp (alg_name, stringop_alg_names[i]))
3166 break;
3168 if (i == last_alg)
3170 error ("wrong strategy name %qs specified for option %qs",
3171 alg_name, opt);
3173 auto_vec <const char *> candidates;
3174 for (i = 0; i < last_alg; i++)
3175 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3176 candidates.safe_push (stringop_alg_names[i]);
3178 char *s;
3179 const char *hint
3180 = candidates_list_and_hint (alg_name, s, candidates);
3181 if (hint)
3182 inform (input_location,
3183 "valid arguments to %qs are: %s; did you mean %qs?",
3184 opt, s, hint);
3185 else
3186 inform (input_location, "valid arguments to %qs are: %s",
3187 opt, s);
3188 XDELETEVEC (s);
3189 return;
3192 if ((stringop_alg) i == rep_prefix_8_byte
3193 && !TARGET_64BIT)
3195 /* rep; movq isn't available in 32-bit code. */
3196 error ("strategy name %qs specified for option %qs "
3197 "not supported for 32-bit code", alg_name, opt);
3198 return;
3201 input_ranges[n].max = maxs;
3202 input_ranges[n].alg = (stringop_alg) i;
3203 if (!strcmp (align, "align"))
3204 input_ranges[n].noalign = false;
3205 else if (!strcmp (align, "noalign"))
3206 input_ranges[n].noalign = true;
3207 else
3209 error ("unknown alignment %qs specified for option %qs", align, opt);
3210 return;
3212 n++;
3213 curr_range_str = next_range_str;
3215 while (curr_range_str);
3217 if (input_ranges[n - 1].max != -1)
3219 error ("the max value for the last size range should be -1"
3220 " for option %qs", opt);
3221 return;
3224 if (n > MAX_STRINGOP_ALGS)
3226 error ("too many size ranges specified in option %qs", opt);
3227 return;
3230 /* Now override the default algs array. */
3231 for (i = 0; i < n; i++)
3233 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3234 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3235 = input_ranges[i].alg;
3236 *const_cast<int *>(&default_algs->size[i].noalign)
3237 = input_ranges[i].noalign;
3242 /* parse -mtune-ctrl= option. When DUMP is true,
3243 print the features that are explicitly set. */
3245 static void
3246 parse_mtune_ctrl_str (bool dump)
3248 if (!ix86_tune_ctrl_string)
3249 return;
3251 char *next_feature_string = NULL;
3252 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3253 char *orig = curr_feature_string;
3254 int i;
3257 bool clear = false;
3259 next_feature_string = strchr (curr_feature_string, ',');
3260 if (next_feature_string)
3261 *next_feature_string++ = '\0';
3262 if (*curr_feature_string == '^')
3264 curr_feature_string++;
3265 clear = true;
3267 for (i = 0; i < X86_TUNE_LAST; i++)
3269 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3271 ix86_tune_features[i] = !clear;
3272 if (dump)
3273 fprintf (stderr, "Explicitly %s feature %s\n",
3274 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3275 break;
3278 if (i == X86_TUNE_LAST)
3279 error ("Unknown parameter to option -mtune-ctrl: %s",
3280 clear ? curr_feature_string - 1 : curr_feature_string);
3281 curr_feature_string = next_feature_string;
3283 while (curr_feature_string);
3284 free (orig);
3287 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3288 processor type. */
3290 static void
3291 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3293 unsigned int ix86_tune_mask = 1u << ix86_tune;
3294 int i;
3296 for (i = 0; i < X86_TUNE_LAST; ++i)
3298 if (ix86_tune_no_default)
3299 ix86_tune_features[i] = 0;
3300 else
3301 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3304 if (dump)
3306 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3307 for (i = 0; i < X86_TUNE_LAST; i++)
3308 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3309 ix86_tune_features[i] ? "on" : "off");
3312 parse_mtune_ctrl_str (dump);
3316 /* Default align_* from the processor table. */
3318 static void
3319 ix86_default_align (struct gcc_options *opts)
3321 if (opts->x_align_loops == 0)
3323 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3324 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3326 if (opts->x_align_jumps == 0)
3328 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3329 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3331 if (opts->x_align_functions == 0)
3333 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3337 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3339 static void
3340 ix86_override_options_after_change (void)
3342 ix86_default_align (&global_options);
3345 /* Override various settings based on options. If MAIN_ARGS_P, the
3346 options are from the command line, otherwise they are from
3347 attributes. Return true if there's an error related to march
3348 option. */
3350 static bool
3351 ix86_option_override_internal (bool main_args_p,
3352 struct gcc_options *opts,
3353 struct gcc_options *opts_set)
3355 int i;
3356 unsigned int ix86_arch_mask;
3357 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3359 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3360 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3361 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3362 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3363 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3364 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3365 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3366 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3367 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3368 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3369 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3370 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3371 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3372 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3373 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3374 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3375 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3376 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3377 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3378 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3379 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3380 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3381 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3382 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3383 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3384 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3385 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3386 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3387 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3388 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3389 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3390 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3391 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3392 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3393 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3394 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3395 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3396 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3397 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3398 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3399 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3400 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3401 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3402 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3403 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3404 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3405 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3406 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3407 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3408 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3409 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3410 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3411 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3412 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3413 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3414 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3415 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3416 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3417 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3418 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3419 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3420 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3421 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3422 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3424 #define PTA_CORE2 \
3425 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3426 | PTA_CX16 | PTA_FXSR)
3427 #define PTA_NEHALEM \
3428 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3429 #define PTA_WESTMERE \
3430 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3431 #define PTA_SANDYBRIDGE \
3432 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3433 #define PTA_IVYBRIDGE \
3434 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3435 #define PTA_HASWELL \
3436 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3437 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3438 #define PTA_BROADWELL \
3439 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3440 #define PTA_SKYLAKE \
3441 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3442 #define PTA_SKYLAKE_AVX512 \
3443 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3444 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
3445 #define PTA_KNL \
3446 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3447 #define PTA_BONNELL \
3448 (PTA_CORE2 | PTA_MOVBE)
3449 #define PTA_SILVERMONT \
3450 (PTA_WESTMERE | PTA_MOVBE)
3451 #define PTA_KNM \
3452 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3454 /* if this reaches 64, need to widen struct pta flags below */
3456 static struct pta
3458 const char *const name; /* processor name or nickname. */
3459 const enum processor_type processor;
3460 const enum attr_cpu schedule;
3461 const unsigned HOST_WIDE_INT flags;
3463 const processor_alias_table[] =
3465 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3466 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3467 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3468 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3469 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3470 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3471 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3472 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3473 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3474 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3475 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3476 PTA_MMX | PTA_SSE | PTA_FXSR},
3477 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3478 PTA_MMX | PTA_SSE | PTA_FXSR},
3479 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3480 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3481 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3482 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3483 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3484 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3485 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3486 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3487 PTA_MMX | PTA_SSE | PTA_FXSR},
3488 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3489 PTA_MMX | PTA_SSE | PTA_FXSR},
3490 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3491 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3492 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3493 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3494 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3495 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3496 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3497 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3498 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3499 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3500 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3501 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3502 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3503 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3504 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3505 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3506 PTA_SANDYBRIDGE},
3507 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3508 PTA_SANDYBRIDGE},
3509 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3510 PTA_IVYBRIDGE},
3511 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3512 PTA_IVYBRIDGE},
3513 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3514 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3515 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3516 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3517 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
3518 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3519 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3520 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3521 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3522 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3523 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3524 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3525 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3526 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3527 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3528 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3529 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3530 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3531 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3532 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3533 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3534 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3535 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3536 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3537 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3538 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3539 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3540 {"x86-64", PROCESSOR_K8, CPU_K8,
3541 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3542 {"eden-x2", PROCESSOR_K8, CPU_K8,
3543 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3544 {"nano", PROCESSOR_K8, CPU_K8,
3545 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3546 | PTA_SSSE3 | PTA_FXSR},
3547 {"nano-1000", PROCESSOR_K8, CPU_K8,
3548 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3549 | PTA_SSSE3 | PTA_FXSR},
3550 {"nano-2000", PROCESSOR_K8, CPU_K8,
3551 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3552 | PTA_SSSE3 | PTA_FXSR},
3553 {"nano-3000", PROCESSOR_K8, CPU_K8,
3554 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3555 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3556 {"nano-x2", PROCESSOR_K8, CPU_K8,
3557 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3558 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3559 {"eden-x4", PROCESSOR_K8, CPU_K8,
3560 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3561 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3562 {"nano-x4", PROCESSOR_K8, CPU_K8,
3563 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3564 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3565 {"k8", PROCESSOR_K8, CPU_K8,
3566 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3567 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3568 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3569 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3570 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3571 {"opteron", PROCESSOR_K8, CPU_K8,
3572 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3573 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3574 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3575 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3576 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3577 {"athlon64", PROCESSOR_K8, CPU_K8,
3578 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3579 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3580 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3581 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3582 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3583 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3584 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3585 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3586 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3587 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3588 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3589 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3590 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3591 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3592 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3593 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3594 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3595 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3596 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3597 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3598 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3599 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3600 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3601 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3602 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3603 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3604 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3605 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3606 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3607 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3608 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3609 | PTA_XSAVEOPT | PTA_FSGSBASE},
3610 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3611 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3612 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3613 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3614 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3615 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3616 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3617 | PTA_MOVBE | PTA_MWAITX},
3618 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3619 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3620 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3621 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3622 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3623 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3624 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3625 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3626 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3627 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3628 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3629 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3630 | PTA_FXSR | PTA_XSAVE},
3631 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3632 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3633 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3634 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3635 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3636 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3638 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3639 PTA_64BIT
3640 | PTA_HLE /* flags are only used for -march switch. */ },
3643 /* -mrecip options. */
3644 static struct
3646 const char *string; /* option name */
3647 unsigned int mask; /* mask bits to set */
3649 const recip_options[] =
3651 { "all", RECIP_MASK_ALL },
3652 { "none", RECIP_MASK_NONE },
3653 { "div", RECIP_MASK_DIV },
3654 { "sqrt", RECIP_MASK_SQRT },
3655 { "vec-div", RECIP_MASK_VEC_DIV },
3656 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3659 int const pta_size = ARRAY_SIZE (processor_alias_table);
3661 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3662 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3663 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3664 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3665 #ifdef TARGET_BI_ARCH
3666 else
3668 #if TARGET_BI_ARCH == 1
3669 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3670 is on and OPTION_MASK_ABI_X32 is off. We turn off
3671 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3672 -mx32. */
3673 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3674 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3675 #else
3676 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3677 on and OPTION_MASK_ABI_64 is off. We turn off
3678 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3679 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3680 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3681 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3682 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3683 #endif
3684 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3685 && TARGET_IAMCU_P (opts->x_target_flags))
3686 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3687 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3689 #endif
3691 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3693 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3694 OPTION_MASK_ABI_64 for TARGET_X32. */
3695 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3696 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3698 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3699 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3700 | OPTION_MASK_ABI_X32
3701 | OPTION_MASK_ABI_64);
3702 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3704 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3705 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3706 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3707 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3710 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3711 SUBTARGET_OVERRIDE_OPTIONS;
3712 #endif
3714 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3715 SUBSUBTARGET_OVERRIDE_OPTIONS;
3716 #endif
3718 /* -fPIC is the default for x86_64. */
3719 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3720 opts->x_flag_pic = 2;
3722 /* Need to check -mtune=generic first. */
3723 if (opts->x_ix86_tune_string)
3725 /* As special support for cross compilers we read -mtune=native
3726 as -mtune=generic. With native compilers we won't see the
3727 -mtune=native, as it was changed by the driver. */
3728 if (!strcmp (opts->x_ix86_tune_string, "native"))
3730 opts->x_ix86_tune_string = "generic";
3732 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3733 warning (OPT_Wdeprecated,
3734 main_args_p
3735 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3736 "or %<-mtune=generic%> instead as appropriate")
3737 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3738 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3739 " instead as appropriate"));
3741 else
3743 if (opts->x_ix86_arch_string)
3744 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3745 if (!opts->x_ix86_tune_string)
3747 opts->x_ix86_tune_string
3748 = processor_target_table[TARGET_CPU_DEFAULT].name;
3749 ix86_tune_defaulted = 1;
3752 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3753 or defaulted. We need to use a sensible tune option. */
3754 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3756 opts->x_ix86_tune_string = "generic";
3760 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3761 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3763 /* rep; movq isn't available in 32-bit code. */
3764 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3765 opts->x_ix86_stringop_alg = no_stringop;
3768 if (!opts->x_ix86_arch_string)
3769 opts->x_ix86_arch_string
3770 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3771 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3772 else
3773 ix86_arch_specified = 1;
3775 if (opts_set->x_ix86_pmode)
3777 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3778 && opts->x_ix86_pmode == PMODE_SI)
3779 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3780 && opts->x_ix86_pmode == PMODE_DI))
3781 error ("address mode %qs not supported in the %s bit mode",
3782 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3783 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3785 else
3786 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3787 ? PMODE_DI : PMODE_SI;
3789 if (!opts_set->x_ix86_abi)
3790 opts->x_ix86_abi = DEFAULT_ABI;
3792 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3793 error ("-mabi=ms not supported with X32 ABI");
3794 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3796 /* For targets using ms ABI enable ms-extensions, if not
3797 explicit turned off. For non-ms ABI we turn off this
3798 option. */
3799 if (!opts_set->x_flag_ms_extensions)
3800 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3802 if (opts_set->x_ix86_cmodel)
3804 switch (opts->x_ix86_cmodel)
3806 case CM_SMALL:
3807 case CM_SMALL_PIC:
3808 if (opts->x_flag_pic)
3809 opts->x_ix86_cmodel = CM_SMALL_PIC;
3810 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3811 error ("code model %qs not supported in the %s bit mode",
3812 "small", "32");
3813 break;
3815 case CM_MEDIUM:
3816 case CM_MEDIUM_PIC:
3817 if (opts->x_flag_pic)
3818 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3819 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3820 error ("code model %qs not supported in the %s bit mode",
3821 "medium", "32");
3822 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3823 error ("code model %qs not supported in x32 mode",
3824 "medium");
3825 break;
3827 case CM_LARGE:
3828 case CM_LARGE_PIC:
3829 if (opts->x_flag_pic)
3830 opts->x_ix86_cmodel = CM_LARGE_PIC;
3831 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3832 error ("code model %qs not supported in the %s bit mode",
3833 "large", "32");
3834 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3835 error ("code model %qs not supported in x32 mode",
3836 "large");
3837 break;
3839 case CM_32:
3840 if (opts->x_flag_pic)
3841 error ("code model %s does not support PIC mode", "32");
3842 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3843 error ("code model %qs not supported in the %s bit mode",
3844 "32", "64");
3845 break;
3847 case CM_KERNEL:
3848 if (opts->x_flag_pic)
3850 error ("code model %s does not support PIC mode", "kernel");
3851 opts->x_ix86_cmodel = CM_32;
3853 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3854 error ("code model %qs not supported in the %s bit mode",
3855 "kernel", "32");
3856 break;
3858 default:
3859 gcc_unreachable ();
3862 else
3864 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3865 use of rip-relative addressing. This eliminates fixups that
3866 would otherwise be needed if this object is to be placed in a
3867 DLL, and is essentially just as efficient as direct addressing. */
3868 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3869 && (TARGET_RDOS || TARGET_PECOFF))
3870 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3871 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3872 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3873 else
3874 opts->x_ix86_cmodel = CM_32;
3876 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3878 error ("-masm=intel not supported in this configuration");
3879 opts->x_ix86_asm_dialect = ASM_ATT;
3881 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3882 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3883 sorry ("%i-bit mode not compiled in",
3884 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3886 for (i = 0; i < pta_size; i++)
3887 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3889 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3891 error (main_args_p
3892 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3893 "switch")
3894 : G_("%<generic%> CPU can be used only for "
3895 "%<target(\"tune=\")%> attribute"));
3896 return false;
3898 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3900 error (main_args_p
3901 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3902 "switch")
3903 : G_("%<intel%> CPU can be used only for "
3904 "%<target(\"tune=\")%> attribute"));
3905 return false;
3908 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3909 && !(processor_alias_table[i].flags & PTA_64BIT))
3911 error ("CPU you selected does not support x86-64 "
3912 "instruction set");
3913 return false;
3916 ix86_schedule = processor_alias_table[i].schedule;
3917 ix86_arch = processor_alias_table[i].processor;
3918 /* Default cpu tuning to the architecture. */
3919 ix86_tune = ix86_arch;
3921 if (processor_alias_table[i].flags & PTA_MMX
3922 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3923 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3924 if (processor_alias_table[i].flags & PTA_3DNOW
3925 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3926 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3927 if (processor_alias_table[i].flags & PTA_3DNOW_A
3928 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3929 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3930 if (processor_alias_table[i].flags & PTA_SSE
3931 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3932 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3933 if (processor_alias_table[i].flags & PTA_SSE2
3934 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3935 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3936 if (processor_alias_table[i].flags & PTA_SSE3
3937 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3938 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3939 if (processor_alias_table[i].flags & PTA_SSSE3
3940 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3941 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3942 if (processor_alias_table[i].flags & PTA_SSE4_1
3943 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3944 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3945 if (processor_alias_table[i].flags & PTA_SSE4_2
3946 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3947 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3948 if (processor_alias_table[i].flags & PTA_AVX
3949 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3950 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3951 if (processor_alias_table[i].flags & PTA_AVX2
3952 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3953 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3954 if (processor_alias_table[i].flags & PTA_FMA
3955 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3956 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3957 if (processor_alias_table[i].flags & PTA_SSE4A
3958 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3959 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3960 if (processor_alias_table[i].flags & PTA_FMA4
3961 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3962 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3963 if (processor_alias_table[i].flags & PTA_XOP
3964 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3965 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3966 if (processor_alias_table[i].flags & PTA_LWP
3967 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3968 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3969 if (processor_alias_table[i].flags & PTA_ABM
3970 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3971 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3972 if (processor_alias_table[i].flags & PTA_BMI
3973 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3974 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3975 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3976 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3977 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3978 if (processor_alias_table[i].flags & PTA_TBM
3979 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3980 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3981 if (processor_alias_table[i].flags & PTA_BMI2
3982 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3983 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3984 if (processor_alias_table[i].flags & PTA_CX16
3985 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3986 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3987 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3988 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3989 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3990 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3991 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3992 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3993 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3994 if (processor_alias_table[i].flags & PTA_MOVBE
3995 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3996 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3997 if (processor_alias_table[i].flags & PTA_AES
3998 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3999 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4000 if (processor_alias_table[i].flags & PTA_SHA
4001 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4002 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4003 if (processor_alias_table[i].flags & PTA_PCLMUL
4004 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4005 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4006 if (processor_alias_table[i].flags & PTA_FSGSBASE
4007 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4008 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4009 if (processor_alias_table[i].flags & PTA_RDRND
4010 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4011 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4012 if (processor_alias_table[i].flags & PTA_F16C
4013 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4014 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4015 if (processor_alias_table[i].flags & PTA_RTM
4016 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4017 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4018 if (processor_alias_table[i].flags & PTA_HLE
4019 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
4020 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
4021 if (processor_alias_table[i].flags & PTA_PRFCHW
4022 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4023 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4024 if (processor_alias_table[i].flags & PTA_RDSEED
4025 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4026 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4027 if (processor_alias_table[i].flags & PTA_ADX
4028 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4029 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4030 if (processor_alias_table[i].flags & PTA_FXSR
4031 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4032 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4033 if (processor_alias_table[i].flags & PTA_XSAVE
4034 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4035 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4036 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4037 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4038 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4039 if (processor_alias_table[i].flags & PTA_AVX512F
4040 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4041 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4042 if (processor_alias_table[i].flags & PTA_AVX512ER
4043 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4044 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4045 if (processor_alias_table[i].flags & PTA_AVX512PF
4046 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4047 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4048 if (processor_alias_table[i].flags & PTA_AVX512CD
4049 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4050 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4051 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4052 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4053 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4054 if (processor_alias_table[i].flags & PTA_CLWB
4055 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4056 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4057 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4058 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4059 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4060 if (processor_alias_table[i].flags & PTA_CLZERO
4061 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
4062 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
4063 if (processor_alias_table[i].flags & PTA_XSAVEC
4064 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4065 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4066 if (processor_alias_table[i].flags & PTA_XSAVES
4067 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4068 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4069 if (processor_alias_table[i].flags & PTA_AVX512DQ
4070 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4071 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4072 if (processor_alias_table[i].flags & PTA_AVX512BW
4073 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4074 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4075 if (processor_alias_table[i].flags & PTA_AVX512VL
4076 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4077 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4078 if (processor_alias_table[i].flags & PTA_MPX
4079 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
4080 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
4081 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4082 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4083 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4084 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4085 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4086 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4088 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4089 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
4090 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4091 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4092 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
4093 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4094 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4095 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4096 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4097 if (processor_alias_table[i].flags & PTA_SGX
4098 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4099 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4101 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4102 x86_prefetch_sse = true;
4103 if (processor_alias_table[i].flags & PTA_MWAITX
4104 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
4105 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
4106 if (processor_alias_table[i].flags & PTA_PKU
4107 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4108 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4110 /* Don't enable x87 instructions if only
4111 general registers are allowed. */
4112 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4113 && !(opts_set->x_target_flags & MASK_80387))
4115 if (processor_alias_table[i].flags & PTA_NO_80387)
4116 opts->x_target_flags &= ~MASK_80387;
4117 else
4118 opts->x_target_flags |= MASK_80387;
4120 break;
4123 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
4124 error ("Intel MPX does not support x32");
4126 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
4127 error ("Intel MPX does not support x32");
4129 if (i == pta_size)
4131 error (main_args_p
4132 ? G_("bad value (%qs) for %<-march=%> switch")
4133 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4134 opts->x_ix86_arch_string);
4136 auto_vec <const char *> candidates;
4137 for (i = 0; i < pta_size; i++)
4138 if (strcmp (processor_alias_table[i].name, "generic")
4139 && strcmp (processor_alias_table[i].name, "intel")
4140 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4141 || (processor_alias_table[i].flags & PTA_64BIT)))
4142 candidates.safe_push (processor_alias_table[i].name);
4144 char *s;
4145 const char *hint
4146 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4147 if (hint)
4148 inform (input_location,
4149 main_args_p
4150 ? G_("valid arguments to %<-march=%> switch are: "
4151 "%s; did you mean %qs?")
4152 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4153 "%s; did you mean %qs?"), s, hint);
4154 else
4155 inform (input_location,
4156 main_args_p
4157 ? G_("valid arguments to %<-march=%> switch are: %s")
4158 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4159 "are: %s"), s);
4160 XDELETEVEC (s);
4163 ix86_arch_mask = 1u << ix86_arch;
4164 for (i = 0; i < X86_ARCH_LAST; ++i)
4165 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4167 for (i = 0; i < pta_size; i++)
4168 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4170 ix86_schedule = processor_alias_table[i].schedule;
4171 ix86_tune = processor_alias_table[i].processor;
4172 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4174 if (!(processor_alias_table[i].flags & PTA_64BIT))
4176 if (ix86_tune_defaulted)
4178 opts->x_ix86_tune_string = "x86-64";
4179 for (i = 0; i < pta_size; i++)
4180 if (! strcmp (opts->x_ix86_tune_string,
4181 processor_alias_table[i].name))
4182 break;
4183 ix86_schedule = processor_alias_table[i].schedule;
4184 ix86_tune = processor_alias_table[i].processor;
4186 else
4187 error ("CPU you selected does not support x86-64 "
4188 "instruction set");
4191 /* Intel CPUs have always interpreted SSE prefetch instructions as
4192 NOPs; so, we can enable SSE prefetch instructions even when
4193 -mtune (rather than -march) points us to a processor that has them.
4194 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4195 higher processors. */
4196 if (TARGET_CMOV
4197 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4198 x86_prefetch_sse = true;
4199 break;
4202 if (ix86_tune_specified && i == pta_size)
4204 error (main_args_p
4205 ? G_("bad value (%qs) for %<-mtune=%> switch")
4206 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4207 opts->x_ix86_tune_string);
4209 auto_vec <const char *> candidates;
4210 for (i = 0; i < pta_size; i++)
4211 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4212 || (processor_alias_table[i].flags & PTA_64BIT))
4213 candidates.safe_push (processor_alias_table[i].name);
4215 char *s;
4216 const char *hint
4217 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4218 if (hint)
4219 inform (input_location,
4220 main_args_p
4221 ? G_("valid arguments to %<-mtune=%> switch are: "
4222 "%s; did you mean %qs?")
4223 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4224 "%s; did you mean %qs?"), s, hint);
4225 else
4226 inform (input_location,
4227 main_args_p
4228 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4229 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4230 "are: %s"), s);
4231 XDELETEVEC (s);
4234 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4236 #ifndef USE_IX86_FRAME_POINTER
4237 #define USE_IX86_FRAME_POINTER 0
4238 #endif
4240 #ifndef USE_X86_64_FRAME_POINTER
4241 #define USE_X86_64_FRAME_POINTER 0
4242 #endif
4244 /* Set the default values for switches whose default depends on TARGET_64BIT
4245 in case they weren't overwritten by command line options. */
4246 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4248 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4249 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4250 if (opts->x_flag_asynchronous_unwind_tables
4251 && !opts_set->x_flag_unwind_tables
4252 && TARGET_64BIT_MS_ABI)
4253 opts->x_flag_unwind_tables = 1;
4254 if (opts->x_flag_asynchronous_unwind_tables == 2)
4255 opts->x_flag_unwind_tables
4256 = opts->x_flag_asynchronous_unwind_tables = 1;
4257 if (opts->x_flag_pcc_struct_return == 2)
4258 opts->x_flag_pcc_struct_return = 0;
4260 else
4262 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4263 opts->x_flag_omit_frame_pointer
4264 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4265 if (opts->x_flag_asynchronous_unwind_tables == 2)
4266 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4267 if (opts->x_flag_pcc_struct_return == 2)
4269 /* Intel MCU psABI specifies that -freg-struct-return should
4270 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4271 we check -miamcu so that -freg-struct-return is always
4272 turned on if -miamcu is used. */
4273 if (TARGET_IAMCU_P (opts->x_target_flags))
4274 opts->x_flag_pcc_struct_return = 0;
4275 else
4276 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4280 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4281 /* TODO: ix86_cost should be chosen at instruction or function granuality
4282 so for cold code we use size_cost even in !optimize_size compilation. */
4283 if (opts->x_optimize_size)
4284 ix86_cost = &ix86_size_cost;
4285 else
4286 ix86_cost = ix86_tune_cost;
4288 /* Arrange to set up i386_stack_locals for all functions. */
4289 init_machine_status = ix86_init_machine_status;
4291 /* Validate -mregparm= value. */
4292 if (opts_set->x_ix86_regparm)
4294 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4295 warning (0, "-mregparm is ignored in 64-bit mode");
4296 else if (TARGET_IAMCU_P (opts->x_target_flags))
4297 warning (0, "-mregparm is ignored for Intel MCU psABI");
4298 if (opts->x_ix86_regparm > REGPARM_MAX)
4300 error ("-mregparm=%d is not between 0 and %d",
4301 opts->x_ix86_regparm, REGPARM_MAX);
4302 opts->x_ix86_regparm = 0;
4305 if (TARGET_IAMCU_P (opts->x_target_flags)
4306 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4307 opts->x_ix86_regparm = REGPARM_MAX;
4309 /* Default align_* from the processor table. */
4310 ix86_default_align (opts);
4312 /* Provide default for -mbranch-cost= value. */
4313 if (!opts_set->x_ix86_branch_cost)
4314 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4316 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4318 opts->x_target_flags
4319 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4321 /* Enable by default the SSE and MMX builtins. Do allow the user to
4322 explicitly disable any of these. In particular, disabling SSE and
4323 MMX for kernel code is extremely useful. */
4324 if (!ix86_arch_specified)
4325 opts->x_ix86_isa_flags
4326 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4327 | TARGET_SUBTARGET64_ISA_DEFAULT)
4328 & ~opts->x_ix86_isa_flags_explicit);
4330 if (TARGET_RTD_P (opts->x_target_flags))
4331 warning (0,
4332 main_args_p
4333 ? G_("%<-mrtd%> is ignored in 64bit mode")
4334 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4336 else
4338 opts->x_target_flags
4339 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4341 if (!ix86_arch_specified)
4342 opts->x_ix86_isa_flags
4343 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4345 /* i386 ABI does not specify red zone. It still makes sense to use it
4346 when programmer takes care to stack from being destroyed. */
4347 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4348 opts->x_target_flags |= MASK_NO_RED_ZONE;
4351 /* Keep nonleaf frame pointers. */
4352 if (opts->x_flag_omit_frame_pointer)
4353 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4354 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4355 opts->x_flag_omit_frame_pointer = 1;
4357 /* If we're doing fast math, we don't care about comparison order
4358 wrt NaNs. This lets us use a shorter comparison sequence. */
4359 if (opts->x_flag_finite_math_only)
4360 opts->x_target_flags &= ~MASK_IEEE_FP;
4362 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4363 since the insns won't need emulation. */
4364 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4365 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4367 /* Likewise, if the target doesn't have a 387, or we've specified
4368 software floating point, don't use 387 inline intrinsics. */
4369 if (!TARGET_80387_P (opts->x_target_flags))
4370 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4372 /* Turn on MMX builtins for -msse. */
4373 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4374 opts->x_ix86_isa_flags
4375 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4377 /* Enable SSE prefetch. */
4378 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4379 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4380 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4381 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4382 x86_prefetch_sse = true;
4384 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4385 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4386 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4387 opts->x_ix86_isa_flags
4388 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4390 /* Enable lzcnt instruction for -mabm. */
4391 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4392 opts->x_ix86_isa_flags
4393 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4395 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4396 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4397 opts->x_ix86_isa_flags
4398 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4399 & ~opts->x_ix86_isa_flags_explicit);
4401 /* Validate -mpreferred-stack-boundary= value or default it to
4402 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4403 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4404 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4406 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4407 int max = TARGET_SEH ? 4 : 12;
4409 if (opts->x_ix86_preferred_stack_boundary_arg < min
4410 || opts->x_ix86_preferred_stack_boundary_arg > max)
4412 if (min == max)
4413 error ("-mpreferred-stack-boundary is not supported "
4414 "for this target");
4415 else
4416 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4417 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4419 else
4420 ix86_preferred_stack_boundary
4421 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4424 /* Set the default value for -mstackrealign. */
4425 if (!opts_set->x_ix86_force_align_arg_pointer)
4426 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4428 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4430 /* Validate -mincoming-stack-boundary= value or default it to
4431 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4432 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4433 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4435 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4437 if (opts->x_ix86_incoming_stack_boundary_arg < min
4438 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4439 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4440 opts->x_ix86_incoming_stack_boundary_arg, min);
4441 else
4443 ix86_user_incoming_stack_boundary
4444 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4445 ix86_incoming_stack_boundary
4446 = ix86_user_incoming_stack_boundary;
4450 #ifndef NO_PROFILE_COUNTERS
4451 if (flag_nop_mcount)
4452 error ("-mnop-mcount is not compatible with this target");
4453 #endif
4454 if (flag_nop_mcount && flag_pic)
4455 error ("-mnop-mcount is not implemented for -fPIC");
4457 /* Accept -msseregparm only if at least SSE support is enabled. */
4458 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4459 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4460 error (main_args_p
4461 ? G_("%<-msseregparm%> used without SSE enabled")
4462 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4464 if (opts_set->x_ix86_fpmath)
4466 if (opts->x_ix86_fpmath & FPMATH_SSE)
4468 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4470 if (TARGET_80387_P (opts->x_target_flags))
4472 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4473 opts->x_ix86_fpmath = FPMATH_387;
4476 else if ((opts->x_ix86_fpmath & FPMATH_387)
4477 && !TARGET_80387_P (opts->x_target_flags))
4479 warning (0, "387 instruction set disabled, using SSE arithmetics");
4480 opts->x_ix86_fpmath = FPMATH_SSE;
4484 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4485 fpmath=387. The second is however default at many targets since the
4486 extra 80bit precision of temporaries is considered to be part of ABI.
4487 Overwrite the default at least for -ffast-math.
4488 TODO: -mfpmath=both seems to produce same performing code with bit
4489 smaller binaries. It is however not clear if register allocation is
4490 ready for this setting.
4491 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4492 codegen. We may switch to 387 with -ffast-math for size optimized
4493 functions. */
4494 else if (fast_math_flags_set_p (&global_options)
4495 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4496 opts->x_ix86_fpmath = FPMATH_SSE;
4497 else
4498 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4500 /* Use external vectorized library in vectorizing intrinsics. */
4501 if (opts_set->x_ix86_veclibabi_type)
4502 switch (opts->x_ix86_veclibabi_type)
4504 case ix86_veclibabi_type_svml:
4505 ix86_veclib_handler = ix86_veclibabi_svml;
4506 break;
4508 case ix86_veclibabi_type_acml:
4509 ix86_veclib_handler = ix86_veclibabi_acml;
4510 break;
4512 default:
4513 gcc_unreachable ();
4516 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4517 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4518 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4520 /* If stack probes are required, the space used for large function
4521 arguments on the stack must also be probed, so enable
4522 -maccumulate-outgoing-args so this happens in the prologue. */
4523 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4524 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4526 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4527 warning (0,
4528 main_args_p
4529 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4530 "for correctness")
4531 : G_("stack probing requires "
4532 "%<target(\"accumulate-outgoing-args\")%> for "
4533 "correctness"));
4534 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4537 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4538 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4539 if (fixed_regs[BP_REG]
4540 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4542 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4543 warning (0,
4544 main_args_p
4545 ? G_("fixed ebp register requires "
4546 "%<-maccumulate-outgoing-args%>")
4547 : G_("fixed ebp register requires "
4548 "%<target(\"accumulate-outgoing-args\")%>"));
4549 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4552 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4554 char *p;
4555 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4556 p = strchr (internal_label_prefix, 'X');
4557 internal_label_prefix_len = p - internal_label_prefix;
4558 *p = '\0';
4561 /* When scheduling description is not available, disable scheduler pass
4562 so it won't slow down the compilation and make x87 code slower. */
4563 if (!TARGET_SCHEDULE)
4564 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4566 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4567 ix86_tune_cost->simultaneous_prefetches,
4568 opts->x_param_values,
4569 opts_set->x_param_values);
4570 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4571 ix86_tune_cost->prefetch_block,
4572 opts->x_param_values,
4573 opts_set->x_param_values);
4574 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4575 ix86_tune_cost->l1_cache_size,
4576 opts->x_param_values,
4577 opts_set->x_param_values);
4578 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4579 ix86_tune_cost->l2_cache_size,
4580 opts->x_param_values,
4581 opts_set->x_param_values);
4583 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4584 if (opts->x_flag_prefetch_loop_arrays < 0
4585 && HAVE_prefetch
4586 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4587 && !opts->x_optimize_size
4588 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4589 opts->x_flag_prefetch_loop_arrays = 1;
4591 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4592 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4593 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4594 targetm.expand_builtin_va_start = NULL;
4596 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4598 ix86_gen_leave = gen_leave_rex64;
4599 if (Pmode == DImode)
4601 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4602 ix86_gen_tls_local_dynamic_base_64
4603 = gen_tls_local_dynamic_base_64_di;
4605 else
4607 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4608 ix86_gen_tls_local_dynamic_base_64
4609 = gen_tls_local_dynamic_base_64_si;
4612 else
4613 ix86_gen_leave = gen_leave;
4615 if (Pmode == DImode)
4617 ix86_gen_add3 = gen_adddi3;
4618 ix86_gen_sub3 = gen_subdi3;
4619 ix86_gen_sub3_carry = gen_subdi3_carry;
4620 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4621 ix86_gen_andsp = gen_anddi3;
4622 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4623 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4624 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4625 ix86_gen_monitor = gen_sse3_monitor_di;
4626 ix86_gen_monitorx = gen_monitorx_di;
4627 ix86_gen_clzero = gen_clzero_di;
4629 else
4631 ix86_gen_add3 = gen_addsi3;
4632 ix86_gen_sub3 = gen_subsi3;
4633 ix86_gen_sub3_carry = gen_subsi3_carry;
4634 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4635 ix86_gen_andsp = gen_andsi3;
4636 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4637 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4638 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4639 ix86_gen_monitor = gen_sse3_monitor_si;
4640 ix86_gen_monitorx = gen_monitorx_si;
4641 ix86_gen_clzero = gen_clzero_si;
4644 #ifdef USE_IX86_CLD
4645 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4646 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4647 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4648 #endif
4650 /* Set the default value for -mfentry. */
4651 if (!opts_set->x_flag_fentry)
4652 opts->x_flag_fentry = TARGET_SEH;
4653 else
4655 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4656 && opts->x_flag_fentry)
4657 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4658 "with -fpic");
4659 else if (TARGET_SEH && !opts->x_flag_fentry)
4660 sorry ("-mno-fentry isn%'t compatible with SEH");
4663 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4664 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4666 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
4667 opts->x_target_flags |= MASK_VZEROUPPER;
4668 if (!(opts_set->x_target_flags & MASK_STV))
4669 opts->x_target_flags |= MASK_STV;
4670 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4671 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4672 stack realignment will be extra cost the pass doesn't take into
4673 account and the pass can't realign the stack. */
4674 if (ix86_preferred_stack_boundary < 128
4675 || ix86_incoming_stack_boundary < 128
4676 || opts->x_ix86_force_align_arg_pointer)
4677 opts->x_target_flags &= ~MASK_STV;
4678 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4679 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4680 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4681 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4682 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4683 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4684 /* Enable 128-bit AVX instruction generation
4685 for the auto-vectorizer. */
4686 if (TARGET_AVX128_OPTIMAL
4687 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
4688 opts->x_target_flags |= MASK_PREFER_AVX128;
4690 if (opts->x_ix86_recip_name)
4692 char *p = ASTRDUP (opts->x_ix86_recip_name);
4693 char *q;
4694 unsigned int mask, i;
4695 bool invert;
4697 while ((q = strtok (p, ",")) != NULL)
4699 p = NULL;
4700 if (*q == '!')
4702 invert = true;
4703 q++;
4705 else
4706 invert = false;
4708 if (!strcmp (q, "default"))
4709 mask = RECIP_MASK_ALL;
4710 else
4712 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4713 if (!strcmp (q, recip_options[i].string))
4715 mask = recip_options[i].mask;
4716 break;
4719 if (i == ARRAY_SIZE (recip_options))
4721 error ("unknown option for -mrecip=%s", q);
4722 invert = false;
4723 mask = RECIP_MASK_NONE;
4727 opts->x_recip_mask_explicit |= mask;
4728 if (invert)
4729 opts->x_recip_mask &= ~mask;
4730 else
4731 opts->x_recip_mask |= mask;
4735 if (TARGET_RECIP_P (opts->x_target_flags))
4736 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4737 else if (opts_set->x_target_flags & MASK_RECIP)
4738 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4740 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4741 for 64-bit Bionic. Also default long double to 64-bit for Intel
4742 MCU psABI. */
4743 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4744 && !(opts_set->x_target_flags
4745 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4746 opts->x_target_flags |= (TARGET_64BIT
4747 ? MASK_LONG_DOUBLE_128
4748 : MASK_LONG_DOUBLE_64);
4750 /* Only one of them can be active. */
4751 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4752 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4754 /* Handle stack protector */
4755 if (!opts_set->x_ix86_stack_protector_guard)
4756 opts->x_ix86_stack_protector_guard
4757 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4759 #ifdef TARGET_THREAD_SSP_OFFSET
4760 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4761 #endif
4763 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4765 char *endp;
4766 const char *str = ix86_stack_protector_guard_offset_str;
4768 errno = 0;
4769 int64_t offset;
4771 #if defined(INT64_T_IS_LONG)
4772 offset = strtol (str, &endp, 0);
4773 #else
4774 offset = strtoll (str, &endp, 0);
4775 #endif
4777 if (!*str || *endp || errno)
4778 error ("%qs is not a valid number "
4779 "in -mstack-protector-guard-offset=", str);
4781 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4782 HOST_WIDE_INT_C (0x7fffffff)))
4783 error ("%qs is not a valid offset "
4784 "in -mstack-protector-guard-offset=", str);
4786 ix86_stack_protector_guard_offset = offset;
4789 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4791 /* The kernel uses a different segment register for performance
4792 reasons; a system call would not have to trash the userspace
4793 segment register, which would be expensive. */
4794 if (ix86_cmodel == CM_KERNEL)
4795 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4797 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4799 const char *str = ix86_stack_protector_guard_reg_str;
4800 addr_space_t seg = ADDR_SPACE_GENERIC;
4802 /* Discard optional register prefix. */
4803 if (str[0] == '%')
4804 str++;
4806 if (strlen (str) == 2 && str[1] == 's')
4808 if (str[0] == 'f')
4809 seg = ADDR_SPACE_SEG_FS;
4810 else if (str[0] == 'g')
4811 seg = ADDR_SPACE_SEG_GS;
4814 if (seg == ADDR_SPACE_GENERIC)
4815 error ("%qs is not a valid base register "
4816 "in -mstack-protector-guard-reg=",
4817 ix86_stack_protector_guard_reg_str);
4819 ix86_stack_protector_guard_reg = seg;
4822 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4823 if (opts->x_ix86_tune_memcpy_strategy)
4825 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4826 ix86_parse_stringop_strategy_string (str, false);
4827 free (str);
4830 if (opts->x_ix86_tune_memset_strategy)
4832 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4833 ix86_parse_stringop_strategy_string (str, true);
4834 free (str);
4837 /* Save the initial options in case the user does function specific
4838 options. */
4839 if (main_args_p)
4840 target_option_default_node = target_option_current_node
4841 = build_target_option_node (opts);
4843 /* Do not support control flow instrumentation if CET is not enabled. */
4844 if (opts->x_flag_cf_protection != CF_NONE)
4846 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4847 || TARGET_SHSTK_P (opts->x_ix86_isa_flags2)))
4849 if (flag_cf_protection == CF_FULL)
4851 error ("%<-fcf-protection=full%> requires CET support "
4852 "on this target. Use -mcet or one of -mibt, "
4853 "-mshstk options to enable CET");
4855 else if (flag_cf_protection == CF_BRANCH)
4857 error ("%<-fcf-protection=branch%> requires CET support "
4858 "on this target. Use -mcet or one of -mibt, "
4859 "-mshstk options to enable CET");
4861 else if (flag_cf_protection == CF_RETURN)
4863 error ("%<-fcf-protection=return%> requires CET support "
4864 "on this target. Use -mcet or one of -mibt, "
4865 "-mshstk options to enable CET");
4867 flag_cf_protection = CF_NONE;
4868 return false;
4870 opts->x_flag_cf_protection =
4871 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4874 return true;
4877 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4879 static void
4880 ix86_option_override (void)
4882 ix86_option_override_internal (true, &global_options, &global_options_set);
4885 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4886 static char *
4887 ix86_offload_options (void)
4889 if (TARGET_LP64)
4890 return xstrdup ("-foffload-abi=lp64");
4891 return xstrdup ("-foffload-abi=ilp32");
4894 /* Update register usage after having seen the compiler flags. */
4896 static void
4897 ix86_conditional_register_usage (void)
4899 int i, c_mask;
4901 /* If there are no caller-saved registers, preserve all registers.
4902 except fixed_regs and registers used for function return value
4903 since aggregate_value_p checks call_used_regs[regno] on return
4904 value. */
4905 if (cfun && cfun->machine->no_caller_saved_registers)
4906 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4907 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4908 call_used_regs[i] = 0;
4910 /* For 32-bit targets, squash the REX registers. */
4911 if (! TARGET_64BIT)
4913 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4914 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4915 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4916 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4917 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4918 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4921 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4922 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4924 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4926 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4928 /* Set/reset conditionally defined registers from
4929 CALL_USED_REGISTERS initializer. */
4930 if (call_used_regs[i] > 1)
4931 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4933 /* Calculate registers of CLOBBERED_REGS register set
4934 as call used registers from GENERAL_REGS register set. */
4935 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4936 && call_used_regs[i])
4937 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4940 /* If MMX is disabled, squash the registers. */
4941 if (! TARGET_MMX)
4942 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4943 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4944 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4946 /* If SSE is disabled, squash the registers. */
4947 if (! TARGET_SSE)
4948 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4949 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4950 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4952 /* If the FPU is disabled, squash the registers. */
4953 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4954 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4955 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4956 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4958 /* If AVX512F is disabled, squash the registers. */
4959 if (! TARGET_AVX512F)
4961 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4962 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4964 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4965 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4968 /* If MPX is disabled, squash the registers. */
4969 if (! TARGET_MPX)
4970 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4971 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4974 /* Canonicalize a comparison from one we don't have to one we do have. */
4976 static void
4977 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
4978 bool op0_preserve_value)
4980 /* The order of operands in x87 ficom compare is forced by combine in
4981 simplify_comparison () function. Float operator is treated as RTX_OBJ
4982 with a precedence over other operators and is always put in the first
4983 place. Swap condition and operands to match ficom instruction. */
4984 if (!op0_preserve_value
4985 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
4987 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
4989 /* We are called only for compares that are split to SAHF instruction.
4990 Ensure that we have setcc/jcc insn for the swapped condition. */
4991 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
4993 std::swap (*op0, *op1);
4994 *code = (int) scode;
4999 /* Save the current options */
5001 static void
5002 ix86_function_specific_save (struct cl_target_option *ptr,
5003 struct gcc_options *opts)
5005 ptr->arch = ix86_arch;
5006 ptr->schedule = ix86_schedule;
5007 ptr->prefetch_sse = x86_prefetch_sse;
5008 ptr->tune = ix86_tune;
5009 ptr->branch_cost = ix86_branch_cost;
5010 ptr->tune_defaulted = ix86_tune_defaulted;
5011 ptr->arch_specified = ix86_arch_specified;
5012 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5013 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5014 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5015 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5016 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5017 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5018 ptr->x_ix86_abi = opts->x_ix86_abi;
5019 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5020 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5021 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5022 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5023 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5024 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5025 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5026 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5027 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5028 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5029 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5030 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5031 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5032 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5033 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5034 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5035 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5036 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5037 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5038 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5040 /* The fields are char but the variables are not; make sure the
5041 values fit in the fields. */
5042 gcc_assert (ptr->arch == ix86_arch);
5043 gcc_assert (ptr->schedule == ix86_schedule);
5044 gcc_assert (ptr->tune == ix86_tune);
5045 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5048 /* Restore the current options */
5050 static void
5051 ix86_function_specific_restore (struct gcc_options *opts,
5052 struct cl_target_option *ptr)
5054 enum processor_type old_tune = ix86_tune;
5055 enum processor_type old_arch = ix86_arch;
5056 unsigned int ix86_arch_mask;
5057 int i;
5059 /* We don't change -fPIC. */
5060 opts->x_flag_pic = flag_pic;
5062 ix86_arch = (enum processor_type) ptr->arch;
5063 ix86_schedule = (enum attr_cpu) ptr->schedule;
5064 ix86_tune = (enum processor_type) ptr->tune;
5065 x86_prefetch_sse = ptr->prefetch_sse;
5066 opts->x_ix86_branch_cost = ptr->branch_cost;
5067 ix86_tune_defaulted = ptr->tune_defaulted;
5068 ix86_arch_specified = ptr->arch_specified;
5069 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5070 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5071 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5072 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5073 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5074 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5075 opts->x_ix86_abi = ptr->x_ix86_abi;
5076 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5077 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5078 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5079 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5080 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5081 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5082 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5083 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5084 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5085 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5086 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5087 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5088 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5089 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5090 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5091 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5092 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5093 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5094 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5095 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5096 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5097 /* TODO: ix86_cost should be chosen at instruction or function granuality
5098 so for cold code we use size_cost even in !optimize_size compilation. */
5099 if (opts->x_optimize_size)
5100 ix86_cost = &ix86_size_cost;
5101 else
5102 ix86_cost = ix86_tune_cost;
5104 /* Recreate the arch feature tests if the arch changed */
5105 if (old_arch != ix86_arch)
5107 ix86_arch_mask = 1u << ix86_arch;
5108 for (i = 0; i < X86_ARCH_LAST; ++i)
5109 ix86_arch_features[i]
5110 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5113 /* Recreate the tune optimization tests */
5114 if (old_tune != ix86_tune)
5115 set_ix86_tune_features (ix86_tune, false);
5118 /* Adjust target options after streaming them in. This is mainly about
5119 reconciling them with global options. */
5121 static void
5122 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5124 /* flag_pic is a global option, but ix86_cmodel is target saved option
5125 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5126 for PIC, or error out. */
5127 if (flag_pic)
5128 switch (ptr->x_ix86_cmodel)
5130 case CM_SMALL:
5131 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5132 break;
5134 case CM_MEDIUM:
5135 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5136 break;
5138 case CM_LARGE:
5139 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5140 break;
5142 case CM_KERNEL:
5143 error ("code model %s does not support PIC mode", "kernel");
5144 break;
5146 default:
5147 break;
5149 else
5150 switch (ptr->x_ix86_cmodel)
5152 case CM_SMALL_PIC:
5153 ptr->x_ix86_cmodel = CM_SMALL;
5154 break;
5156 case CM_MEDIUM_PIC:
5157 ptr->x_ix86_cmodel = CM_MEDIUM;
5158 break;
5160 case CM_LARGE_PIC:
5161 ptr->x_ix86_cmodel = CM_LARGE;
5162 break;
5164 default:
5165 break;
5169 /* Print the current options */
5171 static void
5172 ix86_function_specific_print (FILE *file, int indent,
5173 struct cl_target_option *ptr)
5175 char *target_string
5176 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5177 ptr->x_target_flags, ptr->x_ix86_target_flags,
5178 NULL, NULL, ptr->x_ix86_fpmath, false);
5180 gcc_assert (ptr->arch < PROCESSOR_max);
5181 fprintf (file, "%*sarch = %d (%s)\n",
5182 indent, "",
5183 ptr->arch, processor_target_table[ptr->arch].name);
5185 gcc_assert (ptr->tune < PROCESSOR_max);
5186 fprintf (file, "%*stune = %d (%s)\n",
5187 indent, "",
5188 ptr->tune, processor_target_table[ptr->tune].name);
5190 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5192 if (target_string)
5194 fprintf (file, "%*s%s\n", indent, "", target_string);
5195 free (target_string);
5200 /* Inner function to process the attribute((target(...))), take an argument and
5201 set the current options from the argument. If we have a list, recursively go
5202 over the list. */
5204 static bool
5205 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5206 struct gcc_options *opts,
5207 struct gcc_options *opts_set,
5208 struct gcc_options *enum_opts_set)
5210 char *next_optstr;
5211 bool ret = true;
5213 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5214 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5215 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5216 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5217 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5219 enum ix86_opt_type
5221 ix86_opt_unknown,
5222 ix86_opt_yes,
5223 ix86_opt_no,
5224 ix86_opt_str,
5225 ix86_opt_enum,
5226 ix86_opt_isa
5229 static const struct
5231 const char *string;
5232 size_t len;
5233 enum ix86_opt_type type;
5234 int opt;
5235 int mask;
5236 } attrs[] = {
5237 /* isa options */
5238 IX86_ATTR_ISA ("sgx", OPT_msgx),
5239 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5240 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5241 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5243 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5244 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5245 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5246 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5247 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5248 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5249 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5250 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5251 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5252 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5253 IX86_ATTR_ISA ("fma", OPT_mfma),
5254 IX86_ATTR_ISA ("xop", OPT_mxop),
5255 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5256 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5257 IX86_ATTR_ISA ("avx", OPT_mavx),
5258 IX86_ATTR_ISA ("sse4", OPT_msse4),
5259 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5260 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5261 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5262 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5263 IX86_ATTR_ISA ("sse3", OPT_msse3),
5264 IX86_ATTR_ISA ("aes", OPT_maes),
5265 IX86_ATTR_ISA ("sha", OPT_msha),
5266 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5267 IX86_ATTR_ISA ("sse2", OPT_msse2),
5268 IX86_ATTR_ISA ("sse", OPT_msse),
5269 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5270 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5271 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5272 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5273 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5274 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5275 IX86_ATTR_ISA ("adx", OPT_madx),
5276 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5277 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5278 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5279 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5280 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5281 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5282 IX86_ATTR_ISA ("abm", OPT_mabm),
5283 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5284 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5285 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5286 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5287 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5288 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5289 IX86_ATTR_ISA ("sahf", OPT_msahf),
5290 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5291 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5292 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5293 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5294 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5295 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5296 IX86_ATTR_ISA ("pku", OPT_mpku),
5297 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5298 IX86_ATTR_ISA ("hle", OPT_mhle),
5299 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5300 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5301 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5302 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5303 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5304 IX86_ATTR_ISA ("ibt", OPT_mibt),
5305 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5307 /* enum options */
5308 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5310 /* string options */
5311 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5312 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5314 /* flag options */
5315 IX86_ATTR_YES ("cld",
5316 OPT_mcld,
5317 MASK_CLD),
5319 IX86_ATTR_NO ("fancy-math-387",
5320 OPT_mfancy_math_387,
5321 MASK_NO_FANCY_MATH_387),
5323 IX86_ATTR_YES ("ieee-fp",
5324 OPT_mieee_fp,
5325 MASK_IEEE_FP),
5327 IX86_ATTR_YES ("inline-all-stringops",
5328 OPT_minline_all_stringops,
5329 MASK_INLINE_ALL_STRINGOPS),
5331 IX86_ATTR_YES ("inline-stringops-dynamically",
5332 OPT_minline_stringops_dynamically,
5333 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5335 IX86_ATTR_NO ("align-stringops",
5336 OPT_mno_align_stringops,
5337 MASK_NO_ALIGN_STRINGOPS),
5339 IX86_ATTR_YES ("recip",
5340 OPT_mrecip,
5341 MASK_RECIP),
5345 /* If this is a list, recurse to get the options. */
5346 if (TREE_CODE (args) == TREE_LIST)
5348 bool ret = true;
5350 for (; args; args = TREE_CHAIN (args))
5351 if (TREE_VALUE (args)
5352 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5353 p_strings, opts, opts_set,
5354 enum_opts_set))
5355 ret = false;
5357 return ret;
5360 else if (TREE_CODE (args) != STRING_CST)
5362 error ("attribute %<target%> argument not a string");
5363 return false;
5366 /* Handle multiple arguments separated by commas. */
5367 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5369 while (next_optstr && *next_optstr != '\0')
5371 char *p = next_optstr;
5372 char *orig_p = p;
5373 char *comma = strchr (next_optstr, ',');
5374 const char *opt_string;
5375 size_t len, opt_len;
5376 int opt;
5377 bool opt_set_p;
5378 char ch;
5379 unsigned i;
5380 enum ix86_opt_type type = ix86_opt_unknown;
5381 int mask = 0;
5383 if (comma)
5385 *comma = '\0';
5386 len = comma - next_optstr;
5387 next_optstr = comma + 1;
5389 else
5391 len = strlen (p);
5392 next_optstr = NULL;
5395 /* Recognize no-xxx. */
5396 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5398 opt_set_p = false;
5399 p += 3;
5400 len -= 3;
5402 else
5403 opt_set_p = true;
5405 /* Find the option. */
5406 ch = *p;
5407 opt = N_OPTS;
5408 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5410 type = attrs[i].type;
5411 opt_len = attrs[i].len;
5412 if (ch == attrs[i].string[0]
5413 && ((type != ix86_opt_str && type != ix86_opt_enum)
5414 ? len == opt_len
5415 : len > opt_len)
5416 && memcmp (p, attrs[i].string, opt_len) == 0)
5418 opt = attrs[i].opt;
5419 mask = attrs[i].mask;
5420 opt_string = attrs[i].string;
5421 break;
5425 /* Process the option. */
5426 if (opt == N_OPTS)
5428 error ("attribute(target(\"%s\")) is unknown", orig_p);
5429 ret = false;
5432 else if (type == ix86_opt_isa)
5434 struct cl_decoded_option decoded;
5436 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5437 ix86_handle_option (opts, opts_set,
5438 &decoded, input_location);
5441 else if (type == ix86_opt_yes || type == ix86_opt_no)
5443 if (type == ix86_opt_no)
5444 opt_set_p = !opt_set_p;
5446 if (opt_set_p)
5447 opts->x_target_flags |= mask;
5448 else
5449 opts->x_target_flags &= ~mask;
5452 else if (type == ix86_opt_str)
5454 if (p_strings[opt])
5456 error ("option(\"%s\") was already specified", opt_string);
5457 ret = false;
5459 else
5460 p_strings[opt] = xstrdup (p + opt_len);
5463 else if (type == ix86_opt_enum)
5465 bool arg_ok;
5466 int value;
5468 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5469 if (arg_ok)
5470 set_option (opts, enum_opts_set, opt, value,
5471 p + opt_len, DK_UNSPECIFIED, input_location,
5472 global_dc);
5473 else
5475 error ("attribute(target(\"%s\")) is unknown", orig_p);
5476 ret = false;
5480 else
5481 gcc_unreachable ();
5484 return ret;
5487 /* Release allocated strings. */
5488 static void
5489 release_options_strings (char **option_strings)
5491 /* Free up memory allocated to hold the strings */
5492 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5493 free (option_strings[i]);
5496 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5498 tree
5499 ix86_valid_target_attribute_tree (tree args,
5500 struct gcc_options *opts,
5501 struct gcc_options *opts_set)
5503 const char *orig_arch_string = opts->x_ix86_arch_string;
5504 const char *orig_tune_string = opts->x_ix86_tune_string;
5505 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5506 int orig_tune_defaulted = ix86_tune_defaulted;
5507 int orig_arch_specified = ix86_arch_specified;
5508 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5509 tree t = NULL_TREE;
5510 struct cl_target_option *def
5511 = TREE_TARGET_OPTION (target_option_default_node);
5512 struct gcc_options enum_opts_set;
5514 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5516 /* Process each of the options on the chain. */
5517 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5518 opts_set, &enum_opts_set))
5519 return error_mark_node;
5521 /* If the changed options are different from the default, rerun
5522 ix86_option_override_internal, and then save the options away.
5523 The string options are attribute options, and will be undone
5524 when we copy the save structure. */
5525 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5526 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5527 || opts->x_target_flags != def->x_target_flags
5528 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5529 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5530 || enum_opts_set.x_ix86_fpmath)
5532 /* If we are using the default tune= or arch=, undo the string assigned,
5533 and use the default. */
5534 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5536 opts->x_ix86_arch_string
5537 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5539 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5540 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5541 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5542 | OPTION_MASK_ABI_64
5543 | OPTION_MASK_ABI_X32
5544 | OPTION_MASK_CODE16);
5545 opts->x_ix86_isa_flags2 = 0;
5547 else if (!orig_arch_specified)
5548 opts->x_ix86_arch_string = NULL;
5550 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5551 opts->x_ix86_tune_string
5552 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5553 else if (orig_tune_defaulted)
5554 opts->x_ix86_tune_string = NULL;
5556 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5557 if (enum_opts_set.x_ix86_fpmath)
5558 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5560 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5561 bool r = ix86_option_override_internal (false, opts, opts_set);
5562 if (!r)
5564 release_options_strings (option_strings);
5565 return error_mark_node;
5568 /* Add any builtin functions with the new isa if any. */
5569 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5571 /* Save the current options unless we are validating options for
5572 #pragma. */
5573 t = build_target_option_node (opts);
5575 opts->x_ix86_arch_string = orig_arch_string;
5576 opts->x_ix86_tune_string = orig_tune_string;
5577 opts_set->x_ix86_fpmath = orig_fpmath_set;
5579 release_options_strings (option_strings);
5582 return t;
5585 /* Hook to validate attribute((target("string"))). */
5587 static bool
5588 ix86_valid_target_attribute_p (tree fndecl,
5589 tree ARG_UNUSED (name),
5590 tree args,
5591 int ARG_UNUSED (flags))
5593 struct gcc_options func_options;
5594 tree new_target, new_optimize;
5595 bool ret = true;
5597 /* attribute((target("default"))) does nothing, beyond
5598 affecting multi-versioning. */
5599 if (TREE_VALUE (args)
5600 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5601 && TREE_CHAIN (args) == NULL_TREE
5602 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5603 return true;
5605 tree old_optimize = build_optimization_node (&global_options);
5607 /* Get the optimization options of the current function. */
5608 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5610 if (!func_optimize)
5611 func_optimize = old_optimize;
5613 /* Init func_options. */
5614 memset (&func_options, 0, sizeof (func_options));
5615 init_options_struct (&func_options, NULL);
5616 lang_hooks.init_options_struct (&func_options);
5618 cl_optimization_restore (&func_options,
5619 TREE_OPTIMIZATION (func_optimize));
5621 /* Initialize func_options to the default before its target options can
5622 be set. */
5623 cl_target_option_restore (&func_options,
5624 TREE_TARGET_OPTION (target_option_default_node));
5626 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5627 &global_options_set);
5629 new_optimize = build_optimization_node (&func_options);
5631 if (new_target == error_mark_node)
5632 ret = false;
5634 else if (fndecl && new_target)
5636 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5638 if (old_optimize != new_optimize)
5639 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5642 finalize_options_struct (&func_options);
5644 return ret;
5648 /* Hook to determine if one function can safely inline another. */
5650 static bool
5651 ix86_can_inline_p (tree caller, tree callee)
5653 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5654 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5655 if (!callee_tree)
5656 callee_tree = target_option_default_node;
5657 if (!caller_tree)
5658 caller_tree = target_option_default_node;
5659 if (callee_tree == caller_tree)
5660 return true;
5662 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5663 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5664 bool ret = false;
5666 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5667 function can inline a SSE2 function but a SSE2 function can't inline
5668 a SSE4 function. */
5669 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5670 != callee_opts->x_ix86_isa_flags)
5671 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5672 != callee_opts->x_ix86_isa_flags2))
5673 ret = false;
5675 /* See if we have the same non-isa options. */
5676 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5677 ret = false;
5679 /* See if arch, tune, etc. are the same. */
5680 else if (caller_opts->arch != callee_opts->arch)
5681 ret = false;
5683 else if (caller_opts->tune != callee_opts->tune)
5684 ret = false;
5686 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5687 /* If the calle doesn't use FP expressions differences in
5688 ix86_fpmath can be ignored. We are called from FEs
5689 for multi-versioning call optimization, so beware of
5690 ipa_fn_summaries not available. */
5691 && (! ipa_fn_summaries
5692 || ipa_fn_summaries->get
5693 (cgraph_node::get (callee))->fp_expressions))
5694 ret = false;
5696 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5697 ret = false;
5699 else
5700 ret = true;
5702 return ret;
5706 /* Remember the last target of ix86_set_current_function. */
5707 static GTY(()) tree ix86_previous_fndecl;
5709 /* Set targets globals to the default (or current #pragma GCC target
5710 if active). Invalidate ix86_previous_fndecl cache. */
5712 void
5713 ix86_reset_previous_fndecl (void)
5715 tree new_tree = target_option_current_node;
5716 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5717 if (TREE_TARGET_GLOBALS (new_tree))
5718 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5719 else if (new_tree == target_option_default_node)
5720 restore_target_globals (&default_target_globals);
5721 else
5722 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5723 ix86_previous_fndecl = NULL_TREE;
5726 /* Set the func_type field from the function FNDECL. */
5728 static void
5729 ix86_set_func_type (tree fndecl)
5731 if (cfun->machine->func_type == TYPE_UNKNOWN)
5733 if (lookup_attribute ("interrupt",
5734 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5736 if (ix86_function_naked (fndecl))
5737 error_at (DECL_SOURCE_LOCATION (fndecl),
5738 "interrupt and naked attributes are not compatible");
5740 int nargs = 0;
5741 for (tree arg = DECL_ARGUMENTS (fndecl);
5742 arg;
5743 arg = TREE_CHAIN (arg))
5744 nargs++;
5745 cfun->machine->no_caller_saved_registers = true;
5746 cfun->machine->func_type
5747 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5749 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5751 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5752 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5753 sorry ("Only DWARF debug format is supported for interrupt "
5754 "service routine.");
5756 else
5758 cfun->machine->func_type = TYPE_NORMAL;
5759 if (lookup_attribute ("no_caller_saved_registers",
5760 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5761 cfun->machine->no_caller_saved_registers = true;
5766 /* Establish appropriate back-end context for processing the function
5767 FNDECL. The argument might be NULL to indicate processing at top
5768 level, outside of any function scope. */
5769 static void
5770 ix86_set_current_function (tree fndecl)
5772 /* Only change the context if the function changes. This hook is called
5773 several times in the course of compiling a function, and we don't want to
5774 slow things down too much or call target_reinit when it isn't safe. */
5775 if (fndecl == ix86_previous_fndecl)
5777 /* There may be 2 function bodies for the same function FNDECL,
5778 one is extern inline and one isn't. Call ix86_set_func_type
5779 to set the func_type field. */
5780 if (fndecl != NULL_TREE)
5781 ix86_set_func_type (fndecl);
5782 return;
5785 tree old_tree;
5786 if (ix86_previous_fndecl == NULL_TREE)
5787 old_tree = target_option_current_node;
5788 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5789 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5790 else
5791 old_tree = target_option_default_node;
5793 if (fndecl == NULL_TREE)
5795 if (old_tree != target_option_current_node)
5796 ix86_reset_previous_fndecl ();
5797 return;
5800 ix86_set_func_type (fndecl);
5802 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5803 if (new_tree == NULL_TREE)
5804 new_tree = target_option_default_node;
5806 if (old_tree != new_tree)
5808 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5809 if (TREE_TARGET_GLOBALS (new_tree))
5810 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5811 else if (new_tree == target_option_default_node)
5812 restore_target_globals (&default_target_globals);
5813 else
5814 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5816 ix86_previous_fndecl = fndecl;
5818 static bool prev_no_caller_saved_registers;
5820 /* 64-bit MS and SYSV ABI have different set of call used registers.
5821 Avoid expensive re-initialization of init_regs each time we switch
5822 function context. */
5823 if (TARGET_64BIT
5824 && (call_used_regs[SI_REG]
5825 == (cfun->machine->call_abi == MS_ABI)))
5826 reinit_regs ();
5827 /* Need to re-initialize init_regs if caller-saved registers are
5828 changed. */
5829 else if (prev_no_caller_saved_registers
5830 != cfun->machine->no_caller_saved_registers)
5831 reinit_regs ();
5833 if (cfun->machine->func_type != TYPE_NORMAL
5834 || cfun->machine->no_caller_saved_registers)
5836 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5837 may change processor state. */
5838 const char *isa;
5839 if (TARGET_MPX)
5840 isa = "MPX";
5841 else if (TARGET_SSE)
5842 isa = "SSE";
5843 else if (TARGET_MMX)
5844 isa = "MMX/3Dnow";
5845 else if (TARGET_80387)
5846 isa = "80387";
5847 else
5848 isa = NULL;
5849 if (isa != NULL)
5851 if (cfun->machine->func_type != TYPE_NORMAL)
5852 sorry ("%s instructions aren't allowed in %s service routine",
5853 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5854 ? "exception" : "interrupt"));
5855 else
5856 sorry ("%s instructions aren't allowed in function with "
5857 "no_caller_saved_registers attribute", isa);
5858 /* Don't issue the same error twice. */
5859 cfun->machine->func_type = TYPE_NORMAL;
5860 cfun->machine->no_caller_saved_registers = false;
5864 prev_no_caller_saved_registers
5865 = cfun->machine->no_caller_saved_registers;
5869 /* Return true if this goes in large data/bss. */
5871 static bool
5872 ix86_in_large_data_p (tree exp)
5874 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5875 return false;
5877 if (exp == NULL_TREE)
5878 return false;
5880 /* Functions are never large data. */
5881 if (TREE_CODE (exp) == FUNCTION_DECL)
5882 return false;
5884 /* Automatic variables are never large data. */
5885 if (VAR_P (exp) && !is_global_var (exp))
5886 return false;
5888 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5890 const char *section = DECL_SECTION_NAME (exp);
5891 if (strcmp (section, ".ldata") == 0
5892 || strcmp (section, ".lbss") == 0)
5893 return true;
5894 return false;
5896 else
5898 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5900 /* If this is an incomplete type with size 0, then we can't put it
5901 in data because it might be too big when completed. Also,
5902 int_size_in_bytes returns -1 if size can vary or is larger than
5903 an integer in which case also it is safer to assume that it goes in
5904 large data. */
5905 if (size <= 0 || size > ix86_section_threshold)
5906 return true;
5909 return false;
5912 /* i386-specific section flag to mark large sections. */
5913 #define SECTION_LARGE SECTION_MACH_DEP
5915 /* Switch to the appropriate section for output of DECL.
5916 DECL is either a `VAR_DECL' node or a constant of some sort.
5917 RELOC indicates whether forming the initial value of DECL requires
5918 link-time relocations. */
5920 ATTRIBUTE_UNUSED static section *
5921 x86_64_elf_select_section (tree decl, int reloc,
5922 unsigned HOST_WIDE_INT align)
5924 if (ix86_in_large_data_p (decl))
5926 const char *sname = NULL;
5927 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
5928 switch (categorize_decl_for_section (decl, reloc))
5930 case SECCAT_DATA:
5931 sname = ".ldata";
5932 break;
5933 case SECCAT_DATA_REL:
5934 sname = ".ldata.rel";
5935 break;
5936 case SECCAT_DATA_REL_LOCAL:
5937 sname = ".ldata.rel.local";
5938 break;
5939 case SECCAT_DATA_REL_RO:
5940 sname = ".ldata.rel.ro";
5941 break;
5942 case SECCAT_DATA_REL_RO_LOCAL:
5943 sname = ".ldata.rel.ro.local";
5944 break;
5945 case SECCAT_BSS:
5946 sname = ".lbss";
5947 flags |= SECTION_BSS;
5948 break;
5949 case SECCAT_RODATA:
5950 case SECCAT_RODATA_MERGE_STR:
5951 case SECCAT_RODATA_MERGE_STR_INIT:
5952 case SECCAT_RODATA_MERGE_CONST:
5953 sname = ".lrodata";
5954 flags &= ~SECTION_WRITE;
5955 break;
5956 case SECCAT_SRODATA:
5957 case SECCAT_SDATA:
5958 case SECCAT_SBSS:
5959 gcc_unreachable ();
5960 case SECCAT_TEXT:
5961 case SECCAT_TDATA:
5962 case SECCAT_TBSS:
5963 /* We don't split these for medium model. Place them into
5964 default sections and hope for best. */
5965 break;
5967 if (sname)
5969 /* We might get called with string constants, but get_named_section
5970 doesn't like them as they are not DECLs. Also, we need to set
5971 flags in that case. */
5972 if (!DECL_P (decl))
5973 return get_section (sname, flags, NULL);
5974 return get_named_section (decl, sname, reloc);
5977 return default_elf_select_section (decl, reloc, align);
5980 /* Select a set of attributes for section NAME based on the properties
5981 of DECL and whether or not RELOC indicates that DECL's initializer
5982 might contain runtime relocations. */
5984 static unsigned int ATTRIBUTE_UNUSED
5985 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
5987 unsigned int flags = default_section_type_flags (decl, name, reloc);
5989 if (ix86_in_large_data_p (decl))
5990 flags |= SECTION_LARGE;
5992 if (decl == NULL_TREE
5993 && (strcmp (name, ".ldata.rel.ro") == 0
5994 || strcmp (name, ".ldata.rel.ro.local") == 0))
5995 flags |= SECTION_RELRO;
5997 if (strcmp (name, ".lbss") == 0
5998 || strncmp (name, ".lbss.", 5) == 0
5999 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6000 flags |= SECTION_BSS;
6002 return flags;
6005 /* Build up a unique section name, expressed as a
6006 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6007 RELOC indicates whether the initial value of EXP requires
6008 link-time relocations. */
6010 static void ATTRIBUTE_UNUSED
6011 x86_64_elf_unique_section (tree decl, int reloc)
6013 if (ix86_in_large_data_p (decl))
6015 const char *prefix = NULL;
6016 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6017 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6019 switch (categorize_decl_for_section (decl, reloc))
6021 case SECCAT_DATA:
6022 case SECCAT_DATA_REL:
6023 case SECCAT_DATA_REL_LOCAL:
6024 case SECCAT_DATA_REL_RO:
6025 case SECCAT_DATA_REL_RO_LOCAL:
6026 prefix = one_only ? ".ld" : ".ldata";
6027 break;
6028 case SECCAT_BSS:
6029 prefix = one_only ? ".lb" : ".lbss";
6030 break;
6031 case SECCAT_RODATA:
6032 case SECCAT_RODATA_MERGE_STR:
6033 case SECCAT_RODATA_MERGE_STR_INIT:
6034 case SECCAT_RODATA_MERGE_CONST:
6035 prefix = one_only ? ".lr" : ".lrodata";
6036 break;
6037 case SECCAT_SRODATA:
6038 case SECCAT_SDATA:
6039 case SECCAT_SBSS:
6040 gcc_unreachable ();
6041 case SECCAT_TEXT:
6042 case SECCAT_TDATA:
6043 case SECCAT_TBSS:
6044 /* We don't split these for medium model. Place them into
6045 default sections and hope for best. */
6046 break;
6048 if (prefix)
6050 const char *name, *linkonce;
6051 char *string;
6053 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6054 name = targetm.strip_name_encoding (name);
6056 /* If we're using one_only, then there needs to be a .gnu.linkonce
6057 prefix to the section name. */
6058 linkonce = one_only ? ".gnu.linkonce" : "";
6060 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6062 set_decl_section_name (decl, string);
6063 return;
6066 default_unique_section (decl, reloc);
6069 #ifdef COMMON_ASM_OP
6071 #ifndef LARGECOMM_SECTION_ASM_OP
6072 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6073 #endif
6075 /* This says how to output assembler code to declare an
6076 uninitialized external linkage data object.
6078 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6079 large objects. */
6080 void
6081 x86_elf_aligned_decl_common (FILE *file, tree decl,
6082 const char *name, unsigned HOST_WIDE_INT size,
6083 int align)
6085 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6086 && size > (unsigned int)ix86_section_threshold)
6088 switch_to_section (get_named_section (decl, ".lbss", 0));
6089 fputs (LARGECOMM_SECTION_ASM_OP, file);
6091 else
6092 fputs (COMMON_ASM_OP, file);
6093 assemble_name (file, name);
6094 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6095 size, align / BITS_PER_UNIT);
6097 #endif
6099 /* Utility function for targets to use in implementing
6100 ASM_OUTPUT_ALIGNED_BSS. */
6102 void
6103 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6104 unsigned HOST_WIDE_INT size, int align)
6106 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6107 && size > (unsigned int)ix86_section_threshold)
6108 switch_to_section (get_named_section (decl, ".lbss", 0));
6109 else
6110 switch_to_section (bss_section);
6111 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6112 #ifdef ASM_DECLARE_OBJECT_NAME
6113 last_assemble_variable_decl = decl;
6114 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6115 #else
6116 /* Standard thing is just output label for the object. */
6117 ASM_OUTPUT_LABEL (file, name);
6118 #endif /* ASM_DECLARE_OBJECT_NAME */
6119 ASM_OUTPUT_SKIP (file, size ? size : 1);
6122 /* Decide whether we must probe the stack before any space allocation
6123 on this target. It's essentially TARGET_STACK_PROBE except when
6124 -fstack-check causes the stack to be already probed differently. */
6126 bool
6127 ix86_target_stack_probe (void)
6129 /* Do not probe the stack twice if static stack checking is enabled. */
6130 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6131 return false;
6133 return TARGET_STACK_PROBE;
6136 /* Decide whether we can make a sibling call to a function. DECL is the
6137 declaration of the function being targeted by the call and EXP is the
6138 CALL_EXPR representing the call. */
6140 static bool
6141 ix86_function_ok_for_sibcall (tree decl, tree exp)
6143 tree type, decl_or_type;
6144 rtx a, b;
6145 bool bind_global = decl && !targetm.binds_local_p (decl);
6147 if (ix86_function_naked (current_function_decl))
6148 return false;
6150 /* Sibling call isn't OK if there are no caller-saved registers
6151 since all registers must be preserved before return. */
6152 if (cfun->machine->no_caller_saved_registers)
6153 return false;
6155 /* If we are generating position-independent code, we cannot sibcall
6156 optimize direct calls to global functions, as the PLT requires
6157 %ebx be live. (Darwin does not have a PLT.) */
6158 if (!TARGET_MACHO
6159 && !TARGET_64BIT
6160 && flag_pic
6161 && flag_plt
6162 && bind_global)
6163 return false;
6165 /* If we need to align the outgoing stack, then sibcalling would
6166 unalign the stack, which may break the called function. */
6167 if (ix86_minimum_incoming_stack_boundary (true)
6168 < PREFERRED_STACK_BOUNDARY)
6169 return false;
6171 if (decl)
6173 decl_or_type = decl;
6174 type = TREE_TYPE (decl);
6176 else
6178 /* We're looking at the CALL_EXPR, we need the type of the function. */
6179 type = CALL_EXPR_FN (exp); /* pointer expression */
6180 type = TREE_TYPE (type); /* pointer type */
6181 type = TREE_TYPE (type); /* function type */
6182 decl_or_type = type;
6185 /* Check that the return value locations are the same. Like
6186 if we are returning floats on the 80387 register stack, we cannot
6187 make a sibcall from a function that doesn't return a float to a
6188 function that does or, conversely, from a function that does return
6189 a float to a function that doesn't; the necessary stack adjustment
6190 would not be executed. This is also the place we notice
6191 differences in the return value ABI. Note that it is ok for one
6192 of the functions to have void return type as long as the return
6193 value of the other is passed in a register. */
6194 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6195 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6196 cfun->decl, false);
6197 if (STACK_REG_P (a) || STACK_REG_P (b))
6199 if (!rtx_equal_p (a, b))
6200 return false;
6202 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6204 else if (!rtx_equal_p (a, b))
6205 return false;
6207 if (TARGET_64BIT)
6209 /* The SYSV ABI has more call-clobbered registers;
6210 disallow sibcalls from MS to SYSV. */
6211 if (cfun->machine->call_abi == MS_ABI
6212 && ix86_function_type_abi (type) == SYSV_ABI)
6213 return false;
6215 else
6217 /* If this call is indirect, we'll need to be able to use a
6218 call-clobbered register for the address of the target function.
6219 Make sure that all such registers are not used for passing
6220 parameters. Note that DLLIMPORT functions and call to global
6221 function via GOT slot are indirect. */
6222 if (!decl
6223 || (bind_global && flag_pic && !flag_plt)
6224 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6226 /* Check if regparm >= 3 since arg_reg_available is set to
6227 false if regparm == 0. If regparm is 1 or 2, there is
6228 always a call-clobbered register available.
6230 ??? The symbol indirect call doesn't need a call-clobbered
6231 register. But we don't know if this is a symbol indirect
6232 call or not here. */
6233 if (ix86_function_regparm (type, NULL) >= 3
6234 && !cfun->machine->arg_reg_available)
6235 return false;
6239 /* Otherwise okay. That also includes certain types of indirect calls. */
6240 return true;
6243 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6244 and "sseregparm" calling convention attributes;
6245 arguments as in struct attribute_spec.handler. */
6247 static tree
6248 ix86_handle_cconv_attribute (tree *node, tree name,
6249 tree args,
6250 int,
6251 bool *no_add_attrs)
6253 if (TREE_CODE (*node) != FUNCTION_TYPE
6254 && TREE_CODE (*node) != METHOD_TYPE
6255 && TREE_CODE (*node) != FIELD_DECL
6256 && TREE_CODE (*node) != TYPE_DECL)
6258 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6259 name);
6260 *no_add_attrs = true;
6261 return NULL_TREE;
6264 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6265 if (is_attribute_p ("regparm", name))
6267 tree cst;
6269 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6271 error ("fastcall and regparm attributes are not compatible");
6274 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6276 error ("regparam and thiscall attributes are not compatible");
6279 cst = TREE_VALUE (args);
6280 if (TREE_CODE (cst) != INTEGER_CST)
6282 warning (OPT_Wattributes,
6283 "%qE attribute requires an integer constant argument",
6284 name);
6285 *no_add_attrs = true;
6287 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6289 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6290 name, REGPARM_MAX);
6291 *no_add_attrs = true;
6294 return NULL_TREE;
6297 if (TARGET_64BIT)
6299 /* Do not warn when emulating the MS ABI. */
6300 if ((TREE_CODE (*node) != FUNCTION_TYPE
6301 && TREE_CODE (*node) != METHOD_TYPE)
6302 || ix86_function_type_abi (*node) != MS_ABI)
6303 warning (OPT_Wattributes, "%qE attribute ignored",
6304 name);
6305 *no_add_attrs = true;
6306 return NULL_TREE;
6309 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6310 if (is_attribute_p ("fastcall", name))
6312 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6314 error ("fastcall and cdecl attributes are not compatible");
6316 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6318 error ("fastcall and stdcall attributes are not compatible");
6320 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6322 error ("fastcall and regparm attributes are not compatible");
6324 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6326 error ("fastcall and thiscall attributes are not compatible");
6330 /* Can combine stdcall with fastcall (redundant), regparm and
6331 sseregparm. */
6332 else if (is_attribute_p ("stdcall", name))
6334 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6336 error ("stdcall and cdecl attributes are not compatible");
6338 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6340 error ("stdcall and fastcall attributes are not compatible");
6342 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6344 error ("stdcall and thiscall attributes are not compatible");
6348 /* Can combine cdecl with regparm and sseregparm. */
6349 else if (is_attribute_p ("cdecl", name))
6351 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6353 error ("stdcall and cdecl attributes are not compatible");
6355 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6357 error ("fastcall and cdecl attributes are not compatible");
6359 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6361 error ("cdecl and thiscall attributes are not compatible");
6364 else if (is_attribute_p ("thiscall", name))
6366 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6367 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6368 name);
6369 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6371 error ("stdcall and thiscall attributes are not compatible");
6373 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6375 error ("fastcall and thiscall attributes are not compatible");
6377 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6379 error ("cdecl and thiscall attributes are not compatible");
6383 /* Can combine sseregparm with all attributes. */
6385 return NULL_TREE;
6388 /* The transactional memory builtins are implicitly regparm or fastcall
6389 depending on the ABI. Override the generic do-nothing attribute that
6390 these builtins were declared with, and replace it with one of the two
6391 attributes that we expect elsewhere. */
6393 static tree
6394 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6395 int flags, bool *no_add_attrs)
6397 tree alt;
6399 /* In no case do we want to add the placeholder attribute. */
6400 *no_add_attrs = true;
6402 /* The 64-bit ABI is unchanged for transactional memory. */
6403 if (TARGET_64BIT)
6404 return NULL_TREE;
6406 /* ??? Is there a better way to validate 32-bit windows? We have
6407 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6408 if (CHECK_STACK_LIMIT > 0)
6409 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6410 else
6412 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6413 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6415 decl_attributes (node, alt, flags);
6417 return NULL_TREE;
6420 /* This function determines from TYPE the calling-convention. */
6422 unsigned int
6423 ix86_get_callcvt (const_tree type)
6425 unsigned int ret = 0;
6426 bool is_stdarg;
6427 tree attrs;
6429 if (TARGET_64BIT)
6430 return IX86_CALLCVT_CDECL;
6432 attrs = TYPE_ATTRIBUTES (type);
6433 if (attrs != NULL_TREE)
6435 if (lookup_attribute ("cdecl", attrs))
6436 ret |= IX86_CALLCVT_CDECL;
6437 else if (lookup_attribute ("stdcall", attrs))
6438 ret |= IX86_CALLCVT_STDCALL;
6439 else if (lookup_attribute ("fastcall", attrs))
6440 ret |= IX86_CALLCVT_FASTCALL;
6441 else if (lookup_attribute ("thiscall", attrs))
6442 ret |= IX86_CALLCVT_THISCALL;
6444 /* Regparam isn't allowed for thiscall and fastcall. */
6445 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6447 if (lookup_attribute ("regparm", attrs))
6448 ret |= IX86_CALLCVT_REGPARM;
6449 if (lookup_attribute ("sseregparm", attrs))
6450 ret |= IX86_CALLCVT_SSEREGPARM;
6453 if (IX86_BASE_CALLCVT(ret) != 0)
6454 return ret;
6457 is_stdarg = stdarg_p (type);
6458 if (TARGET_RTD && !is_stdarg)
6459 return IX86_CALLCVT_STDCALL | ret;
6461 if (ret != 0
6462 || is_stdarg
6463 || TREE_CODE (type) != METHOD_TYPE
6464 || ix86_function_type_abi (type) != MS_ABI)
6465 return IX86_CALLCVT_CDECL | ret;
6467 return IX86_CALLCVT_THISCALL;
6470 /* Return 0 if the attributes for two types are incompatible, 1 if they
6471 are compatible, and 2 if they are nearly compatible (which causes a
6472 warning to be generated). */
6474 static int
6475 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6477 unsigned int ccvt1, ccvt2;
6479 if (TREE_CODE (type1) != FUNCTION_TYPE
6480 && TREE_CODE (type1) != METHOD_TYPE)
6481 return 1;
6483 ccvt1 = ix86_get_callcvt (type1);
6484 ccvt2 = ix86_get_callcvt (type2);
6485 if (ccvt1 != ccvt2)
6486 return 0;
6487 if (ix86_function_regparm (type1, NULL)
6488 != ix86_function_regparm (type2, NULL))
6489 return 0;
6491 return 1;
6494 /* Return the regparm value for a function with the indicated TYPE and DECL.
6495 DECL may be NULL when calling function indirectly
6496 or considering a libcall. */
6498 static int
6499 ix86_function_regparm (const_tree type, const_tree decl)
6501 tree attr;
6502 int regparm;
6503 unsigned int ccvt;
6505 if (TARGET_64BIT)
6506 return (ix86_function_type_abi (type) == SYSV_ABI
6507 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6508 ccvt = ix86_get_callcvt (type);
6509 regparm = ix86_regparm;
6511 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6513 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6514 if (attr)
6516 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6517 return regparm;
6520 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6521 return 2;
6522 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6523 return 1;
6525 /* Use register calling convention for local functions when possible. */
6526 if (decl
6527 && TREE_CODE (decl) == FUNCTION_DECL)
6529 cgraph_node *target = cgraph_node::get (decl);
6530 if (target)
6531 target = target->function_symbol ();
6533 /* Caller and callee must agree on the calling convention, so
6534 checking here just optimize means that with
6535 __attribute__((optimize (...))) caller could use regparm convention
6536 and callee not, or vice versa. Instead look at whether the callee
6537 is optimized or not. */
6538 if (target && opt_for_fn (target->decl, optimize)
6539 && !(profile_flag && !flag_fentry))
6541 cgraph_local_info *i = &target->local;
6542 if (i && i->local && i->can_change_signature)
6544 int local_regparm, globals = 0, regno;
6546 /* Make sure no regparm register is taken by a
6547 fixed register variable. */
6548 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6549 local_regparm++)
6550 if (fixed_regs[local_regparm])
6551 break;
6553 /* We don't want to use regparm(3) for nested functions as
6554 these use a static chain pointer in the third argument. */
6555 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6556 local_regparm = 2;
6558 /* Save a register for the split stack. */
6559 if (flag_split_stack)
6561 if (local_regparm == 3)
6562 local_regparm = 2;
6563 else if (local_regparm == 2
6564 && DECL_STATIC_CHAIN (target->decl))
6565 local_regparm = 1;
6568 /* Each fixed register usage increases register pressure,
6569 so less registers should be used for argument passing.
6570 This functionality can be overriden by an explicit
6571 regparm value. */
6572 for (regno = AX_REG; regno <= DI_REG; regno++)
6573 if (fixed_regs[regno])
6574 globals++;
6576 local_regparm
6577 = globals < local_regparm ? local_regparm - globals : 0;
6579 if (local_regparm > regparm)
6580 regparm = local_regparm;
6585 return regparm;
6588 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6589 DFmode (2) arguments in SSE registers for a function with the
6590 indicated TYPE and DECL. DECL may be NULL when calling function
6591 indirectly or considering a libcall. Return -1 if any FP parameter
6592 should be rejected by error. This is used in siutation we imply SSE
6593 calling convetion but the function is called from another function with
6594 SSE disabled. Otherwise return 0. */
6596 static int
6597 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6599 gcc_assert (!TARGET_64BIT);
6601 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6602 by the sseregparm attribute. */
6603 if (TARGET_SSEREGPARM
6604 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6606 if (!TARGET_SSE)
6608 if (warn)
6610 if (decl)
6611 error ("calling %qD with attribute sseregparm without "
6612 "SSE/SSE2 enabled", decl);
6613 else
6614 error ("calling %qT with attribute sseregparm without "
6615 "SSE/SSE2 enabled", type);
6617 return 0;
6620 return 2;
6623 if (!decl)
6624 return 0;
6626 cgraph_node *target = cgraph_node::get (decl);
6627 if (target)
6628 target = target->function_symbol ();
6630 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6631 (and DFmode for SSE2) arguments in SSE registers. */
6632 if (target
6633 /* TARGET_SSE_MATH */
6634 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6635 && opt_for_fn (target->decl, optimize)
6636 && !(profile_flag && !flag_fentry))
6638 cgraph_local_info *i = &target->local;
6639 if (i && i->local && i->can_change_signature)
6641 /* Refuse to produce wrong code when local function with SSE enabled
6642 is called from SSE disabled function.
6643 FIXME: We need a way to detect these cases cross-ltrans partition
6644 and avoid using SSE calling conventions on local functions called
6645 from function with SSE disabled. For now at least delay the
6646 warning until we know we are going to produce wrong code.
6647 See PR66047 */
6648 if (!TARGET_SSE && warn)
6649 return -1;
6650 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6651 ->x_ix86_isa_flags) ? 2 : 1;
6655 return 0;
6658 /* Return true if EAX is live at the start of the function. Used by
6659 ix86_expand_prologue to determine if we need special help before
6660 calling allocate_stack_worker. */
6662 static bool
6663 ix86_eax_live_at_start_p (void)
6665 /* Cheat. Don't bother working forward from ix86_function_regparm
6666 to the function type to whether an actual argument is located in
6667 eax. Instead just look at cfg info, which is still close enough
6668 to correct at this point. This gives false positives for broken
6669 functions that might use uninitialized data that happens to be
6670 allocated in eax, but who cares? */
6671 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6674 static bool
6675 ix86_keep_aggregate_return_pointer (tree fntype)
6677 tree attr;
6679 if (!TARGET_64BIT)
6681 attr = lookup_attribute ("callee_pop_aggregate_return",
6682 TYPE_ATTRIBUTES (fntype));
6683 if (attr)
6684 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6686 /* For 32-bit MS-ABI the default is to keep aggregate
6687 return pointer. */
6688 if (ix86_function_type_abi (fntype) == MS_ABI)
6689 return true;
6691 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6694 /* Value is the number of bytes of arguments automatically
6695 popped when returning from a subroutine call.
6696 FUNDECL is the declaration node of the function (as a tree),
6697 FUNTYPE is the data type of the function (as a tree),
6698 or for a library call it is an identifier node for the subroutine name.
6699 SIZE is the number of bytes of arguments passed on the stack.
6701 On the 80386, the RTD insn may be used to pop them if the number
6702 of args is fixed, but if the number is variable then the caller
6703 must pop them all. RTD can't be used for library calls now
6704 because the library is compiled with the Unix compiler.
6705 Use of RTD is a selectable option, since it is incompatible with
6706 standard Unix calling sequences. If the option is not selected,
6707 the caller must always pop the args.
6709 The attribute stdcall is equivalent to RTD on a per module basis. */
6711 static int
6712 ix86_return_pops_args (tree fundecl, tree funtype, int size)
6714 unsigned int ccvt;
6716 /* None of the 64-bit ABIs pop arguments. */
6717 if (TARGET_64BIT)
6718 return 0;
6720 ccvt = ix86_get_callcvt (funtype);
6722 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6723 | IX86_CALLCVT_THISCALL)) != 0
6724 && ! stdarg_p (funtype))
6725 return size;
6727 /* Lose any fake structure return argument if it is passed on the stack. */
6728 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6729 && !ix86_keep_aggregate_return_pointer (funtype))
6731 int nregs = ix86_function_regparm (funtype, fundecl);
6732 if (nregs == 0)
6733 return GET_MODE_SIZE (Pmode);
6736 return 0;
6739 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6741 static bool
6742 ix86_legitimate_combined_insn (rtx_insn *insn)
6744 int i;
6746 /* Check operand constraints in case hard registers were propagated
6747 into insn pattern. This check prevents combine pass from
6748 generating insn patterns with invalid hard register operands.
6749 These invalid insns can eventually confuse reload to error out
6750 with a spill failure. See also PRs 46829 and 46843. */
6752 gcc_assert (INSN_CODE (insn) >= 0);
6754 extract_insn (insn);
6755 preprocess_constraints (insn);
6757 int n_operands = recog_data.n_operands;
6758 int n_alternatives = recog_data.n_alternatives;
6759 for (i = 0; i < n_operands; i++)
6761 rtx op = recog_data.operand[i];
6762 machine_mode mode = GET_MODE (op);
6763 const operand_alternative *op_alt;
6764 int offset = 0;
6765 bool win;
6766 int j;
6768 /* A unary operator may be accepted by the predicate, but it
6769 is irrelevant for matching constraints. */
6770 if (UNARY_P (op))
6771 op = XEXP (op, 0);
6773 if (SUBREG_P (op))
6775 if (REG_P (SUBREG_REG (op))
6776 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6777 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6778 GET_MODE (SUBREG_REG (op)),
6779 SUBREG_BYTE (op),
6780 GET_MODE (op));
6781 op = SUBREG_REG (op);
6784 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6785 continue;
6787 op_alt = recog_op_alt;
6789 /* Operand has no constraints, anything is OK. */
6790 win = !n_alternatives;
6792 alternative_mask preferred = get_preferred_alternatives (insn);
6793 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6795 if (!TEST_BIT (preferred, j))
6796 continue;
6797 if (op_alt[i].anything_ok
6798 || (op_alt[i].matches != -1
6799 && operands_match_p
6800 (recog_data.operand[i],
6801 recog_data.operand[op_alt[i].matches]))
6802 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6804 win = true;
6805 break;
6809 if (!win)
6810 return false;
6813 return true;
6816 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6818 static unsigned HOST_WIDE_INT
6819 ix86_asan_shadow_offset (void)
6821 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6822 : HOST_WIDE_INT_C (0x7fff8000))
6823 : (HOST_WIDE_INT_1 << 29);
6826 /* Argument support functions. */
6828 /* Return true when register may be used to pass function parameters. */
6829 bool
6830 ix86_function_arg_regno_p (int regno)
6832 int i;
6833 enum calling_abi call_abi;
6834 const int *parm_regs;
6836 if (TARGET_MPX && BND_REGNO_P (regno))
6837 return true;
6839 if (!TARGET_64BIT)
6841 if (TARGET_MACHO)
6842 return (regno < REGPARM_MAX
6843 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6844 else
6845 return (regno < REGPARM_MAX
6846 || (TARGET_MMX && MMX_REGNO_P (regno)
6847 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6848 || (TARGET_SSE && SSE_REGNO_P (regno)
6849 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6852 if (TARGET_SSE && SSE_REGNO_P (regno)
6853 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6854 return true;
6856 /* TODO: The function should depend on current function ABI but
6857 builtins.c would need updating then. Therefore we use the
6858 default ABI. */
6859 call_abi = ix86_cfun_abi ();
6861 /* RAX is used as hidden argument to va_arg functions. */
6862 if (call_abi == SYSV_ABI && regno == AX_REG)
6863 return true;
6865 if (call_abi == MS_ABI)
6866 parm_regs = x86_64_ms_abi_int_parameter_registers;
6867 else
6868 parm_regs = x86_64_int_parameter_registers;
6870 for (i = 0; i < (call_abi == MS_ABI
6871 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6872 if (regno == parm_regs[i])
6873 return true;
6874 return false;
6877 /* Return if we do not know how to pass TYPE solely in registers. */
6879 static bool
6880 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6882 if (must_pass_in_stack_var_size_or_pad (mode, type))
6883 return true;
6885 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
6886 The layout_type routine is crafty and tries to trick us into passing
6887 currently unsupported vector types on the stack by using TImode. */
6888 return (!TARGET_64BIT && mode == TImode
6889 && type && TREE_CODE (type) != VECTOR_TYPE);
6892 /* It returns the size, in bytes, of the area reserved for arguments passed
6893 in registers for the function represented by fndecl dependent to the used
6894 abi format. */
6896 ix86_reg_parm_stack_space (const_tree fndecl)
6898 enum calling_abi call_abi = SYSV_ABI;
6899 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
6900 call_abi = ix86_function_abi (fndecl);
6901 else
6902 call_abi = ix86_function_type_abi (fndecl);
6903 if (TARGET_64BIT && call_abi == MS_ABI)
6904 return 32;
6905 return 0;
6908 /* We add this as a workaround in order to use libc_has_function
6909 hook in i386.md. */
6910 bool
6911 ix86_libc_has_function (enum function_class fn_class)
6913 return targetm.libc_has_function (fn_class);
6916 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
6917 specifying the call abi used. */
6918 enum calling_abi
6919 ix86_function_type_abi (const_tree fntype)
6921 enum calling_abi abi = ix86_abi;
6923 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
6924 return abi;
6926 if (abi == SYSV_ABI
6927 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
6929 static int warned;
6930 if (TARGET_X32 && !warned)
6932 error ("X32 does not support ms_abi attribute");
6933 warned = 1;
6936 abi = MS_ABI;
6938 else if (abi == MS_ABI
6939 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
6940 abi = SYSV_ABI;
6942 return abi;
6945 static enum calling_abi
6946 ix86_function_abi (const_tree fndecl)
6948 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
6951 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
6952 specifying the call abi used. */
6953 enum calling_abi
6954 ix86_cfun_abi (void)
6956 return cfun ? cfun->machine->call_abi : ix86_abi;
6959 static bool
6960 ix86_function_ms_hook_prologue (const_tree fn)
6962 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
6964 if (decl_function_context (fn) != NULL_TREE)
6965 error_at (DECL_SOURCE_LOCATION (fn),
6966 "ms_hook_prologue is not compatible with nested function");
6967 else
6968 return true;
6970 return false;
6973 static bool
6974 ix86_function_naked (const_tree fn)
6976 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
6977 return true;
6979 return false;
6982 /* Write the extra assembler code needed to declare a function properly. */
6984 void
6985 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
6986 tree decl)
6988 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
6990 if (is_ms_hook)
6992 int i, filler_count = (TARGET_64BIT ? 32 : 16);
6993 unsigned int filler_cc = 0xcccccccc;
6995 for (i = 0; i < filler_count; i += 4)
6996 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
6999 #ifdef SUBTARGET_ASM_UNWIND_INIT
7000 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7001 #endif
7003 ASM_OUTPUT_LABEL (asm_out_file, fname);
7005 /* Output magic byte marker, if hot-patch attribute is set. */
7006 if (is_ms_hook)
7008 if (TARGET_64BIT)
7010 /* leaq [%rsp + 0], %rsp */
7011 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7012 asm_out_file);
7014 else
7016 /* movl.s %edi, %edi
7017 push %ebp
7018 movl.s %esp, %ebp */
7019 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7024 /* Implementation of call abi switching target hook. Specific to FNDECL
7025 the specific call register sets are set. See also
7026 ix86_conditional_register_usage for more details. */
7027 void
7028 ix86_call_abi_override (const_tree fndecl)
7030 cfun->machine->call_abi = ix86_function_abi (fndecl);
7033 /* Return 1 if pseudo register should be created and used to hold
7034 GOT address for PIC code. */
7035 bool
7036 ix86_use_pseudo_pic_reg (void)
7038 if ((TARGET_64BIT
7039 && (ix86_cmodel == CM_SMALL_PIC
7040 || TARGET_PECOFF))
7041 || !flag_pic)
7042 return false;
7043 return true;
7046 /* Initialize large model PIC register. */
7048 static void
7049 ix86_init_large_pic_reg (unsigned int tmp_regno)
7051 rtx_code_label *label;
7052 rtx tmp_reg;
7054 gcc_assert (Pmode == DImode);
7055 label = gen_label_rtx ();
7056 emit_label (label);
7057 LABEL_PRESERVE_P (label) = 1;
7058 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7059 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7060 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7061 label));
7062 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7063 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7064 pic_offset_table_rtx, tmp_reg));
7065 const char *name = LABEL_NAME (label);
7066 PUT_CODE (label, NOTE);
7067 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7068 NOTE_DELETED_LABEL_NAME (label) = name;
7071 /* Create and initialize PIC register if required. */
7072 static void
7073 ix86_init_pic_reg (void)
7075 edge entry_edge;
7076 rtx_insn *seq;
7078 if (!ix86_use_pseudo_pic_reg ())
7079 return;
7081 start_sequence ();
7083 if (TARGET_64BIT)
7085 if (ix86_cmodel == CM_LARGE_PIC)
7086 ix86_init_large_pic_reg (R11_REG);
7087 else
7088 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7090 else
7092 /* If there is future mcount call in the function it is more profitable
7093 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7094 rtx reg = crtl->profile
7095 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7096 : pic_offset_table_rtx;
7097 rtx_insn *insn = emit_insn (gen_set_got (reg));
7098 RTX_FRAME_RELATED_P (insn) = 1;
7099 if (crtl->profile)
7100 emit_move_insn (pic_offset_table_rtx, reg);
7101 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7104 seq = get_insns ();
7105 end_sequence ();
7107 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7108 insert_insn_on_edge (seq, entry_edge);
7109 commit_one_edge_insertion (entry_edge);
7112 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7113 for a call to a function whose data type is FNTYPE.
7114 For a library call, FNTYPE is 0. */
7116 void
7117 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7118 tree fntype, /* tree ptr for function decl */
7119 rtx libname, /* SYMBOL_REF of library name or 0 */
7120 tree fndecl,
7121 int caller)
7123 struct cgraph_local_info *i = NULL;
7124 struct cgraph_node *target = NULL;
7126 memset (cum, 0, sizeof (*cum));
7128 if (fndecl)
7130 target = cgraph_node::get (fndecl);
7131 if (target)
7133 target = target->function_symbol ();
7134 i = cgraph_node::local_info (target->decl);
7135 cum->call_abi = ix86_function_abi (target->decl);
7137 else
7138 cum->call_abi = ix86_function_abi (fndecl);
7140 else
7141 cum->call_abi = ix86_function_type_abi (fntype);
7143 cum->caller = caller;
7145 /* Set up the number of registers to use for passing arguments. */
7146 cum->nregs = ix86_regparm;
7147 if (TARGET_64BIT)
7149 cum->nregs = (cum->call_abi == SYSV_ABI
7150 ? X86_64_REGPARM_MAX
7151 : X86_64_MS_REGPARM_MAX);
7153 if (TARGET_SSE)
7155 cum->sse_nregs = SSE_REGPARM_MAX;
7156 if (TARGET_64BIT)
7158 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7159 ? X86_64_SSE_REGPARM_MAX
7160 : X86_64_MS_SSE_REGPARM_MAX);
7163 if (TARGET_MMX)
7164 cum->mmx_nregs = MMX_REGPARM_MAX;
7165 cum->warn_avx512f = true;
7166 cum->warn_avx = true;
7167 cum->warn_sse = true;
7168 cum->warn_mmx = true;
7170 /* Because type might mismatch in between caller and callee, we need to
7171 use actual type of function for local calls.
7172 FIXME: cgraph_analyze can be told to actually record if function uses
7173 va_start so for local functions maybe_vaarg can be made aggressive
7174 helping K&R code.
7175 FIXME: once typesytem is fixed, we won't need this code anymore. */
7176 if (i && i->local && i->can_change_signature)
7177 fntype = TREE_TYPE (target->decl);
7178 cum->stdarg = stdarg_p (fntype);
7179 cum->maybe_vaarg = (fntype
7180 ? (!prototype_p (fntype) || stdarg_p (fntype))
7181 : !libname);
7183 cum->bnd_regno = FIRST_BND_REG;
7184 cum->bnds_in_bt = 0;
7185 cum->force_bnd_pass = 0;
7186 cum->decl = fndecl;
7188 if (!TARGET_64BIT)
7190 /* If there are variable arguments, then we won't pass anything
7191 in registers in 32-bit mode. */
7192 if (stdarg_p (fntype))
7194 cum->nregs = 0;
7195 /* Since in 32-bit, variable arguments are always passed on
7196 stack, there is scratch register available for indirect
7197 sibcall. */
7198 cfun->machine->arg_reg_available = true;
7199 cum->sse_nregs = 0;
7200 cum->mmx_nregs = 0;
7201 cum->warn_avx512f = false;
7202 cum->warn_avx = false;
7203 cum->warn_sse = false;
7204 cum->warn_mmx = false;
7205 return;
7208 /* Use ecx and edx registers if function has fastcall attribute,
7209 else look for regparm information. */
7210 if (fntype)
7212 unsigned int ccvt = ix86_get_callcvt (fntype);
7213 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7215 cum->nregs = 1;
7216 cum->fastcall = 1; /* Same first register as in fastcall. */
7218 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7220 cum->nregs = 2;
7221 cum->fastcall = 1;
7223 else
7224 cum->nregs = ix86_function_regparm (fntype, fndecl);
7227 /* Set up the number of SSE registers used for passing SFmode
7228 and DFmode arguments. Warn for mismatching ABI. */
7229 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7232 cfun->machine->arg_reg_available = (cum->nregs > 0);
7235 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7236 But in the case of vector types, it is some vector mode.
7238 When we have only some of our vector isa extensions enabled, then there
7239 are some modes for which vector_mode_supported_p is false. For these
7240 modes, the generic vector support in gcc will choose some non-vector mode
7241 in order to implement the type. By computing the natural mode, we'll
7242 select the proper ABI location for the operand and not depend on whatever
7243 the middle-end decides to do with these vector types.
7245 The midde-end can't deal with the vector types > 16 bytes. In this
7246 case, we return the original mode and warn ABI change if CUM isn't
7247 NULL.
7249 If INT_RETURN is true, warn ABI change if the vector mode isn't
7250 available for function return value. */
7252 static machine_mode
7253 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7254 bool in_return)
7256 machine_mode mode = TYPE_MODE (type);
7258 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7260 HOST_WIDE_INT size = int_size_in_bytes (type);
7261 if ((size == 8 || size == 16 || size == 32 || size == 64)
7262 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7263 && TYPE_VECTOR_SUBPARTS (type) > 1)
7265 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7267 /* There are no XFmode vector modes. */
7268 if (innermode == XFmode)
7269 return mode;
7271 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7272 mode = MIN_MODE_VECTOR_FLOAT;
7273 else
7274 mode = MIN_MODE_VECTOR_INT;
7276 /* Get the mode which has this inner mode and number of units. */
7277 FOR_EACH_MODE_FROM (mode, mode)
7278 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7279 && GET_MODE_INNER (mode) == innermode)
7281 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7283 static bool warnedavx512f;
7284 static bool warnedavx512f_ret;
7286 if (cum && cum->warn_avx512f && !warnedavx512f)
7288 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7289 "without AVX512F enabled changes the ABI"))
7290 warnedavx512f = true;
7292 else if (in_return && !warnedavx512f_ret)
7294 if (warning (OPT_Wpsabi, "AVX512F vector return "
7295 "without AVX512F enabled changes the ABI"))
7296 warnedavx512f_ret = true;
7299 return TYPE_MODE (type);
7301 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7303 static bool warnedavx;
7304 static bool warnedavx_ret;
7306 if (cum && cum->warn_avx && !warnedavx)
7308 if (warning (OPT_Wpsabi, "AVX vector argument "
7309 "without AVX enabled changes the ABI"))
7310 warnedavx = true;
7312 else if (in_return && !warnedavx_ret)
7314 if (warning (OPT_Wpsabi, "AVX vector return "
7315 "without AVX enabled changes the ABI"))
7316 warnedavx_ret = true;
7319 return TYPE_MODE (type);
7321 else if (((size == 8 && TARGET_64BIT) || size == 16)
7322 && !TARGET_SSE
7323 && !TARGET_IAMCU)
7325 static bool warnedsse;
7326 static bool warnedsse_ret;
7328 if (cum && cum->warn_sse && !warnedsse)
7330 if (warning (OPT_Wpsabi, "SSE vector argument "
7331 "without SSE enabled changes the ABI"))
7332 warnedsse = true;
7334 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7336 if (warning (OPT_Wpsabi, "SSE vector return "
7337 "without SSE enabled changes the ABI"))
7338 warnedsse_ret = true;
7341 else if ((size == 8 && !TARGET_64BIT)
7342 && (!cfun
7343 || cfun->machine->func_type == TYPE_NORMAL)
7344 && !TARGET_MMX
7345 && !TARGET_IAMCU)
7347 static bool warnedmmx;
7348 static bool warnedmmx_ret;
7350 if (cum && cum->warn_mmx && !warnedmmx)
7352 if (warning (OPT_Wpsabi, "MMX vector argument "
7353 "without MMX enabled changes the ABI"))
7354 warnedmmx = true;
7356 else if (in_return && !warnedmmx_ret)
7358 if (warning (OPT_Wpsabi, "MMX vector return "
7359 "without MMX enabled changes the ABI"))
7360 warnedmmx_ret = true;
7363 return mode;
7366 gcc_unreachable ();
7370 return mode;
7373 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7374 this may not agree with the mode that the type system has chosen for the
7375 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7376 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7378 static rtx
7379 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7380 unsigned int regno)
7382 rtx tmp;
7384 if (orig_mode != BLKmode)
7385 tmp = gen_rtx_REG (orig_mode, regno);
7386 else
7388 tmp = gen_rtx_REG (mode, regno);
7389 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7390 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7393 return tmp;
7396 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7397 of this code is to classify each 8bytes of incoming argument by the register
7398 class and assign registers accordingly. */
7400 /* Return the union class of CLASS1 and CLASS2.
7401 See the x86-64 PS ABI for details. */
7403 static enum x86_64_reg_class
7404 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7406 /* Rule #1: If both classes are equal, this is the resulting class. */
7407 if (class1 == class2)
7408 return class1;
7410 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7411 the other class. */
7412 if (class1 == X86_64_NO_CLASS)
7413 return class2;
7414 if (class2 == X86_64_NO_CLASS)
7415 return class1;
7417 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7418 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7419 return X86_64_MEMORY_CLASS;
7421 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7422 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7423 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7424 return X86_64_INTEGERSI_CLASS;
7425 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7426 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7427 return X86_64_INTEGER_CLASS;
7429 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7430 MEMORY is used. */
7431 if (class1 == X86_64_X87_CLASS
7432 || class1 == X86_64_X87UP_CLASS
7433 || class1 == X86_64_COMPLEX_X87_CLASS
7434 || class2 == X86_64_X87_CLASS
7435 || class2 == X86_64_X87UP_CLASS
7436 || class2 == X86_64_COMPLEX_X87_CLASS)
7437 return X86_64_MEMORY_CLASS;
7439 /* Rule #6: Otherwise class SSE is used. */
7440 return X86_64_SSE_CLASS;
7443 /* Classify the argument of type TYPE and mode MODE.
7444 CLASSES will be filled by the register class used to pass each word
7445 of the operand. The number of words is returned. In case the parameter
7446 should be passed in memory, 0 is returned. As a special case for zero
7447 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7449 BIT_OFFSET is used internally for handling records and specifies offset
7450 of the offset in bits modulo 512 to avoid overflow cases.
7452 See the x86-64 PS ABI for details.
7455 static int
7456 classify_argument (machine_mode mode, const_tree type,
7457 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7459 HOST_WIDE_INT bytes =
7460 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7461 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7463 /* Variable sized entities are always passed/returned in memory. */
7464 if (bytes < 0)
7465 return 0;
7467 if (mode != VOIDmode
7468 && targetm.calls.must_pass_in_stack (mode, type))
7469 return 0;
7471 if (type && AGGREGATE_TYPE_P (type))
7473 int i;
7474 tree field;
7475 enum x86_64_reg_class subclasses[MAX_CLASSES];
7477 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7478 if (bytes > 64)
7479 return 0;
7481 for (i = 0; i < words; i++)
7482 classes[i] = X86_64_NO_CLASS;
7484 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7485 signalize memory class, so handle it as special case. */
7486 if (!words)
7488 classes[0] = X86_64_NO_CLASS;
7489 return 1;
7492 /* Classify each field of record and merge classes. */
7493 switch (TREE_CODE (type))
7495 case RECORD_TYPE:
7496 /* And now merge the fields of structure. */
7497 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7499 if (TREE_CODE (field) == FIELD_DECL)
7501 int num;
7503 if (TREE_TYPE (field) == error_mark_node)
7504 continue;
7506 /* Bitfields are always classified as integer. Handle them
7507 early, since later code would consider them to be
7508 misaligned integers. */
7509 if (DECL_BIT_FIELD (field))
7511 for (i = (int_bit_position (field)
7512 + (bit_offset % 64)) / 8 / 8;
7513 i < ((int_bit_position (field) + (bit_offset % 64))
7514 + tree_to_shwi (DECL_SIZE (field))
7515 + 63) / 8 / 8; i++)
7516 classes[i] =
7517 merge_classes (X86_64_INTEGER_CLASS,
7518 classes[i]);
7520 else
7522 int pos;
7524 type = TREE_TYPE (field);
7526 /* Flexible array member is ignored. */
7527 if (TYPE_MODE (type) == BLKmode
7528 && TREE_CODE (type) == ARRAY_TYPE
7529 && TYPE_SIZE (type) == NULL_TREE
7530 && TYPE_DOMAIN (type) != NULL_TREE
7531 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7532 == NULL_TREE))
7534 static bool warned;
7536 if (!warned && warn_psabi)
7538 warned = true;
7539 inform (input_location,
7540 "the ABI of passing struct with"
7541 " a flexible array member has"
7542 " changed in GCC 4.4");
7544 continue;
7546 num = classify_argument (TYPE_MODE (type), type,
7547 subclasses,
7548 (int_bit_position (field)
7549 + bit_offset) % 512);
7550 if (!num)
7551 return 0;
7552 pos = (int_bit_position (field)
7553 + (bit_offset % 64)) / 8 / 8;
7554 for (i = 0; i < num && (i + pos) < words; i++)
7555 classes[i + pos] =
7556 merge_classes (subclasses[i], classes[i + pos]);
7560 break;
7562 case ARRAY_TYPE:
7563 /* Arrays are handled as small records. */
7565 int num;
7566 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7567 TREE_TYPE (type), subclasses, bit_offset);
7568 if (!num)
7569 return 0;
7571 /* The partial classes are now full classes. */
7572 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7573 subclasses[0] = X86_64_SSE_CLASS;
7574 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7575 && !((bit_offset % 64) == 0 && bytes == 4))
7576 subclasses[0] = X86_64_INTEGER_CLASS;
7578 for (i = 0; i < words; i++)
7579 classes[i] = subclasses[i % num];
7581 break;
7583 case UNION_TYPE:
7584 case QUAL_UNION_TYPE:
7585 /* Unions are similar to RECORD_TYPE but offset is always 0.
7587 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7589 if (TREE_CODE (field) == FIELD_DECL)
7591 int num;
7593 if (TREE_TYPE (field) == error_mark_node)
7594 continue;
7596 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7597 TREE_TYPE (field), subclasses,
7598 bit_offset);
7599 if (!num)
7600 return 0;
7601 for (i = 0; i < num && i < words; i++)
7602 classes[i] = merge_classes (subclasses[i], classes[i]);
7605 break;
7607 default:
7608 gcc_unreachable ();
7611 if (words > 2)
7613 /* When size > 16 bytes, if the first one isn't
7614 X86_64_SSE_CLASS or any other ones aren't
7615 X86_64_SSEUP_CLASS, everything should be passed in
7616 memory. */
7617 if (classes[0] != X86_64_SSE_CLASS)
7618 return 0;
7620 for (i = 1; i < words; i++)
7621 if (classes[i] != X86_64_SSEUP_CLASS)
7622 return 0;
7625 /* Final merger cleanup. */
7626 for (i = 0; i < words; i++)
7628 /* If one class is MEMORY, everything should be passed in
7629 memory. */
7630 if (classes[i] == X86_64_MEMORY_CLASS)
7631 return 0;
7633 /* The X86_64_SSEUP_CLASS should be always preceded by
7634 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7635 if (classes[i] == X86_64_SSEUP_CLASS
7636 && classes[i - 1] != X86_64_SSE_CLASS
7637 && classes[i - 1] != X86_64_SSEUP_CLASS)
7639 /* The first one should never be X86_64_SSEUP_CLASS. */
7640 gcc_assert (i != 0);
7641 classes[i] = X86_64_SSE_CLASS;
7644 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7645 everything should be passed in memory. */
7646 if (classes[i] == X86_64_X87UP_CLASS
7647 && (classes[i - 1] != X86_64_X87_CLASS))
7649 static bool warned;
7651 /* The first one should never be X86_64_X87UP_CLASS. */
7652 gcc_assert (i != 0);
7653 if (!warned && warn_psabi)
7655 warned = true;
7656 inform (input_location,
7657 "the ABI of passing union with long double"
7658 " has changed in GCC 4.4");
7660 return 0;
7663 return words;
7666 /* Compute alignment needed. We align all types to natural boundaries with
7667 exception of XFmode that is aligned to 64bits. */
7668 if (mode != VOIDmode && mode != BLKmode)
7670 int mode_alignment = GET_MODE_BITSIZE (mode);
7672 if (mode == XFmode)
7673 mode_alignment = 128;
7674 else if (mode == XCmode)
7675 mode_alignment = 256;
7676 if (COMPLEX_MODE_P (mode))
7677 mode_alignment /= 2;
7678 /* Misaligned fields are always returned in memory. */
7679 if (bit_offset % mode_alignment)
7680 return 0;
7683 /* for V1xx modes, just use the base mode */
7684 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7685 && GET_MODE_UNIT_SIZE (mode) == bytes)
7686 mode = GET_MODE_INNER (mode);
7688 /* Classification of atomic types. */
7689 switch (mode)
7691 case E_SDmode:
7692 case E_DDmode:
7693 classes[0] = X86_64_SSE_CLASS;
7694 return 1;
7695 case E_TDmode:
7696 classes[0] = X86_64_SSE_CLASS;
7697 classes[1] = X86_64_SSEUP_CLASS;
7698 return 2;
7699 case E_DImode:
7700 case E_SImode:
7701 case E_HImode:
7702 case E_QImode:
7703 case E_CSImode:
7704 case E_CHImode:
7705 case E_CQImode:
7707 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7709 /* Analyze last 128 bits only. */
7710 size = (size - 1) & 0x7f;
7712 if (size < 32)
7714 classes[0] = X86_64_INTEGERSI_CLASS;
7715 return 1;
7717 else if (size < 64)
7719 classes[0] = X86_64_INTEGER_CLASS;
7720 return 1;
7722 else if (size < 64+32)
7724 classes[0] = X86_64_INTEGER_CLASS;
7725 classes[1] = X86_64_INTEGERSI_CLASS;
7726 return 2;
7728 else if (size < 64+64)
7730 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7731 return 2;
7733 else
7734 gcc_unreachable ();
7736 case E_CDImode:
7737 case E_TImode:
7738 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7739 return 2;
7740 case E_COImode:
7741 case E_OImode:
7742 /* OImode shouldn't be used directly. */
7743 gcc_unreachable ();
7744 case E_CTImode:
7745 return 0;
7746 case E_SFmode:
7747 if (!(bit_offset % 64))
7748 classes[0] = X86_64_SSESF_CLASS;
7749 else
7750 classes[0] = X86_64_SSE_CLASS;
7751 return 1;
7752 case E_DFmode:
7753 classes[0] = X86_64_SSEDF_CLASS;
7754 return 1;
7755 case E_XFmode:
7756 classes[0] = X86_64_X87_CLASS;
7757 classes[1] = X86_64_X87UP_CLASS;
7758 return 2;
7759 case E_TFmode:
7760 classes[0] = X86_64_SSE_CLASS;
7761 classes[1] = X86_64_SSEUP_CLASS;
7762 return 2;
7763 case E_SCmode:
7764 classes[0] = X86_64_SSE_CLASS;
7765 if (!(bit_offset % 64))
7766 return 1;
7767 else
7769 static bool warned;
7771 if (!warned && warn_psabi)
7773 warned = true;
7774 inform (input_location,
7775 "the ABI of passing structure with complex float"
7776 " member has changed in GCC 4.4");
7778 classes[1] = X86_64_SSESF_CLASS;
7779 return 2;
7781 case E_DCmode:
7782 classes[0] = X86_64_SSEDF_CLASS;
7783 classes[1] = X86_64_SSEDF_CLASS;
7784 return 2;
7785 case E_XCmode:
7786 classes[0] = X86_64_COMPLEX_X87_CLASS;
7787 return 1;
7788 case E_TCmode:
7789 /* This modes is larger than 16 bytes. */
7790 return 0;
7791 case E_V8SFmode:
7792 case E_V8SImode:
7793 case E_V32QImode:
7794 case E_V16HImode:
7795 case E_V4DFmode:
7796 case E_V4DImode:
7797 classes[0] = X86_64_SSE_CLASS;
7798 classes[1] = X86_64_SSEUP_CLASS;
7799 classes[2] = X86_64_SSEUP_CLASS;
7800 classes[3] = X86_64_SSEUP_CLASS;
7801 return 4;
7802 case E_V8DFmode:
7803 case E_V16SFmode:
7804 case E_V8DImode:
7805 case E_V16SImode:
7806 case E_V32HImode:
7807 case E_V64QImode:
7808 classes[0] = X86_64_SSE_CLASS;
7809 classes[1] = X86_64_SSEUP_CLASS;
7810 classes[2] = X86_64_SSEUP_CLASS;
7811 classes[3] = X86_64_SSEUP_CLASS;
7812 classes[4] = X86_64_SSEUP_CLASS;
7813 classes[5] = X86_64_SSEUP_CLASS;
7814 classes[6] = X86_64_SSEUP_CLASS;
7815 classes[7] = X86_64_SSEUP_CLASS;
7816 return 8;
7817 case E_V4SFmode:
7818 case E_V4SImode:
7819 case E_V16QImode:
7820 case E_V8HImode:
7821 case E_V2DFmode:
7822 case E_V2DImode:
7823 classes[0] = X86_64_SSE_CLASS;
7824 classes[1] = X86_64_SSEUP_CLASS;
7825 return 2;
7826 case E_V1TImode:
7827 case E_V1DImode:
7828 case E_V2SFmode:
7829 case E_V2SImode:
7830 case E_V4HImode:
7831 case E_V8QImode:
7832 classes[0] = X86_64_SSE_CLASS;
7833 return 1;
7834 case E_BLKmode:
7835 case E_VOIDmode:
7836 return 0;
7837 default:
7838 gcc_assert (VECTOR_MODE_P (mode));
7840 if (bytes > 16)
7841 return 0;
7843 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7845 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7846 classes[0] = X86_64_INTEGERSI_CLASS;
7847 else
7848 classes[0] = X86_64_INTEGER_CLASS;
7849 classes[1] = X86_64_INTEGER_CLASS;
7850 return 1 + (bytes > 8);
7854 /* Examine the argument and return set number of register required in each
7855 class. Return true iff parameter should be passed in memory. */
7857 static bool
7858 examine_argument (machine_mode mode, const_tree type, int in_return,
7859 int *int_nregs, int *sse_nregs)
7861 enum x86_64_reg_class regclass[MAX_CLASSES];
7862 int n = classify_argument (mode, type, regclass, 0);
7864 *int_nregs = 0;
7865 *sse_nregs = 0;
7867 if (!n)
7868 return true;
7869 for (n--; n >= 0; n--)
7870 switch (regclass[n])
7872 case X86_64_INTEGER_CLASS:
7873 case X86_64_INTEGERSI_CLASS:
7874 (*int_nregs)++;
7875 break;
7876 case X86_64_SSE_CLASS:
7877 case X86_64_SSESF_CLASS:
7878 case X86_64_SSEDF_CLASS:
7879 (*sse_nregs)++;
7880 break;
7881 case X86_64_NO_CLASS:
7882 case X86_64_SSEUP_CLASS:
7883 break;
7884 case X86_64_X87_CLASS:
7885 case X86_64_X87UP_CLASS:
7886 case X86_64_COMPLEX_X87_CLASS:
7887 if (!in_return)
7888 return true;
7889 break;
7890 case X86_64_MEMORY_CLASS:
7891 gcc_unreachable ();
7894 return false;
7897 /* Construct container for the argument used by GCC interface. See
7898 FUNCTION_ARG for the detailed description. */
7900 static rtx
7901 construct_container (machine_mode mode, machine_mode orig_mode,
7902 const_tree type, int in_return, int nintregs, int nsseregs,
7903 const int *intreg, int sse_regno)
7905 /* The following variables hold the static issued_error state. */
7906 static bool issued_sse_arg_error;
7907 static bool issued_sse_ret_error;
7908 static bool issued_x87_ret_error;
7910 machine_mode tmpmode;
7911 int bytes =
7912 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7913 enum x86_64_reg_class regclass[MAX_CLASSES];
7914 int n;
7915 int i;
7916 int nexps = 0;
7917 int needed_sseregs, needed_intregs;
7918 rtx exp[MAX_CLASSES];
7919 rtx ret;
7921 n = classify_argument (mode, type, regclass, 0);
7922 if (!n)
7923 return NULL;
7924 if (examine_argument (mode, type, in_return, &needed_intregs,
7925 &needed_sseregs))
7926 return NULL;
7927 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
7928 return NULL;
7930 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
7931 some less clueful developer tries to use floating-point anyway. */
7932 if (needed_sseregs && !TARGET_SSE)
7934 if (in_return)
7936 if (!issued_sse_ret_error)
7938 error ("SSE register return with SSE disabled");
7939 issued_sse_ret_error = true;
7942 else if (!issued_sse_arg_error)
7944 error ("SSE register argument with SSE disabled");
7945 issued_sse_arg_error = true;
7947 return NULL;
7950 /* Likewise, error if the ABI requires us to return values in the
7951 x87 registers and the user specified -mno-80387. */
7952 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
7953 for (i = 0; i < n; i++)
7954 if (regclass[i] == X86_64_X87_CLASS
7955 || regclass[i] == X86_64_X87UP_CLASS
7956 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
7958 if (!issued_x87_ret_error)
7960 error ("x87 register return with x87 disabled");
7961 issued_x87_ret_error = true;
7963 return NULL;
7966 /* First construct simple cases. Avoid SCmode, since we want to use
7967 single register to pass this type. */
7968 if (n == 1 && mode != SCmode)
7969 switch (regclass[0])
7971 case X86_64_INTEGER_CLASS:
7972 case X86_64_INTEGERSI_CLASS:
7973 return gen_rtx_REG (mode, intreg[0]);
7974 case X86_64_SSE_CLASS:
7975 case X86_64_SSESF_CLASS:
7976 case X86_64_SSEDF_CLASS:
7977 if (mode != BLKmode)
7978 return gen_reg_or_parallel (mode, orig_mode,
7979 SSE_REGNO (sse_regno));
7980 break;
7981 case X86_64_X87_CLASS:
7982 case X86_64_COMPLEX_X87_CLASS:
7983 return gen_rtx_REG (mode, FIRST_STACK_REG);
7984 case X86_64_NO_CLASS:
7985 /* Zero sized array, struct or class. */
7986 return NULL;
7987 default:
7988 gcc_unreachable ();
7990 if (n == 2
7991 && regclass[0] == X86_64_SSE_CLASS
7992 && regclass[1] == X86_64_SSEUP_CLASS
7993 && mode != BLKmode)
7994 return gen_reg_or_parallel (mode, orig_mode,
7995 SSE_REGNO (sse_regno));
7996 if (n == 4
7997 && regclass[0] == X86_64_SSE_CLASS
7998 && regclass[1] == X86_64_SSEUP_CLASS
7999 && regclass[2] == X86_64_SSEUP_CLASS
8000 && regclass[3] == X86_64_SSEUP_CLASS
8001 && mode != BLKmode)
8002 return gen_reg_or_parallel (mode, orig_mode,
8003 SSE_REGNO (sse_regno));
8004 if (n == 8
8005 && regclass[0] == X86_64_SSE_CLASS
8006 && regclass[1] == X86_64_SSEUP_CLASS
8007 && regclass[2] == X86_64_SSEUP_CLASS
8008 && regclass[3] == X86_64_SSEUP_CLASS
8009 && regclass[4] == X86_64_SSEUP_CLASS
8010 && regclass[5] == X86_64_SSEUP_CLASS
8011 && regclass[6] == X86_64_SSEUP_CLASS
8012 && regclass[7] == X86_64_SSEUP_CLASS
8013 && mode != BLKmode)
8014 return gen_reg_or_parallel (mode, orig_mode,
8015 SSE_REGNO (sse_regno));
8016 if (n == 2
8017 && regclass[0] == X86_64_X87_CLASS
8018 && regclass[1] == X86_64_X87UP_CLASS)
8019 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8021 if (n == 2
8022 && regclass[0] == X86_64_INTEGER_CLASS
8023 && regclass[1] == X86_64_INTEGER_CLASS
8024 && (mode == CDImode || mode == TImode)
8025 && intreg[0] + 1 == intreg[1])
8026 return gen_rtx_REG (mode, intreg[0]);
8028 /* Otherwise figure out the entries of the PARALLEL. */
8029 for (i = 0; i < n; i++)
8031 int pos;
8033 switch (regclass[i])
8035 case X86_64_NO_CLASS:
8036 break;
8037 case X86_64_INTEGER_CLASS:
8038 case X86_64_INTEGERSI_CLASS:
8039 /* Merge TImodes on aligned occasions here too. */
8040 if (i * 8 + 8 > bytes)
8042 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8043 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8044 /* We've requested 24 bytes we
8045 don't have mode for. Use DImode. */
8046 tmpmode = DImode;
8048 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8049 tmpmode = SImode;
8050 else
8051 tmpmode = DImode;
8052 exp [nexps++]
8053 = gen_rtx_EXPR_LIST (VOIDmode,
8054 gen_rtx_REG (tmpmode, *intreg),
8055 GEN_INT (i*8));
8056 intreg++;
8057 break;
8058 case X86_64_SSESF_CLASS:
8059 exp [nexps++]
8060 = gen_rtx_EXPR_LIST (VOIDmode,
8061 gen_rtx_REG (SFmode,
8062 SSE_REGNO (sse_regno)),
8063 GEN_INT (i*8));
8064 sse_regno++;
8065 break;
8066 case X86_64_SSEDF_CLASS:
8067 exp [nexps++]
8068 = gen_rtx_EXPR_LIST (VOIDmode,
8069 gen_rtx_REG (DFmode,
8070 SSE_REGNO (sse_regno)),
8071 GEN_INT (i*8));
8072 sse_regno++;
8073 break;
8074 case X86_64_SSE_CLASS:
8075 pos = i;
8076 switch (n)
8078 case 1:
8079 tmpmode = DImode;
8080 break;
8081 case 2:
8082 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8084 tmpmode = TImode;
8085 i++;
8087 else
8088 tmpmode = DImode;
8089 break;
8090 case 4:
8091 gcc_assert (i == 0
8092 && regclass[1] == X86_64_SSEUP_CLASS
8093 && regclass[2] == X86_64_SSEUP_CLASS
8094 && regclass[3] == X86_64_SSEUP_CLASS);
8095 tmpmode = OImode;
8096 i += 3;
8097 break;
8098 case 8:
8099 gcc_assert (i == 0
8100 && regclass[1] == X86_64_SSEUP_CLASS
8101 && regclass[2] == X86_64_SSEUP_CLASS
8102 && regclass[3] == X86_64_SSEUP_CLASS
8103 && regclass[4] == X86_64_SSEUP_CLASS
8104 && regclass[5] == X86_64_SSEUP_CLASS
8105 && regclass[6] == X86_64_SSEUP_CLASS
8106 && regclass[7] == X86_64_SSEUP_CLASS);
8107 tmpmode = XImode;
8108 i += 7;
8109 break;
8110 default:
8111 gcc_unreachable ();
8113 exp [nexps++]
8114 = gen_rtx_EXPR_LIST (VOIDmode,
8115 gen_rtx_REG (tmpmode,
8116 SSE_REGNO (sse_regno)),
8117 GEN_INT (pos*8));
8118 sse_regno++;
8119 break;
8120 default:
8121 gcc_unreachable ();
8125 /* Empty aligned struct, union or class. */
8126 if (nexps == 0)
8127 return NULL;
8129 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8130 for (i = 0; i < nexps; i++)
8131 XVECEXP (ret, 0, i) = exp [i];
8132 return ret;
8135 /* Update the data in CUM to advance over an argument of mode MODE
8136 and data type TYPE. (TYPE is null for libcalls where that information
8137 may not be available.)
8139 Return a number of integer regsiters advanced over. */
8141 static int
8142 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8143 const_tree type, HOST_WIDE_INT bytes,
8144 HOST_WIDE_INT words)
8146 int res = 0;
8147 bool error_p = false;
8149 if (TARGET_IAMCU)
8151 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8152 bytes in registers. */
8153 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8154 goto pass_in_reg;
8155 return res;
8158 switch (mode)
8160 default:
8161 break;
8163 case E_BLKmode:
8164 if (bytes < 0)
8165 break;
8166 /* FALLTHRU */
8168 case E_DImode:
8169 case E_SImode:
8170 case E_HImode:
8171 case E_QImode:
8172 pass_in_reg:
8173 cum->words += words;
8174 cum->nregs -= words;
8175 cum->regno += words;
8176 if (cum->nregs >= 0)
8177 res = words;
8178 if (cum->nregs <= 0)
8180 cum->nregs = 0;
8181 cfun->machine->arg_reg_available = false;
8182 cum->regno = 0;
8184 break;
8186 case E_OImode:
8187 /* OImode shouldn't be used directly. */
8188 gcc_unreachable ();
8190 case E_DFmode:
8191 if (cum->float_in_sse == -1)
8192 error_p = true;
8193 if (cum->float_in_sse < 2)
8194 break;
8195 /* FALLTHRU */
8196 case E_SFmode:
8197 if (cum->float_in_sse == -1)
8198 error_p = true;
8199 if (cum->float_in_sse < 1)
8200 break;
8201 /* FALLTHRU */
8203 case E_V8SFmode:
8204 case E_V8SImode:
8205 case E_V64QImode:
8206 case E_V32HImode:
8207 case E_V16SImode:
8208 case E_V8DImode:
8209 case E_V16SFmode:
8210 case E_V8DFmode:
8211 case E_V32QImode:
8212 case E_V16HImode:
8213 case E_V4DFmode:
8214 case E_V4DImode:
8215 case E_TImode:
8216 case E_V16QImode:
8217 case E_V8HImode:
8218 case E_V4SImode:
8219 case E_V2DImode:
8220 case E_V4SFmode:
8221 case E_V2DFmode:
8222 if (!type || !AGGREGATE_TYPE_P (type))
8224 cum->sse_words += words;
8225 cum->sse_nregs -= 1;
8226 cum->sse_regno += 1;
8227 if (cum->sse_nregs <= 0)
8229 cum->sse_nregs = 0;
8230 cum->sse_regno = 0;
8233 break;
8235 case E_V8QImode:
8236 case E_V4HImode:
8237 case E_V2SImode:
8238 case E_V2SFmode:
8239 case E_V1TImode:
8240 case E_V1DImode:
8241 if (!type || !AGGREGATE_TYPE_P (type))
8243 cum->mmx_words += words;
8244 cum->mmx_nregs -= 1;
8245 cum->mmx_regno += 1;
8246 if (cum->mmx_nregs <= 0)
8248 cum->mmx_nregs = 0;
8249 cum->mmx_regno = 0;
8252 break;
8254 if (error_p)
8256 cum->float_in_sse = 0;
8257 error ("calling %qD with SSE calling convention without "
8258 "SSE/SSE2 enabled", cum->decl);
8259 sorry ("this is a GCC bug that can be worked around by adding "
8260 "attribute used to function called");
8263 return res;
8266 static int
8267 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8268 const_tree type, HOST_WIDE_INT words, bool named)
8270 int int_nregs, sse_nregs;
8272 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8273 if (!named && (VALID_AVX512F_REG_MODE (mode)
8274 || VALID_AVX256_REG_MODE (mode)))
8275 return 0;
8277 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8278 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8280 cum->nregs -= int_nregs;
8281 cum->sse_nregs -= sse_nregs;
8282 cum->regno += int_nregs;
8283 cum->sse_regno += sse_nregs;
8284 return int_nregs;
8286 else
8288 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8289 cum->words = ROUND_UP (cum->words, align);
8290 cum->words += words;
8291 return 0;
8295 static int
8296 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8297 HOST_WIDE_INT words)
8299 /* Otherwise, this should be passed indirect. */
8300 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8302 cum->words += words;
8303 if (cum->nregs > 0)
8305 cum->nregs -= 1;
8306 cum->regno += 1;
8307 return 1;
8309 return 0;
8312 /* Update the data in CUM to advance over an argument of mode MODE and
8313 data type TYPE. (TYPE is null for libcalls where that information
8314 may not be available.) */
8316 static void
8317 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8318 const_tree type, bool named)
8320 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8321 HOST_WIDE_INT bytes, words;
8322 int nregs;
8324 /* The argument of interrupt handler is a special case and is
8325 handled in ix86_function_arg. */
8326 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8327 return;
8329 if (mode == BLKmode)
8330 bytes = int_size_in_bytes (type);
8331 else
8332 bytes = GET_MODE_SIZE (mode);
8333 words = CEIL (bytes, UNITS_PER_WORD);
8335 if (type)
8336 mode = type_natural_mode (type, NULL, false);
8338 if ((type && POINTER_BOUNDS_TYPE_P (type))
8339 || POINTER_BOUNDS_MODE_P (mode))
8341 /* If we pass bounds in BT then just update remained bounds count. */
8342 if (cum->bnds_in_bt)
8344 cum->bnds_in_bt--;
8345 return;
8348 /* Update remained number of bounds to force. */
8349 if (cum->force_bnd_pass)
8350 cum->force_bnd_pass--;
8352 cum->bnd_regno++;
8354 return;
8357 /* The first arg not going to Bounds Tables resets this counter. */
8358 cum->bnds_in_bt = 0;
8359 /* For unnamed args we always pass bounds to avoid bounds mess when
8360 passed and received types do not match. If bounds do not follow
8361 unnamed arg, still pretend required number of bounds were passed. */
8362 if (cum->force_bnd_pass)
8364 cum->bnd_regno += cum->force_bnd_pass;
8365 cum->force_bnd_pass = 0;
8368 if (TARGET_64BIT)
8370 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8372 if (call_abi == MS_ABI)
8373 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8374 else
8375 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8377 else
8378 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8380 /* For stdarg we expect bounds to be passed for each value passed
8381 in register. */
8382 if (cum->stdarg)
8383 cum->force_bnd_pass = nregs;
8384 /* For pointers passed in memory we expect bounds passed in Bounds
8385 Table. */
8386 if (!nregs)
8388 /* Track if there are outgoing arguments on stack. */
8389 if (cum->caller)
8390 cfun->machine->outgoing_args_on_stack = true;
8392 cum->bnds_in_bt = chkp_type_bounds_count (type);
8396 /* Define where to put the arguments to a function.
8397 Value is zero to push the argument on the stack,
8398 or a hard register in which to store the argument.
8400 MODE is the argument's machine mode.
8401 TYPE is the data type of the argument (as a tree).
8402 This is null for libcalls where that information may
8403 not be available.
8404 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8405 the preceding args and about the function being called.
8406 NAMED is nonzero if this argument is a named parameter
8407 (otherwise it is an extra parameter matching an ellipsis). */
8409 static rtx
8410 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8411 machine_mode orig_mode, const_tree type,
8412 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8414 bool error_p = false;
8416 /* Avoid the AL settings for the Unix64 ABI. */
8417 if (mode == VOIDmode)
8418 return constm1_rtx;
8420 if (TARGET_IAMCU)
8422 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8423 bytes in registers. */
8424 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8425 goto pass_in_reg;
8426 return NULL_RTX;
8429 switch (mode)
8431 default:
8432 break;
8434 case E_BLKmode:
8435 if (bytes < 0)
8436 break;
8437 /* FALLTHRU */
8438 case E_DImode:
8439 case E_SImode:
8440 case E_HImode:
8441 case E_QImode:
8442 pass_in_reg:
8443 if (words <= cum->nregs)
8445 int regno = cum->regno;
8447 /* Fastcall allocates the first two DWORD (SImode) or
8448 smaller arguments to ECX and EDX if it isn't an
8449 aggregate type . */
8450 if (cum->fastcall)
8452 if (mode == BLKmode
8453 || mode == DImode
8454 || (type && AGGREGATE_TYPE_P (type)))
8455 break;
8457 /* ECX not EAX is the first allocated register. */
8458 if (regno == AX_REG)
8459 regno = CX_REG;
8461 return gen_rtx_REG (mode, regno);
8463 break;
8465 case E_DFmode:
8466 if (cum->float_in_sse == -1)
8467 error_p = true;
8468 if (cum->float_in_sse < 2)
8469 break;
8470 /* FALLTHRU */
8471 case E_SFmode:
8472 if (cum->float_in_sse == -1)
8473 error_p = true;
8474 if (cum->float_in_sse < 1)
8475 break;
8476 /* FALLTHRU */
8477 case E_TImode:
8478 /* In 32bit, we pass TImode in xmm registers. */
8479 case E_V16QImode:
8480 case E_V8HImode:
8481 case E_V4SImode:
8482 case E_V2DImode:
8483 case E_V4SFmode:
8484 case E_V2DFmode:
8485 if (!type || !AGGREGATE_TYPE_P (type))
8487 if (cum->sse_nregs)
8488 return gen_reg_or_parallel (mode, orig_mode,
8489 cum->sse_regno + FIRST_SSE_REG);
8491 break;
8493 case E_OImode:
8494 case E_XImode:
8495 /* OImode and XImode shouldn't be used directly. */
8496 gcc_unreachable ();
8498 case E_V64QImode:
8499 case E_V32HImode:
8500 case E_V16SImode:
8501 case E_V8DImode:
8502 case E_V16SFmode:
8503 case E_V8DFmode:
8504 case E_V8SFmode:
8505 case E_V8SImode:
8506 case E_V32QImode:
8507 case E_V16HImode:
8508 case E_V4DFmode:
8509 case E_V4DImode:
8510 if (!type || !AGGREGATE_TYPE_P (type))
8512 if (cum->sse_nregs)
8513 return gen_reg_or_parallel (mode, orig_mode,
8514 cum->sse_regno + FIRST_SSE_REG);
8516 break;
8518 case E_V8QImode:
8519 case E_V4HImode:
8520 case E_V2SImode:
8521 case E_V2SFmode:
8522 case E_V1TImode:
8523 case E_V1DImode:
8524 if (!type || !AGGREGATE_TYPE_P (type))
8526 if (cum->mmx_nregs)
8527 return gen_reg_or_parallel (mode, orig_mode,
8528 cum->mmx_regno + FIRST_MMX_REG);
8530 break;
8532 if (error_p)
8534 cum->float_in_sse = 0;
8535 error ("calling %qD with SSE calling convention without "
8536 "SSE/SSE2 enabled", cum->decl);
8537 sorry ("this is a GCC bug that can be worked around by adding "
8538 "attribute used to function called");
8541 return NULL_RTX;
8544 static rtx
8545 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8546 machine_mode orig_mode, const_tree type, bool named)
8548 /* Handle a hidden AL argument containing number of registers
8549 for varargs x86-64 functions. */
8550 if (mode == VOIDmode)
8551 return GEN_INT (cum->maybe_vaarg
8552 ? (cum->sse_nregs < 0
8553 ? X86_64_SSE_REGPARM_MAX
8554 : cum->sse_regno)
8555 : -1);
8557 switch (mode)
8559 default:
8560 break;
8562 case E_V8SFmode:
8563 case E_V8SImode:
8564 case E_V32QImode:
8565 case E_V16HImode:
8566 case E_V4DFmode:
8567 case E_V4DImode:
8568 case E_V16SFmode:
8569 case E_V16SImode:
8570 case E_V64QImode:
8571 case E_V32HImode:
8572 case E_V8DFmode:
8573 case E_V8DImode:
8574 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8575 if (!named)
8576 return NULL;
8577 break;
8580 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8581 cum->sse_nregs,
8582 &x86_64_int_parameter_registers [cum->regno],
8583 cum->sse_regno);
8586 static rtx
8587 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8588 machine_mode orig_mode, bool named,
8589 HOST_WIDE_INT bytes)
8591 unsigned int regno;
8593 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8594 We use value of -2 to specify that current function call is MSABI. */
8595 if (mode == VOIDmode)
8596 return GEN_INT (-2);
8598 /* If we've run out of registers, it goes on the stack. */
8599 if (cum->nregs == 0)
8600 return NULL_RTX;
8602 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8604 /* Only floating point modes are passed in anything but integer regs. */
8605 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8607 if (named)
8608 regno = cum->regno + FIRST_SSE_REG;
8609 else
8611 rtx t1, t2;
8613 /* Unnamed floating parameters are passed in both the
8614 SSE and integer registers. */
8615 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8616 t2 = gen_rtx_REG (mode, regno);
8617 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8618 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8619 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8622 /* Handle aggregated types passed in register. */
8623 if (orig_mode == BLKmode)
8625 if (bytes > 0 && bytes <= 8)
8626 mode = (bytes > 4 ? DImode : SImode);
8627 if (mode == BLKmode)
8628 mode = DImode;
8631 return gen_reg_or_parallel (mode, orig_mode, regno);
8634 /* Return where to put the arguments to a function.
8635 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8637 MODE is the argument's machine mode. TYPE is the data type of the
8638 argument. It is null for libcalls where that information may not be
8639 available. CUM gives information about the preceding args and about
8640 the function being called. NAMED is nonzero if this argument is a
8641 named parameter (otherwise it is an extra parameter matching an
8642 ellipsis). */
8644 static rtx
8645 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8646 const_tree type, bool named)
8648 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8649 machine_mode mode = omode;
8650 HOST_WIDE_INT bytes, words;
8651 rtx arg;
8653 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8655 gcc_assert (type != NULL_TREE);
8656 if (POINTER_TYPE_P (type))
8658 /* This is the pointer argument. */
8659 gcc_assert (TYPE_MODE (type) == Pmode);
8660 /* It is at -WORD(AP) in the current frame in interrupt and
8661 exception handlers. */
8662 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8664 else
8666 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8667 && TREE_CODE (type) == INTEGER_TYPE
8668 && TYPE_MODE (type) == word_mode);
8669 /* The error code is the word-mode integer argument at
8670 -2 * WORD(AP) in the current frame of the exception
8671 handler. */
8672 arg = gen_rtx_MEM (word_mode,
8673 plus_constant (Pmode,
8674 arg_pointer_rtx,
8675 -2 * UNITS_PER_WORD));
8677 return arg;
8680 /* All pointer bounds arguments are handled separately here. */
8681 if ((type && POINTER_BOUNDS_TYPE_P (type))
8682 || POINTER_BOUNDS_MODE_P (mode))
8684 /* Return NULL if bounds are forced to go in Bounds Table. */
8685 if (cum->bnds_in_bt)
8686 arg = NULL;
8687 /* Return the next available bound reg if any. */
8688 else if (cum->bnd_regno <= LAST_BND_REG)
8689 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8690 /* Return the next special slot number otherwise. */
8691 else
8692 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8694 return arg;
8697 if (mode == BLKmode)
8698 bytes = int_size_in_bytes (type);
8699 else
8700 bytes = GET_MODE_SIZE (mode);
8701 words = CEIL (bytes, UNITS_PER_WORD);
8703 /* To simplify the code below, represent vector types with a vector mode
8704 even if MMX/SSE are not active. */
8705 if (type && TREE_CODE (type) == VECTOR_TYPE)
8706 mode = type_natural_mode (type, cum, false);
8708 if (TARGET_64BIT)
8710 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8712 if (call_abi == MS_ABI)
8713 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8714 else
8715 arg = function_arg_64 (cum, mode, omode, type, named);
8717 else
8718 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8720 /* Track if there are outgoing arguments on stack. */
8721 if (arg == NULL_RTX && cum->caller)
8722 cfun->machine->outgoing_args_on_stack = true;
8724 return arg;
8727 /* A C expression that indicates when an argument must be passed by
8728 reference. If nonzero for an argument, a copy of that argument is
8729 made in memory and a pointer to the argument is passed instead of
8730 the argument itself. The pointer is passed in whatever way is
8731 appropriate for passing a pointer to that type. */
8733 static bool
8734 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8735 const_tree type, bool)
8737 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8739 /* Bounds are never passed by reference. */
8740 if ((type && POINTER_BOUNDS_TYPE_P (type))
8741 || POINTER_BOUNDS_MODE_P (mode))
8742 return false;
8744 if (TARGET_64BIT)
8746 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8748 /* See Windows x64 Software Convention. */
8749 if (call_abi == MS_ABI)
8751 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8753 if (type)
8755 /* Arrays are passed by reference. */
8756 if (TREE_CODE (type) == ARRAY_TYPE)
8757 return true;
8759 if (RECORD_OR_UNION_TYPE_P (type))
8761 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8762 are passed by reference. */
8763 msize = int_size_in_bytes (type);
8767 /* __m128 is passed by reference. */
8768 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8770 else if (type && int_size_in_bytes (type) == -1)
8771 return true;
8774 return false;
8777 /* Return true when TYPE should be 128bit aligned for 32bit argument
8778 passing ABI. XXX: This function is obsolete and is only used for
8779 checking psABI compatibility with previous versions of GCC. */
8781 static bool
8782 ix86_compat_aligned_value_p (const_tree type)
8784 machine_mode mode = TYPE_MODE (type);
8785 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8786 || mode == TDmode
8787 || mode == TFmode
8788 || mode == TCmode)
8789 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8790 return true;
8791 if (TYPE_ALIGN (type) < 128)
8792 return false;
8794 if (AGGREGATE_TYPE_P (type))
8796 /* Walk the aggregates recursively. */
8797 switch (TREE_CODE (type))
8799 case RECORD_TYPE:
8800 case UNION_TYPE:
8801 case QUAL_UNION_TYPE:
8803 tree field;
8805 /* Walk all the structure fields. */
8806 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8808 if (TREE_CODE (field) == FIELD_DECL
8809 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8810 return true;
8812 break;
8815 case ARRAY_TYPE:
8816 /* Just for use if some languages passes arrays by value. */
8817 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8818 return true;
8819 break;
8821 default:
8822 gcc_unreachable ();
8825 return false;
8828 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8829 XXX: This function is obsolete and is only used for checking psABI
8830 compatibility with previous versions of GCC. */
8832 static unsigned int
8833 ix86_compat_function_arg_boundary (machine_mode mode,
8834 const_tree type, unsigned int align)
8836 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8837 natural boundaries. */
8838 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8840 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8841 make an exception for SSE modes since these require 128bit
8842 alignment.
8844 The handling here differs from field_alignment. ICC aligns MMX
8845 arguments to 4 byte boundaries, while structure fields are aligned
8846 to 8 byte boundaries. */
8847 if (!type)
8849 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8850 align = PARM_BOUNDARY;
8852 else
8854 if (!ix86_compat_aligned_value_p (type))
8855 align = PARM_BOUNDARY;
8858 if (align > BIGGEST_ALIGNMENT)
8859 align = BIGGEST_ALIGNMENT;
8860 return align;
8863 /* Return true when TYPE should be 128bit aligned for 32bit argument
8864 passing ABI. */
8866 static bool
8867 ix86_contains_aligned_value_p (const_tree type)
8869 machine_mode mode = TYPE_MODE (type);
8871 if (mode == XFmode || mode == XCmode)
8872 return false;
8874 if (TYPE_ALIGN (type) < 128)
8875 return false;
8877 if (AGGREGATE_TYPE_P (type))
8879 /* Walk the aggregates recursively. */
8880 switch (TREE_CODE (type))
8882 case RECORD_TYPE:
8883 case UNION_TYPE:
8884 case QUAL_UNION_TYPE:
8886 tree field;
8888 /* Walk all the structure fields. */
8889 for (field = TYPE_FIELDS (type);
8890 field;
8891 field = DECL_CHAIN (field))
8893 if (TREE_CODE (field) == FIELD_DECL
8894 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
8895 return true;
8897 break;
8900 case ARRAY_TYPE:
8901 /* Just for use if some languages passes arrays by value. */
8902 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
8903 return true;
8904 break;
8906 default:
8907 gcc_unreachable ();
8910 else
8911 return TYPE_ALIGN (type) >= 128;
8913 return false;
8916 /* Gives the alignment boundary, in bits, of an argument with the
8917 specified mode and type. */
8919 static unsigned int
8920 ix86_function_arg_boundary (machine_mode mode, const_tree type)
8922 unsigned int align;
8923 if (type)
8925 /* Since the main variant type is used for call, we convert it to
8926 the main variant type. */
8927 type = TYPE_MAIN_VARIANT (type);
8928 align = TYPE_ALIGN (type);
8930 else
8931 align = GET_MODE_ALIGNMENT (mode);
8932 if (align < PARM_BOUNDARY)
8933 align = PARM_BOUNDARY;
8934 else
8936 static bool warned;
8937 unsigned int saved_align = align;
8939 if (!TARGET_64BIT)
8941 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
8942 if (!type)
8944 if (mode == XFmode || mode == XCmode)
8945 align = PARM_BOUNDARY;
8947 else if (!ix86_contains_aligned_value_p (type))
8948 align = PARM_BOUNDARY;
8950 if (align < 128)
8951 align = PARM_BOUNDARY;
8954 if (warn_psabi
8955 && !warned
8956 && align != ix86_compat_function_arg_boundary (mode, type,
8957 saved_align))
8959 warned = true;
8960 inform (input_location,
8961 "The ABI for passing parameters with %d-byte"
8962 " alignment has changed in GCC 4.6",
8963 align / BITS_PER_UNIT);
8967 return align;
8970 /* Return true if N is a possible register number of function value. */
8972 static bool
8973 ix86_function_value_regno_p (const unsigned int regno)
8975 switch (regno)
8977 case AX_REG:
8978 return true;
8979 case DX_REG:
8980 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
8981 case DI_REG:
8982 case SI_REG:
8983 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
8985 case BND0_REG:
8986 case BND1_REG:
8987 return chkp_function_instrumented_p (current_function_decl);
8989 /* Complex values are returned in %st(0)/%st(1) pair. */
8990 case ST0_REG:
8991 case ST1_REG:
8992 /* TODO: The function should depend on current function ABI but
8993 builtins.c would need updating then. Therefore we use the
8994 default ABI. */
8995 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
8996 return false;
8997 return TARGET_FLOAT_RETURNS_IN_80387;
8999 /* Complex values are returned in %xmm0/%xmm1 pair. */
9000 case XMM0_REG:
9001 case XMM1_REG:
9002 return TARGET_SSE;
9004 case MM0_REG:
9005 if (TARGET_MACHO || TARGET_64BIT)
9006 return false;
9007 return TARGET_MMX;
9010 return false;
9013 /* Define how to find the value returned by a function.
9014 VALTYPE is the data type of the value (as a tree).
9015 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9016 otherwise, FUNC is 0. */
9018 static rtx
9019 function_value_32 (machine_mode orig_mode, machine_mode mode,
9020 const_tree fntype, const_tree fn)
9022 unsigned int regno;
9024 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9025 we normally prevent this case when mmx is not available. However
9026 some ABIs may require the result to be returned like DImode. */
9027 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9028 regno = FIRST_MMX_REG;
9030 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9031 we prevent this case when sse is not available. However some ABIs
9032 may require the result to be returned like integer TImode. */
9033 else if (mode == TImode
9034 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9035 regno = FIRST_SSE_REG;
9037 /* 32-byte vector modes in %ymm0. */
9038 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9039 regno = FIRST_SSE_REG;
9041 /* 64-byte vector modes in %zmm0. */
9042 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9043 regno = FIRST_SSE_REG;
9045 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9046 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9047 regno = FIRST_FLOAT_REG;
9048 else
9049 /* Most things go in %eax. */
9050 regno = AX_REG;
9052 /* Override FP return register with %xmm0 for local functions when
9053 SSE math is enabled or for functions with sseregparm attribute. */
9054 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9056 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9057 if (sse_level == -1)
9059 error ("calling %qD with SSE calling convention without "
9060 "SSE/SSE2 enabled", fn);
9061 sorry ("this is a GCC bug that can be worked around by adding "
9062 "attribute used to function called");
9064 else if ((sse_level >= 1 && mode == SFmode)
9065 || (sse_level == 2 && mode == DFmode))
9066 regno = FIRST_SSE_REG;
9069 /* OImode shouldn't be used directly. */
9070 gcc_assert (mode != OImode);
9072 return gen_rtx_REG (orig_mode, regno);
9075 static rtx
9076 function_value_64 (machine_mode orig_mode, machine_mode mode,
9077 const_tree valtype)
9079 rtx ret;
9081 /* Handle libcalls, which don't provide a type node. */
9082 if (valtype == NULL)
9084 unsigned int regno;
9086 switch (mode)
9088 case E_SFmode:
9089 case E_SCmode:
9090 case E_DFmode:
9091 case E_DCmode:
9092 case E_TFmode:
9093 case E_SDmode:
9094 case E_DDmode:
9095 case E_TDmode:
9096 regno = FIRST_SSE_REG;
9097 break;
9098 case E_XFmode:
9099 case E_XCmode:
9100 regno = FIRST_FLOAT_REG;
9101 break;
9102 case E_TCmode:
9103 return NULL;
9104 default:
9105 regno = AX_REG;
9108 return gen_rtx_REG (mode, regno);
9110 else if (POINTER_TYPE_P (valtype))
9112 /* Pointers are always returned in word_mode. */
9113 mode = word_mode;
9116 ret = construct_container (mode, orig_mode, valtype, 1,
9117 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9118 x86_64_int_return_registers, 0);
9120 /* For zero sized structures, construct_container returns NULL, but we
9121 need to keep rest of compiler happy by returning meaningful value. */
9122 if (!ret)
9123 ret = gen_rtx_REG (orig_mode, AX_REG);
9125 return ret;
9128 static rtx
9129 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9130 const_tree valtype)
9132 unsigned int regno = AX_REG;
9134 if (TARGET_SSE)
9136 switch (GET_MODE_SIZE (mode))
9138 case 16:
9139 if (valtype != NULL_TREE
9140 && !VECTOR_INTEGER_TYPE_P (valtype)
9141 && !VECTOR_INTEGER_TYPE_P (valtype)
9142 && !INTEGRAL_TYPE_P (valtype)
9143 && !VECTOR_FLOAT_TYPE_P (valtype))
9144 break;
9145 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9146 && !COMPLEX_MODE_P (mode))
9147 regno = FIRST_SSE_REG;
9148 break;
9149 case 8:
9150 case 4:
9151 if (mode == SFmode || mode == DFmode)
9152 regno = FIRST_SSE_REG;
9153 break;
9154 default:
9155 break;
9158 return gen_rtx_REG (orig_mode, regno);
9161 static rtx
9162 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9163 machine_mode orig_mode, machine_mode mode)
9165 const_tree fn, fntype;
9167 fn = NULL_TREE;
9168 if (fntype_or_decl && DECL_P (fntype_or_decl))
9169 fn = fntype_or_decl;
9170 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9172 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9173 || POINTER_BOUNDS_MODE_P (mode))
9174 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9175 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9176 return function_value_ms_64 (orig_mode, mode, valtype);
9177 else if (TARGET_64BIT)
9178 return function_value_64 (orig_mode, mode, valtype);
9179 else
9180 return function_value_32 (orig_mode, mode, fntype, fn);
9183 static rtx
9184 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9186 machine_mode mode, orig_mode;
9188 orig_mode = TYPE_MODE (valtype);
9189 mode = type_natural_mode (valtype, NULL, true);
9190 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9193 /* Return an RTX representing a place where a function returns
9194 or recieves pointer bounds or NULL if no bounds are returned.
9196 VALTYPE is a data type of a value returned by the function.
9198 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9199 or FUNCTION_TYPE of the function.
9201 If OUTGOING is false, return a place in which the caller will
9202 see the return value. Otherwise, return a place where a
9203 function returns a value. */
9205 static rtx
9206 ix86_function_value_bounds (const_tree valtype,
9207 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9208 bool outgoing ATTRIBUTE_UNUSED)
9210 rtx res = NULL_RTX;
9212 if (BOUNDED_TYPE_P (valtype))
9213 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9214 else if (chkp_type_has_pointer (valtype))
9216 bitmap slots;
9217 rtx bounds[2];
9218 bitmap_iterator bi;
9219 unsigned i, bnd_no = 0;
9221 bitmap_obstack_initialize (NULL);
9222 slots = BITMAP_ALLOC (NULL);
9223 chkp_find_bound_slots (valtype, slots);
9225 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9227 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9228 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9229 gcc_assert (bnd_no < 2);
9230 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9233 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9235 BITMAP_FREE (slots);
9236 bitmap_obstack_release (NULL);
9238 else
9239 res = NULL_RTX;
9241 return res;
9244 /* Pointer function arguments and return values are promoted to
9245 word_mode for normal functions. */
9247 static machine_mode
9248 ix86_promote_function_mode (const_tree type, machine_mode mode,
9249 int *punsignedp, const_tree fntype,
9250 int for_return)
9252 if (cfun->machine->func_type == TYPE_NORMAL
9253 && type != NULL_TREE
9254 && POINTER_TYPE_P (type))
9256 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9257 return word_mode;
9259 return default_promote_function_mode (type, mode, punsignedp, fntype,
9260 for_return);
9263 /* Return true if a structure, union or array with MODE containing FIELD
9264 should be accessed using BLKmode. */
9266 static bool
9267 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9269 /* Union with XFmode must be in BLKmode. */
9270 return (mode == XFmode
9271 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9272 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9276 ix86_libcall_value (machine_mode mode)
9278 return ix86_function_value_1 (NULL, NULL, mode, mode);
9281 /* Return true iff type is returned in memory. */
9283 static bool
9284 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9286 #ifdef SUBTARGET_RETURN_IN_MEMORY
9287 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9288 #else
9289 const machine_mode mode = type_natural_mode (type, NULL, true);
9290 HOST_WIDE_INT size;
9292 if (POINTER_BOUNDS_TYPE_P (type))
9293 return false;
9295 if (TARGET_64BIT)
9297 if (ix86_function_type_abi (fntype) == MS_ABI)
9299 size = int_size_in_bytes (type);
9301 /* __m128 is returned in xmm0. */
9302 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9303 || INTEGRAL_TYPE_P (type)
9304 || VECTOR_FLOAT_TYPE_P (type))
9305 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9306 && !COMPLEX_MODE_P (mode)
9307 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9308 return false;
9310 /* Otherwise, the size must be exactly in [1248]. */
9311 return size != 1 && size != 2 && size != 4 && size != 8;
9313 else
9315 int needed_intregs, needed_sseregs;
9317 return examine_argument (mode, type, 1,
9318 &needed_intregs, &needed_sseregs);
9321 else
9323 size = int_size_in_bytes (type);
9325 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9326 bytes in registers. */
9327 if (TARGET_IAMCU)
9328 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9330 if (mode == BLKmode)
9331 return true;
9333 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9334 return false;
9336 if (VECTOR_MODE_P (mode) || mode == TImode)
9338 /* User-created vectors small enough to fit in EAX. */
9339 if (size < 8)
9340 return false;
9342 /* Unless ABI prescibes otherwise,
9343 MMX/3dNow values are returned in MM0 if available. */
9345 if (size == 8)
9346 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9348 /* SSE values are returned in XMM0 if available. */
9349 if (size == 16)
9350 return !TARGET_SSE;
9352 /* AVX values are returned in YMM0 if available. */
9353 if (size == 32)
9354 return !TARGET_AVX;
9356 /* AVX512F values are returned in ZMM0 if available. */
9357 if (size == 64)
9358 return !TARGET_AVX512F;
9361 if (mode == XFmode)
9362 return false;
9364 if (size > 12)
9365 return true;
9367 /* OImode shouldn't be used directly. */
9368 gcc_assert (mode != OImode);
9370 return false;
9372 #endif
9376 /* Create the va_list data type. */
9378 static tree
9379 ix86_build_builtin_va_list_64 (void)
9381 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9383 record = lang_hooks.types.make_type (RECORD_TYPE);
9384 type_decl = build_decl (BUILTINS_LOCATION,
9385 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9387 f_gpr = build_decl (BUILTINS_LOCATION,
9388 FIELD_DECL, get_identifier ("gp_offset"),
9389 unsigned_type_node);
9390 f_fpr = build_decl (BUILTINS_LOCATION,
9391 FIELD_DECL, get_identifier ("fp_offset"),
9392 unsigned_type_node);
9393 f_ovf = build_decl (BUILTINS_LOCATION,
9394 FIELD_DECL, get_identifier ("overflow_arg_area"),
9395 ptr_type_node);
9396 f_sav = build_decl (BUILTINS_LOCATION,
9397 FIELD_DECL, get_identifier ("reg_save_area"),
9398 ptr_type_node);
9400 va_list_gpr_counter_field = f_gpr;
9401 va_list_fpr_counter_field = f_fpr;
9403 DECL_FIELD_CONTEXT (f_gpr) = record;
9404 DECL_FIELD_CONTEXT (f_fpr) = record;
9405 DECL_FIELD_CONTEXT (f_ovf) = record;
9406 DECL_FIELD_CONTEXT (f_sav) = record;
9408 TYPE_STUB_DECL (record) = type_decl;
9409 TYPE_NAME (record) = type_decl;
9410 TYPE_FIELDS (record) = f_gpr;
9411 DECL_CHAIN (f_gpr) = f_fpr;
9412 DECL_CHAIN (f_fpr) = f_ovf;
9413 DECL_CHAIN (f_ovf) = f_sav;
9415 layout_type (record);
9417 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9418 NULL_TREE, TYPE_ATTRIBUTES (record));
9420 /* The correct type is an array type of one element. */
9421 return build_array_type (record, build_index_type (size_zero_node));
9424 /* Setup the builtin va_list data type and for 64-bit the additional
9425 calling convention specific va_list data types. */
9427 static tree
9428 ix86_build_builtin_va_list (void)
9430 if (TARGET_64BIT)
9432 /* Initialize ABI specific va_list builtin types.
9434 In lto1, we can encounter two va_list types:
9435 - one as a result of the type-merge across TUs, and
9436 - the one constructed here.
9437 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9438 a type identity check in canonical_va_list_type based on
9439 TYPE_MAIN_VARIANT (which we used to have) will not work.
9440 Instead, we tag each va_list_type_node with its unique attribute, and
9441 look for the attribute in the type identity check in
9442 canonical_va_list_type.
9444 Tagging sysv_va_list_type_node directly with the attribute is
9445 problematic since it's a array of one record, which will degrade into a
9446 pointer to record when used as parameter (see build_va_arg comments for
9447 an example), dropping the attribute in the process. So we tag the
9448 record instead. */
9450 /* For SYSV_ABI we use an array of one record. */
9451 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9453 /* For MS_ABI we use plain pointer to argument area. */
9454 tree char_ptr_type = build_pointer_type (char_type_node);
9455 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9456 TYPE_ATTRIBUTES (char_ptr_type));
9457 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9459 return ((ix86_abi == MS_ABI)
9460 ? ms_va_list_type_node
9461 : sysv_va_list_type_node);
9463 else
9465 /* For i386 we use plain pointer to argument area. */
9466 return build_pointer_type (char_type_node);
9470 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9472 static void
9473 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9475 rtx save_area, mem;
9476 alias_set_type set;
9477 int i, max;
9479 /* GPR size of varargs save area. */
9480 if (cfun->va_list_gpr_size)
9481 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9482 else
9483 ix86_varargs_gpr_size = 0;
9485 /* FPR size of varargs save area. We don't need it if we don't pass
9486 anything in SSE registers. */
9487 if (TARGET_SSE && cfun->va_list_fpr_size)
9488 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9489 else
9490 ix86_varargs_fpr_size = 0;
9492 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9493 return;
9495 save_area = frame_pointer_rtx;
9496 set = get_varargs_alias_set ();
9498 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9499 if (max > X86_64_REGPARM_MAX)
9500 max = X86_64_REGPARM_MAX;
9502 for (i = cum->regno; i < max; i++)
9504 mem = gen_rtx_MEM (word_mode,
9505 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9506 MEM_NOTRAP_P (mem) = 1;
9507 set_mem_alias_set (mem, set);
9508 emit_move_insn (mem,
9509 gen_rtx_REG (word_mode,
9510 x86_64_int_parameter_registers[i]));
9513 if (ix86_varargs_fpr_size)
9515 machine_mode smode;
9516 rtx_code_label *label;
9517 rtx test;
9519 /* Now emit code to save SSE registers. The AX parameter contains number
9520 of SSE parameter registers used to call this function, though all we
9521 actually check here is the zero/non-zero status. */
9523 label = gen_label_rtx ();
9524 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9525 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9526 label));
9528 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9529 we used movdqa (i.e. TImode) instead? Perhaps even better would
9530 be if we could determine the real mode of the data, via a hook
9531 into pass_stdarg. Ignore all that for now. */
9532 smode = V4SFmode;
9533 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9534 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9536 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9537 if (max > X86_64_SSE_REGPARM_MAX)
9538 max = X86_64_SSE_REGPARM_MAX;
9540 for (i = cum->sse_regno; i < max; ++i)
9542 mem = plus_constant (Pmode, save_area,
9543 i * 16 + ix86_varargs_gpr_size);
9544 mem = gen_rtx_MEM (smode, mem);
9545 MEM_NOTRAP_P (mem) = 1;
9546 set_mem_alias_set (mem, set);
9547 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9549 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9552 emit_label (label);
9556 static void
9557 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9559 alias_set_type set = get_varargs_alias_set ();
9560 int i;
9562 /* Reset to zero, as there might be a sysv vaarg used
9563 before. */
9564 ix86_varargs_gpr_size = 0;
9565 ix86_varargs_fpr_size = 0;
9567 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9569 rtx reg, mem;
9571 mem = gen_rtx_MEM (Pmode,
9572 plus_constant (Pmode, virtual_incoming_args_rtx,
9573 i * UNITS_PER_WORD));
9574 MEM_NOTRAP_P (mem) = 1;
9575 set_mem_alias_set (mem, set);
9577 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9578 emit_move_insn (mem, reg);
9582 static void
9583 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9584 tree type, int *, int no_rtl)
9586 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9587 CUMULATIVE_ARGS next_cum;
9588 tree fntype;
9590 /* This argument doesn't appear to be used anymore. Which is good,
9591 because the old code here didn't suppress rtl generation. */
9592 gcc_assert (!no_rtl);
9594 if (!TARGET_64BIT)
9595 return;
9597 fntype = TREE_TYPE (current_function_decl);
9599 /* For varargs, we do not want to skip the dummy va_dcl argument.
9600 For stdargs, we do want to skip the last named argument. */
9601 next_cum = *cum;
9602 if (stdarg_p (fntype))
9603 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9604 true);
9606 if (cum->call_abi == MS_ABI)
9607 setup_incoming_varargs_ms_64 (&next_cum);
9608 else
9609 setup_incoming_varargs_64 (&next_cum);
9612 static void
9613 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9614 machine_mode mode,
9615 tree type,
9616 int *pretend_size ATTRIBUTE_UNUSED,
9617 int no_rtl)
9619 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9620 CUMULATIVE_ARGS next_cum;
9621 tree fntype;
9622 rtx save_area;
9623 int bnd_reg, i, max;
9625 gcc_assert (!no_rtl);
9627 /* Do nothing if we use plain pointer to argument area. */
9628 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9629 return;
9631 fntype = TREE_TYPE (current_function_decl);
9633 /* For varargs, we do not want to skip the dummy va_dcl argument.
9634 For stdargs, we do want to skip the last named argument. */
9635 next_cum = *cum;
9636 if (stdarg_p (fntype))
9637 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9638 true);
9639 save_area = frame_pointer_rtx;
9641 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9642 if (max > X86_64_REGPARM_MAX)
9643 max = X86_64_REGPARM_MAX;
9645 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9646 if (chkp_function_instrumented_p (current_function_decl))
9647 for (i = cum->regno; i < max; i++)
9649 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9650 rtx ptr = gen_rtx_REG (Pmode,
9651 x86_64_int_parameter_registers[i]);
9652 rtx bounds;
9654 if (bnd_reg <= LAST_BND_REG)
9655 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9656 else
9658 rtx ldx_addr =
9659 plus_constant (Pmode, arg_pointer_rtx,
9660 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9661 bounds = gen_reg_rtx (BNDmode);
9662 emit_insn (BNDmode == BND64mode
9663 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9664 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9667 emit_insn (BNDmode == BND64mode
9668 ? gen_bnd64_stx (addr, ptr, bounds)
9669 : gen_bnd32_stx (addr, ptr, bounds));
9671 bnd_reg++;
9676 /* Checks if TYPE is of kind va_list char *. */
9678 static bool
9679 is_va_list_char_pointer (tree type)
9681 tree canonic;
9683 /* For 32-bit it is always true. */
9684 if (!TARGET_64BIT)
9685 return true;
9686 canonic = ix86_canonical_va_list_type (type);
9687 return (canonic == ms_va_list_type_node
9688 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9691 /* Implement va_start. */
9693 static void
9694 ix86_va_start (tree valist, rtx nextarg)
9696 HOST_WIDE_INT words, n_gpr, n_fpr;
9697 tree f_gpr, f_fpr, f_ovf, f_sav;
9698 tree gpr, fpr, ovf, sav, t;
9699 tree type;
9700 rtx ovf_rtx;
9702 if (flag_split_stack
9703 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9705 unsigned int scratch_regno;
9707 /* When we are splitting the stack, we can't refer to the stack
9708 arguments using internal_arg_pointer, because they may be on
9709 the old stack. The split stack prologue will arrange to
9710 leave a pointer to the old stack arguments in a scratch
9711 register, which we here copy to a pseudo-register. The split
9712 stack prologue can't set the pseudo-register directly because
9713 it (the prologue) runs before any registers have been saved. */
9715 scratch_regno = split_stack_prologue_scratch_regno ();
9716 if (scratch_regno != INVALID_REGNUM)
9718 rtx reg;
9719 rtx_insn *seq;
9721 reg = gen_reg_rtx (Pmode);
9722 cfun->machine->split_stack_varargs_pointer = reg;
9724 start_sequence ();
9725 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9726 seq = get_insns ();
9727 end_sequence ();
9729 push_topmost_sequence ();
9730 emit_insn_after (seq, entry_of_function ());
9731 pop_topmost_sequence ();
9735 /* Only 64bit target needs something special. */
9736 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9738 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9739 std_expand_builtin_va_start (valist, nextarg);
9740 else
9742 rtx va_r, next;
9744 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9745 next = expand_binop (ptr_mode, add_optab,
9746 cfun->machine->split_stack_varargs_pointer,
9747 crtl->args.arg_offset_rtx,
9748 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9749 convert_move (va_r, next, 0);
9751 /* Store zero bounds for va_list. */
9752 if (chkp_function_instrumented_p (current_function_decl))
9753 chkp_expand_bounds_reset_for_mem (valist,
9754 make_tree (TREE_TYPE (valist),
9755 next));
9758 return;
9761 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9762 f_fpr = DECL_CHAIN (f_gpr);
9763 f_ovf = DECL_CHAIN (f_fpr);
9764 f_sav = DECL_CHAIN (f_ovf);
9766 valist = build_simple_mem_ref (valist);
9767 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9768 /* The following should be folded into the MEM_REF offset. */
9769 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9770 f_gpr, NULL_TREE);
9771 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9772 f_fpr, NULL_TREE);
9773 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9774 f_ovf, NULL_TREE);
9775 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9776 f_sav, NULL_TREE);
9778 /* Count number of gp and fp argument registers used. */
9779 words = crtl->args.info.words;
9780 n_gpr = crtl->args.info.regno;
9781 n_fpr = crtl->args.info.sse_regno;
9783 if (cfun->va_list_gpr_size)
9785 type = TREE_TYPE (gpr);
9786 t = build2 (MODIFY_EXPR, type,
9787 gpr, build_int_cst (type, n_gpr * 8));
9788 TREE_SIDE_EFFECTS (t) = 1;
9789 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9792 if (TARGET_SSE && cfun->va_list_fpr_size)
9794 type = TREE_TYPE (fpr);
9795 t = build2 (MODIFY_EXPR, type, fpr,
9796 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9797 TREE_SIDE_EFFECTS (t) = 1;
9798 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9801 /* Find the overflow area. */
9802 type = TREE_TYPE (ovf);
9803 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9804 ovf_rtx = crtl->args.internal_arg_pointer;
9805 else
9806 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9807 t = make_tree (type, ovf_rtx);
9808 if (words != 0)
9809 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9811 /* Store zero bounds for overflow area pointer. */
9812 if (chkp_function_instrumented_p (current_function_decl))
9813 chkp_expand_bounds_reset_for_mem (ovf, t);
9815 t = build2 (MODIFY_EXPR, type, ovf, t);
9816 TREE_SIDE_EFFECTS (t) = 1;
9817 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9819 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9821 /* Find the register save area.
9822 Prologue of the function save it right above stack frame. */
9823 type = TREE_TYPE (sav);
9824 t = make_tree (type, frame_pointer_rtx);
9825 if (!ix86_varargs_gpr_size)
9826 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9828 /* Store zero bounds for save area pointer. */
9829 if (chkp_function_instrumented_p (current_function_decl))
9830 chkp_expand_bounds_reset_for_mem (sav, t);
9832 t = build2 (MODIFY_EXPR, type, sav, t);
9833 TREE_SIDE_EFFECTS (t) = 1;
9834 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9838 /* Implement va_arg. */
9840 static tree
9841 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9842 gimple_seq *post_p)
9844 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9845 tree f_gpr, f_fpr, f_ovf, f_sav;
9846 tree gpr, fpr, ovf, sav, t;
9847 int size, rsize;
9848 tree lab_false, lab_over = NULL_TREE;
9849 tree addr, t2;
9850 rtx container;
9851 int indirect_p = 0;
9852 tree ptrtype;
9853 machine_mode nat_mode;
9854 unsigned int arg_boundary;
9856 /* Only 64bit target needs something special. */
9857 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9858 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9860 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9861 f_fpr = DECL_CHAIN (f_gpr);
9862 f_ovf = DECL_CHAIN (f_fpr);
9863 f_sav = DECL_CHAIN (f_ovf);
9865 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9866 valist, f_gpr, NULL_TREE);
9868 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9869 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9870 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9872 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9873 if (indirect_p)
9874 type = build_pointer_type (type);
9875 size = int_size_in_bytes (type);
9876 rsize = CEIL (size, UNITS_PER_WORD);
9878 nat_mode = type_natural_mode (type, NULL, false);
9879 switch (nat_mode)
9881 case E_V8SFmode:
9882 case E_V8SImode:
9883 case E_V32QImode:
9884 case E_V16HImode:
9885 case E_V4DFmode:
9886 case E_V4DImode:
9887 case E_V16SFmode:
9888 case E_V16SImode:
9889 case E_V64QImode:
9890 case E_V32HImode:
9891 case E_V8DFmode:
9892 case E_V8DImode:
9893 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9894 if (!TARGET_64BIT_MS_ABI)
9896 container = NULL;
9897 break;
9899 /* FALLTHRU */
9901 default:
9902 container = construct_container (nat_mode, TYPE_MODE (type),
9903 type, 0, X86_64_REGPARM_MAX,
9904 X86_64_SSE_REGPARM_MAX, intreg,
9906 break;
9909 /* Pull the value out of the saved registers. */
9911 addr = create_tmp_var (ptr_type_node, "addr");
9913 if (container)
9915 int needed_intregs, needed_sseregs;
9916 bool need_temp;
9917 tree int_addr, sse_addr;
9919 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9920 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9922 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9924 need_temp = (!REG_P (container)
9925 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9926 || TYPE_ALIGN (type) > 128));
9928 /* In case we are passing structure, verify that it is consecutive block
9929 on the register save area. If not we need to do moves. */
9930 if (!need_temp && !REG_P (container))
9932 /* Verify that all registers are strictly consecutive */
9933 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
9935 int i;
9937 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9939 rtx slot = XVECEXP (container, 0, i);
9940 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
9941 || INTVAL (XEXP (slot, 1)) != i * 16)
9942 need_temp = true;
9945 else
9947 int i;
9949 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9951 rtx slot = XVECEXP (container, 0, i);
9952 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
9953 || INTVAL (XEXP (slot, 1)) != i * 8)
9954 need_temp = true;
9958 if (!need_temp)
9960 int_addr = addr;
9961 sse_addr = addr;
9963 else
9965 int_addr = create_tmp_var (ptr_type_node, "int_addr");
9966 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
9969 /* First ensure that we fit completely in registers. */
9970 if (needed_intregs)
9972 t = build_int_cst (TREE_TYPE (gpr),
9973 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
9974 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
9975 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
9976 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
9977 gimplify_and_add (t, pre_p);
9979 if (needed_sseregs)
9981 t = build_int_cst (TREE_TYPE (fpr),
9982 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
9983 + X86_64_REGPARM_MAX * 8);
9984 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
9985 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
9986 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
9987 gimplify_and_add (t, pre_p);
9990 /* Compute index to start of area used for integer regs. */
9991 if (needed_intregs)
9993 /* int_addr = gpr + sav; */
9994 t = fold_build_pointer_plus (sav, gpr);
9995 gimplify_assign (int_addr, t, pre_p);
9997 if (needed_sseregs)
9999 /* sse_addr = fpr + sav; */
10000 t = fold_build_pointer_plus (sav, fpr);
10001 gimplify_assign (sse_addr, t, pre_p);
10003 if (need_temp)
10005 int i, prev_size = 0;
10006 tree temp = create_tmp_var (type, "va_arg_tmp");
10008 /* addr = &temp; */
10009 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10010 gimplify_assign (addr, t, pre_p);
10012 for (i = 0; i < XVECLEN (container, 0); i++)
10014 rtx slot = XVECEXP (container, 0, i);
10015 rtx reg = XEXP (slot, 0);
10016 machine_mode mode = GET_MODE (reg);
10017 tree piece_type;
10018 tree addr_type;
10019 tree daddr_type;
10020 tree src_addr, src;
10021 int src_offset;
10022 tree dest_addr, dest;
10023 int cur_size = GET_MODE_SIZE (mode);
10025 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10026 prev_size = INTVAL (XEXP (slot, 1));
10027 if (prev_size + cur_size > size)
10029 cur_size = size - prev_size;
10030 unsigned int nbits = cur_size * BITS_PER_UNIT;
10031 if (!int_mode_for_size (nbits, 1).exists (&mode))
10032 mode = QImode;
10034 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10035 if (mode == GET_MODE (reg))
10036 addr_type = build_pointer_type (piece_type);
10037 else
10038 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10039 true);
10040 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10041 true);
10043 if (SSE_REGNO_P (REGNO (reg)))
10045 src_addr = sse_addr;
10046 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10048 else
10050 src_addr = int_addr;
10051 src_offset = REGNO (reg) * 8;
10053 src_addr = fold_convert (addr_type, src_addr);
10054 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10056 dest_addr = fold_convert (daddr_type, addr);
10057 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10058 if (cur_size == GET_MODE_SIZE (mode))
10060 src = build_va_arg_indirect_ref (src_addr);
10061 dest = build_va_arg_indirect_ref (dest_addr);
10063 gimplify_assign (dest, src, pre_p);
10065 else
10067 tree copy
10068 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10069 3, dest_addr, src_addr,
10070 size_int (cur_size));
10071 gimplify_and_add (copy, pre_p);
10073 prev_size += cur_size;
10077 if (needed_intregs)
10079 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10080 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10081 gimplify_assign (gpr, t, pre_p);
10084 if (needed_sseregs)
10086 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10087 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10088 gimplify_assign (unshare_expr (fpr), t, pre_p);
10091 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10093 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10096 /* ... otherwise out of the overflow area. */
10098 /* When we align parameter on stack for caller, if the parameter
10099 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10100 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10101 here with caller. */
10102 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10103 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10104 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10106 /* Care for on-stack alignment if needed. */
10107 if (arg_boundary <= 64 || size == 0)
10108 t = ovf;
10109 else
10111 HOST_WIDE_INT align = arg_boundary / 8;
10112 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10113 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10114 build_int_cst (TREE_TYPE (t), -align));
10117 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10118 gimplify_assign (addr, t, pre_p);
10120 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10121 gimplify_assign (unshare_expr (ovf), t, pre_p);
10123 if (container)
10124 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10126 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10127 addr = fold_convert (ptrtype, addr);
10129 if (indirect_p)
10130 addr = build_va_arg_indirect_ref (addr);
10131 return build_va_arg_indirect_ref (addr);
10134 /* Return true if OPNUM's MEM should be matched
10135 in movabs* patterns. */
10137 bool
10138 ix86_check_movabs (rtx insn, int opnum)
10140 rtx set, mem;
10142 set = PATTERN (insn);
10143 if (GET_CODE (set) == PARALLEL)
10144 set = XVECEXP (set, 0, 0);
10145 gcc_assert (GET_CODE (set) == SET);
10146 mem = XEXP (set, opnum);
10147 while (SUBREG_P (mem))
10148 mem = SUBREG_REG (mem);
10149 gcc_assert (MEM_P (mem));
10150 return volatile_ok || !MEM_VOLATILE_P (mem);
10153 /* Return false if INSN contains a MEM with a non-default address space. */
10154 bool
10155 ix86_check_no_addr_space (rtx insn)
10157 subrtx_var_iterator::array_type array;
10158 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10160 rtx x = *iter;
10161 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10162 return false;
10164 return true;
10167 /* Initialize the table of extra 80387 mathematical constants. */
10169 static void
10170 init_ext_80387_constants (void)
10172 static const char * cst[5] =
10174 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10175 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10176 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10177 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10178 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10180 int i;
10182 for (i = 0; i < 5; i++)
10184 real_from_string (&ext_80387_constants_table[i], cst[i]);
10185 /* Ensure each constant is rounded to XFmode precision. */
10186 real_convert (&ext_80387_constants_table[i],
10187 XFmode, &ext_80387_constants_table[i]);
10190 ext_80387_constants_init = 1;
10193 /* Return non-zero if the constant is something that
10194 can be loaded with a special instruction. */
10197 standard_80387_constant_p (rtx x)
10199 machine_mode mode = GET_MODE (x);
10201 const REAL_VALUE_TYPE *r;
10203 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10204 return -1;
10206 if (x == CONST0_RTX (mode))
10207 return 1;
10208 if (x == CONST1_RTX (mode))
10209 return 2;
10211 r = CONST_DOUBLE_REAL_VALUE (x);
10213 /* For XFmode constants, try to find a special 80387 instruction when
10214 optimizing for size or on those CPUs that benefit from them. */
10215 if (mode == XFmode
10216 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10218 int i;
10220 if (! ext_80387_constants_init)
10221 init_ext_80387_constants ();
10223 for (i = 0; i < 5; i++)
10224 if (real_identical (r, &ext_80387_constants_table[i]))
10225 return i + 3;
10228 /* Load of the constant -0.0 or -1.0 will be split as
10229 fldz;fchs or fld1;fchs sequence. */
10230 if (real_isnegzero (r))
10231 return 8;
10232 if (real_identical (r, &dconstm1))
10233 return 9;
10235 return 0;
10238 /* Return the opcode of the special instruction to be used to load
10239 the constant X. */
10241 const char *
10242 standard_80387_constant_opcode (rtx x)
10244 switch (standard_80387_constant_p (x))
10246 case 1:
10247 return "fldz";
10248 case 2:
10249 return "fld1";
10250 case 3:
10251 return "fldlg2";
10252 case 4:
10253 return "fldln2";
10254 case 5:
10255 return "fldl2e";
10256 case 6:
10257 return "fldl2t";
10258 case 7:
10259 return "fldpi";
10260 case 8:
10261 case 9:
10262 return "#";
10263 default:
10264 gcc_unreachable ();
10268 /* Return the CONST_DOUBLE representing the 80387 constant that is
10269 loaded by the specified special instruction. The argument IDX
10270 matches the return value from standard_80387_constant_p. */
10273 standard_80387_constant_rtx (int idx)
10275 int i;
10277 if (! ext_80387_constants_init)
10278 init_ext_80387_constants ();
10280 switch (idx)
10282 case 3:
10283 case 4:
10284 case 5:
10285 case 6:
10286 case 7:
10287 i = idx - 3;
10288 break;
10290 default:
10291 gcc_unreachable ();
10294 return const_double_from_real_value (ext_80387_constants_table[i],
10295 XFmode);
10298 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10299 in supported SSE/AVX vector mode. */
10302 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10304 machine_mode mode;
10306 if (!TARGET_SSE)
10307 return 0;
10309 mode = GET_MODE (x);
10311 if (x == const0_rtx || const0_operand (x, mode))
10312 return 1;
10314 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10316 /* VOIDmode integer constant, get mode from the predicate. */
10317 if (mode == VOIDmode)
10318 mode = pred_mode;
10320 switch (GET_MODE_SIZE (mode))
10322 case 64:
10323 if (TARGET_AVX512F)
10324 return 2;
10325 break;
10326 case 32:
10327 if (TARGET_AVX2)
10328 return 2;
10329 break;
10330 case 16:
10331 if (TARGET_SSE2)
10332 return 2;
10333 break;
10334 case 0:
10335 /* VOIDmode */
10336 gcc_unreachable ();
10337 default:
10338 break;
10342 return 0;
10345 /* Return the opcode of the special instruction to be used to load
10346 the constant X. */
10348 const char *
10349 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
10351 machine_mode mode;
10353 gcc_assert (TARGET_SSE);
10355 mode = GET_MODE (x);
10357 if (x == const0_rtx || const0_operand (x, mode))
10359 switch (get_attr_mode (insn))
10361 case MODE_XI:
10362 return "vpxord\t%g0, %g0, %g0";
10363 case MODE_OI:
10364 return (TARGET_AVX512VL
10365 ? "vpxord\t%x0, %x0, %x0"
10366 : "vpxor\t%x0, %x0, %x0");
10367 case MODE_TI:
10368 return (TARGET_AVX512VL
10369 ? "vpxord\t%t0, %t0, %t0"
10370 : "%vpxor\t%0, %d0");
10372 case MODE_V8DF:
10373 return (TARGET_AVX512DQ
10374 ? "vxorpd\t%g0, %g0, %g0"
10375 : "vpxorq\t%g0, %g0, %g0");
10376 case MODE_V4DF:
10377 return "vxorpd\t%x0, %x0, %x0";
10378 case MODE_V2DF:
10379 return "%vxorpd\t%0, %d0";
10381 case MODE_V16SF:
10382 return (TARGET_AVX512DQ
10383 ? "vxorps\t%g0, %g0, %g0"
10384 : "vpxord\t%g0, %g0, %g0");
10385 case MODE_V8SF:
10386 return "vxorps\t%x0, %x0, %x0";
10387 case MODE_V4SF:
10388 return "%vxorps\t%0, %d0";
10390 default:
10391 gcc_unreachable ();
10394 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10396 enum attr_mode insn_mode = get_attr_mode (insn);
10398 switch (insn_mode)
10400 case MODE_XI:
10401 case MODE_V8DF:
10402 case MODE_V16SF:
10403 gcc_assert (TARGET_AVX512F);
10404 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10406 case MODE_OI:
10407 case MODE_V4DF:
10408 case MODE_V8SF:
10409 gcc_assert (TARGET_AVX2);
10410 /* FALLTHRU */
10411 case MODE_TI:
10412 case MODE_V2DF:
10413 case MODE_V4SF:
10414 gcc_assert (TARGET_SSE2);
10415 return (TARGET_AVX
10416 ? "vpcmpeqd\t%0, %0, %0"
10417 : "pcmpeqd\t%0, %0");
10419 default:
10420 gcc_unreachable ();
10424 gcc_unreachable ();
10427 /* Returns true if INSN can be transformed from a memory load
10428 to a supported FP constant load. */
10430 bool
10431 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10433 rtx src = find_constant_src (insn);
10435 gcc_assert (REG_P (dst));
10437 if (src == NULL
10438 || (SSE_REGNO_P (REGNO (dst))
10439 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10440 || (STACK_REGNO_P (REGNO (dst))
10441 && standard_80387_constant_p (src) < 1))
10442 return false;
10444 return true;
10447 /* Returns true if OP contains a symbol reference */
10449 bool
10450 symbolic_reference_mentioned_p (rtx op)
10452 const char *fmt;
10453 int i;
10455 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10456 return true;
10458 fmt = GET_RTX_FORMAT (GET_CODE (op));
10459 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10461 if (fmt[i] == 'E')
10463 int j;
10465 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10466 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10467 return true;
10470 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10471 return true;
10474 return false;
10477 /* Return true if it is appropriate to emit `ret' instructions in the
10478 body of a function. Do this only if the epilogue is simple, needing a
10479 couple of insns. Prior to reloading, we can't tell how many registers
10480 must be saved, so return false then. Return false if there is no frame
10481 marker to de-allocate. */
10483 bool
10484 ix86_can_use_return_insn_p (void)
10486 struct ix86_frame frame;
10488 if (ix86_function_naked (current_function_decl))
10489 return false;
10491 /* Don't use `ret' instruction in interrupt handler. */
10492 if (! reload_completed
10493 || frame_pointer_needed
10494 || cfun->machine->func_type != TYPE_NORMAL)
10495 return 0;
10497 /* Don't allow more than 32k pop, since that's all we can do
10498 with one instruction. */
10499 if (crtl->args.pops_args && crtl->args.size >= 32768)
10500 return 0;
10502 frame = cfun->machine->frame;
10503 return (frame.stack_pointer_offset == UNITS_PER_WORD
10504 && (frame.nregs + frame.nsseregs) == 0);
10507 /* Value should be nonzero if functions must have frame pointers.
10508 Zero means the frame pointer need not be set up (and parms may
10509 be accessed via the stack pointer) in functions that seem suitable. */
10511 static bool
10512 ix86_frame_pointer_required (void)
10514 /* If we accessed previous frames, then the generated code expects
10515 to be able to access the saved ebp value in our frame. */
10516 if (cfun->machine->accesses_prev_frame)
10517 return true;
10519 /* Several x86 os'es need a frame pointer for other reasons,
10520 usually pertaining to setjmp. */
10521 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10522 return true;
10524 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10525 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10526 return true;
10528 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10529 allocation is 4GB. */
10530 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10531 return true;
10533 /* SSE saves require frame-pointer when stack is misaligned. */
10534 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10535 return true;
10537 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10538 turns off the frame pointer by default. Turn it back on now if
10539 we've not got a leaf function. */
10540 if (TARGET_OMIT_LEAF_FRAME_POINTER
10541 && (!crtl->is_leaf
10542 || ix86_current_function_calls_tls_descriptor))
10543 return true;
10545 if (crtl->profile && !flag_fentry)
10546 return true;
10548 return false;
10551 /* Record that the current function accesses previous call frames. */
10553 void
10554 ix86_setup_frame_addresses (void)
10556 cfun->machine->accesses_prev_frame = 1;
10559 #ifndef USE_HIDDEN_LINKONCE
10560 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10561 # define USE_HIDDEN_LINKONCE 1
10562 # else
10563 # define USE_HIDDEN_LINKONCE 0
10564 # endif
10565 #endif
10567 static int pic_labels_used;
10569 /* Fills in the label name that should be used for a pc thunk for
10570 the given register. */
10572 static void
10573 get_pc_thunk_name (char name[32], unsigned int regno)
10575 gcc_assert (!TARGET_64BIT);
10577 if (USE_HIDDEN_LINKONCE)
10578 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10579 else
10580 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10584 /* This function generates code for -fpic that loads %ebx with
10585 the return address of the caller and then returns. */
10587 static void
10588 ix86_code_end (void)
10590 rtx xops[2];
10591 int regno;
10593 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
10595 char name[32];
10596 tree decl;
10598 if (!(pic_labels_used & (1 << regno)))
10599 continue;
10601 get_pc_thunk_name (name, regno);
10603 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10604 get_identifier (name),
10605 build_function_type_list (void_type_node, NULL_TREE));
10606 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10607 NULL_TREE, void_type_node);
10608 TREE_PUBLIC (decl) = 1;
10609 TREE_STATIC (decl) = 1;
10610 DECL_IGNORED_P (decl) = 1;
10612 #if TARGET_MACHO
10613 if (TARGET_MACHO)
10615 switch_to_section (darwin_sections[picbase_thunk_section]);
10616 fputs ("\t.weak_definition\t", asm_out_file);
10617 assemble_name (asm_out_file, name);
10618 fputs ("\n\t.private_extern\t", asm_out_file);
10619 assemble_name (asm_out_file, name);
10620 putc ('\n', asm_out_file);
10621 ASM_OUTPUT_LABEL (asm_out_file, name);
10622 DECL_WEAK (decl) = 1;
10624 else
10625 #endif
10626 if (USE_HIDDEN_LINKONCE)
10628 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10630 targetm.asm_out.unique_section (decl, 0);
10631 switch_to_section (get_named_section (decl, NULL, 0));
10633 targetm.asm_out.globalize_label (asm_out_file, name);
10634 fputs ("\t.hidden\t", asm_out_file);
10635 assemble_name (asm_out_file, name);
10636 putc ('\n', asm_out_file);
10637 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10639 else
10641 switch_to_section (text_section);
10642 ASM_OUTPUT_LABEL (asm_out_file, name);
10645 DECL_INITIAL (decl) = make_node (BLOCK);
10646 current_function_decl = decl;
10647 allocate_struct_function (decl, false);
10648 init_function_start (decl);
10649 /* We're about to hide the function body from callees of final_* by
10650 emitting it directly; tell them we're a thunk, if they care. */
10651 cfun->is_thunk = true;
10652 first_function_block_is_cold = false;
10653 /* Make sure unwind info is emitted for the thunk if needed. */
10654 final_start_function (emit_barrier (), asm_out_file, 1);
10656 /* Pad stack IP move with 4 instructions (two NOPs count
10657 as one instruction). */
10658 if (TARGET_PAD_SHORT_FUNCTION)
10660 int i = 8;
10662 while (i--)
10663 fputs ("\tnop\n", asm_out_file);
10666 xops[0] = gen_rtx_REG (Pmode, regno);
10667 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
10668 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
10669 output_asm_insn ("%!ret", NULL);
10670 final_end_function ();
10671 init_insn_lengths ();
10672 free_after_compilation (cfun);
10673 set_cfun (NULL);
10674 current_function_decl = NULL;
10677 if (flag_split_stack)
10678 file_end_indicate_split_stack ();
10681 /* Emit code for the SET_GOT patterns. */
10683 const char *
10684 output_set_got (rtx dest, rtx label)
10686 rtx xops[3];
10688 xops[0] = dest;
10690 if (TARGET_VXWORKS_RTP && flag_pic)
10692 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
10693 xops[2] = gen_rtx_MEM (Pmode,
10694 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
10695 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
10697 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
10698 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
10699 an unadorned address. */
10700 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
10701 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
10702 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
10703 return "";
10706 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
10708 if (flag_pic)
10710 char name[32];
10711 get_pc_thunk_name (name, REGNO (dest));
10712 pic_labels_used |= 1 << REGNO (dest);
10714 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
10715 xops[2] = gen_rtx_MEM (QImode, xops[2]);
10716 output_asm_insn ("%!call\t%X2", xops);
10718 #if TARGET_MACHO
10719 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
10720 This is what will be referenced by the Mach-O PIC subsystem. */
10721 if (machopic_should_output_picbase_label () || !label)
10722 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
10724 /* When we are restoring the pic base at the site of a nonlocal label,
10725 and we decided to emit the pic base above, we will still output a
10726 local label used for calculating the correction offset (even though
10727 the offset will be 0 in that case). */
10728 if (label)
10729 targetm.asm_out.internal_label (asm_out_file, "L",
10730 CODE_LABEL_NUMBER (label));
10731 #endif
10733 else
10735 if (TARGET_MACHO)
10736 /* We don't need a pic base, we're not producing pic. */
10737 gcc_unreachable ();
10739 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
10740 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
10741 targetm.asm_out.internal_label (asm_out_file, "L",
10742 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
10745 if (!TARGET_MACHO)
10746 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
10748 return "";
10751 /* Generate an "push" pattern for input ARG. */
10753 static rtx
10754 gen_push (rtx arg)
10756 struct machine_function *m = cfun->machine;
10758 if (m->fs.cfa_reg == stack_pointer_rtx)
10759 m->fs.cfa_offset += UNITS_PER_WORD;
10760 m->fs.sp_offset += UNITS_PER_WORD;
10762 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10763 arg = gen_rtx_REG (word_mode, REGNO (arg));
10765 return gen_rtx_SET (gen_rtx_MEM (word_mode,
10766 gen_rtx_PRE_DEC (Pmode,
10767 stack_pointer_rtx)),
10768 arg);
10771 /* Generate an "pop" pattern for input ARG. */
10773 static rtx
10774 gen_pop (rtx arg)
10776 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10777 arg = gen_rtx_REG (word_mode, REGNO (arg));
10779 return gen_rtx_SET (arg,
10780 gen_rtx_MEM (word_mode,
10781 gen_rtx_POST_INC (Pmode,
10782 stack_pointer_rtx)));
10785 /* Return >= 0 if there is an unused call-clobbered register available
10786 for the entire function. */
10788 static unsigned int
10789 ix86_select_alt_pic_regnum (void)
10791 if (ix86_use_pseudo_pic_reg ())
10792 return INVALID_REGNUM;
10794 if (crtl->is_leaf
10795 && !crtl->profile
10796 && !ix86_current_function_calls_tls_descriptor)
10798 int i, drap;
10799 /* Can't use the same register for both PIC and DRAP. */
10800 if (crtl->drap_reg)
10801 drap = REGNO (crtl->drap_reg);
10802 else
10803 drap = -1;
10804 for (i = 2; i >= 0; --i)
10805 if (i != drap && !df_regs_ever_live_p (i))
10806 return i;
10809 return INVALID_REGNUM;
10812 /* Return true if REGNO is used by the epilogue. */
10814 bool
10815 ix86_epilogue_uses (int regno)
10817 /* If there are no caller-saved registers, we preserve all registers,
10818 except for MMX and x87 registers which aren't supported when saving
10819 and restoring registers. Don't explicitly save SP register since
10820 it is always preserved. */
10821 return (epilogue_completed
10822 && cfun->machine->no_caller_saved_registers
10823 && !fixed_regs[regno]
10824 && !STACK_REGNO_P (regno)
10825 && !MMX_REGNO_P (regno));
10828 /* Return nonzero if register REGNO can be used as a scratch register
10829 in peephole2. */
10831 static bool
10832 ix86_hard_regno_scratch_ok (unsigned int regno)
10834 /* If there are no caller-saved registers, we can't use any register
10835 as a scratch register after epilogue and use REGNO as scratch
10836 register only if it has been used before to avoid saving and
10837 restoring it. */
10838 return (!cfun->machine->no_caller_saved_registers
10839 || (!epilogue_completed
10840 && df_regs_ever_live_p (regno)));
10843 /* Return true if register class CL should be an additional allocno
10844 class. */
10846 static bool
10847 ix86_additional_allocno_class_p (reg_class_t cl)
10849 return cl == MOD4_SSE_REGS;
10852 /* Return TRUE if we need to save REGNO. */
10854 static bool
10855 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
10857 /* If there are no caller-saved registers, we preserve all registers,
10858 except for MMX and x87 registers which aren't supported when saving
10859 and restoring registers. Don't explicitly save SP register since
10860 it is always preserved. */
10861 if (cfun->machine->no_caller_saved_registers)
10863 /* Don't preserve registers used for function return value. */
10864 rtx reg = crtl->return_rtx;
10865 if (reg)
10867 unsigned int i = REGNO (reg);
10868 unsigned int nregs = REG_NREGS (reg);
10869 while (nregs-- > 0)
10870 if ((i + nregs) == regno)
10871 return false;
10873 reg = crtl->return_bnd;
10874 if (reg)
10876 i = REGNO (reg);
10877 nregs = REG_NREGS (reg);
10878 while (nregs-- > 0)
10879 if ((i + nregs) == regno)
10880 return false;
10884 return (df_regs_ever_live_p (regno)
10885 && !fixed_regs[regno]
10886 && !STACK_REGNO_P (regno)
10887 && !MMX_REGNO_P (regno)
10888 && (regno != HARD_FRAME_POINTER_REGNUM
10889 || !frame_pointer_needed));
10892 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
10893 && pic_offset_table_rtx)
10895 if (ix86_use_pseudo_pic_reg ())
10897 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
10898 _mcount in prologue. */
10899 if (!TARGET_64BIT && flag_pic && crtl->profile)
10900 return true;
10902 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10903 || crtl->profile
10904 || crtl->calls_eh_return
10905 || crtl->uses_const_pool
10906 || cfun->has_nonlocal_label)
10907 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
10910 if (crtl->calls_eh_return && maybe_eh_return)
10912 unsigned i;
10913 for (i = 0; ; i++)
10915 unsigned test = EH_RETURN_DATA_REGNO (i);
10916 if (test == INVALID_REGNUM)
10917 break;
10918 if (test == regno)
10919 return true;
10923 if (ignore_outlined && cfun->machine->call_ms2sysv)
10925 unsigned count = cfun->machine->call_ms2sysv_extra_regs
10926 + xlogue_layout::MIN_REGS;
10927 if (xlogue_layout::is_stub_managed_reg (regno, count))
10928 return false;
10931 if (crtl->drap_reg
10932 && regno == REGNO (crtl->drap_reg)
10933 && !cfun->machine->no_drap_save_restore)
10934 return true;
10936 return (df_regs_ever_live_p (regno)
10937 && !call_used_regs[regno]
10938 && !fixed_regs[regno]
10939 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
10942 /* Return number of saved general prupose registers. */
10944 static int
10945 ix86_nsaved_regs (void)
10947 int nregs = 0;
10948 int regno;
10950 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10951 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
10952 nregs ++;
10953 return nregs;
10956 /* Return number of saved SSE registers. */
10958 static int
10959 ix86_nsaved_sseregs (void)
10961 int nregs = 0;
10962 int regno;
10964 if (!TARGET_64BIT_MS_ABI)
10965 return 0;
10966 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10967 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
10968 nregs ++;
10969 return nregs;
10972 /* Given FROM and TO register numbers, say whether this elimination is
10973 allowed. If stack alignment is needed, we can only replace argument
10974 pointer with hard frame pointer, or replace frame pointer with stack
10975 pointer. Otherwise, frame pointer elimination is automatically
10976 handled and all other eliminations are valid. */
10978 static bool
10979 ix86_can_eliminate (const int from, const int to)
10981 if (stack_realign_fp)
10982 return ((from == ARG_POINTER_REGNUM
10983 && to == HARD_FRAME_POINTER_REGNUM)
10984 || (from == FRAME_POINTER_REGNUM
10985 && to == STACK_POINTER_REGNUM));
10986 else
10987 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
10990 /* Return the offset between two registers, one to be eliminated, and the other
10991 its replacement, at the start of a routine. */
10993 HOST_WIDE_INT
10994 ix86_initial_elimination_offset (int from, int to)
10996 struct ix86_frame frame = cfun->machine->frame;
10998 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
10999 return frame.hard_frame_pointer_offset;
11000 else if (from == FRAME_POINTER_REGNUM
11001 && to == HARD_FRAME_POINTER_REGNUM)
11002 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11003 else
11005 gcc_assert (to == STACK_POINTER_REGNUM);
11007 if (from == ARG_POINTER_REGNUM)
11008 return frame.stack_pointer_offset;
11010 gcc_assert (from == FRAME_POINTER_REGNUM);
11011 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11015 /* In a dynamically-aligned function, we can't know the offset from
11016 stack pointer to frame pointer, so we must ensure that setjmp
11017 eliminates fp against the hard fp (%ebp) rather than trying to
11018 index from %esp up to the top of the frame across a gap that is
11019 of unknown (at compile-time) size. */
11020 static rtx
11021 ix86_builtin_setjmp_frame_value (void)
11023 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11026 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11027 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11029 static bool warned_once = false;
11030 if (!warned_once)
11032 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11033 feature);
11034 warned_once = true;
11038 /* When using -fsplit-stack, the allocation routines set a field in
11039 the TCB to the bottom of the stack plus this much space, measured
11040 in bytes. */
11042 #define SPLIT_STACK_AVAILABLE 256
11044 /* Fill structure ix86_frame about frame of currently computed function. */
11046 static void
11047 ix86_compute_frame_layout (void)
11049 struct ix86_frame *frame = &cfun->machine->frame;
11050 struct machine_function *m = cfun->machine;
11051 unsigned HOST_WIDE_INT stack_alignment_needed;
11052 HOST_WIDE_INT offset;
11053 unsigned HOST_WIDE_INT preferred_alignment;
11054 HOST_WIDE_INT size = get_frame_size ();
11055 HOST_WIDE_INT to_allocate;
11057 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11058 * ms_abi functions that call a sysv function. We now need to prune away
11059 * cases where it should be disabled. */
11060 if (TARGET_64BIT && m->call_ms2sysv)
11062 gcc_assert (TARGET_64BIT_MS_ABI);
11063 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11064 gcc_assert (!TARGET_SEH);
11065 gcc_assert (TARGET_SSE);
11066 gcc_assert (!ix86_using_red_zone ());
11068 if (crtl->calls_eh_return)
11070 gcc_assert (!reload_completed);
11071 m->call_ms2sysv = false;
11072 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11075 else if (ix86_static_chain_on_stack)
11077 gcc_assert (!reload_completed);
11078 m->call_ms2sysv = false;
11079 warn_once_call_ms2sysv_xlogues ("static call chains");
11082 /* Finally, compute which registers the stub will manage. */
11083 else
11085 unsigned count = xlogue_layout::count_stub_managed_regs ();
11086 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11087 m->call_ms2sysv_pad_in = 0;
11091 frame->nregs = ix86_nsaved_regs ();
11092 frame->nsseregs = ix86_nsaved_sseregs ();
11094 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11095 except for function prologues, leaf functions and when the defult
11096 incoming stack boundary is overriden at command line or via
11097 force_align_arg_pointer attribute. */
11098 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11099 && (!crtl->is_leaf || cfun->calls_alloca != 0
11100 || ix86_current_function_calls_tls_descriptor
11101 || ix86_incoming_stack_boundary < 128))
11103 crtl->preferred_stack_boundary = 128;
11104 crtl->stack_alignment_needed = 128;
11107 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11108 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11110 gcc_assert (!size || stack_alignment_needed);
11111 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11112 gcc_assert (preferred_alignment <= stack_alignment_needed);
11114 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11115 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11116 if (TARGET_64BIT && m->call_ms2sysv)
11118 gcc_assert (stack_alignment_needed >= 16);
11119 gcc_assert (!frame->nsseregs);
11122 /* For SEH we have to limit the amount of code movement into the prologue.
11123 At present we do this via a BLOCKAGE, at which point there's very little
11124 scheduling that can be done, which means that there's very little point
11125 in doing anything except PUSHs. */
11126 if (TARGET_SEH)
11127 m->use_fast_prologue_epilogue = false;
11128 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11130 int count = frame->nregs;
11131 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11133 /* The fast prologue uses move instead of push to save registers. This
11134 is significantly longer, but also executes faster as modern hardware
11135 can execute the moves in parallel, but can't do that for push/pop.
11137 Be careful about choosing what prologue to emit: When function takes
11138 many instructions to execute we may use slow version as well as in
11139 case function is known to be outside hot spot (this is known with
11140 feedback only). Weight the size of function by number of registers
11141 to save as it is cheap to use one or two push instructions but very
11142 slow to use many of them. */
11143 if (count)
11144 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11145 if (node->frequency < NODE_FREQUENCY_NORMAL
11146 || (flag_branch_probabilities
11147 && node->frequency < NODE_FREQUENCY_HOT))
11148 m->use_fast_prologue_epilogue = false;
11149 else
11150 m->use_fast_prologue_epilogue
11151 = !expensive_function_p (count);
11154 frame->save_regs_using_mov
11155 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11156 /* If static stack checking is enabled and done with probes,
11157 the registers need to be saved before allocating the frame. */
11158 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11160 /* Skip return address and error code in exception handler. */
11161 offset = INCOMING_FRAME_SP_OFFSET;
11163 /* Skip pushed static chain. */
11164 if (ix86_static_chain_on_stack)
11165 offset += UNITS_PER_WORD;
11167 /* Skip saved base pointer. */
11168 if (frame_pointer_needed)
11169 offset += UNITS_PER_WORD;
11170 frame->hfp_save_offset = offset;
11172 /* The traditional frame pointer location is at the top of the frame. */
11173 frame->hard_frame_pointer_offset = offset;
11175 /* Register save area */
11176 offset += frame->nregs * UNITS_PER_WORD;
11177 frame->reg_save_offset = offset;
11179 /* On SEH target, registers are pushed just before the frame pointer
11180 location. */
11181 if (TARGET_SEH)
11182 frame->hard_frame_pointer_offset = offset;
11184 /* Calculate the size of the va-arg area (not including padding, if any). */
11185 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11187 if (stack_realign_fp)
11189 /* We may need a 16-byte aligned stack for the remainder of the
11190 register save area, but the stack frame for the local function
11191 may require a greater alignment if using AVX/2/512. In order
11192 to avoid wasting space, we first calculate the space needed for
11193 the rest of the register saves, add that to the stack pointer,
11194 and then realign the stack to the boundary of the start of the
11195 frame for the local function. */
11196 HOST_WIDE_INT space_needed = 0;
11197 HOST_WIDE_INT sse_reg_space_needed = 0;
11199 if (TARGET_64BIT)
11201 if (m->call_ms2sysv)
11203 m->call_ms2sysv_pad_in = 0;
11204 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11207 else if (frame->nsseregs)
11208 /* The only ABI that has saved SSE registers (Win64) also has a
11209 16-byte aligned default stack. However, many programs violate
11210 the ABI, and Wine64 forces stack realignment to compensate. */
11211 space_needed = frame->nsseregs * 16;
11213 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11215 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11216 rounding to be pedantic. */
11217 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11219 else
11220 space_needed = frame->va_arg_size;
11222 /* Record the allocation size required prior to the realignment AND. */
11223 frame->stack_realign_allocate = space_needed;
11225 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11226 before this point are not directly comparable with values below
11227 this point. Use sp_valid_at to determine if the stack pointer is
11228 valid for a given offset, fp_valid_at for the frame pointer, or
11229 choose_baseaddr to have a base register chosen for you.
11231 Note that the result of (frame->stack_realign_offset
11232 & (stack_alignment_needed - 1)) may not equal zero. */
11233 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11234 frame->stack_realign_offset = offset - space_needed;
11235 frame->sse_reg_save_offset = frame->stack_realign_offset
11236 + sse_reg_space_needed;
11238 else
11240 frame->stack_realign_offset = offset;
11242 if (TARGET_64BIT && m->call_ms2sysv)
11244 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11245 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11248 /* Align and set SSE register save area. */
11249 else if (frame->nsseregs)
11251 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11252 required and the DRAP re-alignment boundary is at least 16 bytes,
11253 then we want the SSE register save area properly aligned. */
11254 if (ix86_incoming_stack_boundary >= 128
11255 || (stack_realign_drap && stack_alignment_needed >= 16))
11256 offset = ROUND_UP (offset, 16);
11257 offset += frame->nsseregs * 16;
11259 frame->sse_reg_save_offset = offset;
11260 offset += frame->va_arg_size;
11263 /* Align start of frame for local function. */
11264 if (m->call_ms2sysv
11265 || frame->va_arg_size != 0
11266 || size != 0
11267 || !crtl->is_leaf
11268 || cfun->calls_alloca
11269 || ix86_current_function_calls_tls_descriptor)
11270 offset = ROUND_UP (offset, stack_alignment_needed);
11272 /* Frame pointer points here. */
11273 frame->frame_pointer_offset = offset;
11275 offset += size;
11277 /* Add outgoing arguments area. Can be skipped if we eliminated
11278 all the function calls as dead code.
11279 Skipping is however impossible when function calls alloca. Alloca
11280 expander assumes that last crtl->outgoing_args_size
11281 of stack frame are unused. */
11282 if (ACCUMULATE_OUTGOING_ARGS
11283 && (!crtl->is_leaf || cfun->calls_alloca
11284 || ix86_current_function_calls_tls_descriptor))
11286 offset += crtl->outgoing_args_size;
11287 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11289 else
11290 frame->outgoing_arguments_size = 0;
11292 /* Align stack boundary. Only needed if we're calling another function
11293 or using alloca. */
11294 if (!crtl->is_leaf || cfun->calls_alloca
11295 || ix86_current_function_calls_tls_descriptor)
11296 offset = ROUND_UP (offset, preferred_alignment);
11298 /* We've reached end of stack frame. */
11299 frame->stack_pointer_offset = offset;
11301 /* Size prologue needs to allocate. */
11302 to_allocate = offset - frame->sse_reg_save_offset;
11304 if ((!to_allocate && frame->nregs <= 1)
11305 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
11306 frame->save_regs_using_mov = false;
11308 if (ix86_using_red_zone ()
11309 && crtl->sp_is_unchanging
11310 && crtl->is_leaf
11311 && !ix86_pc_thunk_call_expanded
11312 && !ix86_current_function_calls_tls_descriptor)
11314 frame->red_zone_size = to_allocate;
11315 if (frame->save_regs_using_mov)
11316 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11317 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11318 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11320 else
11321 frame->red_zone_size = 0;
11322 frame->stack_pointer_offset -= frame->red_zone_size;
11324 /* The SEH frame pointer location is near the bottom of the frame.
11325 This is enforced by the fact that the difference between the
11326 stack pointer and the frame pointer is limited to 240 bytes in
11327 the unwind data structure. */
11328 if (TARGET_SEH)
11330 HOST_WIDE_INT diff;
11332 /* If we can leave the frame pointer where it is, do so. Also, returns
11333 the establisher frame for __builtin_frame_address (0). */
11334 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11335 if (diff <= SEH_MAX_FRAME_SIZE
11336 && (diff > 240 || (diff & 15) != 0)
11337 && !crtl->accesses_prior_frames)
11339 /* Ideally we'd determine what portion of the local stack frame
11340 (within the constraint of the lowest 240) is most heavily used.
11341 But without that complication, simply bias the frame pointer
11342 by 128 bytes so as to maximize the amount of the local stack
11343 frame that is addressable with 8-bit offsets. */
11344 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11349 /* This is semi-inlined memory_address_length, but simplified
11350 since we know that we're always dealing with reg+offset, and
11351 to avoid having to create and discard all that rtl. */
11353 static inline int
11354 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11356 int len = 4;
11358 if (offset == 0)
11360 /* EBP and R13 cannot be encoded without an offset. */
11361 len = (regno == BP_REG || regno == R13_REG);
11363 else if (IN_RANGE (offset, -128, 127))
11364 len = 1;
11366 /* ESP and R12 must be encoded with a SIB byte. */
11367 if (regno == SP_REG || regno == R12_REG)
11368 len++;
11370 return len;
11373 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11374 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11376 static bool
11377 sp_valid_at (HOST_WIDE_INT cfa_offset)
11379 const struct machine_frame_state &fs = cfun->machine->fs;
11380 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11382 /* Validate that the cfa_offset isn't in a "no-man's land". */
11383 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11384 return false;
11386 return fs.sp_valid;
11389 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11390 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11392 static inline bool
11393 fp_valid_at (HOST_WIDE_INT cfa_offset)
11395 const struct machine_frame_state &fs = cfun->machine->fs;
11396 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11398 /* Validate that the cfa_offset isn't in a "no-man's land". */
11399 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11400 return false;
11402 return fs.fp_valid;
11405 /* Choose a base register based upon alignment requested, speed and/or
11406 size. */
11408 static void
11409 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11410 HOST_WIDE_INT &base_offset,
11411 unsigned int align_reqested, unsigned int *align)
11413 const struct machine_function *m = cfun->machine;
11414 unsigned int hfp_align;
11415 unsigned int drap_align;
11416 unsigned int sp_align;
11417 bool hfp_ok = fp_valid_at (cfa_offset);
11418 bool drap_ok = m->fs.drap_valid;
11419 bool sp_ok = sp_valid_at (cfa_offset);
11421 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11423 /* Filter out any registers that don't meet the requested alignment
11424 criteria. */
11425 if (align_reqested)
11427 if (m->fs.realigned)
11428 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11429 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11430 notes (which we would need to use a realigned stack pointer),
11431 so disable on SEH targets. */
11432 else if (m->fs.sp_realigned)
11433 sp_align = crtl->stack_alignment_needed;
11435 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11436 drap_ok = drap_ok && drap_align >= align_reqested;
11437 sp_ok = sp_ok && sp_align >= align_reqested;
11440 if (m->use_fast_prologue_epilogue)
11442 /* Choose the base register most likely to allow the most scheduling
11443 opportunities. Generally FP is valid throughout the function,
11444 while DRAP must be reloaded within the epilogue. But choose either
11445 over the SP due to increased encoding size. */
11447 if (hfp_ok)
11449 base_reg = hard_frame_pointer_rtx;
11450 base_offset = m->fs.fp_offset - cfa_offset;
11452 else if (drap_ok)
11454 base_reg = crtl->drap_reg;
11455 base_offset = 0 - cfa_offset;
11457 else if (sp_ok)
11459 base_reg = stack_pointer_rtx;
11460 base_offset = m->fs.sp_offset - cfa_offset;
11463 else
11465 HOST_WIDE_INT toffset;
11466 int len = 16, tlen;
11468 /* Choose the base register with the smallest address encoding.
11469 With a tie, choose FP > DRAP > SP. */
11470 if (sp_ok)
11472 base_reg = stack_pointer_rtx;
11473 base_offset = m->fs.sp_offset - cfa_offset;
11474 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11476 if (drap_ok)
11478 toffset = 0 - cfa_offset;
11479 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11480 if (tlen <= len)
11482 base_reg = crtl->drap_reg;
11483 base_offset = toffset;
11484 len = tlen;
11487 if (hfp_ok)
11489 toffset = m->fs.fp_offset - cfa_offset;
11490 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11491 if (tlen <= len)
11493 base_reg = hard_frame_pointer_rtx;
11494 base_offset = toffset;
11495 len = tlen;
11500 /* Set the align return value. */
11501 if (align)
11503 if (base_reg == stack_pointer_rtx)
11504 *align = sp_align;
11505 else if (base_reg == crtl->drap_reg)
11506 *align = drap_align;
11507 else if (base_reg == hard_frame_pointer_rtx)
11508 *align = hfp_align;
11512 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11513 the alignment of address. If ALIGN is non-null, it should point to
11514 an alignment value (in bits) that is preferred or zero and will
11515 recieve the alignment of the base register that was selected,
11516 irrespective of rather or not CFA_OFFSET is a multiple of that
11517 alignment value.
11519 The valid base registers are taken from CFUN->MACHINE->FS. */
11521 static rtx
11522 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
11524 rtx base_reg = NULL;
11525 HOST_WIDE_INT base_offset = 0;
11527 /* If a specific alignment is requested, try to get a base register
11528 with that alignment first. */
11529 if (align && *align)
11530 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11532 if (!base_reg)
11533 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11535 gcc_assert (base_reg != NULL);
11536 return plus_constant (Pmode, base_reg, base_offset);
11539 /* Emit code to save registers in the prologue. */
11541 static void
11542 ix86_emit_save_regs (void)
11544 unsigned int regno;
11545 rtx_insn *insn;
11547 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
11548 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11550 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
11551 RTX_FRAME_RELATED_P (insn) = 1;
11555 /* Emit a single register save at CFA - CFA_OFFSET. */
11557 static void
11558 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
11559 HOST_WIDE_INT cfa_offset)
11561 struct machine_function *m = cfun->machine;
11562 rtx reg = gen_rtx_REG (mode, regno);
11563 rtx mem, addr, base, insn;
11564 unsigned int align = GET_MODE_ALIGNMENT (mode);
11566 addr = choose_baseaddr (cfa_offset, &align);
11567 mem = gen_frame_mem (mode, addr);
11569 /* The location aligment depends upon the base register. */
11570 align = MIN (GET_MODE_ALIGNMENT (mode), align);
11571 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
11572 set_mem_align (mem, align);
11574 insn = emit_insn (gen_rtx_SET (mem, reg));
11575 RTX_FRAME_RELATED_P (insn) = 1;
11577 base = addr;
11578 if (GET_CODE (base) == PLUS)
11579 base = XEXP (base, 0);
11580 gcc_checking_assert (REG_P (base));
11582 /* When saving registers into a re-aligned local stack frame, avoid
11583 any tricky guessing by dwarf2out. */
11584 if (m->fs.realigned)
11586 gcc_checking_assert (stack_realign_drap);
11588 if (regno == REGNO (crtl->drap_reg))
11590 /* A bit of a hack. We force the DRAP register to be saved in
11591 the re-aligned stack frame, which provides us with a copy
11592 of the CFA that will last past the prologue. Install it. */
11593 gcc_checking_assert (cfun->machine->fs.fp_valid);
11594 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11595 cfun->machine->fs.fp_offset - cfa_offset);
11596 mem = gen_rtx_MEM (mode, addr);
11597 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
11599 else
11601 /* The frame pointer is a stable reference within the
11602 aligned frame. Use it. */
11603 gcc_checking_assert (cfun->machine->fs.fp_valid);
11604 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11605 cfun->machine->fs.fp_offset - cfa_offset);
11606 mem = gen_rtx_MEM (mode, addr);
11607 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11611 else if (base == stack_pointer_rtx && m->fs.sp_realigned
11612 && cfa_offset >= m->fs.sp_realigned_offset)
11614 gcc_checking_assert (stack_realign_fp);
11615 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11618 /* The memory may not be relative to the current CFA register,
11619 which means that we may need to generate a new pattern for
11620 use by the unwind info. */
11621 else if (base != m->fs.cfa_reg)
11623 addr = plus_constant (Pmode, m->fs.cfa_reg,
11624 m->fs.cfa_offset - cfa_offset);
11625 mem = gen_rtx_MEM (mode, addr);
11626 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
11630 /* Emit code to save registers using MOV insns.
11631 First register is stored at CFA - CFA_OFFSET. */
11632 static void
11633 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
11635 unsigned int regno;
11637 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11638 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11640 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
11641 cfa_offset -= UNITS_PER_WORD;
11645 /* Emit code to save SSE registers using MOV insns.
11646 First register is stored at CFA - CFA_OFFSET. */
11647 static void
11648 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
11650 unsigned int regno;
11652 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11653 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11655 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
11656 cfa_offset -= GET_MODE_SIZE (V4SFmode);
11660 static GTY(()) rtx queued_cfa_restores;
11662 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
11663 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
11664 Don't add the note if the previously saved value will be left untouched
11665 within stack red-zone till return, as unwinders can find the same value
11666 in the register and on the stack. */
11668 static void
11669 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
11671 if (!crtl->shrink_wrapped
11672 && cfa_offset <= cfun->machine->fs.red_zone_offset)
11673 return;
11675 if (insn)
11677 add_reg_note (insn, REG_CFA_RESTORE, reg);
11678 RTX_FRAME_RELATED_P (insn) = 1;
11680 else
11681 queued_cfa_restores
11682 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
11685 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
11687 static void
11688 ix86_add_queued_cfa_restore_notes (rtx insn)
11690 rtx last;
11691 if (!queued_cfa_restores)
11692 return;
11693 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
11695 XEXP (last, 1) = REG_NOTES (insn);
11696 REG_NOTES (insn) = queued_cfa_restores;
11697 queued_cfa_restores = NULL_RTX;
11698 RTX_FRAME_RELATED_P (insn) = 1;
11701 /* Expand prologue or epilogue stack adjustment.
11702 The pattern exist to put a dependency on all ebp-based memory accesses.
11703 STYLE should be negative if instructions should be marked as frame related,
11704 zero if %r11 register is live and cannot be freely used and positive
11705 otherwise. */
11707 static rtx
11708 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
11709 int style, bool set_cfa)
11711 struct machine_function *m = cfun->machine;
11712 rtx insn;
11713 bool add_frame_related_expr = false;
11715 if (Pmode == SImode)
11716 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
11717 else if (x86_64_immediate_operand (offset, DImode))
11718 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
11719 else
11721 rtx tmp;
11722 /* r11 is used by indirect sibcall return as well, set before the
11723 epilogue and used after the epilogue. */
11724 if (style)
11725 tmp = gen_rtx_REG (DImode, R11_REG);
11726 else
11728 gcc_assert (src != hard_frame_pointer_rtx
11729 && dest != hard_frame_pointer_rtx);
11730 tmp = hard_frame_pointer_rtx;
11732 insn = emit_insn (gen_rtx_SET (tmp, offset));
11733 if (style < 0)
11734 add_frame_related_expr = true;
11736 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
11739 insn = emit_insn (insn);
11740 if (style >= 0)
11741 ix86_add_queued_cfa_restore_notes (insn);
11743 if (set_cfa)
11745 rtx r;
11747 gcc_assert (m->fs.cfa_reg == src);
11748 m->fs.cfa_offset += INTVAL (offset);
11749 m->fs.cfa_reg = dest;
11751 r = gen_rtx_PLUS (Pmode, src, offset);
11752 r = gen_rtx_SET (dest, r);
11753 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
11754 RTX_FRAME_RELATED_P (insn) = 1;
11756 else if (style < 0)
11758 RTX_FRAME_RELATED_P (insn) = 1;
11759 if (add_frame_related_expr)
11761 rtx r = gen_rtx_PLUS (Pmode, src, offset);
11762 r = gen_rtx_SET (dest, r);
11763 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
11767 if (dest == stack_pointer_rtx)
11769 HOST_WIDE_INT ooffset = m->fs.sp_offset;
11770 bool valid = m->fs.sp_valid;
11771 bool realigned = m->fs.sp_realigned;
11773 if (src == hard_frame_pointer_rtx)
11775 valid = m->fs.fp_valid;
11776 realigned = false;
11777 ooffset = m->fs.fp_offset;
11779 else if (src == crtl->drap_reg)
11781 valid = m->fs.drap_valid;
11782 realigned = false;
11783 ooffset = 0;
11785 else
11787 /* Else there are two possibilities: SP itself, which we set
11788 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
11789 taken care of this by hand along the eh_return path. */
11790 gcc_checking_assert (src == stack_pointer_rtx
11791 || offset == const0_rtx);
11794 m->fs.sp_offset = ooffset - INTVAL (offset);
11795 m->fs.sp_valid = valid;
11796 m->fs.sp_realigned = realigned;
11798 return insn;
11801 /* Find an available register to be used as dynamic realign argument
11802 pointer regsiter. Such a register will be written in prologue and
11803 used in begin of body, so it must not be
11804 1. parameter passing register.
11805 2. GOT pointer.
11806 We reuse static-chain register if it is available. Otherwise, we
11807 use DI for i386 and R13 for x86-64. We chose R13 since it has
11808 shorter encoding.
11810 Return: the regno of chosen register. */
11812 static unsigned int
11813 find_drap_reg (void)
11815 tree decl = cfun->decl;
11817 /* Always use callee-saved register if there are no caller-saved
11818 registers. */
11819 if (TARGET_64BIT)
11821 /* Use R13 for nested function or function need static chain.
11822 Since function with tail call may use any caller-saved
11823 registers in epilogue, DRAP must not use caller-saved
11824 register in such case. */
11825 if (DECL_STATIC_CHAIN (decl)
11826 || cfun->machine->no_caller_saved_registers
11827 || crtl->tail_call_emit)
11828 return R13_REG;
11830 return R10_REG;
11832 else
11834 /* Use DI for nested function or function need static chain.
11835 Since function with tail call may use any caller-saved
11836 registers in epilogue, DRAP must not use caller-saved
11837 register in such case. */
11838 if (DECL_STATIC_CHAIN (decl)
11839 || cfun->machine->no_caller_saved_registers
11840 || crtl->tail_call_emit)
11841 return DI_REG;
11843 /* Reuse static chain register if it isn't used for parameter
11844 passing. */
11845 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
11847 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
11848 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
11849 return CX_REG;
11851 return DI_REG;
11855 /* Handle a "force_align_arg_pointer" attribute. */
11857 static tree
11858 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
11859 tree, int, bool *no_add_attrs)
11861 if (TREE_CODE (*node) != FUNCTION_TYPE
11862 && TREE_CODE (*node) != METHOD_TYPE
11863 && TREE_CODE (*node) != FIELD_DECL
11864 && TREE_CODE (*node) != TYPE_DECL)
11866 warning (OPT_Wattributes, "%qE attribute only applies to functions",
11867 name);
11868 *no_add_attrs = true;
11871 return NULL_TREE;
11874 /* Return minimum incoming stack alignment. */
11876 static unsigned int
11877 ix86_minimum_incoming_stack_boundary (bool sibcall)
11879 unsigned int incoming_stack_boundary;
11881 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
11882 if (cfun->machine->func_type != TYPE_NORMAL)
11883 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
11884 /* Prefer the one specified at command line. */
11885 else if (ix86_user_incoming_stack_boundary)
11886 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
11887 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
11888 if -mstackrealign is used, it isn't used for sibcall check and
11889 estimated stack alignment is 128bit. */
11890 else if (!sibcall
11891 && ix86_force_align_arg_pointer
11892 && crtl->stack_alignment_estimated == 128)
11893 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11894 else
11895 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
11897 /* Incoming stack alignment can be changed on individual functions
11898 via force_align_arg_pointer attribute. We use the smallest
11899 incoming stack boundary. */
11900 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
11901 && lookup_attribute (ix86_force_align_arg_pointer_string,
11902 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
11903 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11905 /* The incoming stack frame has to be aligned at least at
11906 parm_stack_boundary. */
11907 if (incoming_stack_boundary < crtl->parm_stack_boundary)
11908 incoming_stack_boundary = crtl->parm_stack_boundary;
11910 /* Stack at entrance of main is aligned by runtime. We use the
11911 smallest incoming stack boundary. */
11912 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
11913 && DECL_NAME (current_function_decl)
11914 && MAIN_NAME_P (DECL_NAME (current_function_decl))
11915 && DECL_FILE_SCOPE_P (current_function_decl))
11916 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
11918 return incoming_stack_boundary;
11921 /* Update incoming stack boundary and estimated stack alignment. */
11923 static void
11924 ix86_update_stack_boundary (void)
11926 ix86_incoming_stack_boundary
11927 = ix86_minimum_incoming_stack_boundary (false);
11929 /* x86_64 vararg needs 16byte stack alignment for register save
11930 area. */
11931 if (TARGET_64BIT
11932 && cfun->stdarg
11933 && crtl->stack_alignment_estimated < 128)
11934 crtl->stack_alignment_estimated = 128;
11936 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
11937 if (ix86_tls_descriptor_calls_expanded_in_cfun
11938 && crtl->preferred_stack_boundary < 128)
11939 crtl->preferred_stack_boundary = 128;
11942 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
11943 needed or an rtx for DRAP otherwise. */
11945 static rtx
11946 ix86_get_drap_rtx (void)
11948 /* We must use DRAP if there are outgoing arguments on stack and
11949 ACCUMULATE_OUTGOING_ARGS is false. */
11950 if (ix86_force_drap
11951 || (cfun->machine->outgoing_args_on_stack
11952 && !ACCUMULATE_OUTGOING_ARGS))
11953 crtl->need_drap = true;
11955 if (stack_realign_drap)
11957 /* Assign DRAP to vDRAP and returns vDRAP */
11958 unsigned int regno = find_drap_reg ();
11959 rtx drap_vreg;
11960 rtx arg_ptr;
11961 rtx_insn *seq, *insn;
11963 arg_ptr = gen_rtx_REG (Pmode, regno);
11964 crtl->drap_reg = arg_ptr;
11966 start_sequence ();
11967 drap_vreg = copy_to_reg (arg_ptr);
11968 seq = get_insns ();
11969 end_sequence ();
11971 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
11972 if (!optimize)
11974 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
11975 RTX_FRAME_RELATED_P (insn) = 1;
11977 return drap_vreg;
11979 else
11980 return NULL;
11983 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
11985 static rtx
11986 ix86_internal_arg_pointer (void)
11988 return virtual_incoming_args_rtx;
11991 struct scratch_reg {
11992 rtx reg;
11993 bool saved;
11996 /* Return a short-lived scratch register for use on function entry.
11997 In 32-bit mode, it is valid only after the registers are saved
11998 in the prologue. This register must be released by means of
11999 release_scratch_register_on_entry once it is dead. */
12001 static void
12002 get_scratch_register_on_entry (struct scratch_reg *sr)
12004 int regno;
12006 sr->saved = false;
12008 if (TARGET_64BIT)
12010 /* We always use R11 in 64-bit mode. */
12011 regno = R11_REG;
12013 else
12015 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12016 bool fastcall_p
12017 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12018 bool thiscall_p
12019 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12020 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12021 int regparm = ix86_function_regparm (fntype, decl);
12022 int drap_regno
12023 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12025 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12026 for the static chain register. */
12027 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12028 && drap_regno != AX_REG)
12029 regno = AX_REG;
12030 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12031 for the static chain register. */
12032 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12033 regno = AX_REG;
12034 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12035 regno = DX_REG;
12036 /* ecx is the static chain register. */
12037 else if (regparm < 3 && !fastcall_p && !thiscall_p
12038 && !static_chain_p
12039 && drap_regno != CX_REG)
12040 regno = CX_REG;
12041 else if (ix86_save_reg (BX_REG, true, false))
12042 regno = BX_REG;
12043 /* esi is the static chain register. */
12044 else if (!(regparm == 3 && static_chain_p)
12045 && ix86_save_reg (SI_REG, true, false))
12046 regno = SI_REG;
12047 else if (ix86_save_reg (DI_REG, true, false))
12048 regno = DI_REG;
12049 else
12051 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12052 sr->saved = true;
12056 sr->reg = gen_rtx_REG (Pmode, regno);
12057 if (sr->saved)
12059 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12060 RTX_FRAME_RELATED_P (insn) = 1;
12064 /* Release a scratch register obtained from the preceding function. */
12066 static void
12067 release_scratch_register_on_entry (struct scratch_reg *sr)
12069 if (sr->saved)
12071 struct machine_function *m = cfun->machine;
12072 rtx x, insn = emit_insn (gen_pop (sr->reg));
12074 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12075 RTX_FRAME_RELATED_P (insn) = 1;
12076 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12077 x = gen_rtx_SET (stack_pointer_rtx, x);
12078 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12079 m->fs.sp_offset -= UNITS_PER_WORD;
12083 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
12085 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12087 This differs from the next routine in that it tries hard to prevent
12088 attacks that jump the stack guard. Thus it is never allowed to allocate
12089 more than PROBE_INTERVAL bytes of stack space without a suitable
12090 probe. */
12092 static void
12093 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
12095 struct machine_function *m = cfun->machine;
12097 /* If this function does not statically allocate stack space, then
12098 no probes are needed. */
12099 if (!size)
12101 /* However, the allocation of space via pushes for register
12102 saves could be viewed as allocating space, but without the
12103 need to probe. */
12104 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12105 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12106 else
12107 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12108 return;
12111 /* If we are a noreturn function, then we have to consider the
12112 possibility that we're called via a jump rather than a call.
12114 Thus we don't have the implicit probe generated by saving the
12115 return address into the stack at the call. Thus, the stack
12116 pointer could be anywhere in the guard page. The safe thing
12117 to do is emit a probe now.
12119 ?!? This should be revamped to work like aarch64 and s390 where
12120 we track the offset from the most recent probe. Normally that
12121 offset would be zero. For a noreturn function we would reset
12122 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12123 we just probe when we cross PROBE_INTERVAL. */
12124 if (TREE_THIS_VOLATILE (cfun->decl))
12126 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12127 -GET_MODE_SIZE (word_mode)));
12128 emit_insn (gen_blockage ());
12131 /* If we allocate less than the size of the guard statically,
12132 then no probing is necessary, but we do need to allocate
12133 the stack. */
12134 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12136 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12137 GEN_INT (-size), -1,
12138 m->fs.cfa_reg == stack_pointer_rtx);
12139 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12140 return;
12143 /* We're allocating a large enough stack frame that we need to
12144 emit probes. Either emit them inline or in a loop depending
12145 on the size. */
12146 HOST_WIDE_INT probe_interval
12147 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12148 if (size <= 4 * probe_interval)
12150 HOST_WIDE_INT i;
12151 for (i = probe_interval; i <= size; i += probe_interval)
12153 /* Allocate PROBE_INTERVAL bytes. */
12154 rtx insn
12155 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12156 GEN_INT (-PROBE_INTERVAL), -1,
12157 m->fs.cfa_reg == stack_pointer_rtx);
12158 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12160 /* And probe at *sp. */
12161 emit_stack_probe (stack_pointer_rtx);
12162 emit_insn (gen_blockage ());
12165 /* We need to allocate space for the residual, but we do not need
12166 to probe the residual. */
12167 HOST_WIDE_INT residual = (i - probe_interval - size);
12168 if (residual)
12169 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12170 GEN_INT (residual), -1,
12171 m->fs.cfa_reg == stack_pointer_rtx);
12172 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12174 else
12176 struct scratch_reg sr;
12177 get_scratch_register_on_entry (&sr);
12179 /* Step 1: round SIZE down to a multiple of the interval. */
12180 HOST_WIDE_INT rounded_size = size & -probe_interval;
12182 /* Step 2: compute final value of the loop counter. Use lea if
12183 possible. */
12184 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12185 rtx insn;
12186 if (address_no_seg_operand (addr, Pmode))
12187 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12188 else
12190 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12191 insn = emit_insn (gen_rtx_SET (sr.reg,
12192 gen_rtx_PLUS (Pmode, sr.reg,
12193 stack_pointer_rtx)));
12195 if (m->fs.cfa_reg == stack_pointer_rtx)
12197 add_reg_note (insn, REG_CFA_DEF_CFA,
12198 plus_constant (Pmode, sr.reg,
12199 m->fs.cfa_offset + rounded_size));
12200 RTX_FRAME_RELATED_P (insn) = 1;
12203 /* Step 3: the loop. */
12204 rtx size_rtx = GEN_INT (rounded_size);
12205 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12206 size_rtx));
12207 if (m->fs.cfa_reg == stack_pointer_rtx)
12209 m->fs.cfa_offset += rounded_size;
12210 add_reg_note (insn, REG_CFA_DEF_CFA,
12211 plus_constant (Pmode, stack_pointer_rtx,
12212 m->fs.cfa_offset));
12213 RTX_FRAME_RELATED_P (insn) = 1;
12215 m->fs.sp_offset += rounded_size;
12216 emit_insn (gen_blockage ());
12218 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12219 is equal to ROUNDED_SIZE. */
12221 if (size != rounded_size)
12222 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12223 GEN_INT (rounded_size - size), -1,
12224 m->fs.cfa_reg == stack_pointer_rtx);
12225 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12227 release_scratch_register_on_entry (&sr);
12230 /* Make sure nothing is scheduled before we are done. */
12231 emit_insn (gen_blockage ());
12234 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
12236 static void
12237 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
12239 /* We skip the probe for the first interval + a small dope of 4 words and
12240 probe that many bytes past the specified size to maintain a protection
12241 area at the botton of the stack. */
12242 const int dope = 4 * UNITS_PER_WORD;
12243 rtx size_rtx = GEN_INT (size), last;
12245 /* See if we have a constant small number of probes to generate. If so,
12246 that's the easy case. The run-time loop is made up of 9 insns in the
12247 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12248 for n # of intervals. */
12249 if (size <= 4 * PROBE_INTERVAL)
12251 HOST_WIDE_INT i, adjust;
12252 bool first_probe = true;
12254 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12255 values of N from 1 until it exceeds SIZE. If only one probe is
12256 needed, this will not generate any code. Then adjust and probe
12257 to PROBE_INTERVAL + SIZE. */
12258 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
12260 if (first_probe)
12262 adjust = 2 * PROBE_INTERVAL + dope;
12263 first_probe = false;
12265 else
12266 adjust = PROBE_INTERVAL;
12268 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12269 plus_constant (Pmode, stack_pointer_rtx,
12270 -adjust)));
12271 emit_stack_probe (stack_pointer_rtx);
12274 if (first_probe)
12275 adjust = size + PROBE_INTERVAL + dope;
12276 else
12277 adjust = size + PROBE_INTERVAL - i;
12279 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12280 plus_constant (Pmode, stack_pointer_rtx,
12281 -adjust)));
12282 emit_stack_probe (stack_pointer_rtx);
12284 /* Adjust back to account for the additional first interval. */
12285 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12286 plus_constant (Pmode, stack_pointer_rtx,
12287 PROBE_INTERVAL + dope)));
12290 /* Otherwise, do the same as above, but in a loop. Note that we must be
12291 extra careful with variables wrapping around because we might be at
12292 the very top (or the very bottom) of the address space and we have
12293 to be able to handle this case properly; in particular, we use an
12294 equality test for the loop condition. */
12295 else
12297 HOST_WIDE_INT rounded_size;
12298 struct scratch_reg sr;
12300 get_scratch_register_on_entry (&sr);
12303 /* Step 1: round SIZE to the previous multiple of the interval. */
12305 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
12308 /* Step 2: compute initial and final value of the loop counter. */
12310 /* SP = SP_0 + PROBE_INTERVAL. */
12311 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12312 plus_constant (Pmode, stack_pointer_rtx,
12313 - (PROBE_INTERVAL + dope))));
12315 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12316 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12317 emit_insn (gen_rtx_SET (sr.reg,
12318 plus_constant (Pmode, stack_pointer_rtx,
12319 -rounded_size)));
12320 else
12322 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12323 emit_insn (gen_rtx_SET (sr.reg,
12324 gen_rtx_PLUS (Pmode, sr.reg,
12325 stack_pointer_rtx)));
12329 /* Step 3: the loop
12333 SP = SP + PROBE_INTERVAL
12334 probe at SP
12336 while (SP != LAST_ADDR)
12338 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12339 values of N from 1 until it is equal to ROUNDED_SIZE. */
12341 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12344 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12345 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12347 if (size != rounded_size)
12349 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12350 plus_constant (Pmode, stack_pointer_rtx,
12351 rounded_size - size)));
12352 emit_stack_probe (stack_pointer_rtx);
12355 /* Adjust back to account for the additional first interval. */
12356 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12357 plus_constant (Pmode, stack_pointer_rtx,
12358 PROBE_INTERVAL + dope)));
12360 release_scratch_register_on_entry (&sr);
12363 /* Even if the stack pointer isn't the CFA register, we need to correctly
12364 describe the adjustments made to it, in particular differentiate the
12365 frame-related ones from the frame-unrelated ones. */
12366 if (size > 0)
12368 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12369 XVECEXP (expr, 0, 0)
12370 = gen_rtx_SET (stack_pointer_rtx,
12371 plus_constant (Pmode, stack_pointer_rtx, -size));
12372 XVECEXP (expr, 0, 1)
12373 = gen_rtx_SET (stack_pointer_rtx,
12374 plus_constant (Pmode, stack_pointer_rtx,
12375 PROBE_INTERVAL + dope + size));
12376 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12377 RTX_FRAME_RELATED_P (last) = 1;
12379 cfun->machine->fs.sp_offset += size;
12382 /* Make sure nothing is scheduled before we are done. */
12383 emit_insn (gen_blockage ());
12386 /* Adjust the stack pointer up to REG while probing it. */
12388 const char *
12389 output_adjust_stack_and_probe (rtx reg)
12391 static int labelno = 0;
12392 char loop_lab[32];
12393 rtx xops[2];
12395 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12397 /* Loop. */
12398 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12400 /* SP = SP + PROBE_INTERVAL. */
12401 xops[0] = stack_pointer_rtx;
12402 xops[1] = GEN_INT (PROBE_INTERVAL);
12403 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12405 /* Probe at SP. */
12406 xops[1] = const0_rtx;
12407 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12409 /* Test if SP == LAST_ADDR. */
12410 xops[0] = stack_pointer_rtx;
12411 xops[1] = reg;
12412 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12414 /* Branch. */
12415 fputs ("\tjne\t", asm_out_file);
12416 assemble_name_raw (asm_out_file, loop_lab);
12417 fputc ('\n', asm_out_file);
12419 return "";
12422 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12423 inclusive. These are offsets from the current stack pointer. */
12425 static void
12426 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
12428 /* See if we have a constant small number of probes to generate. If so,
12429 that's the easy case. The run-time loop is made up of 6 insns in the
12430 generic case while the compile-time loop is made up of n insns for n #
12431 of intervals. */
12432 if (size <= 6 * PROBE_INTERVAL)
12434 HOST_WIDE_INT i;
12436 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12437 it exceeds SIZE. If only one probe is needed, this will not
12438 generate any code. Then probe at FIRST + SIZE. */
12439 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
12440 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12441 -(first + i)));
12443 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12444 -(first + size)));
12447 /* Otherwise, do the same as above, but in a loop. Note that we must be
12448 extra careful with variables wrapping around because we might be at
12449 the very top (or the very bottom) of the address space and we have
12450 to be able to handle this case properly; in particular, we use an
12451 equality test for the loop condition. */
12452 else
12454 HOST_WIDE_INT rounded_size, last;
12455 struct scratch_reg sr;
12457 get_scratch_register_on_entry (&sr);
12460 /* Step 1: round SIZE to the previous multiple of the interval. */
12462 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
12465 /* Step 2: compute initial and final value of the loop counter. */
12467 /* TEST_OFFSET = FIRST. */
12468 emit_move_insn (sr.reg, GEN_INT (-first));
12470 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
12471 last = first + rounded_size;
12474 /* Step 3: the loop
12478 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
12479 probe at TEST_ADDR
12481 while (TEST_ADDR != LAST_ADDR)
12483 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
12484 until it is equal to ROUNDED_SIZE. */
12486 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
12489 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
12490 that SIZE is equal to ROUNDED_SIZE. */
12492 if (size != rounded_size)
12493 emit_stack_probe (plus_constant (Pmode,
12494 gen_rtx_PLUS (Pmode,
12495 stack_pointer_rtx,
12496 sr.reg),
12497 rounded_size - size));
12499 release_scratch_register_on_entry (&sr);
12502 /* Make sure nothing is scheduled before we are done. */
12503 emit_insn (gen_blockage ());
12506 /* Probe a range of stack addresses from REG to END, inclusive. These are
12507 offsets from the current stack pointer. */
12509 const char *
12510 output_probe_stack_range (rtx reg, rtx end)
12512 static int labelno = 0;
12513 char loop_lab[32];
12514 rtx xops[3];
12516 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12518 /* Loop. */
12519 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12521 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
12522 xops[0] = reg;
12523 xops[1] = GEN_INT (PROBE_INTERVAL);
12524 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12526 /* Probe at TEST_ADDR. */
12527 xops[0] = stack_pointer_rtx;
12528 xops[1] = reg;
12529 xops[2] = const0_rtx;
12530 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
12532 /* Test if TEST_ADDR == LAST_ADDR. */
12533 xops[0] = reg;
12534 xops[1] = end;
12535 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12537 /* Branch. */
12538 fputs ("\tjne\t", asm_out_file);
12539 assemble_name_raw (asm_out_file, loop_lab);
12540 fputc ('\n', asm_out_file);
12542 return "";
12545 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
12546 will guide prologue/epilogue to be generated in correct form. */
12548 static void
12549 ix86_finalize_stack_frame_flags (void)
12551 /* Check if stack realign is really needed after reload, and
12552 stores result in cfun */
12553 unsigned int incoming_stack_boundary
12554 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
12555 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
12556 unsigned int stack_alignment
12557 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
12558 ? crtl->max_used_stack_slot_alignment
12559 : crtl->stack_alignment_needed);
12560 unsigned int stack_realign
12561 = (incoming_stack_boundary < stack_alignment);
12562 bool recompute_frame_layout_p = false;
12564 if (crtl->stack_realign_finalized)
12566 /* After stack_realign_needed is finalized, we can't no longer
12567 change it. */
12568 gcc_assert (crtl->stack_realign_needed == stack_realign);
12569 return;
12572 /* If the only reason for frame_pointer_needed is that we conservatively
12573 assumed stack realignment might be needed or -fno-omit-frame-pointer
12574 is used, but in the end nothing that needed the stack alignment had
12575 been spilled nor stack access, clear frame_pointer_needed and say we
12576 don't need stack realignment. */
12577 if ((stack_realign || !flag_omit_frame_pointer)
12578 && frame_pointer_needed
12579 && crtl->is_leaf
12580 && crtl->sp_is_unchanging
12581 && !ix86_current_function_calls_tls_descriptor
12582 && !crtl->accesses_prior_frames
12583 && !cfun->calls_alloca
12584 && !crtl->calls_eh_return
12585 /* See ira_setup_eliminable_regset for the rationale. */
12586 && !(STACK_CHECK_MOVING_SP
12587 && flag_stack_check
12588 && flag_exceptions
12589 && cfun->can_throw_non_call_exceptions)
12590 && !ix86_frame_pointer_required ()
12591 && get_frame_size () == 0
12592 && ix86_nsaved_sseregs () == 0
12593 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
12595 HARD_REG_SET set_up_by_prologue, prologue_used;
12596 basic_block bb;
12598 CLEAR_HARD_REG_SET (prologue_used);
12599 CLEAR_HARD_REG_SET (set_up_by_prologue);
12600 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
12601 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
12602 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
12603 HARD_FRAME_POINTER_REGNUM);
12605 /* The preferred stack alignment is the minimum stack alignment. */
12606 if (stack_alignment > crtl->preferred_stack_boundary)
12607 stack_alignment = crtl->preferred_stack_boundary;
12609 bool require_stack_frame = false;
12611 FOR_EACH_BB_FN (bb, cfun)
12613 rtx_insn *insn;
12614 FOR_BB_INSNS (bb, insn)
12615 if (NONDEBUG_INSN_P (insn)
12616 && requires_stack_frame_p (insn, prologue_used,
12617 set_up_by_prologue))
12619 require_stack_frame = true;
12621 if (stack_realign)
12623 /* Find the maximum stack alignment. */
12624 subrtx_iterator::array_type array;
12625 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
12626 if (MEM_P (*iter)
12627 && (reg_mentioned_p (stack_pointer_rtx,
12628 *iter)
12629 || reg_mentioned_p (frame_pointer_rtx,
12630 *iter)))
12632 unsigned int alignment = MEM_ALIGN (*iter);
12633 if (alignment > stack_alignment)
12634 stack_alignment = alignment;
12640 if (require_stack_frame)
12642 /* Stack frame is required. If stack alignment needed is less
12643 than incoming stack boundary, don't realign stack. */
12644 stack_realign = incoming_stack_boundary < stack_alignment;
12645 if (!stack_realign)
12647 crtl->max_used_stack_slot_alignment
12648 = incoming_stack_boundary;
12649 crtl->stack_alignment_needed
12650 = incoming_stack_boundary;
12651 /* Also update preferred_stack_boundary for leaf
12652 functions. */
12653 crtl->preferred_stack_boundary
12654 = incoming_stack_boundary;
12657 else
12659 /* If drap has been set, but it actually isn't live at the
12660 start of the function, there is no reason to set it up. */
12661 if (crtl->drap_reg)
12663 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12664 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
12665 REGNO (crtl->drap_reg)))
12667 crtl->drap_reg = NULL_RTX;
12668 crtl->need_drap = false;
12671 else
12672 cfun->machine->no_drap_save_restore = true;
12674 frame_pointer_needed = false;
12675 stack_realign = false;
12676 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
12677 crtl->stack_alignment_needed = incoming_stack_boundary;
12678 crtl->stack_alignment_estimated = incoming_stack_boundary;
12679 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
12680 crtl->preferred_stack_boundary = incoming_stack_boundary;
12681 df_finish_pass (true);
12682 df_scan_alloc (NULL);
12683 df_scan_blocks ();
12684 df_compute_regs_ever_live (true);
12685 df_analyze ();
12687 if (flag_var_tracking)
12689 /* Since frame pointer is no longer available, replace it with
12690 stack pointer - UNITS_PER_WORD in debug insns. */
12691 df_ref ref, next;
12692 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
12693 ref; ref = next)
12695 next = DF_REF_NEXT_REG (ref);
12696 if (!DF_REF_INSN_INFO (ref))
12697 continue;
12699 /* Make sure the next ref is for a different instruction,
12700 so that we're not affected by the rescan. */
12701 rtx_insn *insn = DF_REF_INSN (ref);
12702 while (next && DF_REF_INSN (next) == insn)
12703 next = DF_REF_NEXT_REG (next);
12705 if (DEBUG_INSN_P (insn))
12707 bool changed = false;
12708 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
12710 rtx *loc = DF_REF_LOC (ref);
12711 if (*loc == hard_frame_pointer_rtx)
12713 *loc = plus_constant (Pmode,
12714 stack_pointer_rtx,
12715 -UNITS_PER_WORD);
12716 changed = true;
12719 if (changed)
12720 df_insn_rescan (insn);
12725 recompute_frame_layout_p = true;
12729 if (crtl->stack_realign_needed != stack_realign)
12730 recompute_frame_layout_p = true;
12731 crtl->stack_realign_needed = stack_realign;
12732 crtl->stack_realign_finalized = true;
12733 if (recompute_frame_layout_p)
12734 ix86_compute_frame_layout ();
12737 /* Delete SET_GOT right after entry block if it is allocated to reg. */
12739 static void
12740 ix86_elim_entry_set_got (rtx reg)
12742 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12743 rtx_insn *c_insn = BB_HEAD (bb);
12744 if (!NONDEBUG_INSN_P (c_insn))
12745 c_insn = next_nonnote_nondebug_insn (c_insn);
12746 if (c_insn && NONJUMP_INSN_P (c_insn))
12748 rtx pat = PATTERN (c_insn);
12749 if (GET_CODE (pat) == PARALLEL)
12751 rtx vec = XVECEXP (pat, 0, 0);
12752 if (GET_CODE (vec) == SET
12753 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
12754 && REGNO (XEXP (vec, 0)) == REGNO (reg))
12755 delete_insn (c_insn);
12760 static rtx
12761 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
12763 rtx addr, mem;
12765 if (offset)
12766 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
12767 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
12768 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
12771 static inline rtx
12772 gen_frame_load (rtx reg, rtx frame_reg, int offset)
12774 return gen_frame_set (reg, frame_reg, offset, false);
12777 static inline rtx
12778 gen_frame_store (rtx reg, rtx frame_reg, int offset)
12780 return gen_frame_set (reg, frame_reg, offset, true);
12783 static void
12784 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
12786 struct machine_function *m = cfun->machine;
12787 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
12788 + m->call_ms2sysv_extra_regs;
12789 rtvec v = rtvec_alloc (ncregs + 1);
12790 unsigned int align, i, vi = 0;
12791 rtx_insn *insn;
12792 rtx sym, addr;
12793 rtx rax = gen_rtx_REG (word_mode, AX_REG);
12794 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12795 HOST_WIDE_INT allocate = frame.stack_pointer_offset - m->fs.sp_offset;
12797 /* AL should only be live with sysv_abi. */
12798 gcc_assert (!ix86_eax_live_at_start_p ());
12800 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
12801 we've actually realigned the stack or not. */
12802 align = GET_MODE_ALIGNMENT (V4SFmode);
12803 addr = choose_baseaddr (frame.stack_realign_offset
12804 + xlogue.get_stub_ptr_offset (), &align);
12805 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
12806 emit_insn (gen_rtx_SET (rax, addr));
12808 /* Allocate stack if not already done. */
12809 if (allocate > 0)
12810 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12811 GEN_INT (-allocate), -1, false);
12813 /* Get the stub symbol. */
12814 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
12815 : XLOGUE_STUB_SAVE);
12816 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
12818 for (i = 0; i < ncregs; ++i)
12820 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
12821 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
12822 r.regno);
12823 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
12826 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
12828 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
12829 RTX_FRAME_RELATED_P (insn) = true;
12832 /* Expand the prologue into a bunch of separate insns. */
12834 void
12835 ix86_expand_prologue (void)
12837 struct machine_function *m = cfun->machine;
12838 rtx insn, t;
12839 struct ix86_frame frame;
12840 HOST_WIDE_INT allocate;
12841 bool int_registers_saved;
12842 bool sse_registers_saved;
12843 rtx static_chain = NULL_RTX;
12845 if (ix86_function_naked (current_function_decl))
12846 return;
12848 ix86_finalize_stack_frame_flags ();
12850 /* DRAP should not coexist with stack_realign_fp */
12851 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
12853 memset (&m->fs, 0, sizeof (m->fs));
12855 /* Initialize CFA state for before the prologue. */
12856 m->fs.cfa_reg = stack_pointer_rtx;
12857 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
12859 /* Track SP offset to the CFA. We continue tracking this after we've
12860 swapped the CFA register away from SP. In the case of re-alignment
12861 this is fudged; we're interested to offsets within the local frame. */
12862 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
12863 m->fs.sp_valid = true;
12864 m->fs.sp_realigned = false;
12866 frame = m->frame;
12868 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
12870 /* We should have already generated an error for any use of
12871 ms_hook on a nested function. */
12872 gcc_checking_assert (!ix86_static_chain_on_stack);
12874 /* Check if profiling is active and we shall use profiling before
12875 prologue variant. If so sorry. */
12876 if (crtl->profile && flag_fentry != 0)
12877 sorry ("ms_hook_prologue attribute isn%'t compatible "
12878 "with -mfentry for 32-bit");
12880 /* In ix86_asm_output_function_label we emitted:
12881 8b ff movl.s %edi,%edi
12882 55 push %ebp
12883 8b ec movl.s %esp,%ebp
12885 This matches the hookable function prologue in Win32 API
12886 functions in Microsoft Windows XP Service Pack 2 and newer.
12887 Wine uses this to enable Windows apps to hook the Win32 API
12888 functions provided by Wine.
12890 What that means is that we've already set up the frame pointer. */
12892 if (frame_pointer_needed
12893 && !(crtl->drap_reg && crtl->stack_realign_needed))
12895 rtx push, mov;
12897 /* We've decided to use the frame pointer already set up.
12898 Describe this to the unwinder by pretending that both
12899 push and mov insns happen right here.
12901 Putting the unwind info here at the end of the ms_hook
12902 is done so that we can make absolutely certain we get
12903 the required byte sequence at the start of the function,
12904 rather than relying on an assembler that can produce
12905 the exact encoding required.
12907 However it does mean (in the unpatched case) that we have
12908 a 1 insn window where the asynchronous unwind info is
12909 incorrect. However, if we placed the unwind info at
12910 its correct location we would have incorrect unwind info
12911 in the patched case. Which is probably all moot since
12912 I don't expect Wine generates dwarf2 unwind info for the
12913 system libraries that use this feature. */
12915 insn = emit_insn (gen_blockage ());
12917 push = gen_push (hard_frame_pointer_rtx);
12918 mov = gen_rtx_SET (hard_frame_pointer_rtx,
12919 stack_pointer_rtx);
12920 RTX_FRAME_RELATED_P (push) = 1;
12921 RTX_FRAME_RELATED_P (mov) = 1;
12923 RTX_FRAME_RELATED_P (insn) = 1;
12924 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
12925 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
12927 /* Note that gen_push incremented m->fs.cfa_offset, even
12928 though we didn't emit the push insn here. */
12929 m->fs.cfa_reg = hard_frame_pointer_rtx;
12930 m->fs.fp_offset = m->fs.cfa_offset;
12931 m->fs.fp_valid = true;
12933 else
12935 /* The frame pointer is not needed so pop %ebp again.
12936 This leaves us with a pristine state. */
12937 emit_insn (gen_pop (hard_frame_pointer_rtx));
12941 /* The first insn of a function that accepts its static chain on the
12942 stack is to push the register that would be filled in by a direct
12943 call. This insn will be skipped by the trampoline. */
12944 else if (ix86_static_chain_on_stack)
12946 static_chain = ix86_static_chain (cfun->decl, false);
12947 insn = emit_insn (gen_push (static_chain));
12948 emit_insn (gen_blockage ());
12950 /* We don't want to interpret this push insn as a register save,
12951 only as a stack adjustment. The real copy of the register as
12952 a save will be done later, if needed. */
12953 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12954 t = gen_rtx_SET (stack_pointer_rtx, t);
12955 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
12956 RTX_FRAME_RELATED_P (insn) = 1;
12959 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
12960 of DRAP is needed and stack realignment is really needed after reload */
12961 if (stack_realign_drap)
12963 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
12965 /* Can't use DRAP in interrupt function. */
12966 if (cfun->machine->func_type != TYPE_NORMAL)
12967 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
12968 "in interrupt service routine. This may be worked "
12969 "around by avoiding functions with aggregate return.");
12971 /* Only need to push parameter pointer reg if it is caller saved. */
12972 if (!call_used_regs[REGNO (crtl->drap_reg)])
12974 /* Push arg pointer reg */
12975 insn = emit_insn (gen_push (crtl->drap_reg));
12976 RTX_FRAME_RELATED_P (insn) = 1;
12979 /* Grab the argument pointer. */
12980 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
12981 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
12982 RTX_FRAME_RELATED_P (insn) = 1;
12983 m->fs.cfa_reg = crtl->drap_reg;
12984 m->fs.cfa_offset = 0;
12986 /* Align the stack. */
12987 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
12988 stack_pointer_rtx,
12989 GEN_INT (-align_bytes)));
12990 RTX_FRAME_RELATED_P (insn) = 1;
12992 /* Replicate the return address on the stack so that return
12993 address can be reached via (argp - 1) slot. This is needed
12994 to implement macro RETURN_ADDR_RTX and intrinsic function
12995 expand_builtin_return_addr etc. */
12996 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
12997 t = gen_frame_mem (word_mode, t);
12998 insn = emit_insn (gen_push (t));
12999 RTX_FRAME_RELATED_P (insn) = 1;
13001 /* For the purposes of frame and register save area addressing,
13002 we've started over with a new frame. */
13003 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13004 m->fs.realigned = true;
13006 if (static_chain)
13008 /* Replicate static chain on the stack so that static chain
13009 can be reached via (argp - 2) slot. This is needed for
13010 nested function with stack realignment. */
13011 insn = emit_insn (gen_push (static_chain));
13012 RTX_FRAME_RELATED_P (insn) = 1;
13016 int_registers_saved = (frame.nregs == 0);
13017 sse_registers_saved = (frame.nsseregs == 0);
13019 if (frame_pointer_needed && !m->fs.fp_valid)
13021 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13022 slower on all targets. Also sdb doesn't like it. */
13023 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13024 RTX_FRAME_RELATED_P (insn) = 1;
13026 /* Push registers now, before setting the frame pointer
13027 on SEH target. */
13028 if (!int_registers_saved
13029 && TARGET_SEH
13030 && !frame.save_regs_using_mov)
13032 ix86_emit_save_regs ();
13033 int_registers_saved = true;
13034 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13037 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13039 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13040 RTX_FRAME_RELATED_P (insn) = 1;
13042 if (m->fs.cfa_reg == stack_pointer_rtx)
13043 m->fs.cfa_reg = hard_frame_pointer_rtx;
13044 m->fs.fp_offset = m->fs.sp_offset;
13045 m->fs.fp_valid = true;
13049 if (!int_registers_saved)
13051 /* If saving registers via PUSH, do so now. */
13052 if (!frame.save_regs_using_mov)
13054 ix86_emit_save_regs ();
13055 int_registers_saved = true;
13056 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13059 /* When using red zone we may start register saving before allocating
13060 the stack frame saving one cycle of the prologue. However, avoid
13061 doing this if we have to probe the stack; at least on x86_64 the
13062 stack probe can turn into a call that clobbers a red zone location. */
13063 else if (ix86_using_red_zone ()
13064 && (! TARGET_STACK_PROBE
13065 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13067 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13068 int_registers_saved = true;
13072 if (stack_realign_fp)
13074 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13075 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13077 /* Record last valid frame pointer offset. */
13078 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13080 /* The computation of the size of the re-aligned stack frame means
13081 that we must allocate the size of the register save area before
13082 performing the actual alignment. Otherwise we cannot guarantee
13083 that there's enough storage above the realignment point. */
13084 allocate = frame.reg_save_offset - m->fs.sp_offset
13085 + frame.stack_realign_allocate;
13086 if (allocate)
13087 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13088 GEN_INT (-allocate), -1, false);
13090 /* Align the stack. */
13091 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13092 stack_pointer_rtx,
13093 GEN_INT (-align_bytes)));
13094 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13095 m->fs.sp_realigned_offset = m->fs.sp_offset
13096 - frame.stack_realign_allocate;
13097 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13098 Beyond this point, stack access should be done via choose_baseaddr or
13099 by using sp_valid_at and fp_valid_at to determine the correct base
13100 register. Henceforth, any CFA offset should be thought of as logical
13101 and not physical. */
13102 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13103 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13104 m->fs.sp_realigned = true;
13106 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13107 is needed to describe where a register is saved using a realigned
13108 stack pointer, so we need to invalidate the stack pointer for that
13109 target. */
13110 if (TARGET_SEH)
13111 m->fs.sp_valid = false;
13114 if (m->call_ms2sysv)
13115 ix86_emit_outlined_ms2sysv_save (frame);
13117 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13119 if (flag_stack_usage_info)
13121 /* We start to count from ARG_POINTER. */
13122 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13124 /* If it was realigned, take into account the fake frame. */
13125 if (stack_realign_drap)
13127 if (ix86_static_chain_on_stack)
13128 stack_size += UNITS_PER_WORD;
13130 if (!call_used_regs[REGNO (crtl->drap_reg)])
13131 stack_size += UNITS_PER_WORD;
13133 /* This over-estimates by 1 minimal-stack-alignment-unit but
13134 mitigates that by counting in the new return address slot. */
13135 current_function_dynamic_stack_size
13136 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13139 current_function_static_stack_size = stack_size;
13142 /* On SEH target with very large frame size, allocate an area to save
13143 SSE registers (as the very large allocation won't be described). */
13144 if (TARGET_SEH
13145 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13146 && !sse_registers_saved)
13148 HOST_WIDE_INT sse_size =
13149 frame.sse_reg_save_offset - frame.reg_save_offset;
13151 gcc_assert (int_registers_saved);
13153 /* No need to do stack checking as the area will be immediately
13154 written. */
13155 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13156 GEN_INT (-sse_size), -1,
13157 m->fs.cfa_reg == stack_pointer_rtx);
13158 allocate -= sse_size;
13159 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13160 sse_registers_saved = true;
13163 /* The stack has already been decremented by the instruction calling us
13164 so probe if the size is non-negative to preserve the protection area. */
13165 if (allocate >= 0
13166 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13167 || flag_stack_clash_protection))
13169 /* We expect the GP registers to be saved when probes are used. */
13170 gcc_assert (int_registers_saved);
13172 if (flag_stack_clash_protection)
13174 ix86_adjust_stack_and_probe_stack_clash (allocate);
13175 allocate = 0;
13177 else if (STACK_CHECK_MOVING_SP)
13179 if (!(crtl->is_leaf && !cfun->calls_alloca
13180 && allocate <= PROBE_INTERVAL))
13182 ix86_adjust_stack_and_probe (allocate);
13183 allocate = 0;
13186 else
13188 HOST_WIDE_INT size = allocate;
13190 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13191 size = 0x80000000 - get_stack_check_protect () - 1;
13193 if (TARGET_STACK_PROBE)
13195 if (crtl->is_leaf && !cfun->calls_alloca)
13197 if (size > PROBE_INTERVAL)
13198 ix86_emit_probe_stack_range (0, size);
13200 else
13201 ix86_emit_probe_stack_range (0,
13202 size + get_stack_check_protect ());
13204 else
13206 if (crtl->is_leaf && !cfun->calls_alloca)
13208 if (size > PROBE_INTERVAL
13209 && size > get_stack_check_protect ())
13210 ix86_emit_probe_stack_range (get_stack_check_protect (),
13211 size - get_stack_check_protect ());
13213 else
13214 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
13219 if (allocate == 0)
13221 else if (!ix86_target_stack_probe ()
13222 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13224 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13225 GEN_INT (-allocate), -1,
13226 m->fs.cfa_reg == stack_pointer_rtx);
13228 else
13230 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13231 rtx r10 = NULL;
13232 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13233 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13234 bool eax_live = ix86_eax_live_at_start_p ();
13235 bool r10_live = false;
13237 if (TARGET_64BIT)
13238 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13240 if (eax_live)
13242 insn = emit_insn (gen_push (eax));
13243 allocate -= UNITS_PER_WORD;
13244 /* Note that SEH directives need to continue tracking the stack
13245 pointer even after the frame pointer has been set up. */
13246 if (sp_is_cfa_reg || TARGET_SEH)
13248 if (sp_is_cfa_reg)
13249 m->fs.cfa_offset += UNITS_PER_WORD;
13250 RTX_FRAME_RELATED_P (insn) = 1;
13251 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13252 gen_rtx_SET (stack_pointer_rtx,
13253 plus_constant (Pmode, stack_pointer_rtx,
13254 -UNITS_PER_WORD)));
13258 if (r10_live)
13260 r10 = gen_rtx_REG (Pmode, R10_REG);
13261 insn = emit_insn (gen_push (r10));
13262 allocate -= UNITS_PER_WORD;
13263 if (sp_is_cfa_reg || TARGET_SEH)
13265 if (sp_is_cfa_reg)
13266 m->fs.cfa_offset += UNITS_PER_WORD;
13267 RTX_FRAME_RELATED_P (insn) = 1;
13268 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13269 gen_rtx_SET (stack_pointer_rtx,
13270 plus_constant (Pmode, stack_pointer_rtx,
13271 -UNITS_PER_WORD)));
13275 emit_move_insn (eax, GEN_INT (allocate));
13276 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13278 /* Use the fact that AX still contains ALLOCATE. */
13279 adjust_stack_insn = (Pmode == DImode
13280 ? gen_pro_epilogue_adjust_stack_di_sub
13281 : gen_pro_epilogue_adjust_stack_si_sub);
13283 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13284 stack_pointer_rtx, eax));
13286 if (sp_is_cfa_reg || TARGET_SEH)
13288 if (sp_is_cfa_reg)
13289 m->fs.cfa_offset += allocate;
13290 RTX_FRAME_RELATED_P (insn) = 1;
13291 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13292 gen_rtx_SET (stack_pointer_rtx,
13293 plus_constant (Pmode, stack_pointer_rtx,
13294 -allocate)));
13296 m->fs.sp_offset += allocate;
13298 /* Use stack_pointer_rtx for relative addressing so that code
13299 works for realigned stack, too. */
13300 if (r10_live && eax_live)
13302 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13303 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13304 gen_frame_mem (word_mode, t));
13305 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13306 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13307 gen_frame_mem (word_mode, t));
13309 else if (eax_live || r10_live)
13311 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13312 emit_move_insn (gen_rtx_REG (word_mode,
13313 (eax_live ? AX_REG : R10_REG)),
13314 gen_frame_mem (word_mode, t));
13317 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13319 /* If we havn't already set up the frame pointer, do so now. */
13320 if (frame_pointer_needed && !m->fs.fp_valid)
13322 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13323 GEN_INT (frame.stack_pointer_offset
13324 - frame.hard_frame_pointer_offset));
13325 insn = emit_insn (insn);
13326 RTX_FRAME_RELATED_P (insn) = 1;
13327 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13329 if (m->fs.cfa_reg == stack_pointer_rtx)
13330 m->fs.cfa_reg = hard_frame_pointer_rtx;
13331 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13332 m->fs.fp_valid = true;
13335 if (!int_registers_saved)
13336 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13337 if (!sse_registers_saved)
13338 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13340 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13341 in PROLOGUE. */
13342 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13344 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13345 insn = emit_insn (gen_set_got (pic));
13346 RTX_FRAME_RELATED_P (insn) = 1;
13347 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13348 emit_insn (gen_prologue_use (pic));
13349 /* Deleting already emmitted SET_GOT if exist and allocated to
13350 REAL_PIC_OFFSET_TABLE_REGNUM. */
13351 ix86_elim_entry_set_got (pic);
13354 if (crtl->drap_reg && !crtl->stack_realign_needed)
13356 /* vDRAP is setup but after reload it turns out stack realign
13357 isn't necessary, here we will emit prologue to setup DRAP
13358 without stack realign adjustment */
13359 t = choose_baseaddr (0, NULL);
13360 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13363 /* Prevent instructions from being scheduled into register save push
13364 sequence when access to the redzone area is done through frame pointer.
13365 The offset between the frame pointer and the stack pointer is calculated
13366 relative to the value of the stack pointer at the end of the function
13367 prologue, and moving instructions that access redzone area via frame
13368 pointer inside push sequence violates this assumption. */
13369 if (frame_pointer_needed && frame.red_zone_size)
13370 emit_insn (gen_memory_blockage ());
13372 /* SEH requires that the prologue end within 256 bytes of the start of
13373 the function. Prevent instruction schedules that would extend that.
13374 Further, prevent alloca modifications to the stack pointer from being
13375 combined with prologue modifications. */
13376 if (TARGET_SEH)
13377 emit_insn (gen_prologue_use (stack_pointer_rtx));
13380 /* Emit code to restore REG using a POP insn. */
13382 static void
13383 ix86_emit_restore_reg_using_pop (rtx reg)
13385 struct machine_function *m = cfun->machine;
13386 rtx_insn *insn = emit_insn (gen_pop (reg));
13388 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13389 m->fs.sp_offset -= UNITS_PER_WORD;
13391 if (m->fs.cfa_reg == crtl->drap_reg
13392 && REGNO (reg) == REGNO (crtl->drap_reg))
13394 /* Previously we'd represented the CFA as an expression
13395 like *(%ebp - 8). We've just popped that value from
13396 the stack, which means we need to reset the CFA to
13397 the drap register. This will remain until we restore
13398 the stack pointer. */
13399 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13400 RTX_FRAME_RELATED_P (insn) = 1;
13402 /* This means that the DRAP register is valid for addressing too. */
13403 m->fs.drap_valid = true;
13404 return;
13407 if (m->fs.cfa_reg == stack_pointer_rtx)
13409 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13410 x = gen_rtx_SET (stack_pointer_rtx, x);
13411 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
13412 RTX_FRAME_RELATED_P (insn) = 1;
13414 m->fs.cfa_offset -= UNITS_PER_WORD;
13417 /* When the frame pointer is the CFA, and we pop it, we are
13418 swapping back to the stack pointer as the CFA. This happens
13419 for stack frames that don't allocate other data, so we assume
13420 the stack pointer is now pointing at the return address, i.e.
13421 the function entry state, which makes the offset be 1 word. */
13422 if (reg == hard_frame_pointer_rtx)
13424 m->fs.fp_valid = false;
13425 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13427 m->fs.cfa_reg = stack_pointer_rtx;
13428 m->fs.cfa_offset -= UNITS_PER_WORD;
13430 add_reg_note (insn, REG_CFA_DEF_CFA,
13431 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
13432 GEN_INT (m->fs.cfa_offset)));
13433 RTX_FRAME_RELATED_P (insn) = 1;
13438 /* Emit code to restore saved registers using POP insns. */
13440 static void
13441 ix86_emit_restore_regs_using_pop (void)
13443 unsigned int regno;
13445 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13446 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
13447 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
13450 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
13451 omits the emit and only attaches the notes. */
13453 static void
13454 ix86_emit_leave (rtx_insn *insn)
13456 struct machine_function *m = cfun->machine;
13457 if (!insn)
13458 insn = emit_insn (ix86_gen_leave ());
13460 ix86_add_queued_cfa_restore_notes (insn);
13462 gcc_assert (m->fs.fp_valid);
13463 m->fs.sp_valid = true;
13464 m->fs.sp_realigned = false;
13465 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
13466 m->fs.fp_valid = false;
13468 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13470 m->fs.cfa_reg = stack_pointer_rtx;
13471 m->fs.cfa_offset = m->fs.sp_offset;
13473 add_reg_note (insn, REG_CFA_DEF_CFA,
13474 plus_constant (Pmode, stack_pointer_rtx,
13475 m->fs.sp_offset));
13476 RTX_FRAME_RELATED_P (insn) = 1;
13478 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
13479 m->fs.fp_offset);
13482 /* Emit code to restore saved registers using MOV insns.
13483 First register is restored from CFA - CFA_OFFSET. */
13484 static void
13485 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
13486 bool maybe_eh_return)
13488 struct machine_function *m = cfun->machine;
13489 unsigned int regno;
13491 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13492 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13494 rtx reg = gen_rtx_REG (word_mode, regno);
13495 rtx mem;
13496 rtx_insn *insn;
13498 mem = choose_baseaddr (cfa_offset, NULL);
13499 mem = gen_frame_mem (word_mode, mem);
13500 insn = emit_move_insn (reg, mem);
13502 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
13504 /* Previously we'd represented the CFA as an expression
13505 like *(%ebp - 8). We've just popped that value from
13506 the stack, which means we need to reset the CFA to
13507 the drap register. This will remain until we restore
13508 the stack pointer. */
13509 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13510 RTX_FRAME_RELATED_P (insn) = 1;
13512 /* This means that the DRAP register is valid for addressing. */
13513 m->fs.drap_valid = true;
13515 else
13516 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13518 cfa_offset -= UNITS_PER_WORD;
13522 /* Emit code to restore saved registers using MOV insns.
13523 First register is restored from CFA - CFA_OFFSET. */
13524 static void
13525 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
13526 bool maybe_eh_return)
13528 unsigned int regno;
13530 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13531 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13533 rtx reg = gen_rtx_REG (V4SFmode, regno);
13534 rtx mem;
13535 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
13537 mem = choose_baseaddr (cfa_offset, &align);
13538 mem = gen_rtx_MEM (V4SFmode, mem);
13540 /* The location aligment depends upon the base register. */
13541 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
13542 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13543 set_mem_align (mem, align);
13544 emit_insn (gen_rtx_SET (reg, mem));
13546 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13548 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13552 static void
13553 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
13554 bool use_call, int style)
13556 struct machine_function *m = cfun->machine;
13557 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13558 + m->call_ms2sysv_extra_regs;
13559 rtvec v;
13560 unsigned int elems_needed, align, i, vi = 0;
13561 rtx_insn *insn;
13562 rtx sym, tmp;
13563 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
13564 rtx r10 = NULL_RTX;
13565 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13566 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
13567 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
13568 rtx rsi_frame_load = NULL_RTX;
13569 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
13570 enum xlogue_stub stub;
13572 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
13574 /* If using a realigned stack, we should never start with padding. */
13575 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
13577 /* Setup RSI as the stub's base pointer. */
13578 align = GET_MODE_ALIGNMENT (V4SFmode);
13579 tmp = choose_baseaddr (rsi_offset, &align);
13580 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13581 emit_insn (gen_rtx_SET (rsi, tmp));
13583 /* Get a symbol for the stub. */
13584 if (frame_pointer_needed)
13585 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
13586 : XLOGUE_STUB_RESTORE_HFP_TAIL;
13587 else
13588 stub = use_call ? XLOGUE_STUB_RESTORE
13589 : XLOGUE_STUB_RESTORE_TAIL;
13590 sym = xlogue.get_stub_rtx (stub);
13592 elems_needed = ncregs;
13593 if (use_call)
13594 elems_needed += 1;
13595 else
13596 elems_needed += frame_pointer_needed ? 5 : 3;
13597 v = rtvec_alloc (elems_needed);
13599 /* We call the epilogue stub when we need to pop incoming args or we are
13600 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
13601 epilogue stub and it is the tail-call. */
13602 if (use_call)
13603 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13604 else
13606 RTVEC_ELT (v, vi++) = ret_rtx;
13607 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13608 if (frame_pointer_needed)
13610 rtx rbp = gen_rtx_REG (DImode, BP_REG);
13611 gcc_assert (m->fs.fp_valid);
13612 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
13614 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
13615 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
13616 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
13617 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
13618 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
13620 else
13622 /* If no hard frame pointer, we set R10 to the SP restore value. */
13623 gcc_assert (!m->fs.fp_valid);
13624 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13625 gcc_assert (m->fs.sp_valid);
13627 r10 = gen_rtx_REG (DImode, R10_REG);
13628 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
13629 emit_insn (gen_rtx_SET (r10, tmp));
13631 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
13635 /* Generate frame load insns and restore notes. */
13636 for (i = 0; i < ncregs; ++i)
13638 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13639 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
13640 rtx reg, frame_load;
13642 reg = gen_rtx_REG (mode, r.regno);
13643 frame_load = gen_frame_load (reg, rsi, r.offset);
13645 /* Save RSI frame load insn & note to add last. */
13646 if (r.regno == SI_REG)
13648 gcc_assert (!rsi_frame_load);
13649 rsi_frame_load = frame_load;
13650 rsi_restore_offset = r.offset;
13652 else
13654 RTVEC_ELT (v, vi++) = frame_load;
13655 ix86_add_cfa_restore_note (NULL, reg, r.offset);
13659 /* Add RSI frame load & restore note at the end. */
13660 gcc_assert (rsi_frame_load);
13661 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
13662 RTVEC_ELT (v, vi++) = rsi_frame_load;
13663 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
13664 rsi_restore_offset);
13666 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
13667 if (!use_call && !frame_pointer_needed)
13669 gcc_assert (m->fs.sp_valid);
13670 gcc_assert (!m->fs.sp_realigned);
13672 /* At this point, R10 should point to frame.stack_realign_offset. */
13673 if (m->fs.cfa_reg == stack_pointer_rtx)
13674 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
13675 m->fs.sp_offset = frame.stack_realign_offset;
13678 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
13679 tmp = gen_rtx_PARALLEL (VOIDmode, v);
13680 if (use_call)
13681 insn = emit_insn (tmp);
13682 else
13684 insn = emit_jump_insn (tmp);
13685 JUMP_LABEL (insn) = ret_rtx;
13687 if (frame_pointer_needed)
13688 ix86_emit_leave (insn);
13689 else
13691 /* Need CFA adjust note. */
13692 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
13693 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
13697 RTX_FRAME_RELATED_P (insn) = true;
13698 ix86_add_queued_cfa_restore_notes (insn);
13700 /* If we're not doing a tail-call, we need to adjust the stack. */
13701 if (use_call && m->fs.sp_valid)
13703 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
13704 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13705 GEN_INT (dealloc), style,
13706 m->fs.cfa_reg == stack_pointer_rtx);
13710 /* Restore function stack, frame, and registers. */
13712 void
13713 ix86_expand_epilogue (int style)
13715 struct machine_function *m = cfun->machine;
13716 struct machine_frame_state frame_state_save = m->fs;
13717 struct ix86_frame frame;
13718 bool restore_regs_via_mov;
13719 bool using_drap;
13720 bool restore_stub_is_tail = false;
13722 if (ix86_function_naked (current_function_decl))
13724 /* The program should not reach this point. */
13725 emit_insn (gen_ud2 ());
13726 return;
13729 ix86_finalize_stack_frame_flags ();
13730 frame = m->frame;
13732 m->fs.sp_realigned = stack_realign_fp;
13733 m->fs.sp_valid = stack_realign_fp
13734 || !frame_pointer_needed
13735 || crtl->sp_is_unchanging;
13736 gcc_assert (!m->fs.sp_valid
13737 || m->fs.sp_offset == frame.stack_pointer_offset);
13739 /* The FP must be valid if the frame pointer is present. */
13740 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
13741 gcc_assert (!m->fs.fp_valid
13742 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
13744 /* We must have *some* valid pointer to the stack frame. */
13745 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
13747 /* The DRAP is never valid at this point. */
13748 gcc_assert (!m->fs.drap_valid);
13750 /* See the comment about red zone and frame
13751 pointer usage in ix86_expand_prologue. */
13752 if (frame_pointer_needed && frame.red_zone_size)
13753 emit_insn (gen_memory_blockage ());
13755 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
13756 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
13758 /* Determine the CFA offset of the end of the red-zone. */
13759 m->fs.red_zone_offset = 0;
13760 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
13762 /* The red-zone begins below return address and error code in
13763 exception handler. */
13764 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
13766 /* When the register save area is in the aligned portion of
13767 the stack, determine the maximum runtime displacement that
13768 matches up with the aligned frame. */
13769 if (stack_realign_drap)
13770 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
13771 + UNITS_PER_WORD);
13774 /* Special care must be taken for the normal return case of a function
13775 using eh_return: the eax and edx registers are marked as saved, but
13776 not restored along this path. Adjust the save location to match. */
13777 if (crtl->calls_eh_return && style != 2)
13778 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
13780 /* EH_RETURN requires the use of moves to function properly. */
13781 if (crtl->calls_eh_return)
13782 restore_regs_via_mov = true;
13783 /* SEH requires the use of pops to identify the epilogue. */
13784 else if (TARGET_SEH)
13785 restore_regs_via_mov = false;
13786 /* If we're only restoring one register and sp cannot be used then
13787 using a move instruction to restore the register since it's
13788 less work than reloading sp and popping the register. */
13789 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
13790 restore_regs_via_mov = true;
13791 else if (TARGET_EPILOGUE_USING_MOVE
13792 && cfun->machine->use_fast_prologue_epilogue
13793 && (frame.nregs > 1
13794 || m->fs.sp_offset != frame.reg_save_offset))
13795 restore_regs_via_mov = true;
13796 else if (frame_pointer_needed
13797 && !frame.nregs
13798 && m->fs.sp_offset != frame.reg_save_offset)
13799 restore_regs_via_mov = true;
13800 else if (frame_pointer_needed
13801 && TARGET_USE_LEAVE
13802 && cfun->machine->use_fast_prologue_epilogue
13803 && frame.nregs == 1)
13804 restore_regs_via_mov = true;
13805 else
13806 restore_regs_via_mov = false;
13808 if (restore_regs_via_mov || frame.nsseregs)
13810 /* Ensure that the entire register save area is addressable via
13811 the stack pointer, if we will restore SSE regs via sp. */
13812 if (TARGET_64BIT
13813 && m->fs.sp_offset > 0x7fffffff
13814 && sp_valid_at (frame.stack_realign_offset)
13815 && (frame.nsseregs + frame.nregs) != 0)
13817 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13818 GEN_INT (m->fs.sp_offset
13819 - frame.sse_reg_save_offset),
13820 style,
13821 m->fs.cfa_reg == stack_pointer_rtx);
13825 /* If there are any SSE registers to restore, then we have to do it
13826 via moves, since there's obviously no pop for SSE regs. */
13827 if (frame.nsseregs)
13828 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
13829 style == 2);
13831 if (m->call_ms2sysv)
13833 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
13835 /* We cannot use a tail-call for the stub if:
13836 1. We have to pop incoming args,
13837 2. We have additional int regs to restore, or
13838 3. A sibling call will be the tail-call, or
13839 4. We are emitting an eh_return_internal epilogue.
13841 TODO: Item 4 has not yet tested!
13843 If any of the above are true, we will call the stub rather than
13844 jump to it. */
13845 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
13846 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
13849 /* If using out-of-line stub that is a tail-call, then...*/
13850 if (m->call_ms2sysv && restore_stub_is_tail)
13852 /* TODO: parinoid tests. (remove eventually) */
13853 gcc_assert (m->fs.sp_valid);
13854 gcc_assert (!m->fs.sp_realigned);
13855 gcc_assert (!m->fs.fp_valid);
13856 gcc_assert (!m->fs.realigned);
13857 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
13858 gcc_assert (!crtl->drap_reg);
13859 gcc_assert (!frame.nregs);
13861 else if (restore_regs_via_mov)
13863 rtx t;
13865 if (frame.nregs)
13866 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
13868 /* eh_return epilogues need %ecx added to the stack pointer. */
13869 if (style == 2)
13871 rtx sa = EH_RETURN_STACKADJ_RTX;
13872 rtx_insn *insn;
13874 /* %ecx can't be used for both DRAP register and eh_return. */
13875 if (crtl->drap_reg)
13876 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
13878 /* regparm nested functions don't work with eh_return. */
13879 gcc_assert (!ix86_static_chain_on_stack);
13881 if (frame_pointer_needed)
13883 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
13884 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
13885 emit_insn (gen_rtx_SET (sa, t));
13887 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
13888 insn = emit_move_insn (hard_frame_pointer_rtx, t);
13890 /* Note that we use SA as a temporary CFA, as the return
13891 address is at the proper place relative to it. We
13892 pretend this happens at the FP restore insn because
13893 prior to this insn the FP would be stored at the wrong
13894 offset relative to SA, and after this insn we have no
13895 other reasonable register to use for the CFA. We don't
13896 bother resetting the CFA to the SP for the duration of
13897 the return insn. */
13898 add_reg_note (insn, REG_CFA_DEF_CFA,
13899 plus_constant (Pmode, sa, UNITS_PER_WORD));
13900 ix86_add_queued_cfa_restore_notes (insn);
13901 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
13902 RTX_FRAME_RELATED_P (insn) = 1;
13904 m->fs.cfa_reg = sa;
13905 m->fs.cfa_offset = UNITS_PER_WORD;
13906 m->fs.fp_valid = false;
13908 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
13909 const0_rtx, style, false);
13911 else
13913 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
13914 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
13915 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
13916 ix86_add_queued_cfa_restore_notes (insn);
13918 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13919 if (m->fs.cfa_offset != UNITS_PER_WORD)
13921 m->fs.cfa_offset = UNITS_PER_WORD;
13922 add_reg_note (insn, REG_CFA_DEF_CFA,
13923 plus_constant (Pmode, stack_pointer_rtx,
13924 UNITS_PER_WORD));
13925 RTX_FRAME_RELATED_P (insn) = 1;
13928 m->fs.sp_offset = UNITS_PER_WORD;
13929 m->fs.sp_valid = true;
13930 m->fs.sp_realigned = false;
13933 else
13935 /* SEH requires that the function end with (1) a stack adjustment
13936 if necessary, (2) a sequence of pops, and (3) a return or
13937 jump instruction. Prevent insns from the function body from
13938 being scheduled into this sequence. */
13939 if (TARGET_SEH)
13941 /* Prevent a catch region from being adjacent to the standard
13942 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
13943 several other flags that would be interesting to test are
13944 not yet set up. */
13945 if (flag_non_call_exceptions)
13946 emit_insn (gen_nops (const1_rtx));
13947 else
13948 emit_insn (gen_blockage ());
13951 /* First step is to deallocate the stack frame so that we can
13952 pop the registers. If the stack pointer was realigned, it needs
13953 to be restored now. Also do it on SEH target for very large
13954 frame as the emitted instructions aren't allowed by the ABI
13955 in epilogues. */
13956 if (!m->fs.sp_valid || m->fs.sp_realigned
13957 || (TARGET_SEH
13958 && (m->fs.sp_offset - frame.reg_save_offset
13959 >= SEH_MAX_FRAME_SIZE)))
13961 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
13962 GEN_INT (m->fs.fp_offset
13963 - frame.reg_save_offset),
13964 style, false);
13966 else if (m->fs.sp_offset != frame.reg_save_offset)
13968 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13969 GEN_INT (m->fs.sp_offset
13970 - frame.reg_save_offset),
13971 style,
13972 m->fs.cfa_reg == stack_pointer_rtx);
13975 ix86_emit_restore_regs_using_pop ();
13978 /* If we used a stack pointer and haven't already got rid of it,
13979 then do so now. */
13980 if (m->fs.fp_valid)
13982 /* If the stack pointer is valid and pointing at the frame
13983 pointer store address, then we only need a pop. */
13984 if (sp_valid_at (frame.hfp_save_offset)
13985 && m->fs.sp_offset == frame.hfp_save_offset)
13986 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
13987 /* Leave results in shorter dependency chains on CPUs that are
13988 able to grok it fast. */
13989 else if (TARGET_USE_LEAVE
13990 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
13991 || !cfun->machine->use_fast_prologue_epilogue)
13992 ix86_emit_leave (NULL);
13993 else
13995 pro_epilogue_adjust_stack (stack_pointer_rtx,
13996 hard_frame_pointer_rtx,
13997 const0_rtx, style, !using_drap);
13998 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14002 if (using_drap)
14004 int param_ptr_offset = UNITS_PER_WORD;
14005 rtx_insn *insn;
14007 gcc_assert (stack_realign_drap);
14009 if (ix86_static_chain_on_stack)
14010 param_ptr_offset += UNITS_PER_WORD;
14011 if (!call_used_regs[REGNO (crtl->drap_reg)])
14012 param_ptr_offset += UNITS_PER_WORD;
14014 insn = emit_insn (gen_rtx_SET
14015 (stack_pointer_rtx,
14016 gen_rtx_PLUS (Pmode,
14017 crtl->drap_reg,
14018 GEN_INT (-param_ptr_offset))));
14019 m->fs.cfa_reg = stack_pointer_rtx;
14020 m->fs.cfa_offset = param_ptr_offset;
14021 m->fs.sp_offset = param_ptr_offset;
14022 m->fs.realigned = false;
14024 add_reg_note (insn, REG_CFA_DEF_CFA,
14025 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14026 GEN_INT (param_ptr_offset)));
14027 RTX_FRAME_RELATED_P (insn) = 1;
14029 if (!call_used_regs[REGNO (crtl->drap_reg)])
14030 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14033 /* At this point the stack pointer must be valid, and we must have
14034 restored all of the registers. We may not have deallocated the
14035 entire stack frame. We've delayed this until now because it may
14036 be possible to merge the local stack deallocation with the
14037 deallocation forced by ix86_static_chain_on_stack. */
14038 gcc_assert (m->fs.sp_valid);
14039 gcc_assert (!m->fs.sp_realigned);
14040 gcc_assert (!m->fs.fp_valid);
14041 gcc_assert (!m->fs.realigned);
14042 if (m->fs.sp_offset != UNITS_PER_WORD)
14044 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14045 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14046 style, true);
14048 else
14049 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14051 /* Sibcall epilogues don't want a return instruction. */
14052 if (style == 0)
14054 m->fs = frame_state_save;
14055 return;
14058 if (cfun->machine->func_type != TYPE_NORMAL)
14059 emit_jump_insn (gen_interrupt_return ());
14060 else if (crtl->args.pops_args && crtl->args.size)
14062 rtx popc = GEN_INT (crtl->args.pops_args);
14064 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14065 address, do explicit add, and jump indirectly to the caller. */
14067 if (crtl->args.pops_args >= 65536)
14069 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14070 rtx_insn *insn;
14072 /* There is no "pascal" calling convention in any 64bit ABI. */
14073 gcc_assert (!TARGET_64BIT);
14075 insn = emit_insn (gen_pop (ecx));
14076 m->fs.cfa_offset -= UNITS_PER_WORD;
14077 m->fs.sp_offset -= UNITS_PER_WORD;
14079 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14080 x = gen_rtx_SET (stack_pointer_rtx, x);
14081 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14082 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14083 RTX_FRAME_RELATED_P (insn) = 1;
14085 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14086 popc, -1, true);
14087 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14089 else
14090 emit_jump_insn (gen_simple_return_pop_internal (popc));
14092 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14093 emit_jump_insn (gen_simple_return_internal ());
14095 /* Restore the state back to the state from the prologue,
14096 so that it's correct for the next epilogue. */
14097 m->fs = frame_state_save;
14100 /* Reset from the function's potential modifications. */
14102 static void
14103 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14105 if (pic_offset_table_rtx
14106 && !ix86_use_pseudo_pic_reg ())
14107 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14109 if (TARGET_MACHO)
14111 rtx_insn *insn = get_last_insn ();
14112 rtx_insn *deleted_debug_label = NULL;
14114 /* Mach-O doesn't support labels at the end of objects, so if
14115 it looks like we might want one, take special action.
14116 First, collect any sequence of deleted debug labels. */
14117 while (insn
14118 && NOTE_P (insn)
14119 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14121 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14122 notes only, instead set their CODE_LABEL_NUMBER to -1,
14123 otherwise there would be code generation differences
14124 in between -g and -g0. */
14125 if (NOTE_P (insn) && NOTE_KIND (insn)
14126 == NOTE_INSN_DELETED_DEBUG_LABEL)
14127 deleted_debug_label = insn;
14128 insn = PREV_INSN (insn);
14131 /* If we have:
14132 label:
14133 barrier
14134 then this needs to be detected, so skip past the barrier. */
14136 if (insn && BARRIER_P (insn))
14137 insn = PREV_INSN (insn);
14139 /* Up to now we've only seen notes or barriers. */
14140 if (insn)
14142 if (LABEL_P (insn)
14143 || (NOTE_P (insn)
14144 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14145 /* Trailing label. */
14146 fputs ("\tnop\n", file);
14147 else if (cfun && ! cfun->is_thunk)
14149 /* See if we have a completely empty function body, skipping
14150 the special case of the picbase thunk emitted as asm. */
14151 while (insn && ! INSN_P (insn))
14152 insn = PREV_INSN (insn);
14153 /* If we don't find any insns, we've got an empty function body;
14154 I.e. completely empty - without a return or branch. This is
14155 taken as the case where a function body has been removed
14156 because it contains an inline __builtin_unreachable(). GCC
14157 declares that reaching __builtin_unreachable() means UB so
14158 we're not obliged to do anything special; however, we want
14159 non-zero-sized function bodies. To meet this, and help the
14160 user out, let's trap the case. */
14161 if (insn == NULL)
14162 fputs ("\tud2\n", file);
14165 else if (deleted_debug_label)
14166 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14167 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14168 CODE_LABEL_NUMBER (insn) = -1;
14172 /* Return a scratch register to use in the split stack prologue. The
14173 split stack prologue is used for -fsplit-stack. It is the first
14174 instructions in the function, even before the regular prologue.
14175 The scratch register can be any caller-saved register which is not
14176 used for parameters or for the static chain. */
14178 static unsigned int
14179 split_stack_prologue_scratch_regno (void)
14181 if (TARGET_64BIT)
14182 return R11_REG;
14183 else
14185 bool is_fastcall, is_thiscall;
14186 int regparm;
14188 is_fastcall = (lookup_attribute ("fastcall",
14189 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14190 != NULL);
14191 is_thiscall = (lookup_attribute ("thiscall",
14192 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14193 != NULL);
14194 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14196 if (is_fastcall)
14198 if (DECL_STATIC_CHAIN (cfun->decl))
14200 sorry ("-fsplit-stack does not support fastcall with "
14201 "nested function");
14202 return INVALID_REGNUM;
14204 return AX_REG;
14206 else if (is_thiscall)
14208 if (!DECL_STATIC_CHAIN (cfun->decl))
14209 return DX_REG;
14210 return AX_REG;
14212 else if (regparm < 3)
14214 if (!DECL_STATIC_CHAIN (cfun->decl))
14215 return CX_REG;
14216 else
14218 if (regparm >= 2)
14220 sorry ("-fsplit-stack does not support 2 register "
14221 "parameters for a nested function");
14222 return INVALID_REGNUM;
14224 return DX_REG;
14227 else
14229 /* FIXME: We could make this work by pushing a register
14230 around the addition and comparison. */
14231 sorry ("-fsplit-stack does not support 3 register parameters");
14232 return INVALID_REGNUM;
14237 /* A SYMBOL_REF for the function which allocates new stackspace for
14238 -fsplit-stack. */
14240 static GTY(()) rtx split_stack_fn;
14242 /* A SYMBOL_REF for the more stack function when using the large
14243 model. */
14245 static GTY(()) rtx split_stack_fn_large;
14247 /* Return location of the stack guard value in the TLS block. */
14250 ix86_split_stack_guard (void)
14252 int offset;
14253 addr_space_t as = DEFAULT_TLS_SEG_REG;
14254 rtx r;
14256 gcc_assert (flag_split_stack);
14258 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14259 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14260 #else
14261 gcc_unreachable ();
14262 #endif
14264 r = GEN_INT (offset);
14265 r = gen_const_mem (Pmode, r);
14266 set_mem_addr_space (r, as);
14268 return r;
14271 /* Handle -fsplit-stack. These are the first instructions in the
14272 function, even before the regular prologue. */
14274 void
14275 ix86_expand_split_stack_prologue (void)
14277 struct ix86_frame frame;
14278 HOST_WIDE_INT allocate;
14279 unsigned HOST_WIDE_INT args_size;
14280 rtx_code_label *label;
14281 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14282 rtx scratch_reg = NULL_RTX;
14283 rtx_code_label *varargs_label = NULL;
14284 rtx fn;
14286 gcc_assert (flag_split_stack && reload_completed);
14288 ix86_finalize_stack_frame_flags ();
14289 frame = cfun->machine->frame;
14290 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14292 /* This is the label we will branch to if we have enough stack
14293 space. We expect the basic block reordering pass to reverse this
14294 branch if optimizing, so that we branch in the unlikely case. */
14295 label = gen_label_rtx ();
14297 /* We need to compare the stack pointer minus the frame size with
14298 the stack boundary in the TCB. The stack boundary always gives
14299 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14300 can compare directly. Otherwise we need to do an addition. */
14302 limit = ix86_split_stack_guard ();
14304 if (allocate < SPLIT_STACK_AVAILABLE)
14305 current = stack_pointer_rtx;
14306 else
14308 unsigned int scratch_regno;
14309 rtx offset;
14311 /* We need a scratch register to hold the stack pointer minus
14312 the required frame size. Since this is the very start of the
14313 function, the scratch register can be any caller-saved
14314 register which is not used for parameters. */
14315 offset = GEN_INT (- allocate);
14316 scratch_regno = split_stack_prologue_scratch_regno ();
14317 if (scratch_regno == INVALID_REGNUM)
14318 return;
14319 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14320 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14322 /* We don't use ix86_gen_add3 in this case because it will
14323 want to split to lea, but when not optimizing the insn
14324 will not be split after this point. */
14325 emit_insn (gen_rtx_SET (scratch_reg,
14326 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14327 offset)));
14329 else
14331 emit_move_insn (scratch_reg, offset);
14332 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14333 stack_pointer_rtx));
14335 current = scratch_reg;
14338 ix86_expand_branch (GEU, current, limit, label);
14339 rtx_insn *jump_insn = get_last_insn ();
14340 JUMP_LABEL (jump_insn) = label;
14342 /* Mark the jump as very likely to be taken. */
14343 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14345 if (split_stack_fn == NULL_RTX)
14347 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14348 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14350 fn = split_stack_fn;
14352 /* Get more stack space. We pass in the desired stack space and the
14353 size of the arguments to copy to the new stack. In 32-bit mode
14354 we push the parameters; __morestack will return on a new stack
14355 anyhow. In 64-bit mode we pass the parameters in r10 and
14356 r11. */
14357 allocate_rtx = GEN_INT (allocate);
14358 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14359 call_fusage = NULL_RTX;
14360 rtx pop = NULL_RTX;
14361 if (TARGET_64BIT)
14363 rtx reg10, reg11;
14365 reg10 = gen_rtx_REG (Pmode, R10_REG);
14366 reg11 = gen_rtx_REG (Pmode, R11_REG);
14368 /* If this function uses a static chain, it will be in %r10.
14369 Preserve it across the call to __morestack. */
14370 if (DECL_STATIC_CHAIN (cfun->decl))
14372 rtx rax;
14374 rax = gen_rtx_REG (word_mode, AX_REG);
14375 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14376 use_reg (&call_fusage, rax);
14379 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14380 && !TARGET_PECOFF)
14382 HOST_WIDE_INT argval;
14384 gcc_assert (Pmode == DImode);
14385 /* When using the large model we need to load the address
14386 into a register, and we've run out of registers. So we
14387 switch to a different calling convention, and we call a
14388 different function: __morestack_large. We pass the
14389 argument size in the upper 32 bits of r10 and pass the
14390 frame size in the lower 32 bits. */
14391 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14392 gcc_assert ((args_size & 0xffffffff) == args_size);
14394 if (split_stack_fn_large == NULL_RTX)
14396 split_stack_fn_large =
14397 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14398 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14400 if (ix86_cmodel == CM_LARGE_PIC)
14402 rtx_code_label *label;
14403 rtx x;
14405 label = gen_label_rtx ();
14406 emit_label (label);
14407 LABEL_PRESERVE_P (label) = 1;
14408 emit_insn (gen_set_rip_rex64 (reg10, label));
14409 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14410 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14411 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14412 UNSPEC_GOT);
14413 x = gen_rtx_CONST (Pmode, x);
14414 emit_move_insn (reg11, x);
14415 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14416 x = gen_const_mem (Pmode, x);
14417 emit_move_insn (reg11, x);
14419 else
14420 emit_move_insn (reg11, split_stack_fn_large);
14422 fn = reg11;
14424 argval = ((args_size << 16) << 16) + allocate;
14425 emit_move_insn (reg10, GEN_INT (argval));
14427 else
14429 emit_move_insn (reg10, allocate_rtx);
14430 emit_move_insn (reg11, GEN_INT (args_size));
14431 use_reg (&call_fusage, reg11);
14434 use_reg (&call_fusage, reg10);
14436 else
14438 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
14439 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
14440 insn = emit_insn (gen_push (allocate_rtx));
14441 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
14442 pop = GEN_INT (2 * UNITS_PER_WORD);
14444 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14445 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14446 pop, false);
14447 add_function_usage_to (call_insn, call_fusage);
14448 if (!TARGET_64BIT)
14449 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
14450 /* Indicate that this function can't jump to non-local gotos. */
14451 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
14453 /* In order to make call/return prediction work right, we now need
14454 to execute a return instruction. See
14455 libgcc/config/i386/morestack.S for the details on how this works.
14457 For flow purposes gcc must not see this as a return
14458 instruction--we need control flow to continue at the subsequent
14459 label. Therefore, we use an unspec. */
14460 gcc_assert (crtl->args.pops_args < 65536);
14461 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14463 /* If we are in 64-bit mode and this function uses a static chain,
14464 we saved %r10 in %rax before calling _morestack. */
14465 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14466 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14467 gen_rtx_REG (word_mode, AX_REG));
14469 /* If this function calls va_start, we need to store a pointer to
14470 the arguments on the old stack, because they may not have been
14471 all copied to the new stack. At this point the old stack can be
14472 found at the frame pointer value used by __morestack, because
14473 __morestack has set that up before calling back to us. Here we
14474 store that pointer in a scratch register, and in
14475 ix86_expand_prologue we store the scratch register in a stack
14476 slot. */
14477 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14479 unsigned int scratch_regno;
14480 rtx frame_reg;
14481 int words;
14483 scratch_regno = split_stack_prologue_scratch_regno ();
14484 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14485 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14487 /* 64-bit:
14488 fp -> old fp value
14489 return address within this function
14490 return address of caller of this function
14491 stack arguments
14492 So we add three words to get to the stack arguments.
14494 32-bit:
14495 fp -> old fp value
14496 return address within this function
14497 first argument to __morestack
14498 second argument to __morestack
14499 return address of caller of this function
14500 stack arguments
14501 So we add five words to get to the stack arguments.
14503 words = TARGET_64BIT ? 3 : 5;
14504 emit_insn (gen_rtx_SET (scratch_reg,
14505 gen_rtx_PLUS (Pmode, frame_reg,
14506 GEN_INT (words * UNITS_PER_WORD))));
14508 varargs_label = gen_label_rtx ();
14509 emit_jump_insn (gen_jump (varargs_label));
14510 JUMP_LABEL (get_last_insn ()) = varargs_label;
14512 emit_barrier ();
14515 emit_label (label);
14516 LABEL_NUSES (label) = 1;
14518 /* If this function calls va_start, we now have to set the scratch
14519 register for the case where we do not call __morestack. In this
14520 case we need to set it based on the stack pointer. */
14521 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14523 emit_insn (gen_rtx_SET (scratch_reg,
14524 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14525 GEN_INT (UNITS_PER_WORD))));
14527 emit_label (varargs_label);
14528 LABEL_NUSES (varargs_label) = 1;
14532 /* We may have to tell the dataflow pass that the split stack prologue
14533 is initializing a scratch register. */
14535 static void
14536 ix86_live_on_entry (bitmap regs)
14538 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14540 gcc_assert (flag_split_stack);
14541 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14545 /* Extract the parts of an RTL expression that is a valid memory address
14546 for an instruction. Return 0 if the structure of the address is
14547 grossly off. Return -1 if the address contains ASHIFT, so it is not
14548 strictly valid, but still used for computing length of lea instruction. */
14551 ix86_decompose_address (rtx addr, struct ix86_address *out)
14553 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14554 rtx base_reg, index_reg;
14555 HOST_WIDE_INT scale = 1;
14556 rtx scale_rtx = NULL_RTX;
14557 rtx tmp;
14558 int retval = 1;
14559 addr_space_t seg = ADDR_SPACE_GENERIC;
14561 /* Allow zero-extended SImode addresses,
14562 they will be emitted with addr32 prefix. */
14563 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14565 if (GET_CODE (addr) == ZERO_EXTEND
14566 && GET_MODE (XEXP (addr, 0)) == SImode)
14568 addr = XEXP (addr, 0);
14569 if (CONST_INT_P (addr))
14570 return 0;
14572 else if (GET_CODE (addr) == AND
14573 && const_32bit_mask (XEXP (addr, 1), DImode))
14575 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14576 if (addr == NULL_RTX)
14577 return 0;
14579 if (CONST_INT_P (addr))
14580 return 0;
14584 /* Allow SImode subregs of DImode addresses,
14585 they will be emitted with addr32 prefix. */
14586 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14588 if (SUBREG_P (addr)
14589 && GET_MODE (SUBREG_REG (addr)) == DImode)
14591 addr = SUBREG_REG (addr);
14592 if (CONST_INT_P (addr))
14593 return 0;
14597 if (REG_P (addr))
14598 base = addr;
14599 else if (SUBREG_P (addr))
14601 if (REG_P (SUBREG_REG (addr)))
14602 base = addr;
14603 else
14604 return 0;
14606 else if (GET_CODE (addr) == PLUS)
14608 rtx addends[4], op;
14609 int n = 0, i;
14611 op = addr;
14614 if (n >= 4)
14615 return 0;
14616 addends[n++] = XEXP (op, 1);
14617 op = XEXP (op, 0);
14619 while (GET_CODE (op) == PLUS);
14620 if (n >= 4)
14621 return 0;
14622 addends[n] = op;
14624 for (i = n; i >= 0; --i)
14626 op = addends[i];
14627 switch (GET_CODE (op))
14629 case MULT:
14630 if (index)
14631 return 0;
14632 index = XEXP (op, 0);
14633 scale_rtx = XEXP (op, 1);
14634 break;
14636 case ASHIFT:
14637 if (index)
14638 return 0;
14639 index = XEXP (op, 0);
14640 tmp = XEXP (op, 1);
14641 if (!CONST_INT_P (tmp))
14642 return 0;
14643 scale = INTVAL (tmp);
14644 if ((unsigned HOST_WIDE_INT) scale > 3)
14645 return 0;
14646 scale = 1 << scale;
14647 break;
14649 case ZERO_EXTEND:
14650 op = XEXP (op, 0);
14651 if (GET_CODE (op) != UNSPEC)
14652 return 0;
14653 /* FALLTHRU */
14655 case UNSPEC:
14656 if (XINT (op, 1) == UNSPEC_TP
14657 && TARGET_TLS_DIRECT_SEG_REFS
14658 && seg == ADDR_SPACE_GENERIC)
14659 seg = DEFAULT_TLS_SEG_REG;
14660 else
14661 return 0;
14662 break;
14664 case SUBREG:
14665 if (!REG_P (SUBREG_REG (op)))
14666 return 0;
14667 /* FALLTHRU */
14669 case REG:
14670 if (!base)
14671 base = op;
14672 else if (!index)
14673 index = op;
14674 else
14675 return 0;
14676 break;
14678 case CONST:
14679 case CONST_INT:
14680 case SYMBOL_REF:
14681 case LABEL_REF:
14682 if (disp)
14683 return 0;
14684 disp = op;
14685 break;
14687 default:
14688 return 0;
14692 else if (GET_CODE (addr) == MULT)
14694 index = XEXP (addr, 0); /* index*scale */
14695 scale_rtx = XEXP (addr, 1);
14697 else if (GET_CODE (addr) == ASHIFT)
14699 /* We're called for lea too, which implements ashift on occasion. */
14700 index = XEXP (addr, 0);
14701 tmp = XEXP (addr, 1);
14702 if (!CONST_INT_P (tmp))
14703 return 0;
14704 scale = INTVAL (tmp);
14705 if ((unsigned HOST_WIDE_INT) scale > 3)
14706 return 0;
14707 scale = 1 << scale;
14708 retval = -1;
14710 else
14711 disp = addr; /* displacement */
14713 if (index)
14715 if (REG_P (index))
14717 else if (SUBREG_P (index)
14718 && REG_P (SUBREG_REG (index)))
14720 else
14721 return 0;
14724 /* Extract the integral value of scale. */
14725 if (scale_rtx)
14727 if (!CONST_INT_P (scale_rtx))
14728 return 0;
14729 scale = INTVAL (scale_rtx);
14732 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
14733 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
14735 /* Avoid useless 0 displacement. */
14736 if (disp == const0_rtx && (base || index))
14737 disp = NULL_RTX;
14739 /* Allow arg pointer and stack pointer as index if there is not scaling. */
14740 if (base_reg && index_reg && scale == 1
14741 && (REGNO (index_reg) == ARG_POINTER_REGNUM
14742 || REGNO (index_reg) == FRAME_POINTER_REGNUM
14743 || REGNO (index_reg) == SP_REG))
14745 std::swap (base, index);
14746 std::swap (base_reg, index_reg);
14749 /* Special case: %ebp cannot be encoded as a base without a displacement.
14750 Similarly %r13. */
14751 if (!disp && base_reg
14752 && (REGNO (base_reg) == ARG_POINTER_REGNUM
14753 || REGNO (base_reg) == FRAME_POINTER_REGNUM
14754 || REGNO (base_reg) == BP_REG
14755 || REGNO (base_reg) == R13_REG))
14756 disp = const0_rtx;
14758 /* Special case: on K6, [%esi] makes the instruction vector decoded.
14759 Avoid this by transforming to [%esi+0].
14760 Reload calls address legitimization without cfun defined, so we need
14761 to test cfun for being non-NULL. */
14762 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
14763 && base_reg && !index_reg && !disp
14764 && REGNO (base_reg) == SI_REG)
14765 disp = const0_rtx;
14767 /* Special case: encode reg+reg instead of reg*2. */
14768 if (!base && index && scale == 2)
14769 base = index, base_reg = index_reg, scale = 1;
14771 /* Special case: scaling cannot be encoded without base or displacement. */
14772 if (!base && !disp && index && scale != 1)
14773 disp = const0_rtx;
14775 out->base = base;
14776 out->index = index;
14777 out->disp = disp;
14778 out->scale = scale;
14779 out->seg = seg;
14781 return retval;
14784 /* Return cost of the memory address x.
14785 For i386, it is better to use a complex address than let gcc copy
14786 the address into a reg and make a new pseudo. But not if the address
14787 requires to two regs - that would mean more pseudos with longer
14788 lifetimes. */
14789 static int
14790 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
14792 struct ix86_address parts;
14793 int cost = 1;
14794 int ok = ix86_decompose_address (x, &parts);
14796 gcc_assert (ok);
14798 if (parts.base && SUBREG_P (parts.base))
14799 parts.base = SUBREG_REG (parts.base);
14800 if (parts.index && SUBREG_P (parts.index))
14801 parts.index = SUBREG_REG (parts.index);
14803 /* Attempt to minimize number of registers in the address by increasing
14804 address cost for each used register. We don't increase address cost
14805 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
14806 is not invariant itself it most likely means that base or index is not
14807 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
14808 which is not profitable for x86. */
14809 if (parts.base
14810 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
14811 && (current_pass->type == GIMPLE_PASS
14812 || !pic_offset_table_rtx
14813 || !REG_P (parts.base)
14814 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
14815 cost++;
14817 if (parts.index
14818 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
14819 && (current_pass->type == GIMPLE_PASS
14820 || !pic_offset_table_rtx
14821 || !REG_P (parts.index)
14822 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
14823 cost++;
14825 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
14826 since it's predecode logic can't detect the length of instructions
14827 and it degenerates to vector decoded. Increase cost of such
14828 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
14829 to split such addresses or even refuse such addresses at all.
14831 Following addressing modes are affected:
14832 [base+scale*index]
14833 [scale*index+disp]
14834 [base+index]
14836 The first and last case may be avoidable by explicitly coding the zero in
14837 memory address, but I don't have AMD-K6 machine handy to check this
14838 theory. */
14840 if (TARGET_K6
14841 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
14842 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
14843 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
14844 cost += 10;
14846 return cost;
14849 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
14850 this is used for to form addresses to local data when -fPIC is in
14851 use. */
14853 static bool
14854 darwin_local_data_pic (rtx disp)
14856 return (GET_CODE (disp) == UNSPEC
14857 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
14860 /* True if operand X should be loaded from GOT. */
14862 bool
14863 ix86_force_load_from_GOT_p (rtx x)
14865 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
14866 && !TARGET_PECOFF && !TARGET_MACHO
14867 && !flag_plt && !flag_pic
14868 && ix86_cmodel != CM_LARGE
14869 && GET_CODE (x) == SYMBOL_REF
14870 && SYMBOL_REF_FUNCTION_P (x)
14871 && !SYMBOL_REF_LOCAL_P (x));
14874 /* Determine if a given RTX is a valid constant. We already know this
14875 satisfies CONSTANT_P. */
14877 static bool
14878 ix86_legitimate_constant_p (machine_mode mode, rtx x)
14880 /* Pointer bounds constants are not valid. */
14881 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
14882 return false;
14884 switch (GET_CODE (x))
14886 case CONST:
14887 x = XEXP (x, 0);
14889 if (GET_CODE (x) == PLUS)
14891 if (!CONST_INT_P (XEXP (x, 1)))
14892 return false;
14893 x = XEXP (x, 0);
14896 if (TARGET_MACHO && darwin_local_data_pic (x))
14897 return true;
14899 /* Only some unspecs are valid as "constants". */
14900 if (GET_CODE (x) == UNSPEC)
14901 switch (XINT (x, 1))
14903 case UNSPEC_GOT:
14904 case UNSPEC_GOTOFF:
14905 case UNSPEC_PLTOFF:
14906 return TARGET_64BIT;
14907 case UNSPEC_TPOFF:
14908 case UNSPEC_NTPOFF:
14909 x = XVECEXP (x, 0, 0);
14910 return (GET_CODE (x) == SYMBOL_REF
14911 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
14912 case UNSPEC_DTPOFF:
14913 x = XVECEXP (x, 0, 0);
14914 return (GET_CODE (x) == SYMBOL_REF
14915 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
14916 default:
14917 return false;
14920 /* We must have drilled down to a symbol. */
14921 if (GET_CODE (x) == LABEL_REF)
14922 return true;
14923 if (GET_CODE (x) != SYMBOL_REF)
14924 return false;
14925 /* FALLTHRU */
14927 case SYMBOL_REF:
14928 /* TLS symbols are never valid. */
14929 if (SYMBOL_REF_TLS_MODEL (x))
14930 return false;
14932 /* DLLIMPORT symbols are never valid. */
14933 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
14934 && SYMBOL_REF_DLLIMPORT_P (x))
14935 return false;
14937 #if TARGET_MACHO
14938 /* mdynamic-no-pic */
14939 if (MACHO_DYNAMIC_NO_PIC_P)
14940 return machopic_symbol_defined_p (x);
14941 #endif
14943 /* External function address should be loaded
14944 via the GOT slot to avoid PLT. */
14945 if (ix86_force_load_from_GOT_p (x))
14946 return false;
14948 break;
14950 CASE_CONST_SCALAR_INT:
14951 switch (mode)
14953 case E_TImode:
14954 if (TARGET_64BIT)
14955 return true;
14956 /* FALLTHRU */
14957 case E_OImode:
14958 case E_XImode:
14959 if (!standard_sse_constant_p (x, mode))
14960 return false;
14961 default:
14962 break;
14964 break;
14966 case CONST_VECTOR:
14967 if (!standard_sse_constant_p (x, mode))
14968 return false;
14970 default:
14971 break;
14974 /* Otherwise we handle everything else in the move patterns. */
14975 return true;
14978 /* Determine if it's legal to put X into the constant pool. This
14979 is not possible for the address of thread-local symbols, which
14980 is checked above. */
14982 static bool
14983 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
14985 /* We can put any immediate constant in memory. */
14986 switch (GET_CODE (x))
14988 CASE_CONST_ANY:
14989 return false;
14991 default:
14992 break;
14995 return !ix86_legitimate_constant_p (mode, x);
14998 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
14999 otherwise zero. */
15001 static bool
15002 is_imported_p (rtx x)
15004 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15005 || GET_CODE (x) != SYMBOL_REF)
15006 return false;
15008 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15012 /* Nonzero if the constant value X is a legitimate general operand
15013 when generating PIC code. It is given that flag_pic is on and
15014 that X satisfies CONSTANT_P. */
15016 bool
15017 legitimate_pic_operand_p (rtx x)
15019 rtx inner;
15021 switch (GET_CODE (x))
15023 case CONST:
15024 inner = XEXP (x, 0);
15025 if (GET_CODE (inner) == PLUS
15026 && CONST_INT_P (XEXP (inner, 1)))
15027 inner = XEXP (inner, 0);
15029 /* Only some unspecs are valid as "constants". */
15030 if (GET_CODE (inner) == UNSPEC)
15031 switch (XINT (inner, 1))
15033 case UNSPEC_GOT:
15034 case UNSPEC_GOTOFF:
15035 case UNSPEC_PLTOFF:
15036 return TARGET_64BIT;
15037 case UNSPEC_TPOFF:
15038 x = XVECEXP (inner, 0, 0);
15039 return (GET_CODE (x) == SYMBOL_REF
15040 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15041 case UNSPEC_MACHOPIC_OFFSET:
15042 return legitimate_pic_address_disp_p (x);
15043 default:
15044 return false;
15046 /* FALLTHRU */
15048 case SYMBOL_REF:
15049 case LABEL_REF:
15050 return legitimate_pic_address_disp_p (x);
15052 default:
15053 return true;
15057 /* Determine if a given CONST RTX is a valid memory displacement
15058 in PIC mode. */
15060 bool
15061 legitimate_pic_address_disp_p (rtx disp)
15063 bool saw_plus;
15065 /* In 64bit mode we can allow direct addresses of symbols and labels
15066 when they are not dynamic symbols. */
15067 if (TARGET_64BIT)
15069 rtx op0 = disp, op1;
15071 switch (GET_CODE (disp))
15073 case LABEL_REF:
15074 return true;
15076 case CONST:
15077 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15078 break;
15079 op0 = XEXP (XEXP (disp, 0), 0);
15080 op1 = XEXP (XEXP (disp, 0), 1);
15081 if (!CONST_INT_P (op1)
15082 || INTVAL (op1) >= 16*1024*1024
15083 || INTVAL (op1) < -16*1024*1024)
15084 break;
15085 if (GET_CODE (op0) == LABEL_REF)
15086 return true;
15087 if (GET_CODE (op0) == CONST
15088 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15089 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15090 return true;
15091 if (GET_CODE (op0) == UNSPEC
15092 && XINT (op0, 1) == UNSPEC_PCREL)
15093 return true;
15094 if (GET_CODE (op0) != SYMBOL_REF)
15095 break;
15096 /* FALLTHRU */
15098 case SYMBOL_REF:
15099 /* TLS references should always be enclosed in UNSPEC.
15100 The dllimported symbol needs always to be resolved. */
15101 if (SYMBOL_REF_TLS_MODEL (op0)
15102 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15103 return false;
15105 if (TARGET_PECOFF)
15107 if (is_imported_p (op0))
15108 return true;
15110 if (SYMBOL_REF_FAR_ADDR_P (op0)
15111 || !SYMBOL_REF_LOCAL_P (op0))
15112 break;
15114 /* Function-symbols need to be resolved only for
15115 large-model.
15116 For the small-model we don't need to resolve anything
15117 here. */
15118 if ((ix86_cmodel != CM_LARGE_PIC
15119 && SYMBOL_REF_FUNCTION_P (op0))
15120 || ix86_cmodel == CM_SMALL_PIC)
15121 return true;
15122 /* Non-external symbols don't need to be resolved for
15123 large, and medium-model. */
15124 if ((ix86_cmodel == CM_LARGE_PIC
15125 || ix86_cmodel == CM_MEDIUM_PIC)
15126 && !SYMBOL_REF_EXTERNAL_P (op0))
15127 return true;
15129 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15130 && (SYMBOL_REF_LOCAL_P (op0)
15131 || (HAVE_LD_PIE_COPYRELOC
15132 && flag_pie
15133 && !SYMBOL_REF_WEAK (op0)
15134 && !SYMBOL_REF_FUNCTION_P (op0)))
15135 && ix86_cmodel != CM_LARGE_PIC)
15136 return true;
15137 break;
15139 default:
15140 break;
15143 if (GET_CODE (disp) != CONST)
15144 return false;
15145 disp = XEXP (disp, 0);
15147 if (TARGET_64BIT)
15149 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15150 of GOT tables. We should not need these anyway. */
15151 if (GET_CODE (disp) != UNSPEC
15152 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15153 && XINT (disp, 1) != UNSPEC_GOTOFF
15154 && XINT (disp, 1) != UNSPEC_PCREL
15155 && XINT (disp, 1) != UNSPEC_PLTOFF))
15156 return false;
15158 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15159 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15160 return false;
15161 return true;
15164 saw_plus = false;
15165 if (GET_CODE (disp) == PLUS)
15167 if (!CONST_INT_P (XEXP (disp, 1)))
15168 return false;
15169 disp = XEXP (disp, 0);
15170 saw_plus = true;
15173 if (TARGET_MACHO && darwin_local_data_pic (disp))
15174 return true;
15176 if (GET_CODE (disp) != UNSPEC)
15177 return false;
15179 switch (XINT (disp, 1))
15181 case UNSPEC_GOT:
15182 if (saw_plus)
15183 return false;
15184 /* We need to check for both symbols and labels because VxWorks loads
15185 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15186 details. */
15187 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15188 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15189 case UNSPEC_GOTOFF:
15190 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15191 While ABI specify also 32bit relocation but we don't produce it in
15192 small PIC model at all. */
15193 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15194 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15195 && !TARGET_64BIT)
15196 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15197 return false;
15198 case UNSPEC_GOTTPOFF:
15199 case UNSPEC_GOTNTPOFF:
15200 case UNSPEC_INDNTPOFF:
15201 if (saw_plus)
15202 return false;
15203 disp = XVECEXP (disp, 0, 0);
15204 return (GET_CODE (disp) == SYMBOL_REF
15205 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15206 case UNSPEC_NTPOFF:
15207 disp = XVECEXP (disp, 0, 0);
15208 return (GET_CODE (disp) == SYMBOL_REF
15209 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15210 case UNSPEC_DTPOFF:
15211 disp = XVECEXP (disp, 0, 0);
15212 return (GET_CODE (disp) == SYMBOL_REF
15213 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15216 return false;
15219 /* Determine if op is suitable RTX for an address register.
15220 Return naked register if a register or a register subreg is
15221 found, otherwise return NULL_RTX. */
15223 static rtx
15224 ix86_validate_address_register (rtx op)
15226 machine_mode mode = GET_MODE (op);
15228 /* Only SImode or DImode registers can form the address. */
15229 if (mode != SImode && mode != DImode)
15230 return NULL_RTX;
15232 if (REG_P (op))
15233 return op;
15234 else if (SUBREG_P (op))
15236 rtx reg = SUBREG_REG (op);
15238 if (!REG_P (reg))
15239 return NULL_RTX;
15241 mode = GET_MODE (reg);
15243 /* Don't allow SUBREGs that span more than a word. It can
15244 lead to spill failures when the register is one word out
15245 of a two word structure. */
15246 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15247 return NULL_RTX;
15249 /* Allow only SUBREGs of non-eliminable hard registers. */
15250 if (register_no_elim_operand (reg, mode))
15251 return reg;
15254 /* Op is not a register. */
15255 return NULL_RTX;
15258 /* Recognizes RTL expressions that are valid memory addresses for an
15259 instruction. The MODE argument is the machine mode for the MEM
15260 expression that wants to use this address.
15262 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15263 convert common non-canonical forms to canonical form so that they will
15264 be recognized. */
15266 static bool
15267 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15269 struct ix86_address parts;
15270 rtx base, index, disp;
15271 HOST_WIDE_INT scale;
15272 addr_space_t seg;
15274 if (ix86_decompose_address (addr, &parts) <= 0)
15275 /* Decomposition failed. */
15276 return false;
15278 base = parts.base;
15279 index = parts.index;
15280 disp = parts.disp;
15281 scale = parts.scale;
15282 seg = parts.seg;
15284 /* Validate base register. */
15285 if (base)
15287 rtx reg = ix86_validate_address_register (base);
15289 if (reg == NULL_RTX)
15290 return false;
15292 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15293 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15294 /* Base is not valid. */
15295 return false;
15298 /* Validate index register. */
15299 if (index)
15301 rtx reg = ix86_validate_address_register (index);
15303 if (reg == NULL_RTX)
15304 return false;
15306 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15307 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15308 /* Index is not valid. */
15309 return false;
15312 /* Index and base should have the same mode. */
15313 if (base && index
15314 && GET_MODE (base) != GET_MODE (index))
15315 return false;
15317 /* Address override works only on the (%reg) part of %fs:(%reg). */
15318 if (seg != ADDR_SPACE_GENERIC
15319 && ((base && GET_MODE (base) != word_mode)
15320 || (index && GET_MODE (index) != word_mode)))
15321 return false;
15323 /* Validate scale factor. */
15324 if (scale != 1)
15326 if (!index)
15327 /* Scale without index. */
15328 return false;
15330 if (scale != 2 && scale != 4 && scale != 8)
15331 /* Scale is not a valid multiplier. */
15332 return false;
15335 /* Validate displacement. */
15336 if (disp)
15338 if (GET_CODE (disp) == CONST
15339 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15340 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15341 switch (XINT (XEXP (disp, 0), 1))
15343 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15344 when used. While ABI specify also 32bit relocations, we
15345 don't produce them at all and use IP relative instead.
15346 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15347 should be loaded via GOT. */
15348 case UNSPEC_GOT:
15349 if (!TARGET_64BIT
15350 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15351 goto is_legitimate_pic;
15352 /* FALLTHRU */
15353 case UNSPEC_GOTOFF:
15354 gcc_assert (flag_pic);
15355 if (!TARGET_64BIT)
15356 goto is_legitimate_pic;
15358 /* 64bit address unspec. */
15359 return false;
15361 case UNSPEC_GOTPCREL:
15362 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15363 goto is_legitimate_pic;
15364 /* FALLTHRU */
15365 case UNSPEC_PCREL:
15366 gcc_assert (flag_pic);
15367 goto is_legitimate_pic;
15369 case UNSPEC_GOTTPOFF:
15370 case UNSPEC_GOTNTPOFF:
15371 case UNSPEC_INDNTPOFF:
15372 case UNSPEC_NTPOFF:
15373 case UNSPEC_DTPOFF:
15374 break;
15376 default:
15377 /* Invalid address unspec. */
15378 return false;
15381 else if (SYMBOLIC_CONST (disp)
15382 && (flag_pic
15383 || (TARGET_MACHO
15384 #if TARGET_MACHO
15385 && MACHOPIC_INDIRECT
15386 && !machopic_operand_p (disp)
15387 #endif
15391 is_legitimate_pic:
15392 if (TARGET_64BIT && (index || base))
15394 /* foo@dtpoff(%rX) is ok. */
15395 if (GET_CODE (disp) != CONST
15396 || GET_CODE (XEXP (disp, 0)) != PLUS
15397 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15398 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15399 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15400 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15401 /* Non-constant pic memory reference. */
15402 return false;
15404 else if ((!TARGET_MACHO || flag_pic)
15405 && ! legitimate_pic_address_disp_p (disp))
15406 /* Displacement is an invalid pic construct. */
15407 return false;
15408 #if TARGET_MACHO
15409 else if (MACHO_DYNAMIC_NO_PIC_P
15410 && !ix86_legitimate_constant_p (Pmode, disp))
15411 /* displacment must be referenced via non_lazy_pointer */
15412 return false;
15413 #endif
15415 /* This code used to verify that a symbolic pic displacement
15416 includes the pic_offset_table_rtx register.
15418 While this is good idea, unfortunately these constructs may
15419 be created by "adds using lea" optimization for incorrect
15420 code like:
15422 int a;
15423 int foo(int i)
15425 return *(&a+i);
15428 This code is nonsensical, but results in addressing
15429 GOT table with pic_offset_table_rtx base. We can't
15430 just refuse it easily, since it gets matched by
15431 "addsi3" pattern, that later gets split to lea in the
15432 case output register differs from input. While this
15433 can be handled by separate addsi pattern for this case
15434 that never results in lea, this seems to be easier and
15435 correct fix for crash to disable this test. */
15437 else if (GET_CODE (disp) != LABEL_REF
15438 && !CONST_INT_P (disp)
15439 && (GET_CODE (disp) != CONST
15440 || !ix86_legitimate_constant_p (Pmode, disp))
15441 && (GET_CODE (disp) != SYMBOL_REF
15442 || !ix86_legitimate_constant_p (Pmode, disp)))
15443 /* Displacement is not constant. */
15444 return false;
15445 else if (TARGET_64BIT
15446 && !x86_64_immediate_operand (disp, VOIDmode))
15447 /* Displacement is out of range. */
15448 return false;
15449 /* In x32 mode, constant addresses are sign extended to 64bit, so
15450 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15451 else if (TARGET_X32 && !(index || base)
15452 && CONST_INT_P (disp)
15453 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15454 return false;
15457 /* Everything looks valid. */
15458 return true;
15461 /* Determine if a given RTX is a valid constant address. */
15463 bool
15464 constant_address_p (rtx x)
15466 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15469 /* Return a unique alias set for the GOT. */
15471 static alias_set_type
15472 ix86_GOT_alias_set (void)
15474 static alias_set_type set = -1;
15475 if (set == -1)
15476 set = new_alias_set ();
15477 return set;
15480 /* Return a legitimate reference for ORIG (an address) using the
15481 register REG. If REG is 0, a new pseudo is generated.
15483 There are two types of references that must be handled:
15485 1. Global data references must load the address from the GOT, via
15486 the PIC reg. An insn is emitted to do this load, and the reg is
15487 returned.
15489 2. Static data references, constant pool addresses, and code labels
15490 compute the address as an offset from the GOT, whose base is in
15491 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15492 differentiate them from global data objects. The returned
15493 address is the PIC reg + an unspec constant.
15495 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15496 reg also appears in the address. */
15498 static rtx
15499 legitimize_pic_address (rtx orig, rtx reg)
15501 rtx addr = orig;
15502 rtx new_rtx = orig;
15504 #if TARGET_MACHO
15505 if (TARGET_MACHO && !TARGET_64BIT)
15507 if (reg == 0)
15508 reg = gen_reg_rtx (Pmode);
15509 /* Use the generic Mach-O PIC machinery. */
15510 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15512 #endif
15514 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15516 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15517 if (tmp)
15518 return tmp;
15521 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15522 new_rtx = addr;
15523 else if ((!TARGET_64BIT
15524 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15525 && !TARGET_PECOFF
15526 && gotoff_operand (addr, Pmode))
15528 /* This symbol may be referenced via a displacement
15529 from the PIC base address (@GOTOFF). */
15530 if (GET_CODE (addr) == CONST)
15531 addr = XEXP (addr, 0);
15533 if (GET_CODE (addr) == PLUS)
15535 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15536 UNSPEC_GOTOFF);
15537 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15539 else
15540 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15542 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15544 if (TARGET_64BIT)
15545 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15547 if (reg != 0)
15549 gcc_assert (REG_P (reg));
15550 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15551 new_rtx, reg, 1, OPTAB_DIRECT);
15553 else
15554 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15556 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15557 /* We can't use @GOTOFF for text labels
15558 on VxWorks, see gotoff_operand. */
15559 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15561 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15562 if (tmp)
15563 return tmp;
15565 /* For x64 PE-COFF there is no GOT table,
15566 so we use address directly. */
15567 if (TARGET_64BIT && TARGET_PECOFF)
15569 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15570 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15572 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15574 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15575 UNSPEC_GOTPCREL);
15576 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15577 new_rtx = gen_const_mem (Pmode, new_rtx);
15578 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15580 else
15582 /* This symbol must be referenced via a load
15583 from the Global Offset Table (@GOT). */
15584 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15585 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15586 if (TARGET_64BIT)
15587 new_rtx = force_reg (Pmode, new_rtx);
15588 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15589 new_rtx = gen_const_mem (Pmode, new_rtx);
15590 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15593 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15595 else
15597 if (CONST_INT_P (addr)
15598 && !x86_64_immediate_operand (addr, VOIDmode))
15599 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15600 else if (GET_CODE (addr) == CONST)
15602 addr = XEXP (addr, 0);
15604 /* We must match stuff we generate before. Assume the only
15605 unspecs that can get here are ours. Not that we could do
15606 anything with them anyway.... */
15607 if (GET_CODE (addr) == UNSPEC
15608 || (GET_CODE (addr) == PLUS
15609 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
15610 return orig;
15611 gcc_assert (GET_CODE (addr) == PLUS);
15614 if (GET_CODE (addr) == PLUS)
15616 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
15618 /* Check first to see if this is a constant
15619 offset from a @GOTOFF symbol reference. */
15620 if (!TARGET_PECOFF
15621 && gotoff_operand (op0, Pmode)
15622 && CONST_INT_P (op1))
15624 if (!TARGET_64BIT)
15626 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
15627 UNSPEC_GOTOFF);
15628 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
15629 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15631 if (reg != 0)
15633 gcc_assert (REG_P (reg));
15634 new_rtx = expand_simple_binop (Pmode, PLUS,
15635 pic_offset_table_rtx,
15636 new_rtx, reg, 1,
15637 OPTAB_DIRECT);
15639 else
15640 new_rtx
15641 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15643 else
15645 if (INTVAL (op1) < -16*1024*1024
15646 || INTVAL (op1) >= 16*1024*1024)
15648 if (!x86_64_immediate_operand (op1, Pmode))
15649 op1 = force_reg (Pmode, op1);
15651 new_rtx
15652 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
15656 else
15658 rtx base = legitimize_pic_address (op0, reg);
15659 machine_mode mode = GET_MODE (base);
15660 new_rtx
15661 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
15663 if (CONST_INT_P (new_rtx))
15665 if (INTVAL (new_rtx) < -16*1024*1024
15666 || INTVAL (new_rtx) >= 16*1024*1024)
15668 if (!x86_64_immediate_operand (new_rtx, mode))
15669 new_rtx = force_reg (mode, new_rtx);
15671 new_rtx
15672 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
15674 else
15675 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
15677 else
15679 /* For %rip addressing, we have to use
15680 just disp32, not base nor index. */
15681 if (TARGET_64BIT
15682 && (GET_CODE (base) == SYMBOL_REF
15683 || GET_CODE (base) == LABEL_REF))
15684 base = force_reg (mode, base);
15685 if (GET_CODE (new_rtx) == PLUS
15686 && CONSTANT_P (XEXP (new_rtx, 1)))
15688 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
15689 new_rtx = XEXP (new_rtx, 1);
15691 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
15696 return new_rtx;
15699 /* Load the thread pointer. If TO_REG is true, force it into a register. */
15701 static rtx
15702 get_thread_pointer (machine_mode tp_mode, bool to_reg)
15704 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
15706 if (GET_MODE (tp) != tp_mode)
15708 gcc_assert (GET_MODE (tp) == SImode);
15709 gcc_assert (tp_mode == DImode);
15711 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
15714 if (to_reg)
15715 tp = copy_to_mode_reg (tp_mode, tp);
15717 return tp;
15720 /* Construct the SYMBOL_REF for the tls_get_addr function. */
15722 static GTY(()) rtx ix86_tls_symbol;
15724 static rtx
15725 ix86_tls_get_addr (void)
15727 if (!ix86_tls_symbol)
15729 const char *sym
15730 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
15731 ? "___tls_get_addr" : "__tls_get_addr");
15733 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
15736 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
15738 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
15739 UNSPEC_PLTOFF);
15740 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
15741 gen_rtx_CONST (Pmode, unspec));
15744 return ix86_tls_symbol;
15747 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
15749 static GTY(()) rtx ix86_tls_module_base_symbol;
15752 ix86_tls_module_base (void)
15754 if (!ix86_tls_module_base_symbol)
15756 ix86_tls_module_base_symbol
15757 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
15759 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15760 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15763 return ix86_tls_module_base_symbol;
15766 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
15767 false if we expect this to be used for a memory address and true if
15768 we expect to load the address into a register. */
15770 static rtx
15771 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
15773 rtx dest, base, off;
15774 rtx pic = NULL_RTX, tp = NULL_RTX;
15775 machine_mode tp_mode = Pmode;
15776 int type;
15778 /* Fall back to global dynamic model if tool chain cannot support local
15779 dynamic. */
15780 if (TARGET_SUN_TLS && !TARGET_64BIT
15781 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
15782 && model == TLS_MODEL_LOCAL_DYNAMIC)
15783 model = TLS_MODEL_GLOBAL_DYNAMIC;
15785 switch (model)
15787 case TLS_MODEL_GLOBAL_DYNAMIC:
15788 dest = gen_reg_rtx (Pmode);
15790 if (!TARGET_64BIT)
15792 if (flag_pic && !TARGET_PECOFF)
15793 pic = pic_offset_table_rtx;
15794 else
15796 pic = gen_reg_rtx (Pmode);
15797 emit_insn (gen_set_got (pic));
15801 if (TARGET_GNU2_TLS)
15803 if (TARGET_64BIT)
15804 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
15805 else
15806 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
15808 tp = get_thread_pointer (Pmode, true);
15809 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
15811 if (GET_MODE (x) != Pmode)
15812 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15814 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
15816 else
15818 rtx caddr = ix86_tls_get_addr ();
15820 if (TARGET_64BIT)
15822 rtx rax = gen_rtx_REG (Pmode, AX_REG);
15823 rtx_insn *insns;
15825 start_sequence ();
15826 emit_call_insn
15827 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
15828 insns = get_insns ();
15829 end_sequence ();
15831 if (GET_MODE (x) != Pmode)
15832 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15834 RTL_CONST_CALL_P (insns) = 1;
15835 emit_libcall_block (insns, dest, rax, x);
15837 else
15838 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
15840 break;
15842 case TLS_MODEL_LOCAL_DYNAMIC:
15843 base = gen_reg_rtx (Pmode);
15845 if (!TARGET_64BIT)
15847 if (flag_pic)
15848 pic = pic_offset_table_rtx;
15849 else
15851 pic = gen_reg_rtx (Pmode);
15852 emit_insn (gen_set_got (pic));
15856 if (TARGET_GNU2_TLS)
15858 rtx tmp = ix86_tls_module_base ();
15860 if (TARGET_64BIT)
15861 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
15862 else
15863 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
15865 tp = get_thread_pointer (Pmode, true);
15866 set_unique_reg_note (get_last_insn (), REG_EQUAL,
15867 gen_rtx_MINUS (Pmode, tmp, tp));
15869 else
15871 rtx caddr = ix86_tls_get_addr ();
15873 if (TARGET_64BIT)
15875 rtx rax = gen_rtx_REG (Pmode, AX_REG);
15876 rtx_insn *insns;
15877 rtx eqv;
15879 start_sequence ();
15880 emit_call_insn
15881 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
15882 insns = get_insns ();
15883 end_sequence ();
15885 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
15886 share the LD_BASE result with other LD model accesses. */
15887 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
15888 UNSPEC_TLS_LD_BASE);
15890 RTL_CONST_CALL_P (insns) = 1;
15891 emit_libcall_block (insns, base, rax, eqv);
15893 else
15894 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
15897 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
15898 off = gen_rtx_CONST (Pmode, off);
15900 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
15902 if (TARGET_GNU2_TLS)
15904 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
15906 if (GET_MODE (x) != Pmode)
15907 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15909 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
15911 break;
15913 case TLS_MODEL_INITIAL_EXEC:
15914 if (TARGET_64BIT)
15916 if (TARGET_SUN_TLS && !TARGET_X32)
15918 /* The Sun linker took the AMD64 TLS spec literally
15919 and can only handle %rax as destination of the
15920 initial executable code sequence. */
15922 dest = gen_reg_rtx (DImode);
15923 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
15924 return dest;
15927 /* Generate DImode references to avoid %fs:(%reg32)
15928 problems and linker IE->LE relaxation bug. */
15929 tp_mode = DImode;
15930 pic = NULL;
15931 type = UNSPEC_GOTNTPOFF;
15933 else if (flag_pic)
15935 pic = pic_offset_table_rtx;
15936 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
15938 else if (!TARGET_ANY_GNU_TLS)
15940 pic = gen_reg_rtx (Pmode);
15941 emit_insn (gen_set_got (pic));
15942 type = UNSPEC_GOTTPOFF;
15944 else
15946 pic = NULL;
15947 type = UNSPEC_INDNTPOFF;
15950 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
15951 off = gen_rtx_CONST (tp_mode, off);
15952 if (pic)
15953 off = gen_rtx_PLUS (tp_mode, pic, off);
15954 off = gen_const_mem (tp_mode, off);
15955 set_mem_alias_set (off, ix86_GOT_alias_set ());
15957 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
15959 base = get_thread_pointer (tp_mode,
15960 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
15961 off = force_reg (tp_mode, off);
15962 dest = gen_rtx_PLUS (tp_mode, base, off);
15963 if (tp_mode != Pmode)
15964 dest = convert_to_mode (Pmode, dest, 1);
15966 else
15968 base = get_thread_pointer (Pmode, true);
15969 dest = gen_reg_rtx (Pmode);
15970 emit_insn (ix86_gen_sub3 (dest, base, off));
15972 break;
15974 case TLS_MODEL_LOCAL_EXEC:
15975 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
15976 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
15977 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
15978 off = gen_rtx_CONST (Pmode, off);
15980 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
15982 base = get_thread_pointer (Pmode,
15983 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
15984 return gen_rtx_PLUS (Pmode, base, off);
15986 else
15988 base = get_thread_pointer (Pmode, true);
15989 dest = gen_reg_rtx (Pmode);
15990 emit_insn (ix86_gen_sub3 (dest, base, off));
15992 break;
15994 default:
15995 gcc_unreachable ();
15998 return dest;
16001 /* Return true if OP refers to a TLS address. */
16002 bool
16003 ix86_tls_address_pattern_p (rtx op)
16005 subrtx_var_iterator::array_type array;
16006 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16008 rtx op = *iter;
16009 if (MEM_P (op))
16011 rtx *x = &XEXP (op, 0);
16012 while (GET_CODE (*x) == PLUS)
16014 int i;
16015 for (i = 0; i < 2; i++)
16017 rtx u = XEXP (*x, i);
16018 if (GET_CODE (u) == ZERO_EXTEND)
16019 u = XEXP (u, 0);
16020 if (GET_CODE (u) == UNSPEC
16021 && XINT (u, 1) == UNSPEC_TP)
16022 return true;
16024 x = &XEXP (*x, 0);
16027 iter.skip_subrtxes ();
16031 return false;
16034 /* Rewrite *LOC so that it refers to a default TLS address space. */
16035 void
16036 ix86_rewrite_tls_address_1 (rtx *loc)
16038 subrtx_ptr_iterator::array_type array;
16039 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16041 rtx *loc = *iter;
16042 if (MEM_P (*loc))
16044 rtx addr = XEXP (*loc, 0);
16045 rtx *x = &addr;
16046 while (GET_CODE (*x) == PLUS)
16048 int i;
16049 for (i = 0; i < 2; i++)
16051 rtx u = XEXP (*x, i);
16052 if (GET_CODE (u) == ZERO_EXTEND)
16053 u = XEXP (u, 0);
16054 if (GET_CODE (u) == UNSPEC
16055 && XINT (u, 1) == UNSPEC_TP)
16057 addr_space_t as = DEFAULT_TLS_SEG_REG;
16059 *x = XEXP (*x, 1 - i);
16061 *loc = replace_equiv_address_nv (*loc, addr, true);
16062 set_mem_addr_space (*loc, as);
16063 return;
16066 x = &XEXP (*x, 0);
16069 iter.skip_subrtxes ();
16074 /* Rewrite instruction pattern involvning TLS address
16075 so that it refers to a default TLS address space. */
16077 ix86_rewrite_tls_address (rtx pattern)
16079 pattern = copy_insn (pattern);
16080 ix86_rewrite_tls_address_1 (&pattern);
16081 return pattern;
16084 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16085 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16086 unique refptr-DECL symbol corresponding to symbol DECL. */
16088 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16090 static inline hashval_t hash (tree_map *m) { return m->hash; }
16091 static inline bool
16092 equal (tree_map *a, tree_map *b)
16094 return a->base.from == b->base.from;
16097 static int
16098 keep_cache_entry (tree_map *&m)
16100 return ggc_marked_p (m->base.from);
16104 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16106 static tree
16107 get_dllimport_decl (tree decl, bool beimport)
16109 struct tree_map *h, in;
16110 const char *name;
16111 const char *prefix;
16112 size_t namelen, prefixlen;
16113 char *imp_name;
16114 tree to;
16115 rtx rtl;
16117 if (!dllimport_map)
16118 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16120 in.hash = htab_hash_pointer (decl);
16121 in.base.from = decl;
16122 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16123 h = *loc;
16124 if (h)
16125 return h->to;
16127 *loc = h = ggc_alloc<tree_map> ();
16128 h->hash = in.hash;
16129 h->base.from = decl;
16130 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16131 VAR_DECL, NULL, ptr_type_node);
16132 DECL_ARTIFICIAL (to) = 1;
16133 DECL_IGNORED_P (to) = 1;
16134 DECL_EXTERNAL (to) = 1;
16135 TREE_READONLY (to) = 1;
16137 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16138 name = targetm.strip_name_encoding (name);
16139 if (beimport)
16140 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16141 ? "*__imp_" : "*__imp__";
16142 else
16143 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16144 namelen = strlen (name);
16145 prefixlen = strlen (prefix);
16146 imp_name = (char *) alloca (namelen + prefixlen + 1);
16147 memcpy (imp_name, prefix, prefixlen);
16148 memcpy (imp_name + prefixlen, name, namelen + 1);
16150 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16151 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16152 SET_SYMBOL_REF_DECL (rtl, to);
16153 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16154 if (!beimport)
16156 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16157 #ifdef SUB_TARGET_RECORD_STUB
16158 SUB_TARGET_RECORD_STUB (name);
16159 #endif
16162 rtl = gen_const_mem (Pmode, rtl);
16163 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16165 SET_DECL_RTL (to, rtl);
16166 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16168 return to;
16171 /* Expand SYMBOL into its corresponding far-address symbol.
16172 WANT_REG is true if we require the result be a register. */
16174 static rtx
16175 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16177 tree imp_decl;
16178 rtx x;
16180 gcc_assert (SYMBOL_REF_DECL (symbol));
16181 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16183 x = DECL_RTL (imp_decl);
16184 if (want_reg)
16185 x = force_reg (Pmode, x);
16186 return x;
16189 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16190 true if we require the result be a register. */
16192 static rtx
16193 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16195 tree imp_decl;
16196 rtx x;
16198 gcc_assert (SYMBOL_REF_DECL (symbol));
16199 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16201 x = DECL_RTL (imp_decl);
16202 if (want_reg)
16203 x = force_reg (Pmode, x);
16204 return x;
16207 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16208 is true if we require the result be a register. */
16210 static rtx
16211 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16213 if (!TARGET_PECOFF)
16214 return NULL_RTX;
16216 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16218 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16219 return legitimize_dllimport_symbol (addr, inreg);
16220 if (GET_CODE (addr) == CONST
16221 && GET_CODE (XEXP (addr, 0)) == PLUS
16222 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16223 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16225 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16226 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16230 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16231 return NULL_RTX;
16232 if (GET_CODE (addr) == SYMBOL_REF
16233 && !is_imported_p (addr)
16234 && SYMBOL_REF_EXTERNAL_P (addr)
16235 && SYMBOL_REF_DECL (addr))
16236 return legitimize_pe_coff_extern_decl (addr, inreg);
16238 if (GET_CODE (addr) == CONST
16239 && GET_CODE (XEXP (addr, 0)) == PLUS
16240 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16241 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16242 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16243 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16245 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16246 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16248 return NULL_RTX;
16251 /* Try machine-dependent ways of modifying an illegitimate address
16252 to be legitimate. If we find one, return the new, valid address.
16253 This macro is used in only one place: `memory_address' in explow.c.
16255 OLDX is the address as it was before break_out_memory_refs was called.
16256 In some cases it is useful to look at this to decide what needs to be done.
16258 It is always safe for this macro to do nothing. It exists to recognize
16259 opportunities to optimize the output.
16261 For the 80386, we handle X+REG by loading X into a register R and
16262 using R+REG. R will go in a general reg and indexing will be used.
16263 However, if REG is a broken-out memory address or multiplication,
16264 nothing needs to be done because REG can certainly go in a general reg.
16266 When -fpic is used, special handling is needed for symbolic references.
16267 See comments by legitimize_pic_address in i386.c for details. */
16269 static rtx
16270 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16272 bool changed = false;
16273 unsigned log;
16275 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16276 if (log)
16277 return legitimize_tls_address (x, (enum tls_model) log, false);
16278 if (GET_CODE (x) == CONST
16279 && GET_CODE (XEXP (x, 0)) == PLUS
16280 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16281 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16283 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16284 (enum tls_model) log, false);
16285 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16288 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16290 rtx tmp = legitimize_pe_coff_symbol (x, true);
16291 if (tmp)
16292 return tmp;
16295 if (flag_pic && SYMBOLIC_CONST (x))
16296 return legitimize_pic_address (x, 0);
16298 #if TARGET_MACHO
16299 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16300 return machopic_indirect_data_reference (x, 0);
16301 #endif
16303 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16304 if (GET_CODE (x) == ASHIFT
16305 && CONST_INT_P (XEXP (x, 1))
16306 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16308 changed = true;
16309 log = INTVAL (XEXP (x, 1));
16310 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16311 GEN_INT (1 << log));
16314 if (GET_CODE (x) == PLUS)
16316 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16318 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16319 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16320 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16322 changed = true;
16323 log = INTVAL (XEXP (XEXP (x, 0), 1));
16324 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16325 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16326 GEN_INT (1 << log));
16329 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16330 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16331 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16333 changed = true;
16334 log = INTVAL (XEXP (XEXP (x, 1), 1));
16335 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16336 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16337 GEN_INT (1 << log));
16340 /* Put multiply first if it isn't already. */
16341 if (GET_CODE (XEXP (x, 1)) == MULT)
16343 std::swap (XEXP (x, 0), XEXP (x, 1));
16344 changed = true;
16347 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16348 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16349 created by virtual register instantiation, register elimination, and
16350 similar optimizations. */
16351 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16353 changed = true;
16354 x = gen_rtx_PLUS (Pmode,
16355 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16356 XEXP (XEXP (x, 1), 0)),
16357 XEXP (XEXP (x, 1), 1));
16360 /* Canonicalize
16361 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16362 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16363 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16364 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16365 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16366 && CONSTANT_P (XEXP (x, 1)))
16368 rtx constant;
16369 rtx other = NULL_RTX;
16371 if (CONST_INT_P (XEXP (x, 1)))
16373 constant = XEXP (x, 1);
16374 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16376 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16378 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16379 other = XEXP (x, 1);
16381 else
16382 constant = 0;
16384 if (constant)
16386 changed = true;
16387 x = gen_rtx_PLUS (Pmode,
16388 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16389 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16390 plus_constant (Pmode, other,
16391 INTVAL (constant)));
16395 if (changed && ix86_legitimate_address_p (mode, x, false))
16396 return x;
16398 if (GET_CODE (XEXP (x, 0)) == MULT)
16400 changed = true;
16401 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16404 if (GET_CODE (XEXP (x, 1)) == MULT)
16406 changed = true;
16407 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16410 if (changed
16411 && REG_P (XEXP (x, 1))
16412 && REG_P (XEXP (x, 0)))
16413 return x;
16415 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16417 changed = true;
16418 x = legitimize_pic_address (x, 0);
16421 if (changed && ix86_legitimate_address_p (mode, x, false))
16422 return x;
16424 if (REG_P (XEXP (x, 0)))
16426 rtx temp = gen_reg_rtx (Pmode);
16427 rtx val = force_operand (XEXP (x, 1), temp);
16428 if (val != temp)
16430 val = convert_to_mode (Pmode, val, 1);
16431 emit_move_insn (temp, val);
16434 XEXP (x, 1) = temp;
16435 return x;
16438 else if (REG_P (XEXP (x, 1)))
16440 rtx temp = gen_reg_rtx (Pmode);
16441 rtx val = force_operand (XEXP (x, 0), temp);
16442 if (val != temp)
16444 val = convert_to_mode (Pmode, val, 1);
16445 emit_move_insn (temp, val);
16448 XEXP (x, 0) = temp;
16449 return x;
16453 return x;
16456 /* Print an integer constant expression in assembler syntax. Addition
16457 and subtraction are the only arithmetic that may appear in these
16458 expressions. FILE is the stdio stream to write to, X is the rtx, and
16459 CODE is the operand print code from the output string. */
16461 static void
16462 output_pic_addr_const (FILE *file, rtx x, int code)
16464 char buf[256];
16466 switch (GET_CODE (x))
16468 case PC:
16469 gcc_assert (flag_pic);
16470 putc ('.', file);
16471 break;
16473 case SYMBOL_REF:
16474 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16475 output_addr_const (file, x);
16476 else
16478 const char *name = XSTR (x, 0);
16480 /* Mark the decl as referenced so that cgraph will
16481 output the function. */
16482 if (SYMBOL_REF_DECL (x))
16483 mark_decl_referenced (SYMBOL_REF_DECL (x));
16485 #if TARGET_MACHO
16486 if (MACHOPIC_INDIRECT
16487 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16488 name = machopic_indirection_name (x, /*stub_p=*/true);
16489 #endif
16490 assemble_name (file, name);
16492 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16493 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16494 fputs ("@PLT", file);
16495 break;
16497 case LABEL_REF:
16498 x = XEXP (x, 0);
16499 /* FALLTHRU */
16500 case CODE_LABEL:
16501 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16502 assemble_name (asm_out_file, buf);
16503 break;
16505 case CONST_INT:
16506 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16507 break;
16509 case CONST:
16510 /* This used to output parentheses around the expression,
16511 but that does not work on the 386 (either ATT or BSD assembler). */
16512 output_pic_addr_const (file, XEXP (x, 0), code);
16513 break;
16515 case CONST_DOUBLE:
16516 /* We can't handle floating point constants;
16517 TARGET_PRINT_OPERAND must handle them. */
16518 output_operand_lossage ("floating constant misused");
16519 break;
16521 case PLUS:
16522 /* Some assemblers need integer constants to appear first. */
16523 if (CONST_INT_P (XEXP (x, 0)))
16525 output_pic_addr_const (file, XEXP (x, 0), code);
16526 putc ('+', file);
16527 output_pic_addr_const (file, XEXP (x, 1), code);
16529 else
16531 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16532 output_pic_addr_const (file, XEXP (x, 1), code);
16533 putc ('+', file);
16534 output_pic_addr_const (file, XEXP (x, 0), code);
16536 break;
16538 case MINUS:
16539 if (!TARGET_MACHO)
16540 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16541 output_pic_addr_const (file, XEXP (x, 0), code);
16542 putc ('-', file);
16543 output_pic_addr_const (file, XEXP (x, 1), code);
16544 if (!TARGET_MACHO)
16545 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16546 break;
16548 case UNSPEC:
16549 gcc_assert (XVECLEN (x, 0) == 1);
16550 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16551 switch (XINT (x, 1))
16553 case UNSPEC_GOT:
16554 fputs ("@GOT", file);
16555 break;
16556 case UNSPEC_GOTOFF:
16557 fputs ("@GOTOFF", file);
16558 break;
16559 case UNSPEC_PLTOFF:
16560 fputs ("@PLTOFF", file);
16561 break;
16562 case UNSPEC_PCREL:
16563 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16564 "(%rip)" : "[rip]", file);
16565 break;
16566 case UNSPEC_GOTPCREL:
16567 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16568 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16569 break;
16570 case UNSPEC_GOTTPOFF:
16571 /* FIXME: This might be @TPOFF in Sun ld too. */
16572 fputs ("@gottpoff", file);
16573 break;
16574 case UNSPEC_TPOFF:
16575 fputs ("@tpoff", file);
16576 break;
16577 case UNSPEC_NTPOFF:
16578 if (TARGET_64BIT)
16579 fputs ("@tpoff", file);
16580 else
16581 fputs ("@ntpoff", file);
16582 break;
16583 case UNSPEC_DTPOFF:
16584 fputs ("@dtpoff", file);
16585 break;
16586 case UNSPEC_GOTNTPOFF:
16587 if (TARGET_64BIT)
16588 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16589 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16590 else
16591 fputs ("@gotntpoff", file);
16592 break;
16593 case UNSPEC_INDNTPOFF:
16594 fputs ("@indntpoff", file);
16595 break;
16596 #if TARGET_MACHO
16597 case UNSPEC_MACHOPIC_OFFSET:
16598 putc ('-', file);
16599 machopic_output_function_base_name (file);
16600 break;
16601 #endif
16602 default:
16603 output_operand_lossage ("invalid UNSPEC as operand");
16604 break;
16606 break;
16608 default:
16609 output_operand_lossage ("invalid expression as operand");
16613 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16614 We need to emit DTP-relative relocations. */
16616 static void ATTRIBUTE_UNUSED
16617 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16619 fputs (ASM_LONG, file);
16620 output_addr_const (file, x);
16621 fputs ("@dtpoff", file);
16622 switch (size)
16624 case 4:
16625 break;
16626 case 8:
16627 fputs (", 0", file);
16628 break;
16629 default:
16630 gcc_unreachable ();
16634 /* Return true if X is a representation of the PIC register. This copes
16635 with calls from ix86_find_base_term, where the register might have
16636 been replaced by a cselib value. */
16638 static bool
16639 ix86_pic_register_p (rtx x)
16641 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16642 return (pic_offset_table_rtx
16643 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16644 else if (!REG_P (x))
16645 return false;
16646 else if (pic_offset_table_rtx)
16648 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16649 return true;
16650 if (HARD_REGISTER_P (x)
16651 && !HARD_REGISTER_P (pic_offset_table_rtx)
16652 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16653 return true;
16654 return false;
16656 else
16657 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16660 /* Helper function for ix86_delegitimize_address.
16661 Attempt to delegitimize TLS local-exec accesses. */
16663 static rtx
16664 ix86_delegitimize_tls_address (rtx orig_x)
16666 rtx x = orig_x, unspec;
16667 struct ix86_address addr;
16669 if (!TARGET_TLS_DIRECT_SEG_REFS)
16670 return orig_x;
16671 if (MEM_P (x))
16672 x = XEXP (x, 0);
16673 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16674 return orig_x;
16675 if (ix86_decompose_address (x, &addr) == 0
16676 || addr.seg != DEFAULT_TLS_SEG_REG
16677 || addr.disp == NULL_RTX
16678 || GET_CODE (addr.disp) != CONST)
16679 return orig_x;
16680 unspec = XEXP (addr.disp, 0);
16681 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
16682 unspec = XEXP (unspec, 0);
16683 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
16684 return orig_x;
16685 x = XVECEXP (unspec, 0, 0);
16686 gcc_assert (GET_CODE (x) == SYMBOL_REF);
16687 if (unspec != XEXP (addr.disp, 0))
16688 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
16689 if (addr.index)
16691 rtx idx = addr.index;
16692 if (addr.scale != 1)
16693 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
16694 x = gen_rtx_PLUS (Pmode, idx, x);
16696 if (addr.base)
16697 x = gen_rtx_PLUS (Pmode, addr.base, x);
16698 if (MEM_P (orig_x))
16699 x = replace_equiv_address_nv (orig_x, x);
16700 return x;
16703 /* In the name of slightly smaller debug output, and to cater to
16704 general assembler lossage, recognize PIC+GOTOFF and turn it back
16705 into a direct symbol reference.
16707 On Darwin, this is necessary to avoid a crash, because Darwin
16708 has a different PIC label for each routine but the DWARF debugging
16709 information is not associated with any particular routine, so it's
16710 necessary to remove references to the PIC label from RTL stored by
16711 the DWARF output code.
16713 This helper is used in the normal ix86_delegitimize_address
16714 entrypoint (e.g. used in the target delegitimization hook) and
16715 in ix86_find_base_term. As compile time memory optimization, we
16716 avoid allocating rtxes that will not change anything on the outcome
16717 of the callers (find_base_value and find_base_term). */
16719 static inline rtx
16720 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
16722 rtx orig_x = delegitimize_mem_from_attrs (x);
16723 /* addend is NULL or some rtx if x is something+GOTOFF where
16724 something doesn't include the PIC register. */
16725 rtx addend = NULL_RTX;
16726 /* reg_addend is NULL or a multiple of some register. */
16727 rtx reg_addend = NULL_RTX;
16728 /* const_addend is NULL or a const_int. */
16729 rtx const_addend = NULL_RTX;
16730 /* This is the result, or NULL. */
16731 rtx result = NULL_RTX;
16733 x = orig_x;
16735 if (MEM_P (x))
16736 x = XEXP (x, 0);
16738 if (TARGET_64BIT)
16740 if (GET_CODE (x) == CONST
16741 && GET_CODE (XEXP (x, 0)) == PLUS
16742 && GET_MODE (XEXP (x, 0)) == Pmode
16743 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16744 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
16745 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
16747 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
16748 base. A CONST can't be arg_pointer_rtx based. */
16749 if (base_term_p && MEM_P (orig_x))
16750 return orig_x;
16751 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
16752 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
16753 if (MEM_P (orig_x))
16754 x = replace_equiv_address_nv (orig_x, x);
16755 return x;
16758 if (GET_CODE (x) == CONST
16759 && GET_CODE (XEXP (x, 0)) == UNSPEC
16760 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
16761 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
16762 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
16764 x = XVECEXP (XEXP (x, 0), 0, 0);
16765 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
16767 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
16768 if (x == NULL_RTX)
16769 return orig_x;
16771 return x;
16774 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
16775 return ix86_delegitimize_tls_address (orig_x);
16777 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
16778 and -mcmodel=medium -fpic. */
16781 if (GET_CODE (x) != PLUS
16782 || GET_CODE (XEXP (x, 1)) != CONST)
16783 return ix86_delegitimize_tls_address (orig_x);
16785 if (ix86_pic_register_p (XEXP (x, 0)))
16786 /* %ebx + GOT/GOTOFF */
16788 else if (GET_CODE (XEXP (x, 0)) == PLUS)
16790 /* %ebx + %reg * scale + GOT/GOTOFF */
16791 reg_addend = XEXP (x, 0);
16792 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
16793 reg_addend = XEXP (reg_addend, 1);
16794 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
16795 reg_addend = XEXP (reg_addend, 0);
16796 else
16798 reg_addend = NULL_RTX;
16799 addend = XEXP (x, 0);
16802 else
16803 addend = XEXP (x, 0);
16805 x = XEXP (XEXP (x, 1), 0);
16806 if (GET_CODE (x) == PLUS
16807 && CONST_INT_P (XEXP (x, 1)))
16809 const_addend = XEXP (x, 1);
16810 x = XEXP (x, 0);
16813 if (GET_CODE (x) == UNSPEC
16814 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
16815 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
16816 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
16817 && !MEM_P (orig_x) && !addend)))
16818 result = XVECEXP (x, 0, 0);
16820 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
16821 && !MEM_P (orig_x))
16822 result = XVECEXP (x, 0, 0);
16824 if (! result)
16825 return ix86_delegitimize_tls_address (orig_x);
16827 /* For (PLUS something CONST_INT) both find_base_{value,term} just
16828 recurse on the first operand. */
16829 if (const_addend && !base_term_p)
16830 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
16831 if (reg_addend)
16832 result = gen_rtx_PLUS (Pmode, reg_addend, result);
16833 if (addend)
16835 /* If the rest of original X doesn't involve the PIC register, add
16836 addend and subtract pic_offset_table_rtx. This can happen e.g.
16837 for code like:
16838 leal (%ebx, %ecx, 4), %ecx
16840 movl foo@GOTOFF(%ecx), %edx
16841 in which case we return (%ecx - %ebx) + foo
16842 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
16843 and reload has completed. */
16844 if (pic_offset_table_rtx
16845 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
16846 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
16847 pic_offset_table_rtx),
16848 result);
16849 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
16851 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
16852 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
16853 result = gen_rtx_PLUS (Pmode, tmp, result);
16855 else
16856 return orig_x;
16858 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
16860 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
16861 if (result == NULL_RTX)
16862 return orig_x;
16864 return result;
16867 /* The normal instantiation of the above template. */
16869 static rtx
16870 ix86_delegitimize_address (rtx x)
16872 return ix86_delegitimize_address_1 (x, false);
16875 /* If X is a machine specific address (i.e. a symbol or label being
16876 referenced as a displacement from the GOT implemented using an
16877 UNSPEC), then return the base term. Otherwise return X. */
16880 ix86_find_base_term (rtx x)
16882 rtx term;
16884 if (TARGET_64BIT)
16886 if (GET_CODE (x) != CONST)
16887 return x;
16888 term = XEXP (x, 0);
16889 if (GET_CODE (term) == PLUS
16890 && CONST_INT_P (XEXP (term, 1)))
16891 term = XEXP (term, 0);
16892 if (GET_CODE (term) != UNSPEC
16893 || (XINT (term, 1) != UNSPEC_GOTPCREL
16894 && XINT (term, 1) != UNSPEC_PCREL))
16895 return x;
16897 return XVECEXP (term, 0, 0);
16900 return ix86_delegitimize_address_1 (x, true);
16903 static void
16904 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
16905 bool fp, FILE *file)
16907 const char *suffix;
16909 if (mode == CCFPmode || mode == CCFPUmode)
16911 code = ix86_fp_compare_code_to_integer (code);
16912 mode = CCmode;
16914 if (reverse)
16915 code = reverse_condition (code);
16917 switch (code)
16919 case EQ:
16920 gcc_assert (mode != CCGZmode);
16921 switch (mode)
16923 case E_CCAmode:
16924 suffix = "a";
16925 break;
16926 case E_CCCmode:
16927 suffix = "c";
16928 break;
16929 case E_CCOmode:
16930 suffix = "o";
16931 break;
16932 case E_CCPmode:
16933 suffix = "p";
16934 break;
16935 case E_CCSmode:
16936 suffix = "s";
16937 break;
16938 default:
16939 suffix = "e";
16940 break;
16942 break;
16943 case NE:
16944 gcc_assert (mode != CCGZmode);
16945 switch (mode)
16947 case E_CCAmode:
16948 suffix = "na";
16949 break;
16950 case E_CCCmode:
16951 suffix = "nc";
16952 break;
16953 case E_CCOmode:
16954 suffix = "no";
16955 break;
16956 case E_CCPmode:
16957 suffix = "np";
16958 break;
16959 case E_CCSmode:
16960 suffix = "ns";
16961 break;
16962 default:
16963 suffix = "ne";
16964 break;
16966 break;
16967 case GT:
16968 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
16969 suffix = "g";
16970 break;
16971 case GTU:
16972 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
16973 Those same assemblers have the same but opposite lossage on cmov. */
16974 if (mode == CCmode)
16975 suffix = fp ? "nbe" : "a";
16976 else
16977 gcc_unreachable ();
16978 break;
16979 case LT:
16980 switch (mode)
16982 case E_CCNOmode:
16983 case E_CCGOCmode:
16984 suffix = "s";
16985 break;
16987 case E_CCmode:
16988 case E_CCGCmode:
16989 case E_CCGZmode:
16990 suffix = "l";
16991 break;
16993 default:
16994 gcc_unreachable ();
16996 break;
16997 case LTU:
16998 if (mode == CCmode || mode == CCGZmode)
16999 suffix = "b";
17000 else if (mode == CCCmode)
17001 suffix = fp ? "b" : "c";
17002 else
17003 gcc_unreachable ();
17004 break;
17005 case GE:
17006 switch (mode)
17008 case E_CCNOmode:
17009 case E_CCGOCmode:
17010 suffix = "ns";
17011 break;
17013 case E_CCmode:
17014 case E_CCGCmode:
17015 case E_CCGZmode:
17016 suffix = "ge";
17017 break;
17019 default:
17020 gcc_unreachable ();
17022 break;
17023 case GEU:
17024 if (mode == CCmode || mode == CCGZmode)
17025 suffix = "nb";
17026 else if (mode == CCCmode)
17027 suffix = fp ? "nb" : "nc";
17028 else
17029 gcc_unreachable ();
17030 break;
17031 case LE:
17032 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17033 suffix = "le";
17034 break;
17035 case LEU:
17036 if (mode == CCmode)
17037 suffix = "be";
17038 else
17039 gcc_unreachable ();
17040 break;
17041 case UNORDERED:
17042 suffix = fp ? "u" : "p";
17043 break;
17044 case ORDERED:
17045 suffix = fp ? "nu" : "np";
17046 break;
17047 default:
17048 gcc_unreachable ();
17050 fputs (suffix, file);
17053 /* Print the name of register X to FILE based on its machine mode and number.
17054 If CODE is 'w', pretend the mode is HImode.
17055 If CODE is 'b', pretend the mode is QImode.
17056 If CODE is 'k', pretend the mode is SImode.
17057 If CODE is 'q', pretend the mode is DImode.
17058 If CODE is 'x', pretend the mode is V4SFmode.
17059 If CODE is 't', pretend the mode is V8SFmode.
17060 If CODE is 'g', pretend the mode is V16SFmode.
17061 If CODE is 'h', pretend the reg is the 'high' byte register.
17062 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17063 If CODE is 'd', duplicate the operand for AVX instruction.
17066 void
17067 print_reg (rtx x, int code, FILE *file)
17069 const char *reg;
17070 int msize;
17071 unsigned int regno;
17072 bool duplicated;
17074 if (ASSEMBLER_DIALECT == ASM_ATT)
17075 putc ('%', file);
17077 if (x == pc_rtx)
17079 gcc_assert (TARGET_64BIT);
17080 fputs ("rip", file);
17081 return;
17084 if (code == 'y' && STACK_TOP_P (x))
17086 fputs ("st(0)", file);
17087 return;
17090 if (code == 'w')
17091 msize = 2;
17092 else if (code == 'b')
17093 msize = 1;
17094 else if (code == 'k')
17095 msize = 4;
17096 else if (code == 'q')
17097 msize = 8;
17098 else if (code == 'h')
17099 msize = 0;
17100 else if (code == 'x')
17101 msize = 16;
17102 else if (code == 't')
17103 msize = 32;
17104 else if (code == 'g')
17105 msize = 64;
17106 else
17107 msize = GET_MODE_SIZE (GET_MODE (x));
17109 regno = REGNO (x);
17111 if (regno == ARG_POINTER_REGNUM
17112 || regno == FRAME_POINTER_REGNUM
17113 || regno == FPSR_REG
17114 || regno == FPCR_REG)
17116 output_operand_lossage
17117 ("invalid use of register '%s'", reg_names[regno]);
17118 return;
17120 else if (regno == FLAGS_REG)
17122 output_operand_lossage ("invalid use of asm flag output");
17123 return;
17126 duplicated = code == 'd' && TARGET_AVX;
17128 switch (msize)
17130 case 16:
17131 case 12:
17132 case 8:
17133 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17134 warning (0, "unsupported size for integer register");
17135 /* FALLTHRU */
17136 case 4:
17137 if (LEGACY_INT_REGNO_P (regno))
17138 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17139 /* FALLTHRU */
17140 case 2:
17141 normal:
17142 reg = hi_reg_name[regno];
17143 break;
17144 case 1:
17145 if (regno >= ARRAY_SIZE (qi_reg_name))
17146 goto normal;
17147 if (!ANY_QI_REGNO_P (regno))
17148 error ("unsupported size for integer register");
17149 reg = qi_reg_name[regno];
17150 break;
17151 case 0:
17152 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17153 goto normal;
17154 reg = qi_high_reg_name[regno];
17155 break;
17156 case 32:
17157 case 64:
17158 if (SSE_REGNO_P (regno))
17160 gcc_assert (!duplicated);
17161 putc (msize == 32 ? 'y' : 'z', file);
17162 reg = hi_reg_name[regno] + 1;
17163 break;
17165 goto normal;
17166 default:
17167 gcc_unreachable ();
17170 fputs (reg, file);
17172 /* Irritatingly, AMD extended registers use
17173 different naming convention: "r%d[bwd]" */
17174 if (REX_INT_REGNO_P (regno))
17176 gcc_assert (TARGET_64BIT);
17177 switch (msize)
17179 case 0:
17180 error ("extended registers have no high halves");
17181 break;
17182 case 1:
17183 putc ('b', file);
17184 break;
17185 case 2:
17186 putc ('w', file);
17187 break;
17188 case 4:
17189 putc ('d', file);
17190 break;
17191 case 8:
17192 /* no suffix */
17193 break;
17194 default:
17195 error ("unsupported operand size for extended register");
17196 break;
17198 return;
17201 if (duplicated)
17203 if (ASSEMBLER_DIALECT == ASM_ATT)
17204 fprintf (file, ", %%%s", reg);
17205 else
17206 fprintf (file, ", %s", reg);
17210 /* Meaning of CODE:
17211 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17212 C -- print opcode suffix for set/cmov insn.
17213 c -- like C, but print reversed condition
17214 F,f -- likewise, but for floating-point.
17215 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17216 otherwise nothing
17217 R -- print embeded rounding and sae.
17218 r -- print only sae.
17219 z -- print the opcode suffix for the size of the current operand.
17220 Z -- likewise, with special suffixes for x87 instructions.
17221 * -- print a star (in certain assembler syntax)
17222 A -- print an absolute memory reference.
17223 E -- print address with DImode register names if TARGET_64BIT.
17224 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17225 s -- print a shift double count, followed by the assemblers argument
17226 delimiter.
17227 b -- print the QImode name of the register for the indicated operand.
17228 %b0 would print %al if operands[0] is reg 0.
17229 w -- likewise, print the HImode name of the register.
17230 k -- likewise, print the SImode name of the register.
17231 q -- likewise, print the DImode name of the register.
17232 x -- likewise, print the V4SFmode name of the register.
17233 t -- likewise, print the V8SFmode name of the register.
17234 g -- likewise, print the V16SFmode name of the register.
17235 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17236 y -- print "st(0)" instead of "st" as a register.
17237 d -- print duplicated register operand for AVX instruction.
17238 D -- print condition for SSE cmp instruction.
17239 P -- if PIC, print an @PLT suffix.
17240 p -- print raw symbol name.
17241 X -- don't print any sort of PIC '@' suffix for a symbol.
17242 & -- print some in-use local-dynamic symbol name.
17243 H -- print a memory address offset by 8; used for sse high-parts
17244 Y -- print condition for XOP pcom* instruction.
17245 + -- print a branch hint as 'cs' or 'ds' prefix
17246 ; -- print a semicolon (after prefixes due to bug in older gas).
17247 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17248 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17249 ! -- print MPX prefix for jxx/call/ret instructions if required.
17252 void
17253 ix86_print_operand (FILE *file, rtx x, int code)
17255 if (code)
17257 switch (code)
17259 case 'A':
17260 switch (ASSEMBLER_DIALECT)
17262 case ASM_ATT:
17263 putc ('*', file);
17264 break;
17266 case ASM_INTEL:
17267 /* Intel syntax. For absolute addresses, registers should not
17268 be surrounded by braces. */
17269 if (!REG_P (x))
17271 putc ('[', file);
17272 ix86_print_operand (file, x, 0);
17273 putc (']', file);
17274 return;
17276 break;
17278 default:
17279 gcc_unreachable ();
17282 ix86_print_operand (file, x, 0);
17283 return;
17285 case 'E':
17286 /* Wrap address in an UNSPEC to declare special handling. */
17287 if (TARGET_64BIT)
17288 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17290 output_address (VOIDmode, x);
17291 return;
17293 case 'L':
17294 if (ASSEMBLER_DIALECT == ASM_ATT)
17295 putc ('l', file);
17296 return;
17298 case 'W':
17299 if (ASSEMBLER_DIALECT == ASM_ATT)
17300 putc ('w', file);
17301 return;
17303 case 'B':
17304 if (ASSEMBLER_DIALECT == ASM_ATT)
17305 putc ('b', file);
17306 return;
17308 case 'Q':
17309 if (ASSEMBLER_DIALECT == ASM_ATT)
17310 putc ('l', file);
17311 return;
17313 case 'S':
17314 if (ASSEMBLER_DIALECT == ASM_ATT)
17315 putc ('s', file);
17316 return;
17318 case 'T':
17319 if (ASSEMBLER_DIALECT == ASM_ATT)
17320 putc ('t', file);
17321 return;
17323 case 'O':
17324 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17325 if (ASSEMBLER_DIALECT != ASM_ATT)
17326 return;
17328 switch (GET_MODE_SIZE (GET_MODE (x)))
17330 case 2:
17331 putc ('w', file);
17332 break;
17334 case 4:
17335 putc ('l', file);
17336 break;
17338 case 8:
17339 putc ('q', file);
17340 break;
17342 default:
17343 output_operand_lossage ("invalid operand size for operand "
17344 "code 'O'");
17345 return;
17348 putc ('.', file);
17349 #endif
17350 return;
17352 case 'z':
17353 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17355 /* Opcodes don't get size suffixes if using Intel opcodes. */
17356 if (ASSEMBLER_DIALECT == ASM_INTEL)
17357 return;
17359 switch (GET_MODE_SIZE (GET_MODE (x)))
17361 case 1:
17362 putc ('b', file);
17363 return;
17365 case 2:
17366 putc ('w', file);
17367 return;
17369 case 4:
17370 putc ('l', file);
17371 return;
17373 case 8:
17374 putc ('q', file);
17375 return;
17377 default:
17378 output_operand_lossage ("invalid operand size for operand "
17379 "code 'z'");
17380 return;
17384 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17385 warning (0, "non-integer operand used with operand code 'z'");
17386 /* FALLTHRU */
17388 case 'Z':
17389 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17390 if (ASSEMBLER_DIALECT == ASM_INTEL)
17391 return;
17393 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17395 switch (GET_MODE_SIZE (GET_MODE (x)))
17397 case 2:
17398 #ifdef HAVE_AS_IX86_FILDS
17399 putc ('s', file);
17400 #endif
17401 return;
17403 case 4:
17404 putc ('l', file);
17405 return;
17407 case 8:
17408 #ifdef HAVE_AS_IX86_FILDQ
17409 putc ('q', file);
17410 #else
17411 fputs ("ll", file);
17412 #endif
17413 return;
17415 default:
17416 break;
17419 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17421 /* 387 opcodes don't get size suffixes
17422 if the operands are registers. */
17423 if (STACK_REG_P (x))
17424 return;
17426 switch (GET_MODE_SIZE (GET_MODE (x)))
17428 case 4:
17429 putc ('s', file);
17430 return;
17432 case 8:
17433 putc ('l', file);
17434 return;
17436 case 12:
17437 case 16:
17438 putc ('t', file);
17439 return;
17441 default:
17442 break;
17445 else
17447 output_operand_lossage ("invalid operand type used with "
17448 "operand code 'Z'");
17449 return;
17452 output_operand_lossage ("invalid operand size for operand code 'Z'");
17453 return;
17455 case 'd':
17456 case 'b':
17457 case 'w':
17458 case 'k':
17459 case 'q':
17460 case 'h':
17461 case 't':
17462 case 'g':
17463 case 'y':
17464 case 'x':
17465 case 'X':
17466 case 'P':
17467 case 'p':
17468 break;
17470 case 's':
17471 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17473 ix86_print_operand (file, x, 0);
17474 fputs (", ", file);
17476 return;
17478 case 'Y':
17479 switch (GET_CODE (x))
17481 case NE:
17482 fputs ("neq", file);
17483 break;
17484 case EQ:
17485 fputs ("eq", file);
17486 break;
17487 case GE:
17488 case GEU:
17489 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17490 break;
17491 case GT:
17492 case GTU:
17493 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17494 break;
17495 case LE:
17496 case LEU:
17497 fputs ("le", file);
17498 break;
17499 case LT:
17500 case LTU:
17501 fputs ("lt", file);
17502 break;
17503 case UNORDERED:
17504 fputs ("unord", file);
17505 break;
17506 case ORDERED:
17507 fputs ("ord", file);
17508 break;
17509 case UNEQ:
17510 fputs ("ueq", file);
17511 break;
17512 case UNGE:
17513 fputs ("nlt", file);
17514 break;
17515 case UNGT:
17516 fputs ("nle", file);
17517 break;
17518 case UNLE:
17519 fputs ("ule", file);
17520 break;
17521 case UNLT:
17522 fputs ("ult", file);
17523 break;
17524 case LTGT:
17525 fputs ("une", file);
17526 break;
17527 default:
17528 output_operand_lossage ("operand is not a condition code, "
17529 "invalid operand code 'Y'");
17530 return;
17532 return;
17534 case 'D':
17535 /* Little bit of braindamage here. The SSE compare instructions
17536 does use completely different names for the comparisons that the
17537 fp conditional moves. */
17538 switch (GET_CODE (x))
17540 case UNEQ:
17541 if (TARGET_AVX)
17543 fputs ("eq_us", file);
17544 break;
17546 /* FALLTHRU */
17547 case EQ:
17548 fputs ("eq", file);
17549 break;
17550 case UNLT:
17551 if (TARGET_AVX)
17553 fputs ("nge", file);
17554 break;
17556 /* FALLTHRU */
17557 case LT:
17558 fputs ("lt", file);
17559 break;
17560 case UNLE:
17561 if (TARGET_AVX)
17563 fputs ("ngt", file);
17564 break;
17566 /* FALLTHRU */
17567 case LE:
17568 fputs ("le", file);
17569 break;
17570 case UNORDERED:
17571 fputs ("unord", file);
17572 break;
17573 case LTGT:
17574 if (TARGET_AVX)
17576 fputs ("neq_oq", file);
17577 break;
17579 /* FALLTHRU */
17580 case NE:
17581 fputs ("neq", file);
17582 break;
17583 case GE:
17584 if (TARGET_AVX)
17586 fputs ("ge", file);
17587 break;
17589 /* FALLTHRU */
17590 case UNGE:
17591 fputs ("nlt", file);
17592 break;
17593 case GT:
17594 if (TARGET_AVX)
17596 fputs ("gt", file);
17597 break;
17599 /* FALLTHRU */
17600 case UNGT:
17601 fputs ("nle", file);
17602 break;
17603 case ORDERED:
17604 fputs ("ord", file);
17605 break;
17606 default:
17607 output_operand_lossage ("operand is not a condition code, "
17608 "invalid operand code 'D'");
17609 return;
17611 return;
17613 case 'F':
17614 case 'f':
17615 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17616 if (ASSEMBLER_DIALECT == ASM_ATT)
17617 putc ('.', file);
17618 gcc_fallthrough ();
17619 #endif
17621 case 'C':
17622 case 'c':
17623 if (!COMPARISON_P (x))
17625 output_operand_lossage ("operand is not a condition code, "
17626 "invalid operand code '%c'", code);
17627 return;
17629 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17630 code == 'c' || code == 'f',
17631 code == 'F' || code == 'f',
17632 file);
17633 return;
17635 case 'H':
17636 if (!offsettable_memref_p (x))
17638 output_operand_lossage ("operand is not an offsettable memory "
17639 "reference, invalid operand code 'H'");
17640 return;
17642 /* It doesn't actually matter what mode we use here, as we're
17643 only going to use this for printing. */
17644 x = adjust_address_nv (x, DImode, 8);
17645 /* Output 'qword ptr' for intel assembler dialect. */
17646 if (ASSEMBLER_DIALECT == ASM_INTEL)
17647 code = 'q';
17648 break;
17650 case 'K':
17651 if (!CONST_INT_P (x))
17653 output_operand_lossage ("operand is not an integer, invalid "
17654 "operand code 'K'");
17655 return;
17658 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17659 #ifdef HAVE_AS_IX86_HLE
17660 fputs ("xacquire ", file);
17661 #else
17662 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17663 #endif
17664 else if (INTVAL (x) & IX86_HLE_RELEASE)
17665 #ifdef HAVE_AS_IX86_HLE
17666 fputs ("xrelease ", file);
17667 #else
17668 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17669 #endif
17670 /* We do not want to print value of the operand. */
17671 return;
17673 case 'N':
17674 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17675 fputs ("{z}", file);
17676 return;
17678 case 'r':
17679 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
17681 output_operand_lossage ("operand is not a specific integer, "
17682 "invalid operand code 'r'");
17683 return;
17686 if (ASSEMBLER_DIALECT == ASM_INTEL)
17687 fputs (", ", file);
17689 fputs ("{sae}", file);
17691 if (ASSEMBLER_DIALECT == ASM_ATT)
17692 fputs (", ", file);
17694 return;
17696 case 'R':
17697 if (!CONST_INT_P (x))
17699 output_operand_lossage ("operand is not an integer, invalid "
17700 "operand code 'R'");
17701 return;
17704 if (ASSEMBLER_DIALECT == ASM_INTEL)
17705 fputs (", ", file);
17707 switch (INTVAL (x))
17709 case ROUND_NEAREST_INT | ROUND_SAE:
17710 fputs ("{rn-sae}", file);
17711 break;
17712 case ROUND_NEG_INF | ROUND_SAE:
17713 fputs ("{rd-sae}", file);
17714 break;
17715 case ROUND_POS_INF | ROUND_SAE:
17716 fputs ("{ru-sae}", file);
17717 break;
17718 case ROUND_ZERO | ROUND_SAE:
17719 fputs ("{rz-sae}", file);
17720 break;
17721 default:
17722 output_operand_lossage ("operand is not a specific integer, "
17723 "invalid operand code 'R'");
17726 if (ASSEMBLER_DIALECT == ASM_ATT)
17727 fputs (", ", file);
17729 return;
17731 case '*':
17732 if (ASSEMBLER_DIALECT == ASM_ATT)
17733 putc ('*', file);
17734 return;
17736 case '&':
17738 const char *name = get_some_local_dynamic_name ();
17739 if (name == NULL)
17740 output_operand_lossage ("'%%&' used without any "
17741 "local dynamic TLS references");
17742 else
17743 assemble_name (file, name);
17744 return;
17747 case '+':
17749 rtx x;
17751 if (!optimize
17752 || optimize_function_for_size_p (cfun)
17753 || !TARGET_BRANCH_PREDICTION_HINTS)
17754 return;
17756 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
17757 if (x)
17759 int pred_val = profile_probability::from_reg_br_prob_note
17760 (XINT (x, 0)).to_reg_br_prob_base ();
17762 if (pred_val < REG_BR_PROB_BASE * 45 / 100
17763 || pred_val > REG_BR_PROB_BASE * 55 / 100)
17765 bool taken = pred_val > REG_BR_PROB_BASE / 2;
17766 bool cputaken
17767 = final_forward_branch_p (current_output_insn) == 0;
17769 /* Emit hints only in the case default branch prediction
17770 heuristics would fail. */
17771 if (taken != cputaken)
17773 /* We use 3e (DS) prefix for taken branches and
17774 2e (CS) prefix for not taken branches. */
17775 if (taken)
17776 fputs ("ds ; ", file);
17777 else
17778 fputs ("cs ; ", file);
17782 return;
17785 case ';':
17786 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
17787 putc (';', file);
17788 #endif
17789 return;
17791 case '~':
17792 putc (TARGET_AVX2 ? 'i' : 'f', file);
17793 return;
17795 case '^':
17796 if (TARGET_64BIT && Pmode != word_mode)
17797 fputs ("addr32 ", file);
17798 return;
17800 case '!':
17801 if (ix86_bnd_prefixed_insn_p (current_output_insn))
17802 fputs ("bnd ", file);
17803 if (ix86_notrack_prefixed_insn_p (current_output_insn))
17804 fputs ("notrack ", file);
17805 return;
17807 default:
17808 output_operand_lossage ("invalid operand code '%c'", code);
17812 if (REG_P (x))
17813 print_reg (x, code, file);
17815 else if (MEM_P (x))
17817 rtx addr = XEXP (x, 0);
17819 /* No `byte ptr' prefix for call instructions ... */
17820 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
17822 machine_mode mode = GET_MODE (x);
17823 const char *size;
17825 /* Check for explicit size override codes. */
17826 if (code == 'b')
17827 size = "BYTE";
17828 else if (code == 'w')
17829 size = "WORD";
17830 else if (code == 'k')
17831 size = "DWORD";
17832 else if (code == 'q')
17833 size = "QWORD";
17834 else if (code == 'x')
17835 size = "XMMWORD";
17836 else if (code == 't')
17837 size = "YMMWORD";
17838 else if (code == 'g')
17839 size = "ZMMWORD";
17840 else if (mode == BLKmode)
17841 /* ... or BLKmode operands, when not overridden. */
17842 size = NULL;
17843 else
17844 switch (GET_MODE_SIZE (mode))
17846 case 1: size = "BYTE"; break;
17847 case 2: size = "WORD"; break;
17848 case 4: size = "DWORD"; break;
17849 case 8: size = "QWORD"; break;
17850 case 12: size = "TBYTE"; break;
17851 case 16:
17852 if (mode == XFmode)
17853 size = "TBYTE";
17854 else
17855 size = "XMMWORD";
17856 break;
17857 case 32: size = "YMMWORD"; break;
17858 case 64: size = "ZMMWORD"; break;
17859 default:
17860 gcc_unreachable ();
17862 if (size)
17864 fputs (size, file);
17865 fputs (" PTR ", file);
17869 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
17870 output_operand_lossage ("invalid constraints for operand");
17871 else
17872 ix86_print_operand_address_as
17873 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
17876 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
17878 long l;
17880 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
17882 if (ASSEMBLER_DIALECT == ASM_ATT)
17883 putc ('$', file);
17884 /* Sign extend 32bit SFmode immediate to 8 bytes. */
17885 if (code == 'q')
17886 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
17887 (unsigned long long) (int) l);
17888 else
17889 fprintf (file, "0x%08x", (unsigned int) l);
17892 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
17894 long l[2];
17896 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
17898 if (ASSEMBLER_DIALECT == ASM_ATT)
17899 putc ('$', file);
17900 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
17903 /* These float cases don't actually occur as immediate operands. */
17904 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
17906 char dstr[30];
17908 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
17909 fputs (dstr, file);
17912 else
17914 /* We have patterns that allow zero sets of memory, for instance.
17915 In 64-bit mode, we should probably support all 8-byte vectors,
17916 since we can in fact encode that into an immediate. */
17917 if (GET_CODE (x) == CONST_VECTOR)
17919 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
17920 x = const0_rtx;
17923 if (code != 'P' && code != 'p')
17925 if (CONST_INT_P (x))
17927 if (ASSEMBLER_DIALECT == ASM_ATT)
17928 putc ('$', file);
17930 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
17931 || GET_CODE (x) == LABEL_REF)
17933 if (ASSEMBLER_DIALECT == ASM_ATT)
17934 putc ('$', file);
17935 else
17936 fputs ("OFFSET FLAT:", file);
17939 if (CONST_INT_P (x))
17940 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17941 else if (flag_pic || MACHOPIC_INDIRECT)
17942 output_pic_addr_const (file, x, code);
17943 else
17944 output_addr_const (file, x);
17948 static bool
17949 ix86_print_operand_punct_valid_p (unsigned char code)
17951 return (code == '*' || code == '+' || code == '&' || code == ';'
17952 || code == '~' || code == '^' || code == '!');
17955 /* Print a memory operand whose address is ADDR. */
17957 static void
17958 ix86_print_operand_address_as (FILE *file, rtx addr,
17959 addr_space_t as, bool no_rip)
17961 struct ix86_address parts;
17962 rtx base, index, disp;
17963 int scale;
17964 int ok;
17965 bool vsib = false;
17966 int code = 0;
17968 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
17970 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
17971 gcc_assert (parts.index == NULL_RTX);
17972 parts.index = XVECEXP (addr, 0, 1);
17973 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
17974 addr = XVECEXP (addr, 0, 0);
17975 vsib = true;
17977 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
17979 gcc_assert (TARGET_64BIT);
17980 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
17981 code = 'q';
17983 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
17985 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
17986 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
17987 if (parts.base != NULL_RTX)
17989 parts.index = parts.base;
17990 parts.scale = 1;
17992 parts.base = XVECEXP (addr, 0, 0);
17993 addr = XVECEXP (addr, 0, 0);
17995 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
17997 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
17998 gcc_assert (parts.index == NULL_RTX);
17999 parts.index = XVECEXP (addr, 0, 1);
18000 addr = XVECEXP (addr, 0, 0);
18002 else
18003 ok = ix86_decompose_address (addr, &parts);
18005 gcc_assert (ok);
18007 base = parts.base;
18008 index = parts.index;
18009 disp = parts.disp;
18010 scale = parts.scale;
18012 if (ADDR_SPACE_GENERIC_P (as))
18013 as = parts.seg;
18014 else
18015 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18017 if (!ADDR_SPACE_GENERIC_P (as))
18019 const char *string;
18021 if (as == ADDR_SPACE_SEG_FS)
18022 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18023 else if (as == ADDR_SPACE_SEG_GS)
18024 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18025 else
18026 gcc_unreachable ();
18027 fputs (string, file);
18030 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18031 if (TARGET_64BIT && !base && !index && !no_rip)
18033 rtx symbol = disp;
18035 if (GET_CODE (disp) == CONST
18036 && GET_CODE (XEXP (disp, 0)) == PLUS
18037 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18038 symbol = XEXP (XEXP (disp, 0), 0);
18040 if (GET_CODE (symbol) == LABEL_REF
18041 || (GET_CODE (symbol) == SYMBOL_REF
18042 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18043 base = pc_rtx;
18046 if (!base && !index)
18048 /* Displacement only requires special attention. */
18049 if (CONST_INT_P (disp))
18051 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18052 fputs ("ds:", file);
18053 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18055 /* Load the external function address via the GOT slot to avoid PLT. */
18056 else if (GET_CODE (disp) == CONST
18057 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18058 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18059 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18060 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18061 output_pic_addr_const (file, disp, 0);
18062 else if (flag_pic)
18063 output_pic_addr_const (file, disp, 0);
18064 else
18065 output_addr_const (file, disp);
18067 else
18069 /* Print SImode register names to force addr32 prefix. */
18070 if (SImode_address_operand (addr, VOIDmode))
18072 if (flag_checking)
18074 gcc_assert (TARGET_64BIT);
18075 switch (GET_CODE (addr))
18077 case SUBREG:
18078 gcc_assert (GET_MODE (addr) == SImode);
18079 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18080 break;
18081 case ZERO_EXTEND:
18082 case AND:
18083 gcc_assert (GET_MODE (addr) == DImode);
18084 break;
18085 default:
18086 gcc_unreachable ();
18089 gcc_assert (!code);
18090 code = 'k';
18092 else if (code == 0
18093 && TARGET_X32
18094 && disp
18095 && CONST_INT_P (disp)
18096 && INTVAL (disp) < -16*1024*1024)
18098 /* X32 runs in 64-bit mode, where displacement, DISP, in
18099 address DISP(%r64), is encoded as 32-bit immediate sign-
18100 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18101 address is %r64 + 0xffffffffbffffd00. When %r64 <
18102 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18103 which is invalid for x32. The correct address is %r64
18104 - 0x40000300 == 0xf7ffdd64. To properly encode
18105 -0x40000300(%r64) for x32, we zero-extend negative
18106 displacement by forcing addr32 prefix which truncates
18107 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18108 zero-extend all negative displacements, including -1(%rsp).
18109 However, for small negative displacements, sign-extension
18110 won't cause overflow. We only zero-extend negative
18111 displacements if they < -16*1024*1024, which is also used
18112 to check legitimate address displacements for PIC. */
18113 code = 'k';
18116 /* Since the upper 32 bits of RSP are always zero for x32,
18117 we can encode %esp as %rsp to avoid 0x67 prefix if
18118 there is no index register. */
18119 if (TARGET_X32 && Pmode == SImode
18120 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18121 code = 'q';
18123 if (ASSEMBLER_DIALECT == ASM_ATT)
18125 if (disp)
18127 if (flag_pic)
18128 output_pic_addr_const (file, disp, 0);
18129 else if (GET_CODE (disp) == LABEL_REF)
18130 output_asm_label (disp);
18131 else
18132 output_addr_const (file, disp);
18135 putc ('(', file);
18136 if (base)
18137 print_reg (base, code, file);
18138 if (index)
18140 putc (',', file);
18141 print_reg (index, vsib ? 0 : code, file);
18142 if (scale != 1 || vsib)
18143 fprintf (file, ",%d", scale);
18145 putc (')', file);
18147 else
18149 rtx offset = NULL_RTX;
18151 if (disp)
18153 /* Pull out the offset of a symbol; print any symbol itself. */
18154 if (GET_CODE (disp) == CONST
18155 && GET_CODE (XEXP (disp, 0)) == PLUS
18156 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18158 offset = XEXP (XEXP (disp, 0), 1);
18159 disp = gen_rtx_CONST (VOIDmode,
18160 XEXP (XEXP (disp, 0), 0));
18163 if (flag_pic)
18164 output_pic_addr_const (file, disp, 0);
18165 else if (GET_CODE (disp) == LABEL_REF)
18166 output_asm_label (disp);
18167 else if (CONST_INT_P (disp))
18168 offset = disp;
18169 else
18170 output_addr_const (file, disp);
18173 putc ('[', file);
18174 if (base)
18176 print_reg (base, code, file);
18177 if (offset)
18179 if (INTVAL (offset) >= 0)
18180 putc ('+', file);
18181 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18184 else if (offset)
18185 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18186 else
18187 putc ('0', file);
18189 if (index)
18191 putc ('+', file);
18192 print_reg (index, vsib ? 0 : code, file);
18193 if (scale != 1 || vsib)
18194 fprintf (file, "*%d", scale);
18196 putc (']', file);
18201 static void
18202 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18204 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18207 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18209 static bool
18210 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18212 rtx op;
18214 if (GET_CODE (x) != UNSPEC)
18215 return false;
18217 op = XVECEXP (x, 0, 0);
18218 switch (XINT (x, 1))
18220 case UNSPEC_GOTTPOFF:
18221 output_addr_const (file, op);
18222 /* FIXME: This might be @TPOFF in Sun ld. */
18223 fputs ("@gottpoff", file);
18224 break;
18225 case UNSPEC_TPOFF:
18226 output_addr_const (file, op);
18227 fputs ("@tpoff", file);
18228 break;
18229 case UNSPEC_NTPOFF:
18230 output_addr_const (file, op);
18231 if (TARGET_64BIT)
18232 fputs ("@tpoff", file);
18233 else
18234 fputs ("@ntpoff", file);
18235 break;
18236 case UNSPEC_DTPOFF:
18237 output_addr_const (file, op);
18238 fputs ("@dtpoff", file);
18239 break;
18240 case UNSPEC_GOTNTPOFF:
18241 output_addr_const (file, op);
18242 if (TARGET_64BIT)
18243 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18244 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18245 else
18246 fputs ("@gotntpoff", file);
18247 break;
18248 case UNSPEC_INDNTPOFF:
18249 output_addr_const (file, op);
18250 fputs ("@indntpoff", file);
18251 break;
18252 #if TARGET_MACHO
18253 case UNSPEC_MACHOPIC_OFFSET:
18254 output_addr_const (file, op);
18255 putc ('-', file);
18256 machopic_output_function_base_name (file);
18257 break;
18258 #endif
18260 default:
18261 return false;
18264 return true;
18267 /* Split one or more double-mode RTL references into pairs of half-mode
18268 references. The RTL can be REG, offsettable MEM, integer constant, or
18269 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18270 split and "num" is its length. lo_half and hi_half are output arrays
18271 that parallel "operands". */
18273 void
18274 split_double_mode (machine_mode mode, rtx operands[],
18275 int num, rtx lo_half[], rtx hi_half[])
18277 machine_mode half_mode;
18278 unsigned int byte;
18280 switch (mode)
18282 case E_TImode:
18283 half_mode = DImode;
18284 break;
18285 case E_DImode:
18286 half_mode = SImode;
18287 break;
18288 default:
18289 gcc_unreachable ();
18292 byte = GET_MODE_SIZE (half_mode);
18294 while (num--)
18296 rtx op = operands[num];
18298 /* simplify_subreg refuse to split volatile memory addresses,
18299 but we still have to handle it. */
18300 if (MEM_P (op))
18302 lo_half[num] = adjust_address (op, half_mode, 0);
18303 hi_half[num] = adjust_address (op, half_mode, byte);
18305 else
18307 lo_half[num] = simplify_gen_subreg (half_mode, op,
18308 GET_MODE (op) == VOIDmode
18309 ? mode : GET_MODE (op), 0);
18310 hi_half[num] = simplify_gen_subreg (half_mode, op,
18311 GET_MODE (op) == VOIDmode
18312 ? mode : GET_MODE (op), byte);
18317 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18318 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18319 is the expression of the binary operation. The output may either be
18320 emitted here, or returned to the caller, like all output_* functions.
18322 There is no guarantee that the operands are the same mode, as they
18323 might be within FLOAT or FLOAT_EXTEND expressions. */
18325 #ifndef SYSV386_COMPAT
18326 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18327 wants to fix the assemblers because that causes incompatibility
18328 with gcc. No-one wants to fix gcc because that causes
18329 incompatibility with assemblers... You can use the option of
18330 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18331 #define SYSV386_COMPAT 1
18332 #endif
18334 const char *
18335 output_387_binary_op (rtx_insn *insn, rtx *operands)
18337 static char buf[40];
18338 const char *p;
18339 bool is_sse
18340 = (SSE_REG_P (operands[0])
18341 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
18343 if (is_sse)
18344 p = "%v";
18345 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18346 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18347 p = "fi";
18348 else
18349 p = "f";
18351 strcpy (buf, p);
18353 switch (GET_CODE (operands[3]))
18355 case PLUS:
18356 p = "add"; break;
18357 case MINUS:
18358 p = "sub"; break;
18359 case MULT:
18360 p = "mul"; break;
18361 case DIV:
18362 p = "div"; break;
18363 default:
18364 gcc_unreachable ();
18367 strcat (buf, p);
18369 if (is_sse)
18371 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
18372 strcat (buf, p);
18374 if (TARGET_AVX)
18375 p = "\t{%2, %1, %0|%0, %1, %2}";
18376 else
18377 p = "\t{%2, %0|%0, %2}";
18379 strcat (buf, p);
18380 return buf;
18383 /* Even if we do not want to check the inputs, this documents input
18384 constraints. Which helps in understanding the following code. */
18385 if (flag_checking)
18387 if (STACK_REG_P (operands[0])
18388 && ((REG_P (operands[1])
18389 && REGNO (operands[0]) == REGNO (operands[1])
18390 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18391 || (REG_P (operands[2])
18392 && REGNO (operands[0]) == REGNO (operands[2])
18393 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18394 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18395 ; /* ok */
18396 else
18397 gcc_unreachable ();
18400 switch (GET_CODE (operands[3]))
18402 case MULT:
18403 case PLUS:
18404 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18405 std::swap (operands[1], operands[2]);
18407 /* know operands[0] == operands[1]. */
18409 if (MEM_P (operands[2]))
18411 p = "%Z2\t%2";
18412 break;
18415 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18417 if (STACK_TOP_P (operands[0]))
18418 /* How is it that we are storing to a dead operand[2]?
18419 Well, presumably operands[1] is dead too. We can't
18420 store the result to st(0) as st(0) gets popped on this
18421 instruction. Instead store to operands[2] (which I
18422 think has to be st(1)). st(1) will be popped later.
18423 gcc <= 2.8.1 didn't have this check and generated
18424 assembly code that the Unixware assembler rejected. */
18425 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18426 else
18427 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18428 break;
18431 if (STACK_TOP_P (operands[0]))
18432 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18433 else
18434 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18435 break;
18437 case MINUS:
18438 case DIV:
18439 if (MEM_P (operands[1]))
18441 p = "r%Z1\t%1";
18442 break;
18445 if (MEM_P (operands[2]))
18447 p = "%Z2\t%2";
18448 break;
18451 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18453 #if SYSV386_COMPAT
18454 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18455 derived assemblers, confusingly reverse the direction of
18456 the operation for fsub{r} and fdiv{r} when the
18457 destination register is not st(0). The Intel assembler
18458 doesn't have this brain damage. Read !SYSV386_COMPAT to
18459 figure out what the hardware really does. */
18460 if (STACK_TOP_P (operands[0]))
18461 p = "{p\t%0, %2|rp\t%2, %0}";
18462 else
18463 p = "{rp\t%2, %0|p\t%0, %2}";
18464 #else
18465 if (STACK_TOP_P (operands[0]))
18466 /* As above for fmul/fadd, we can't store to st(0). */
18467 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18468 else
18469 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18470 #endif
18471 break;
18474 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18476 #if SYSV386_COMPAT
18477 if (STACK_TOP_P (operands[0]))
18478 p = "{rp\t%0, %1|p\t%1, %0}";
18479 else
18480 p = "{p\t%1, %0|rp\t%0, %1}";
18481 #else
18482 if (STACK_TOP_P (operands[0]))
18483 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18484 else
18485 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18486 #endif
18487 break;
18490 if (STACK_TOP_P (operands[0]))
18492 if (STACK_TOP_P (operands[1]))
18493 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18494 else
18495 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18496 break;
18498 else if (STACK_TOP_P (operands[1]))
18500 #if SYSV386_COMPAT
18501 p = "{\t%1, %0|r\t%0, %1}";
18502 #else
18503 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18504 #endif
18506 else
18508 #if SYSV386_COMPAT
18509 p = "{r\t%2, %0|\t%0, %2}";
18510 #else
18511 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18512 #endif
18514 break;
18516 default:
18517 gcc_unreachable ();
18520 strcat (buf, p);
18521 return buf;
18524 /* Return needed mode for entity in optimize_mode_switching pass. */
18526 static int
18527 ix86_dirflag_mode_needed (rtx_insn *insn)
18529 if (CALL_P (insn))
18531 if (cfun->machine->func_type == TYPE_NORMAL)
18532 return X86_DIRFLAG_ANY;
18533 else
18534 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18535 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18538 if (recog_memoized (insn) < 0)
18539 return X86_DIRFLAG_ANY;
18541 if (get_attr_type (insn) == TYPE_STR)
18543 /* Emit cld instruction if stringops are used in the function. */
18544 if (cfun->machine->func_type == TYPE_NORMAL)
18545 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18546 else
18547 return X86_DIRFLAG_RESET;
18550 return X86_DIRFLAG_ANY;
18553 /* Check if a 256bit AVX register is referenced inside of EXP. */
18555 static bool
18556 ix86_check_avx256_register (const_rtx exp)
18558 if (SUBREG_P (exp))
18559 exp = SUBREG_REG (exp);
18561 return (REG_P (exp)
18562 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
18565 /* Return needed mode for entity in optimize_mode_switching pass. */
18567 static int
18568 ix86_avx_u128_mode_needed (rtx_insn *insn)
18570 if (CALL_P (insn))
18572 rtx link;
18574 /* Needed mode is set to AVX_U128_CLEAN if there are
18575 no 256bit modes used in function arguments. */
18576 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18577 link;
18578 link = XEXP (link, 1))
18580 if (GET_CODE (XEXP (link, 0)) == USE)
18582 rtx arg = XEXP (XEXP (link, 0), 0);
18584 if (ix86_check_avx256_register (arg))
18585 return AVX_U128_DIRTY;
18589 return AVX_U128_CLEAN;
18592 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
18593 changes state only when a 256bit register is written to, but we need
18594 to prevent the compiler from moving optimal insertion point above
18595 eventual read from 256bit register. */
18596 subrtx_iterator::array_type array;
18597 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18598 if (ix86_check_avx256_register (*iter))
18599 return AVX_U128_DIRTY;
18601 return AVX_U128_ANY;
18604 /* Return mode that i387 must be switched into
18605 prior to the execution of insn. */
18607 static int
18608 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18610 enum attr_i387_cw mode;
18612 /* The mode UNINITIALIZED is used to store control word after a
18613 function call or ASM pattern. The mode ANY specify that function
18614 has no requirements on the control word and make no changes in the
18615 bits we are interested in. */
18617 if (CALL_P (insn)
18618 || (NONJUMP_INSN_P (insn)
18619 && (asm_noperands (PATTERN (insn)) >= 0
18620 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18621 return I387_CW_UNINITIALIZED;
18623 if (recog_memoized (insn) < 0)
18624 return I387_CW_ANY;
18626 mode = get_attr_i387_cw (insn);
18628 switch (entity)
18630 case I387_TRUNC:
18631 if (mode == I387_CW_TRUNC)
18632 return mode;
18633 break;
18635 case I387_FLOOR:
18636 if (mode == I387_CW_FLOOR)
18637 return mode;
18638 break;
18640 case I387_CEIL:
18641 if (mode == I387_CW_CEIL)
18642 return mode;
18643 break;
18645 case I387_MASK_PM:
18646 if (mode == I387_CW_MASK_PM)
18647 return mode;
18648 break;
18650 default:
18651 gcc_unreachable ();
18654 return I387_CW_ANY;
18657 /* Return mode that entity must be switched into
18658 prior to the execution of insn. */
18660 static int
18661 ix86_mode_needed (int entity, rtx_insn *insn)
18663 switch (entity)
18665 case X86_DIRFLAG:
18666 return ix86_dirflag_mode_needed (insn);
18667 case AVX_U128:
18668 return ix86_avx_u128_mode_needed (insn);
18669 case I387_TRUNC:
18670 case I387_FLOOR:
18671 case I387_CEIL:
18672 case I387_MASK_PM:
18673 return ix86_i387_mode_needed (entity, insn);
18674 default:
18675 gcc_unreachable ();
18677 return 0;
18680 /* Check if a 256bit AVX register is referenced in stores. */
18682 static void
18683 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
18685 if (ix86_check_avx256_register (dest))
18687 bool *used = (bool *) data;
18688 *used = true;
18692 /* Calculate mode of upper 128bit AVX registers after the insn. */
18694 static int
18695 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
18697 rtx pat = PATTERN (insn);
18699 if (vzeroupper_operation (pat, VOIDmode)
18700 || vzeroall_operation (pat, VOIDmode))
18701 return AVX_U128_CLEAN;
18703 /* We know that state is clean after CALL insn if there are no
18704 256bit registers used in the function return register. */
18705 if (CALL_P (insn))
18707 bool avx_reg256_found = false;
18708 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
18710 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
18713 /* Otherwise, return current mode. Remember that if insn
18714 references AVX 256bit registers, the mode was already changed
18715 to DIRTY from MODE_NEEDED. */
18716 return mode;
18719 /* Return the mode that an insn results in. */
18721 static int
18722 ix86_mode_after (int entity, int mode, rtx_insn *insn)
18724 switch (entity)
18726 case X86_DIRFLAG:
18727 return mode;
18728 case AVX_U128:
18729 return ix86_avx_u128_mode_after (mode, insn);
18730 case I387_TRUNC:
18731 case I387_FLOOR:
18732 case I387_CEIL:
18733 case I387_MASK_PM:
18734 return mode;
18735 default:
18736 gcc_unreachable ();
18740 static int
18741 ix86_dirflag_mode_entry (void)
18743 /* For TARGET_CLD or in the interrupt handler we can't assume
18744 direction flag state at function entry. */
18745 if (TARGET_CLD
18746 || cfun->machine->func_type != TYPE_NORMAL)
18747 return X86_DIRFLAG_ANY;
18749 return X86_DIRFLAG_RESET;
18752 static int
18753 ix86_avx_u128_mode_entry (void)
18755 tree arg;
18757 /* Entry mode is set to AVX_U128_DIRTY if there are
18758 256bit modes used in function arguments. */
18759 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
18760 arg = TREE_CHAIN (arg))
18762 rtx incoming = DECL_INCOMING_RTL (arg);
18764 if (incoming && ix86_check_avx256_register (incoming))
18765 return AVX_U128_DIRTY;
18768 return AVX_U128_CLEAN;
18771 /* Return a mode that ENTITY is assumed to be
18772 switched to at function entry. */
18774 static int
18775 ix86_mode_entry (int entity)
18777 switch (entity)
18779 case X86_DIRFLAG:
18780 return ix86_dirflag_mode_entry ();
18781 case AVX_U128:
18782 return ix86_avx_u128_mode_entry ();
18783 case I387_TRUNC:
18784 case I387_FLOOR:
18785 case I387_CEIL:
18786 case I387_MASK_PM:
18787 return I387_CW_ANY;
18788 default:
18789 gcc_unreachable ();
18793 static int
18794 ix86_avx_u128_mode_exit (void)
18796 rtx reg = crtl->return_rtx;
18798 /* Exit mode is set to AVX_U128_DIRTY if there are
18799 256bit modes used in the function return register. */
18800 if (reg && ix86_check_avx256_register (reg))
18801 return AVX_U128_DIRTY;
18803 return AVX_U128_CLEAN;
18806 /* Return a mode that ENTITY is assumed to be
18807 switched to at function exit. */
18809 static int
18810 ix86_mode_exit (int entity)
18812 switch (entity)
18814 case X86_DIRFLAG:
18815 return X86_DIRFLAG_ANY;
18816 case AVX_U128:
18817 return ix86_avx_u128_mode_exit ();
18818 case I387_TRUNC:
18819 case I387_FLOOR:
18820 case I387_CEIL:
18821 case I387_MASK_PM:
18822 return I387_CW_ANY;
18823 default:
18824 gcc_unreachable ();
18828 static int
18829 ix86_mode_priority (int, int n)
18831 return n;
18834 /* Output code to initialize control word copies used by trunc?f?i and
18835 rounding patterns. CURRENT_MODE is set to current control word,
18836 while NEW_MODE is set to new control word. */
18838 static void
18839 emit_i387_cw_initialization (int mode)
18841 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
18842 rtx new_mode;
18844 enum ix86_stack_slot slot;
18846 rtx reg = gen_reg_rtx (HImode);
18848 emit_insn (gen_x86_fnstcw_1 (stored_mode));
18849 emit_move_insn (reg, copy_rtx (stored_mode));
18851 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
18852 || optimize_insn_for_size_p ())
18854 switch (mode)
18856 case I387_CW_TRUNC:
18857 /* round toward zero (truncate) */
18858 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
18859 slot = SLOT_CW_TRUNC;
18860 break;
18862 case I387_CW_FLOOR:
18863 /* round down toward -oo */
18864 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
18865 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
18866 slot = SLOT_CW_FLOOR;
18867 break;
18869 case I387_CW_CEIL:
18870 /* round up toward +oo */
18871 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
18872 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
18873 slot = SLOT_CW_CEIL;
18874 break;
18876 case I387_CW_MASK_PM:
18877 /* mask precision exception for nearbyint() */
18878 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
18879 slot = SLOT_CW_MASK_PM;
18880 break;
18882 default:
18883 gcc_unreachable ();
18886 else
18888 switch (mode)
18890 case I387_CW_TRUNC:
18891 /* round toward zero (truncate) */
18892 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
18893 slot = SLOT_CW_TRUNC;
18894 break;
18896 case I387_CW_FLOOR:
18897 /* round down toward -oo */
18898 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
18899 slot = SLOT_CW_FLOOR;
18900 break;
18902 case I387_CW_CEIL:
18903 /* round up toward +oo */
18904 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
18905 slot = SLOT_CW_CEIL;
18906 break;
18908 case I387_CW_MASK_PM:
18909 /* mask precision exception for nearbyint() */
18910 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
18911 slot = SLOT_CW_MASK_PM;
18912 break;
18914 default:
18915 gcc_unreachable ();
18919 gcc_assert (slot < MAX_386_STACK_LOCALS);
18921 new_mode = assign_386_stack_local (HImode, slot);
18922 emit_move_insn (new_mode, reg);
18925 /* Emit vzeroupper. */
18927 void
18928 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
18930 int i;
18932 /* Cancel automatic vzeroupper insertion if there are
18933 live call-saved SSE registers at the insertion point. */
18935 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
18936 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
18937 return;
18939 if (TARGET_64BIT)
18940 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
18941 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
18942 return;
18944 emit_insn (gen_avx_vzeroupper ());
18947 /* Generate one or more insns to set ENTITY to MODE. */
18949 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
18950 is the set of hard registers live at the point where the insn(s)
18951 are to be inserted. */
18953 static void
18954 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
18955 HARD_REG_SET regs_live)
18957 switch (entity)
18959 case X86_DIRFLAG:
18960 if (mode == X86_DIRFLAG_RESET)
18961 emit_insn (gen_cld ());
18962 break;
18963 case AVX_U128:
18964 if (mode == AVX_U128_CLEAN)
18965 ix86_avx_emit_vzeroupper (regs_live);
18966 break;
18967 case I387_TRUNC:
18968 case I387_FLOOR:
18969 case I387_CEIL:
18970 case I387_MASK_PM:
18971 if (mode != I387_CW_ANY
18972 && mode != I387_CW_UNINITIALIZED)
18973 emit_i387_cw_initialization (mode);
18974 break;
18975 default:
18976 gcc_unreachable ();
18980 /* Output code for INSN to convert a float to a signed int. OPERANDS
18981 are the insn operands. The output may be [HSD]Imode and the input
18982 operand may be [SDX]Fmode. */
18984 const char *
18985 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
18987 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
18988 bool dimode_p = GET_MODE (operands[0]) == DImode;
18989 int round_mode = get_attr_i387_cw (insn);
18991 static char buf[40];
18992 const char *p;
18994 /* Jump through a hoop or two for DImode, since the hardware has no
18995 non-popping instruction. We used to do this a different way, but
18996 that was somewhat fragile and broke with post-reload splitters. */
18997 if ((dimode_p || fisttp) && !stack_top_dies)
18998 output_asm_insn ("fld\t%y1", operands);
19000 gcc_assert (STACK_TOP_P (operands[1]));
19001 gcc_assert (MEM_P (operands[0]));
19002 gcc_assert (GET_MODE (operands[1]) != TFmode);
19004 if (fisttp)
19005 return "fisttp%Z0\t%0";
19007 strcpy (buf, "fist");
19009 if (round_mode != I387_CW_ANY)
19010 output_asm_insn ("fldcw\t%3", operands);
19012 p = "p%Z0\t%0";
19013 strcat (buf, p + !(stack_top_dies || dimode_p));
19015 output_asm_insn (buf, operands);
19017 if (round_mode != I387_CW_ANY)
19018 output_asm_insn ("fldcw\t%2", operands);
19020 return "";
19023 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19024 have the values zero or one, indicates the ffreep insn's operand
19025 from the OPERANDS array. */
19027 static const char *
19028 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19030 if (TARGET_USE_FFREEP)
19031 #ifdef HAVE_AS_IX86_FFREEP
19032 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19033 #else
19035 static char retval[32];
19036 int regno = REGNO (operands[opno]);
19038 gcc_assert (STACK_REGNO_P (regno));
19040 regno -= FIRST_STACK_REG;
19042 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19043 return retval;
19045 #endif
19047 return opno ? "fstp\t%y1" : "fstp\t%y0";
19051 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19052 should be used. UNORDERED_P is true when fucom should be used. */
19054 const char *
19055 output_fp_compare (rtx_insn *insn, rtx *operands,
19056 bool eflags_p, bool unordered_p)
19058 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19059 bool stack_top_dies;
19061 static char buf[40];
19062 const char *p;
19064 gcc_assert (STACK_TOP_P (xops[0]));
19066 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19068 if (eflags_p)
19070 p = unordered_p ? "fucomi" : "fcomi";
19071 strcpy (buf, p);
19073 p = "p\t{%y1, %0|%0, %y1}";
19074 strcat (buf, p + !stack_top_dies);
19076 return buf;
19079 if (STACK_REG_P (xops[1])
19080 && stack_top_dies
19081 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19083 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19085 /* If both the top of the 387 stack die, and the other operand
19086 is also a stack register that dies, then this must be a
19087 `fcompp' float compare. */
19088 p = unordered_p ? "fucompp" : "fcompp";
19089 strcpy (buf, p);
19091 else if (const0_operand (xops[1], VOIDmode))
19093 gcc_assert (!unordered_p);
19094 strcpy (buf, "ftst");
19096 else
19098 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19100 gcc_assert (!unordered_p);
19101 p = "ficom";
19103 else
19104 p = unordered_p ? "fucom" : "fcom";
19106 strcpy (buf, p);
19108 p = "p%Z2\t%y2";
19109 strcat (buf, p + !stack_top_dies);
19112 output_asm_insn (buf, operands);
19113 return "fnstsw\t%0";
19116 void
19117 ix86_output_addr_vec_elt (FILE *file, int value)
19119 const char *directive = ASM_LONG;
19121 #ifdef ASM_QUAD
19122 if (TARGET_LP64)
19123 directive = ASM_QUAD;
19124 #else
19125 gcc_assert (!TARGET_64BIT);
19126 #endif
19128 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19131 void
19132 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19134 const char *directive = ASM_LONG;
19136 #ifdef ASM_QUAD
19137 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19138 directive = ASM_QUAD;
19139 #else
19140 gcc_assert (!TARGET_64BIT);
19141 #endif
19142 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19143 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19144 fprintf (file, "%s%s%d-%s%d\n",
19145 directive, LPREFIX, value, LPREFIX, rel);
19146 else if (HAVE_AS_GOTOFF_IN_DATA)
19147 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19148 #if TARGET_MACHO
19149 else if (TARGET_MACHO)
19151 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19152 machopic_output_function_base_name (file);
19153 putc ('\n', file);
19155 #endif
19156 else
19157 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19158 GOT_SYMBOL_NAME, LPREFIX, value);
19161 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19162 for the target. */
19164 void
19165 ix86_expand_clear (rtx dest)
19167 rtx tmp;
19169 /* We play register width games, which are only valid after reload. */
19170 gcc_assert (reload_completed);
19172 /* Avoid HImode and its attendant prefix byte. */
19173 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19174 dest = gen_rtx_REG (SImode, REGNO (dest));
19175 tmp = gen_rtx_SET (dest, const0_rtx);
19177 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19179 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19180 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19183 emit_insn (tmp);
19186 /* X is an unchanging MEM. If it is a constant pool reference, return
19187 the constant pool rtx, else NULL. */
19190 maybe_get_pool_constant (rtx x)
19192 x = ix86_delegitimize_address (XEXP (x, 0));
19194 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19195 return get_pool_constant (x);
19197 return NULL_RTX;
19200 void
19201 ix86_expand_move (machine_mode mode, rtx operands[])
19203 rtx op0, op1;
19204 rtx tmp, addend = NULL_RTX;
19205 enum tls_model model;
19207 op0 = operands[0];
19208 op1 = operands[1];
19210 switch (GET_CODE (op1))
19212 case CONST:
19213 tmp = XEXP (op1, 0);
19215 if (GET_CODE (tmp) != PLUS
19216 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19217 break;
19219 op1 = XEXP (tmp, 0);
19220 addend = XEXP (tmp, 1);
19221 /* FALLTHRU */
19223 case SYMBOL_REF:
19224 model = SYMBOL_REF_TLS_MODEL (op1);
19226 if (model)
19227 op1 = legitimize_tls_address (op1, model, true);
19228 else if (ix86_force_load_from_GOT_p (op1))
19230 /* Load the external function address via GOT slot to avoid PLT. */
19231 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19232 (TARGET_64BIT
19233 ? UNSPEC_GOTPCREL
19234 : UNSPEC_GOT));
19235 op1 = gen_rtx_CONST (Pmode, op1);
19236 op1 = gen_const_mem (Pmode, op1);
19237 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19239 else
19241 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19242 if (tmp)
19244 op1 = tmp;
19245 if (!addend)
19246 break;
19248 else
19250 op1 = operands[1];
19251 break;
19255 if (addend)
19257 op1 = force_operand (op1, NULL_RTX);
19258 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19259 op0, 1, OPTAB_DIRECT);
19261 else
19262 op1 = force_operand (op1, op0);
19264 if (op1 == op0)
19265 return;
19267 op1 = convert_to_mode (mode, op1, 1);
19269 default:
19270 break;
19273 if ((flag_pic || MACHOPIC_INDIRECT)
19274 && symbolic_operand (op1, mode))
19276 if (TARGET_MACHO && !TARGET_64BIT)
19278 #if TARGET_MACHO
19279 /* dynamic-no-pic */
19280 if (MACHOPIC_INDIRECT)
19282 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19283 ? op0 : gen_reg_rtx (Pmode);
19284 op1 = machopic_indirect_data_reference (op1, temp);
19285 if (MACHOPIC_PURE)
19286 op1 = machopic_legitimize_pic_address (op1, mode,
19287 temp == op1 ? 0 : temp);
19289 if (op0 != op1 && GET_CODE (op0) != MEM)
19291 rtx insn = gen_rtx_SET (op0, op1);
19292 emit_insn (insn);
19293 return;
19295 if (GET_CODE (op0) == MEM)
19296 op1 = force_reg (Pmode, op1);
19297 else
19299 rtx temp = op0;
19300 if (GET_CODE (temp) != REG)
19301 temp = gen_reg_rtx (Pmode);
19302 temp = legitimize_pic_address (op1, temp);
19303 if (temp == op0)
19304 return;
19305 op1 = temp;
19307 /* dynamic-no-pic */
19308 #endif
19310 else
19312 if (MEM_P (op0))
19313 op1 = force_reg (mode, op1);
19314 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19316 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19317 op1 = legitimize_pic_address (op1, reg);
19318 if (op0 == op1)
19319 return;
19320 op1 = convert_to_mode (mode, op1, 1);
19324 else
19326 if (MEM_P (op0)
19327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19328 || !push_operand (op0, mode))
19329 && MEM_P (op1))
19330 op1 = force_reg (mode, op1);
19332 if (push_operand (op0, mode)
19333 && ! general_no_elim_operand (op1, mode))
19334 op1 = copy_to_mode_reg (mode, op1);
19336 /* Force large constants in 64bit compilation into register
19337 to get them CSEed. */
19338 if (can_create_pseudo_p ()
19339 && (mode == DImode) && TARGET_64BIT
19340 && immediate_operand (op1, mode)
19341 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19342 && !register_operand (op0, mode)
19343 && optimize)
19344 op1 = copy_to_mode_reg (mode, op1);
19346 if (can_create_pseudo_p ()
19347 && CONST_DOUBLE_P (op1))
19349 /* If we are loading a floating point constant to a register,
19350 force the value to memory now, since we'll get better code
19351 out the back end. */
19353 op1 = validize_mem (force_const_mem (mode, op1));
19354 if (!register_operand (op0, mode))
19356 rtx temp = gen_reg_rtx (mode);
19357 emit_insn (gen_rtx_SET (temp, op1));
19358 emit_move_insn (op0, temp);
19359 return;
19364 emit_insn (gen_rtx_SET (op0, op1));
19367 void
19368 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19370 rtx op0 = operands[0], op1 = operands[1];
19371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19373 unsigned int align = (TARGET_IAMCU
19374 ? GET_MODE_BITSIZE (mode)
19375 : GET_MODE_ALIGNMENT (mode));
19377 if (push_operand (op0, VOIDmode))
19378 op0 = emit_move_resolve_push (mode, op0);
19380 /* Force constants other than zero into memory. We do not know how
19381 the instructions used to build constants modify the upper 64 bits
19382 of the register, once we have that information we may be able
19383 to handle some of them more efficiently. */
19384 if (can_create_pseudo_p ()
19385 && (CONSTANT_P (op1)
19386 || (SUBREG_P (op1)
19387 && CONSTANT_P (SUBREG_REG (op1))))
19388 && ((register_operand (op0, mode)
19389 && !standard_sse_constant_p (op1, mode))
19390 /* ix86_expand_vector_move_misalign() does not like constants. */
19391 || (SSE_REG_MODE_P (mode)
19392 && MEM_P (op0)
19393 && MEM_ALIGN (op0) < align)))
19395 if (SUBREG_P (op1))
19397 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19398 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19399 if (r)
19400 r = validize_mem (r);
19401 else
19402 r = force_reg (imode, SUBREG_REG (op1));
19403 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19405 else
19406 op1 = validize_mem (force_const_mem (mode, op1));
19409 /* We need to check memory alignment for SSE mode since attribute
19410 can make operands unaligned. */
19411 if (can_create_pseudo_p ()
19412 && SSE_REG_MODE_P (mode)
19413 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19414 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19416 rtx tmp[2];
19418 /* ix86_expand_vector_move_misalign() does not like both
19419 arguments in memory. */
19420 if (!register_operand (op0, mode)
19421 && !register_operand (op1, mode))
19422 op1 = force_reg (mode, op1);
19424 tmp[0] = op0; tmp[1] = op1;
19425 ix86_expand_vector_move_misalign (mode, tmp);
19426 return;
19429 /* Make operand1 a register if it isn't already. */
19430 if (can_create_pseudo_p ()
19431 && !register_operand (op0, mode)
19432 && !register_operand (op1, mode))
19434 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19435 return;
19438 emit_insn (gen_rtx_SET (op0, op1));
19441 /* Split 32-byte AVX unaligned load and store if needed. */
19443 static void
19444 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19446 rtx m;
19447 rtx (*extract) (rtx, rtx, rtx);
19448 machine_mode mode;
19450 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19451 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19453 emit_insn (gen_rtx_SET (op0, op1));
19454 return;
19457 rtx orig_op0 = NULL_RTX;
19458 mode = GET_MODE (op0);
19459 switch (GET_MODE_CLASS (mode))
19461 case MODE_VECTOR_INT:
19462 case MODE_INT:
19463 if (mode != V32QImode)
19465 if (!MEM_P (op0))
19467 orig_op0 = op0;
19468 op0 = gen_reg_rtx (V32QImode);
19470 else
19471 op0 = gen_lowpart (V32QImode, op0);
19472 op1 = gen_lowpart (V32QImode, op1);
19473 mode = V32QImode;
19475 break;
19476 case MODE_VECTOR_FLOAT:
19477 break;
19478 default:
19479 gcc_unreachable ();
19482 switch (mode)
19484 default:
19485 gcc_unreachable ();
19486 case E_V32QImode:
19487 extract = gen_avx_vextractf128v32qi;
19488 mode = V16QImode;
19489 break;
19490 case E_V8SFmode:
19491 extract = gen_avx_vextractf128v8sf;
19492 mode = V4SFmode;
19493 break;
19494 case E_V4DFmode:
19495 extract = gen_avx_vextractf128v4df;
19496 mode = V2DFmode;
19497 break;
19500 if (MEM_P (op1))
19502 rtx r = gen_reg_rtx (mode);
19503 m = adjust_address (op1, mode, 0);
19504 emit_move_insn (r, m);
19505 m = adjust_address (op1, mode, 16);
19506 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19507 emit_move_insn (op0, r);
19509 else if (MEM_P (op0))
19511 m = adjust_address (op0, mode, 0);
19512 emit_insn (extract (m, op1, const0_rtx));
19513 m = adjust_address (op0, mode, 16);
19514 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19516 else
19517 gcc_unreachable ();
19519 if (orig_op0)
19520 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19523 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19524 straight to ix86_expand_vector_move. */
19525 /* Code generation for scalar reg-reg moves of single and double precision data:
19526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19527 movaps reg, reg
19528 else
19529 movss reg, reg
19530 if (x86_sse_partial_reg_dependency == true)
19531 movapd reg, reg
19532 else
19533 movsd reg, reg
19535 Code generation for scalar loads of double precision data:
19536 if (x86_sse_split_regs == true)
19537 movlpd mem, reg (gas syntax)
19538 else
19539 movsd mem, reg
19541 Code generation for unaligned packed loads of single precision data
19542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19543 if (x86_sse_unaligned_move_optimal)
19544 movups mem, reg
19546 if (x86_sse_partial_reg_dependency == true)
19548 xorps reg, reg
19549 movlps mem, reg
19550 movhps mem+8, reg
19552 else
19554 movlps mem, reg
19555 movhps mem+8, reg
19558 Code generation for unaligned packed loads of double precision data
19559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19560 if (x86_sse_unaligned_move_optimal)
19561 movupd mem, reg
19563 if (x86_sse_split_regs == true)
19565 movlpd mem, reg
19566 movhpd mem+8, reg
19568 else
19570 movsd mem, reg
19571 movhpd mem+8, reg
19575 void
19576 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19578 rtx op0, op1, m;
19580 op0 = operands[0];
19581 op1 = operands[1];
19583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19584 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19586 emit_insn (gen_rtx_SET (op0, op1));
19587 return;
19590 if (TARGET_AVX)
19592 if (GET_MODE_SIZE (mode) == 32)
19593 ix86_avx256_split_vector_move_misalign (op0, op1);
19594 else
19595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19596 emit_insn (gen_rtx_SET (op0, op1));
19597 return;
19600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19603 emit_insn (gen_rtx_SET (op0, op1));
19604 return;
19607 /* ??? If we have typed data, then it would appear that using
19608 movdqu is the only way to get unaligned data loaded with
19609 integer type. */
19610 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19612 emit_insn (gen_rtx_SET (op0, op1));
19613 return;
19616 if (MEM_P (op1))
19618 if (TARGET_SSE2 && mode == V2DFmode)
19620 rtx zero;
19622 /* When SSE registers are split into halves, we can avoid
19623 writing to the top half twice. */
19624 if (TARGET_SSE_SPLIT_REGS)
19626 emit_clobber (op0);
19627 zero = op0;
19629 else
19631 /* ??? Not sure about the best option for the Intel chips.
19632 The following would seem to satisfy; the register is
19633 entirely cleared, breaking the dependency chain. We
19634 then store to the upper half, with a dependency depth
19635 of one. A rumor has it that Intel recommends two movsd
19636 followed by an unpacklpd, but this is unconfirmed. And
19637 given that the dependency depth of the unpacklpd would
19638 still be one, I'm not sure why this would be better. */
19639 zero = CONST0_RTX (V2DFmode);
19642 m = adjust_address (op1, DFmode, 0);
19643 emit_insn (gen_sse2_loadlpd (op0, zero, m));
19644 m = adjust_address (op1, DFmode, 8);
19645 emit_insn (gen_sse2_loadhpd (op0, op0, m));
19647 else
19649 rtx t;
19651 if (mode != V4SFmode)
19652 t = gen_reg_rtx (V4SFmode);
19653 else
19654 t = op0;
19656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
19657 emit_move_insn (t, CONST0_RTX (V4SFmode));
19658 else
19659 emit_clobber (t);
19661 m = adjust_address (op1, V2SFmode, 0);
19662 emit_insn (gen_sse_loadlps (t, t, m));
19663 m = adjust_address (op1, V2SFmode, 8);
19664 emit_insn (gen_sse_loadhps (t, t, m));
19665 if (mode != V4SFmode)
19666 emit_move_insn (op0, gen_lowpart (mode, t));
19669 else if (MEM_P (op0))
19671 if (TARGET_SSE2 && mode == V2DFmode)
19673 m = adjust_address (op0, DFmode, 0);
19674 emit_insn (gen_sse2_storelpd (m, op1));
19675 m = adjust_address (op0, DFmode, 8);
19676 emit_insn (gen_sse2_storehpd (m, op1));
19678 else
19680 if (mode != V4SFmode)
19681 op1 = gen_lowpart (V4SFmode, op1);
19683 m = adjust_address (op0, V2SFmode, 0);
19684 emit_insn (gen_sse_storelps (m, op1));
19685 m = adjust_address (op0, V2SFmode, 8);
19686 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
19689 else
19690 gcc_unreachable ();
19693 /* Helper function of ix86_fixup_binary_operands to canonicalize
19694 operand order. Returns true if the operands should be swapped. */
19696 static bool
19697 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
19698 rtx operands[])
19700 rtx dst = operands[0];
19701 rtx src1 = operands[1];
19702 rtx src2 = operands[2];
19704 /* If the operation is not commutative, we can't do anything. */
19705 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
19706 return false;
19708 /* Highest priority is that src1 should match dst. */
19709 if (rtx_equal_p (dst, src1))
19710 return false;
19711 if (rtx_equal_p (dst, src2))
19712 return true;
19714 /* Next highest priority is that immediate constants come second. */
19715 if (immediate_operand (src2, mode))
19716 return false;
19717 if (immediate_operand (src1, mode))
19718 return true;
19720 /* Lowest priority is that memory references should come second. */
19721 if (MEM_P (src2))
19722 return false;
19723 if (MEM_P (src1))
19724 return true;
19726 return false;
19730 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
19731 destination to use for the operation. If different from the true
19732 destination in operands[0], a copy operation will be required. */
19735 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
19736 rtx operands[])
19738 rtx dst = operands[0];
19739 rtx src1 = operands[1];
19740 rtx src2 = operands[2];
19742 /* Canonicalize operand order. */
19743 if (ix86_swap_binary_operands_p (code, mode, operands))
19745 /* It is invalid to swap operands of different modes. */
19746 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
19748 std::swap (src1, src2);
19751 /* Both source operands cannot be in memory. */
19752 if (MEM_P (src1) && MEM_P (src2))
19754 /* Optimization: Only read from memory once. */
19755 if (rtx_equal_p (src1, src2))
19757 src2 = force_reg (mode, src2);
19758 src1 = src2;
19760 else if (rtx_equal_p (dst, src1))
19761 src2 = force_reg (mode, src2);
19762 else
19763 src1 = force_reg (mode, src1);
19766 /* If the destination is memory, and we do not have matching source
19767 operands, do things in registers. */
19768 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19769 dst = gen_reg_rtx (mode);
19771 /* Source 1 cannot be a constant. */
19772 if (CONSTANT_P (src1))
19773 src1 = force_reg (mode, src1);
19775 /* Source 1 cannot be a non-matching memory. */
19776 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
19777 src1 = force_reg (mode, src1);
19779 /* Improve address combine. */
19780 if (code == PLUS
19781 && GET_MODE_CLASS (mode) == MODE_INT
19782 && MEM_P (src2))
19783 src2 = force_reg (mode, src2);
19785 operands[1] = src1;
19786 operands[2] = src2;
19787 return dst;
19790 /* Similarly, but assume that the destination has already been
19791 set up properly. */
19793 void
19794 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
19795 machine_mode mode, rtx operands[])
19797 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
19798 gcc_assert (dst == operands[0]);
19801 /* Attempt to expand a binary operator. Make the expansion closer to the
19802 actual machine, then just general_operand, which will allow 3 separate
19803 memory references (one output, two input) in a single insn. */
19805 void
19806 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
19807 rtx operands[])
19809 rtx src1, src2, dst, op, clob;
19811 dst = ix86_fixup_binary_operands (code, mode, operands);
19812 src1 = operands[1];
19813 src2 = operands[2];
19815 /* Emit the instruction. */
19817 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
19819 if (reload_completed
19820 && code == PLUS
19821 && !rtx_equal_p (dst, src1))
19823 /* This is going to be an LEA; avoid splitting it later. */
19824 emit_insn (op);
19826 else
19828 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19829 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
19832 /* Fix up the destination if needed. */
19833 if (dst != operands[0])
19834 emit_move_insn (operands[0], dst);
19837 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
19838 the given OPERANDS. */
19840 void
19841 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
19842 rtx operands[])
19844 rtx op1 = NULL_RTX, op2 = NULL_RTX;
19845 if (SUBREG_P (operands[1]))
19847 op1 = operands[1];
19848 op2 = operands[2];
19850 else if (SUBREG_P (operands[2]))
19852 op1 = operands[2];
19853 op2 = operands[1];
19855 /* Optimize (__m128i) d | (__m128i) e and similar code
19856 when d and e are float vectors into float vector logical
19857 insn. In C/C++ without using intrinsics there is no other way
19858 to express vector logical operation on float vectors than
19859 to cast them temporarily to integer vectors. */
19860 if (op1
19861 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
19862 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
19863 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
19864 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
19865 && SUBREG_BYTE (op1) == 0
19866 && (GET_CODE (op2) == CONST_VECTOR
19867 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
19868 && SUBREG_BYTE (op2) == 0))
19869 && can_create_pseudo_p ())
19871 rtx dst;
19872 switch (GET_MODE (SUBREG_REG (op1)))
19874 case E_V4SFmode:
19875 case E_V8SFmode:
19876 case E_V16SFmode:
19877 case E_V2DFmode:
19878 case E_V4DFmode:
19879 case E_V8DFmode:
19880 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
19881 if (GET_CODE (op2) == CONST_VECTOR)
19883 op2 = gen_lowpart (GET_MODE (dst), op2);
19884 op2 = force_reg (GET_MODE (dst), op2);
19886 else
19888 op1 = operands[1];
19889 op2 = SUBREG_REG (operands[2]);
19890 if (!vector_operand (op2, GET_MODE (dst)))
19891 op2 = force_reg (GET_MODE (dst), op2);
19893 op1 = SUBREG_REG (op1);
19894 if (!vector_operand (op1, GET_MODE (dst)))
19895 op1 = force_reg (GET_MODE (dst), op1);
19896 emit_insn (gen_rtx_SET (dst,
19897 gen_rtx_fmt_ee (code, GET_MODE (dst),
19898 op1, op2)));
19899 emit_move_insn (operands[0], gen_lowpart (mode, dst));
19900 return;
19901 default:
19902 break;
19905 if (!vector_operand (operands[1], mode))
19906 operands[1] = force_reg (mode, operands[1]);
19907 if (!vector_operand (operands[2], mode))
19908 operands[2] = force_reg (mode, operands[2]);
19909 ix86_fixup_binary_operands_no_copy (code, mode, operands);
19910 emit_insn (gen_rtx_SET (operands[0],
19911 gen_rtx_fmt_ee (code, mode, operands[1],
19912 operands[2])));
19915 /* Return TRUE or FALSE depending on whether the binary operator meets the
19916 appropriate constraints. */
19918 bool
19919 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
19920 rtx operands[3])
19922 rtx dst = operands[0];
19923 rtx src1 = operands[1];
19924 rtx src2 = operands[2];
19926 /* Both source operands cannot be in memory. */
19927 if (MEM_P (src1) && MEM_P (src2))
19928 return false;
19930 /* Canonicalize operand order for commutative operators. */
19931 if (ix86_swap_binary_operands_p (code, mode, operands))
19932 std::swap (src1, src2);
19934 /* If the destination is memory, we must have a matching source operand. */
19935 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19936 return false;
19938 /* Source 1 cannot be a constant. */
19939 if (CONSTANT_P (src1))
19940 return false;
19942 /* Source 1 cannot be a non-matching memory. */
19943 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
19944 /* Support "andhi/andsi/anddi" as a zero-extending move. */
19945 return (code == AND
19946 && (mode == HImode
19947 || mode == SImode
19948 || (TARGET_64BIT && mode == DImode))
19949 && satisfies_constraint_L (src2));
19951 return true;
19954 /* Attempt to expand a unary operator. Make the expansion closer to the
19955 actual machine, then just general_operand, which will allow 2 separate
19956 memory references (one output, one input) in a single insn. */
19958 void
19959 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
19960 rtx operands[])
19962 bool matching_memory = false;
19963 rtx src, dst, op, clob;
19965 dst = operands[0];
19966 src = operands[1];
19968 /* If the destination is memory, and we do not have matching source
19969 operands, do things in registers. */
19970 if (MEM_P (dst))
19972 if (rtx_equal_p (dst, src))
19973 matching_memory = true;
19974 else
19975 dst = gen_reg_rtx (mode);
19978 /* When source operand is memory, destination must match. */
19979 if (MEM_P (src) && !matching_memory)
19980 src = force_reg (mode, src);
19982 /* Emit the instruction. */
19984 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
19986 if (code == NOT)
19987 emit_insn (op);
19988 else
19990 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19991 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
19994 /* Fix up the destination if needed. */
19995 if (dst != operands[0])
19996 emit_move_insn (operands[0], dst);
19999 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20000 divisor are within the range [0-255]. */
20002 void
20003 ix86_split_idivmod (machine_mode mode, rtx operands[],
20004 bool signed_p)
20006 rtx_code_label *end_label, *qimode_label;
20007 rtx div, mod;
20008 rtx_insn *insn;
20009 rtx scratch, tmp0, tmp1, tmp2;
20010 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20011 rtx (*gen_zero_extend) (rtx, rtx);
20012 rtx (*gen_test_ccno_1) (rtx, rtx);
20014 switch (mode)
20016 case E_SImode:
20017 if (GET_MODE (operands[0]) == SImode)
20019 if (GET_MODE (operands[1]) == SImode)
20020 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20021 else
20022 gen_divmod4_1
20023 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20024 gen_zero_extend = gen_zero_extendqisi2;
20026 else
20028 gen_divmod4_1
20029 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20030 gen_zero_extend = gen_zero_extendqidi2;
20032 gen_test_ccno_1 = gen_testsi_ccno_1;
20033 break;
20034 case E_DImode:
20035 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20036 gen_test_ccno_1 = gen_testdi_ccno_1;
20037 gen_zero_extend = gen_zero_extendqidi2;
20038 break;
20039 default:
20040 gcc_unreachable ();
20043 end_label = gen_label_rtx ();
20044 qimode_label = gen_label_rtx ();
20046 scratch = gen_reg_rtx (mode);
20048 /* Use 8bit unsigned divimod if dividend and divisor are within
20049 the range [0-255]. */
20050 emit_move_insn (scratch, operands[2]);
20051 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20052 scratch, 1, OPTAB_DIRECT);
20053 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20054 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20055 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20056 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20057 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20058 pc_rtx);
20059 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20060 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20061 JUMP_LABEL (insn) = qimode_label;
20063 /* Generate original signed/unsigned divimod. */
20064 div = gen_divmod4_1 (operands[0], operands[1],
20065 operands[2], operands[3]);
20066 emit_insn (div);
20068 /* Branch to the end. */
20069 emit_jump_insn (gen_jump (end_label));
20070 emit_barrier ();
20072 /* Generate 8bit unsigned divide. */
20073 emit_label (qimode_label);
20074 /* Don't use operands[0] for result of 8bit divide since not all
20075 registers support QImode ZERO_EXTRACT. */
20076 tmp0 = lowpart_subreg (HImode, scratch, mode);
20077 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20078 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20079 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20081 if (signed_p)
20083 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20084 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20086 else
20088 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20089 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20091 if (mode == SImode)
20093 if (GET_MODE (operands[0]) != SImode)
20094 div = gen_rtx_ZERO_EXTEND (DImode, div);
20095 if (GET_MODE (operands[1]) != SImode)
20096 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20099 /* Extract remainder from AH. */
20100 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20101 tmp0, GEN_INT (8), GEN_INT (8));
20102 if (REG_P (operands[1]))
20103 insn = emit_move_insn (operands[1], tmp1);
20104 else
20106 /* Need a new scratch register since the old one has result
20107 of 8bit divide. */
20108 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20109 emit_move_insn (scratch, tmp1);
20110 insn = emit_move_insn (operands[1], scratch);
20112 set_unique_reg_note (insn, REG_EQUAL, mod);
20114 /* Zero extend quotient from AL. */
20115 tmp1 = gen_lowpart (QImode, tmp0);
20116 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20117 set_unique_reg_note (insn, REG_EQUAL, div);
20119 emit_label (end_label);
20122 #define LEA_MAX_STALL (3)
20123 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20125 /* Increase given DISTANCE in half-cycles according to
20126 dependencies between PREV and NEXT instructions.
20127 Add 1 half-cycle if there is no dependency and
20128 go to next cycle if there is some dependecy. */
20130 static unsigned int
20131 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20133 df_ref def, use;
20135 if (!prev || !next)
20136 return distance + (distance & 1) + 2;
20138 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20139 return distance + 1;
20141 FOR_EACH_INSN_USE (use, next)
20142 FOR_EACH_INSN_DEF (def, prev)
20143 if (!DF_REF_IS_ARTIFICIAL (def)
20144 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20145 return distance + (distance & 1) + 2;
20147 return distance + 1;
20150 /* Function checks if instruction INSN defines register number
20151 REGNO1 or REGNO2. */
20153 static bool
20154 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20155 rtx_insn *insn)
20157 df_ref def;
20159 FOR_EACH_INSN_DEF (def, insn)
20160 if (DF_REF_REG_DEF_P (def)
20161 && !DF_REF_IS_ARTIFICIAL (def)
20162 && (regno1 == DF_REF_REGNO (def)
20163 || regno2 == DF_REF_REGNO (def)))
20164 return true;
20166 return false;
20169 /* Function checks if instruction INSN uses register number
20170 REGNO as a part of address expression. */
20172 static bool
20173 insn_uses_reg_mem (unsigned int regno, rtx insn)
20175 df_ref use;
20177 FOR_EACH_INSN_USE (use, insn)
20178 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20179 return true;
20181 return false;
20184 /* Search backward for non-agu definition of register number REGNO1
20185 or register number REGNO2 in basic block starting from instruction
20186 START up to head of basic block or instruction INSN.
20188 Function puts true value into *FOUND var if definition was found
20189 and false otherwise.
20191 Distance in half-cycles between START and found instruction or head
20192 of BB is added to DISTANCE and returned. */
20194 static int
20195 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20196 rtx_insn *insn, int distance,
20197 rtx_insn *start, bool *found)
20199 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20200 rtx_insn *prev = start;
20201 rtx_insn *next = NULL;
20203 *found = false;
20205 while (prev
20206 && prev != insn
20207 && distance < LEA_SEARCH_THRESHOLD)
20209 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20211 distance = increase_distance (prev, next, distance);
20212 if (insn_defines_reg (regno1, regno2, prev))
20214 if (recog_memoized (prev) < 0
20215 || get_attr_type (prev) != TYPE_LEA)
20217 *found = true;
20218 return distance;
20222 next = prev;
20224 if (prev == BB_HEAD (bb))
20225 break;
20227 prev = PREV_INSN (prev);
20230 return distance;
20233 /* Search backward for non-agu definition of register number REGNO1
20234 or register number REGNO2 in INSN's basic block until
20235 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20236 2. Reach neighbor BBs boundary, or
20237 3. Reach agu definition.
20238 Returns the distance between the non-agu definition point and INSN.
20239 If no definition point, returns -1. */
20241 static int
20242 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20243 rtx_insn *insn)
20245 basic_block bb = BLOCK_FOR_INSN (insn);
20246 int distance = 0;
20247 bool found = false;
20249 if (insn != BB_HEAD (bb))
20250 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20251 distance, PREV_INSN (insn),
20252 &found);
20254 if (!found && distance < LEA_SEARCH_THRESHOLD)
20256 edge e;
20257 edge_iterator ei;
20258 bool simple_loop = false;
20260 FOR_EACH_EDGE (e, ei, bb->preds)
20261 if (e->src == bb)
20263 simple_loop = true;
20264 break;
20267 if (simple_loop)
20268 distance = distance_non_agu_define_in_bb (regno1, regno2,
20269 insn, distance,
20270 BB_END (bb), &found);
20271 else
20273 int shortest_dist = -1;
20274 bool found_in_bb = false;
20276 FOR_EACH_EDGE (e, ei, bb->preds)
20278 int bb_dist
20279 = distance_non_agu_define_in_bb (regno1, regno2,
20280 insn, distance,
20281 BB_END (e->src),
20282 &found_in_bb);
20283 if (found_in_bb)
20285 if (shortest_dist < 0)
20286 shortest_dist = bb_dist;
20287 else if (bb_dist > 0)
20288 shortest_dist = MIN (bb_dist, shortest_dist);
20290 found = true;
20294 distance = shortest_dist;
20298 /* get_attr_type may modify recog data. We want to make sure
20299 that recog data is valid for instruction INSN, on which
20300 distance_non_agu_define is called. INSN is unchanged here. */
20301 extract_insn_cached (insn);
20303 if (!found)
20304 return -1;
20306 return distance >> 1;
20309 /* Return the distance in half-cycles between INSN and the next
20310 insn that uses register number REGNO in memory address added
20311 to DISTANCE. Return -1 if REGNO0 is set.
20313 Put true value into *FOUND if register usage was found and
20314 false otherwise.
20315 Put true value into *REDEFINED if register redefinition was
20316 found and false otherwise. */
20318 static int
20319 distance_agu_use_in_bb (unsigned int regno,
20320 rtx_insn *insn, int distance, rtx_insn *start,
20321 bool *found, bool *redefined)
20323 basic_block bb = NULL;
20324 rtx_insn *next = start;
20325 rtx_insn *prev = NULL;
20327 *found = false;
20328 *redefined = false;
20330 if (start != NULL_RTX)
20332 bb = BLOCK_FOR_INSN (start);
20333 if (start != BB_HEAD (bb))
20334 /* If insn and start belong to the same bb, set prev to insn,
20335 so the call to increase_distance will increase the distance
20336 between insns by 1. */
20337 prev = insn;
20340 while (next
20341 && next != insn
20342 && distance < LEA_SEARCH_THRESHOLD)
20344 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20346 distance = increase_distance(prev, next, distance);
20347 if (insn_uses_reg_mem (regno, next))
20349 /* Return DISTANCE if OP0 is used in memory
20350 address in NEXT. */
20351 *found = true;
20352 return distance;
20355 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20357 /* Return -1 if OP0 is set in NEXT. */
20358 *redefined = true;
20359 return -1;
20362 prev = next;
20365 if (next == BB_END (bb))
20366 break;
20368 next = NEXT_INSN (next);
20371 return distance;
20374 /* Return the distance between INSN and the next insn that uses
20375 register number REGNO0 in memory address. Return -1 if no such
20376 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20378 static int
20379 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20381 basic_block bb = BLOCK_FOR_INSN (insn);
20382 int distance = 0;
20383 bool found = false;
20384 bool redefined = false;
20386 if (insn != BB_END (bb))
20387 distance = distance_agu_use_in_bb (regno0, insn, distance,
20388 NEXT_INSN (insn),
20389 &found, &redefined);
20391 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20393 edge e;
20394 edge_iterator ei;
20395 bool simple_loop = false;
20397 FOR_EACH_EDGE (e, ei, bb->succs)
20398 if (e->dest == bb)
20400 simple_loop = true;
20401 break;
20404 if (simple_loop)
20405 distance = distance_agu_use_in_bb (regno0, insn,
20406 distance, BB_HEAD (bb),
20407 &found, &redefined);
20408 else
20410 int shortest_dist = -1;
20411 bool found_in_bb = false;
20412 bool redefined_in_bb = false;
20414 FOR_EACH_EDGE (e, ei, bb->succs)
20416 int bb_dist
20417 = distance_agu_use_in_bb (regno0, insn,
20418 distance, BB_HEAD (e->dest),
20419 &found_in_bb, &redefined_in_bb);
20420 if (found_in_bb)
20422 if (shortest_dist < 0)
20423 shortest_dist = bb_dist;
20424 else if (bb_dist > 0)
20425 shortest_dist = MIN (bb_dist, shortest_dist);
20427 found = true;
20431 distance = shortest_dist;
20435 if (!found || redefined)
20436 return -1;
20438 return distance >> 1;
20441 /* Define this macro to tune LEA priority vs ADD, it take effect when
20442 there is a dilemma of choicing LEA or ADD
20443 Negative value: ADD is more preferred than LEA
20444 Zero: Netrual
20445 Positive value: LEA is more preferred than ADD*/
20446 #define IX86_LEA_PRIORITY 0
20448 /* Return true if usage of lea INSN has performance advantage
20449 over a sequence of instructions. Instructions sequence has
20450 SPLIT_COST cycles higher latency than lea latency. */
20452 static bool
20453 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20454 unsigned int regno2, int split_cost, bool has_scale)
20456 int dist_define, dist_use;
20458 /* For Silvermont if using a 2-source or 3-source LEA for
20459 non-destructive destination purposes, or due to wanting
20460 ability to use SCALE, the use of LEA is justified. */
20461 if (TARGET_SILVERMONT || TARGET_INTEL)
20463 if (has_scale)
20464 return true;
20465 if (split_cost < 1)
20466 return false;
20467 if (regno0 == regno1 || regno0 == regno2)
20468 return false;
20469 return true;
20472 dist_define = distance_non_agu_define (regno1, regno2, insn);
20473 dist_use = distance_agu_use (regno0, insn);
20475 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20477 /* If there is no non AGU operand definition, no AGU
20478 operand usage and split cost is 0 then both lea
20479 and non lea variants have same priority. Currently
20480 we prefer lea for 64 bit code and non lea on 32 bit
20481 code. */
20482 if (dist_use < 0 && split_cost == 0)
20483 return TARGET_64BIT || IX86_LEA_PRIORITY;
20484 else
20485 return true;
20488 /* With longer definitions distance lea is more preferable.
20489 Here we change it to take into account splitting cost and
20490 lea priority. */
20491 dist_define += split_cost + IX86_LEA_PRIORITY;
20493 /* If there is no use in memory addess then we just check
20494 that split cost exceeds AGU stall. */
20495 if (dist_use < 0)
20496 return dist_define > LEA_MAX_STALL;
20498 /* If this insn has both backward non-agu dependence and forward
20499 agu dependence, the one with short distance takes effect. */
20500 return dist_define >= dist_use;
20503 /* Return true if it is legal to clobber flags by INSN and
20504 false otherwise. */
20506 static bool
20507 ix86_ok_to_clobber_flags (rtx_insn *insn)
20509 basic_block bb = BLOCK_FOR_INSN (insn);
20510 df_ref use;
20511 bitmap live;
20513 while (insn)
20515 if (NONDEBUG_INSN_P (insn))
20517 FOR_EACH_INSN_USE (use, insn)
20518 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20519 return false;
20521 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20522 return true;
20525 if (insn == BB_END (bb))
20526 break;
20528 insn = NEXT_INSN (insn);
20531 live = df_get_live_out(bb);
20532 return !REGNO_REG_SET_P (live, FLAGS_REG);
20535 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20536 move and add to avoid AGU stalls. */
20538 bool
20539 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20541 unsigned int regno0, regno1, regno2;
20543 /* Check if we need to optimize. */
20544 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20545 return false;
20547 /* Check it is correct to split here. */
20548 if (!ix86_ok_to_clobber_flags(insn))
20549 return false;
20551 regno0 = true_regnum (operands[0]);
20552 regno1 = true_regnum (operands[1]);
20553 regno2 = true_regnum (operands[2]);
20555 /* We need to split only adds with non destructive
20556 destination operand. */
20557 if (regno0 == regno1 || regno0 == regno2)
20558 return false;
20559 else
20560 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20563 /* Return true if we should emit lea instruction instead of mov
20564 instruction. */
20566 bool
20567 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20569 unsigned int regno0, regno1;
20571 /* Check if we need to optimize. */
20572 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20573 return false;
20575 /* Use lea for reg to reg moves only. */
20576 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20577 return false;
20579 regno0 = true_regnum (operands[0]);
20580 regno1 = true_regnum (operands[1]);
20582 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20585 /* Return true if we need to split lea into a sequence of
20586 instructions to avoid AGU stalls. */
20588 bool
20589 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20591 unsigned int regno0, regno1, regno2;
20592 int split_cost;
20593 struct ix86_address parts;
20594 int ok;
20596 /* Check we need to optimize. */
20597 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20598 return false;
20600 /* The "at least two components" test below might not catch simple
20601 move or zero extension insns if parts.base is non-NULL and parts.disp
20602 is const0_rtx as the only components in the address, e.g. if the
20603 register is %rbp or %r13. As this test is much cheaper and moves or
20604 zero extensions are the common case, do this check first. */
20605 if (REG_P (operands[1])
20606 || (SImode_address_operand (operands[1], VOIDmode)
20607 && REG_P (XEXP (operands[1], 0))))
20608 return false;
20610 /* Check if it is OK to split here. */
20611 if (!ix86_ok_to_clobber_flags (insn))
20612 return false;
20614 ok = ix86_decompose_address (operands[1], &parts);
20615 gcc_assert (ok);
20617 /* There should be at least two components in the address. */
20618 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20619 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20620 return false;
20622 /* We should not split into add if non legitimate pic
20623 operand is used as displacement. */
20624 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20625 return false;
20627 regno0 = true_regnum (operands[0]) ;
20628 regno1 = INVALID_REGNUM;
20629 regno2 = INVALID_REGNUM;
20631 if (parts.base)
20632 regno1 = true_regnum (parts.base);
20633 if (parts.index)
20634 regno2 = true_regnum (parts.index);
20636 split_cost = 0;
20638 /* Compute how many cycles we will add to execution time
20639 if split lea into a sequence of instructions. */
20640 if (parts.base || parts.index)
20642 /* Have to use mov instruction if non desctructive
20643 destination form is used. */
20644 if (regno1 != regno0 && regno2 != regno0)
20645 split_cost += 1;
20647 /* Have to add index to base if both exist. */
20648 if (parts.base && parts.index)
20649 split_cost += 1;
20651 /* Have to use shift and adds if scale is 2 or greater. */
20652 if (parts.scale > 1)
20654 if (regno0 != regno1)
20655 split_cost += 1;
20656 else if (regno2 == regno0)
20657 split_cost += 4;
20658 else
20659 split_cost += parts.scale;
20662 /* Have to use add instruction with immediate if
20663 disp is non zero. */
20664 if (parts.disp && parts.disp != const0_rtx)
20665 split_cost += 1;
20667 /* Subtract the price of lea. */
20668 split_cost -= 1;
20671 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
20672 parts.scale > 1);
20675 /* Emit x86 binary operand CODE in mode MODE, where the first operand
20676 matches destination. RTX includes clobber of FLAGS_REG. */
20678 static void
20679 ix86_emit_binop (enum rtx_code code, machine_mode mode,
20680 rtx dst, rtx src)
20682 rtx op, clob;
20684 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
20685 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20687 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20690 /* Return true if regno1 def is nearest to the insn. */
20692 static bool
20693 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
20695 rtx_insn *prev = insn;
20696 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
20698 if (insn == start)
20699 return false;
20700 while (prev && prev != start)
20702 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
20704 prev = PREV_INSN (prev);
20705 continue;
20707 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
20708 return true;
20709 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
20710 return false;
20711 prev = PREV_INSN (prev);
20714 /* None of the regs is defined in the bb. */
20715 return false;
20718 /* Split lea instructions into a sequence of instructions
20719 which are executed on ALU to avoid AGU stalls.
20720 It is assumed that it is allowed to clobber flags register
20721 at lea position. */
20723 void
20724 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
20726 unsigned int regno0, regno1, regno2;
20727 struct ix86_address parts;
20728 rtx target, tmp;
20729 int ok, adds;
20731 ok = ix86_decompose_address (operands[1], &parts);
20732 gcc_assert (ok);
20734 target = gen_lowpart (mode, operands[0]);
20736 regno0 = true_regnum (target);
20737 regno1 = INVALID_REGNUM;
20738 regno2 = INVALID_REGNUM;
20740 if (parts.base)
20742 parts.base = gen_lowpart (mode, parts.base);
20743 regno1 = true_regnum (parts.base);
20746 if (parts.index)
20748 parts.index = gen_lowpart (mode, parts.index);
20749 regno2 = true_regnum (parts.index);
20752 if (parts.disp)
20753 parts.disp = gen_lowpart (mode, parts.disp);
20755 if (parts.scale > 1)
20757 /* Case r1 = r1 + ... */
20758 if (regno1 == regno0)
20760 /* If we have a case r1 = r1 + C * r2 then we
20761 should use multiplication which is very
20762 expensive. Assume cost model is wrong if we
20763 have such case here. */
20764 gcc_assert (regno2 != regno0);
20766 for (adds = parts.scale; adds > 0; adds--)
20767 ix86_emit_binop (PLUS, mode, target, parts.index);
20769 else
20771 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
20772 if (regno0 != regno2)
20773 emit_insn (gen_rtx_SET (target, parts.index));
20775 /* Use shift for scaling. */
20776 ix86_emit_binop (ASHIFT, mode, target,
20777 GEN_INT (exact_log2 (parts.scale)));
20779 if (parts.base)
20780 ix86_emit_binop (PLUS, mode, target, parts.base);
20782 if (parts.disp && parts.disp != const0_rtx)
20783 ix86_emit_binop (PLUS, mode, target, parts.disp);
20786 else if (!parts.base && !parts.index)
20788 gcc_assert(parts.disp);
20789 emit_insn (gen_rtx_SET (target, parts.disp));
20791 else
20793 if (!parts.base)
20795 if (regno0 != regno2)
20796 emit_insn (gen_rtx_SET (target, parts.index));
20798 else if (!parts.index)
20800 if (regno0 != regno1)
20801 emit_insn (gen_rtx_SET (target, parts.base));
20803 else
20805 if (regno0 == regno1)
20806 tmp = parts.index;
20807 else if (regno0 == regno2)
20808 tmp = parts.base;
20809 else
20811 rtx tmp1;
20813 /* Find better operand for SET instruction, depending
20814 on which definition is farther from the insn. */
20815 if (find_nearest_reg_def (insn, regno1, regno2))
20816 tmp = parts.index, tmp1 = parts.base;
20817 else
20818 tmp = parts.base, tmp1 = parts.index;
20820 emit_insn (gen_rtx_SET (target, tmp));
20822 if (parts.disp && parts.disp != const0_rtx)
20823 ix86_emit_binop (PLUS, mode, target, parts.disp);
20825 ix86_emit_binop (PLUS, mode, target, tmp1);
20826 return;
20829 ix86_emit_binop (PLUS, mode, target, tmp);
20832 if (parts.disp && parts.disp != const0_rtx)
20833 ix86_emit_binop (PLUS, mode, target, parts.disp);
20837 /* Return true if it is ok to optimize an ADD operation to LEA
20838 operation to avoid flag register consumation. For most processors,
20839 ADD is faster than LEA. For the processors like BONNELL, if the
20840 destination register of LEA holds an actual address which will be
20841 used soon, LEA is better and otherwise ADD is better. */
20843 bool
20844 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
20846 unsigned int regno0 = true_regnum (operands[0]);
20847 unsigned int regno1 = true_regnum (operands[1]);
20848 unsigned int regno2 = true_regnum (operands[2]);
20850 /* If a = b + c, (a!=b && a!=c), must use lea form. */
20851 if (regno0 != regno1 && regno0 != regno2)
20852 return true;
20854 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20855 return false;
20857 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
20860 /* Return true if destination reg of SET_BODY is shift count of
20861 USE_BODY. */
20863 static bool
20864 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
20866 rtx set_dest;
20867 rtx shift_rtx;
20868 int i;
20870 /* Retrieve destination of SET_BODY. */
20871 switch (GET_CODE (set_body))
20873 case SET:
20874 set_dest = SET_DEST (set_body);
20875 if (!set_dest || !REG_P (set_dest))
20876 return false;
20877 break;
20878 case PARALLEL:
20879 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
20880 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
20881 use_body))
20882 return true;
20883 /* FALLTHROUGH */
20884 default:
20885 return false;
20888 /* Retrieve shift count of USE_BODY. */
20889 switch (GET_CODE (use_body))
20891 case SET:
20892 shift_rtx = XEXP (use_body, 1);
20893 break;
20894 case PARALLEL:
20895 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
20896 if (ix86_dep_by_shift_count_body (set_body,
20897 XVECEXP (use_body, 0, i)))
20898 return true;
20899 /* FALLTHROUGH */
20900 default:
20901 return false;
20904 if (shift_rtx
20905 && (GET_CODE (shift_rtx) == ASHIFT
20906 || GET_CODE (shift_rtx) == LSHIFTRT
20907 || GET_CODE (shift_rtx) == ASHIFTRT
20908 || GET_CODE (shift_rtx) == ROTATE
20909 || GET_CODE (shift_rtx) == ROTATERT))
20911 rtx shift_count = XEXP (shift_rtx, 1);
20913 /* Return true if shift count is dest of SET_BODY. */
20914 if (REG_P (shift_count))
20916 /* Add check since it can be invoked before register
20917 allocation in pre-reload schedule. */
20918 if (reload_completed
20919 && true_regnum (set_dest) == true_regnum (shift_count))
20920 return true;
20921 else if (REGNO(set_dest) == REGNO(shift_count))
20922 return true;
20926 return false;
20929 /* Return true if destination reg of SET_INSN is shift count of
20930 USE_INSN. */
20932 bool
20933 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
20935 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
20936 PATTERN (use_insn));
20939 /* Return TRUE or FALSE depending on whether the unary operator meets the
20940 appropriate constraints. */
20942 bool
20943 ix86_unary_operator_ok (enum rtx_code,
20944 machine_mode,
20945 rtx operands[2])
20947 /* If one of operands is memory, source and destination must match. */
20948 if ((MEM_P (operands[0])
20949 || MEM_P (operands[1]))
20950 && ! rtx_equal_p (operands[0], operands[1]))
20951 return false;
20952 return true;
20955 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
20956 are ok, keeping in mind the possible movddup alternative. */
20958 bool
20959 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
20961 if (MEM_P (operands[0]))
20962 return rtx_equal_p (operands[0], operands[1 + high]);
20963 if (MEM_P (operands[1]) && MEM_P (operands[2]))
20964 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
20965 return true;
20968 /* Post-reload splitter for converting an SF or DFmode value in an
20969 SSE register into an unsigned SImode. */
20971 void
20972 ix86_split_convert_uns_si_sse (rtx operands[])
20974 machine_mode vecmode;
20975 rtx value, large, zero_or_two31, input, two31, x;
20977 large = operands[1];
20978 zero_or_two31 = operands[2];
20979 input = operands[3];
20980 two31 = operands[4];
20981 vecmode = GET_MODE (large);
20982 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
20984 /* Load up the value into the low element. We must ensure that the other
20985 elements are valid floats -- zero is the easiest such value. */
20986 if (MEM_P (input))
20988 if (vecmode == V4SFmode)
20989 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
20990 else
20991 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
20993 else
20995 input = gen_rtx_REG (vecmode, REGNO (input));
20996 emit_move_insn (value, CONST0_RTX (vecmode));
20997 if (vecmode == V4SFmode)
20998 emit_insn (gen_sse_movss (value, value, input));
20999 else
21000 emit_insn (gen_sse2_movsd (value, value, input));
21003 emit_move_insn (large, two31);
21004 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21006 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21007 emit_insn (gen_rtx_SET (large, x));
21009 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21010 emit_insn (gen_rtx_SET (zero_or_two31, x));
21012 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21013 emit_insn (gen_rtx_SET (value, x));
21015 large = gen_rtx_REG (V4SImode, REGNO (large));
21016 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21018 x = gen_rtx_REG (V4SImode, REGNO (value));
21019 if (vecmode == V4SFmode)
21020 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21021 else
21022 emit_insn (gen_sse2_cvttpd2dq (x, value));
21023 value = x;
21025 emit_insn (gen_xorv4si3 (value, value, large));
21028 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21029 Expects the 64-bit DImode to be supplied in a pair of integral
21030 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21031 -mfpmath=sse, !optimize_size only. */
21033 void
21034 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21036 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21037 rtx int_xmm, fp_xmm;
21038 rtx biases, exponents;
21039 rtx x;
21041 int_xmm = gen_reg_rtx (V4SImode);
21042 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21043 emit_insn (gen_movdi_to_sse (int_xmm, input));
21044 else if (TARGET_SSE_SPLIT_REGS)
21046 emit_clobber (int_xmm);
21047 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21049 else
21051 x = gen_reg_rtx (V2DImode);
21052 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21053 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21056 x = gen_rtx_CONST_VECTOR (V4SImode,
21057 gen_rtvec (4, GEN_INT (0x43300000UL),
21058 GEN_INT (0x45300000UL),
21059 const0_rtx, const0_rtx));
21060 exponents = validize_mem (force_const_mem (V4SImode, x));
21062 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21063 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21065 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21066 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21067 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21068 (0x1.0p84 + double(fp_value_hi_xmm)).
21069 Note these exponents differ by 32. */
21071 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21073 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21074 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21075 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21076 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21077 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21078 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21079 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21080 biases = validize_mem (force_const_mem (V2DFmode, biases));
21081 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21083 /* Add the upper and lower DFmode values together. */
21084 if (TARGET_SSE3)
21085 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21086 else
21088 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21089 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21090 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21093 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21096 /* Not used, but eases macroization of patterns. */
21097 void
21098 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21100 gcc_unreachable ();
21103 /* Convert an unsigned SImode value into a DFmode. Only currently used
21104 for SSE, but applicable anywhere. */
21106 void
21107 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21109 REAL_VALUE_TYPE TWO31r;
21110 rtx x, fp;
21112 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21113 NULL, 1, OPTAB_DIRECT);
21115 fp = gen_reg_rtx (DFmode);
21116 emit_insn (gen_floatsidf2 (fp, x));
21118 real_ldexp (&TWO31r, &dconst1, 31);
21119 x = const_double_from_real_value (TWO31r, DFmode);
21121 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21122 if (x != target)
21123 emit_move_insn (target, x);
21126 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21127 32-bit mode; otherwise we have a direct convert instruction. */
21129 void
21130 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21132 REAL_VALUE_TYPE TWO32r;
21133 rtx fp_lo, fp_hi, x;
21135 fp_lo = gen_reg_rtx (DFmode);
21136 fp_hi = gen_reg_rtx (DFmode);
21138 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21140 real_ldexp (&TWO32r, &dconst1, 32);
21141 x = const_double_from_real_value (TWO32r, DFmode);
21142 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21144 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21146 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21147 0, OPTAB_DIRECT);
21148 if (x != target)
21149 emit_move_insn (target, x);
21152 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21153 For x86_32, -mfpmath=sse, !optimize_size only. */
21154 void
21155 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21157 REAL_VALUE_TYPE ONE16r;
21158 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21160 real_ldexp (&ONE16r, &dconst1, 16);
21161 x = const_double_from_real_value (ONE16r, SFmode);
21162 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21163 NULL, 0, OPTAB_DIRECT);
21164 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21165 NULL, 0, OPTAB_DIRECT);
21166 fp_hi = gen_reg_rtx (SFmode);
21167 fp_lo = gen_reg_rtx (SFmode);
21168 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21169 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21170 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21171 0, OPTAB_DIRECT);
21172 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21173 0, OPTAB_DIRECT);
21174 if (!rtx_equal_p (target, fp_hi))
21175 emit_move_insn (target, fp_hi);
21178 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21179 a vector of unsigned ints VAL to vector of floats TARGET. */
21181 void
21182 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21184 rtx tmp[8];
21185 REAL_VALUE_TYPE TWO16r;
21186 machine_mode intmode = GET_MODE (val);
21187 machine_mode fltmode = GET_MODE (target);
21188 rtx (*cvt) (rtx, rtx);
21190 if (intmode == V4SImode)
21191 cvt = gen_floatv4siv4sf2;
21192 else
21193 cvt = gen_floatv8siv8sf2;
21194 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21195 tmp[0] = force_reg (intmode, tmp[0]);
21196 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21197 OPTAB_DIRECT);
21198 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21199 NULL_RTX, 1, OPTAB_DIRECT);
21200 tmp[3] = gen_reg_rtx (fltmode);
21201 emit_insn (cvt (tmp[3], tmp[1]));
21202 tmp[4] = gen_reg_rtx (fltmode);
21203 emit_insn (cvt (tmp[4], tmp[2]));
21204 real_ldexp (&TWO16r, &dconst1, 16);
21205 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21206 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21207 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21208 OPTAB_DIRECT);
21209 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21210 OPTAB_DIRECT);
21211 if (tmp[7] != target)
21212 emit_move_insn (target, tmp[7]);
21215 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21216 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21217 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21218 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21221 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21223 REAL_VALUE_TYPE TWO31r;
21224 rtx two31r, tmp[4];
21225 machine_mode mode = GET_MODE (val);
21226 machine_mode scalarmode = GET_MODE_INNER (mode);
21227 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21228 rtx (*cmp) (rtx, rtx, rtx, rtx);
21229 int i;
21231 for (i = 0; i < 3; i++)
21232 tmp[i] = gen_reg_rtx (mode);
21233 real_ldexp (&TWO31r, &dconst1, 31);
21234 two31r = const_double_from_real_value (TWO31r, scalarmode);
21235 two31r = ix86_build_const_vector (mode, 1, two31r);
21236 two31r = force_reg (mode, two31r);
21237 switch (mode)
21239 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21240 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21241 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21242 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21243 default: gcc_unreachable ();
21245 tmp[3] = gen_rtx_LE (mode, two31r, val);
21246 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21247 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21248 0, OPTAB_DIRECT);
21249 if (intmode == V4SImode || TARGET_AVX2)
21250 *xorp = expand_simple_binop (intmode, ASHIFT,
21251 gen_lowpart (intmode, tmp[0]),
21252 GEN_INT (31), NULL_RTX, 0,
21253 OPTAB_DIRECT);
21254 else
21256 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21257 two31 = ix86_build_const_vector (intmode, 1, two31);
21258 *xorp = expand_simple_binop (intmode, AND,
21259 gen_lowpart (intmode, tmp[0]),
21260 two31, NULL_RTX, 0,
21261 OPTAB_DIRECT);
21263 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21264 0, OPTAB_DIRECT);
21267 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21268 then replicate the value for all elements of the vector
21269 register. */
21272 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21274 int i, n_elt;
21275 rtvec v;
21276 machine_mode scalar_mode;
21278 switch (mode)
21280 case E_V64QImode:
21281 case E_V32QImode:
21282 case E_V16QImode:
21283 case E_V32HImode:
21284 case E_V16HImode:
21285 case E_V8HImode:
21286 case E_V16SImode:
21287 case E_V8SImode:
21288 case E_V4SImode:
21289 case E_V8DImode:
21290 case E_V4DImode:
21291 case E_V2DImode:
21292 gcc_assert (vect);
21293 /* FALLTHRU */
21294 case E_V16SFmode:
21295 case E_V8SFmode:
21296 case E_V4SFmode:
21297 case E_V8DFmode:
21298 case E_V4DFmode:
21299 case E_V2DFmode:
21300 n_elt = GET_MODE_NUNITS (mode);
21301 v = rtvec_alloc (n_elt);
21302 scalar_mode = GET_MODE_INNER (mode);
21304 RTVEC_ELT (v, 0) = value;
21306 for (i = 1; i < n_elt; ++i)
21307 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21309 return gen_rtx_CONST_VECTOR (mode, v);
21311 default:
21312 gcc_unreachable ();
21316 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21317 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21318 for an SSE register. If VECT is true, then replicate the mask for
21319 all elements of the vector register. If INVERT is true, then create
21320 a mask excluding the sign bit. */
21323 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21325 machine_mode vec_mode, imode;
21326 wide_int w;
21327 rtx mask, v;
21329 switch (mode)
21331 case E_V16SImode:
21332 case E_V16SFmode:
21333 case E_V8SImode:
21334 case E_V4SImode:
21335 case E_V8SFmode:
21336 case E_V4SFmode:
21337 vec_mode = mode;
21338 imode = SImode;
21339 break;
21341 case E_V8DImode:
21342 case E_V4DImode:
21343 case E_V2DImode:
21344 case E_V8DFmode:
21345 case E_V4DFmode:
21346 case E_V2DFmode:
21347 vec_mode = mode;
21348 imode = DImode;
21349 break;
21351 case E_TImode:
21352 case E_TFmode:
21353 vec_mode = VOIDmode;
21354 imode = TImode;
21355 break;
21357 default:
21358 gcc_unreachable ();
21361 machine_mode inner_mode = GET_MODE_INNER (mode);
21362 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21363 GET_MODE_BITSIZE (inner_mode));
21364 if (invert)
21365 w = wi::bit_not (w);
21367 /* Force this value into the low part of a fp vector constant. */
21368 mask = immed_wide_int_const (w, imode);
21369 mask = gen_lowpart (inner_mode, mask);
21371 if (vec_mode == VOIDmode)
21372 return force_reg (inner_mode, mask);
21374 v = ix86_build_const_vector (vec_mode, vect, mask);
21375 return force_reg (vec_mode, v);
21378 /* Generate code for floating point ABS or NEG. */
21380 void
21381 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21382 rtx operands[])
21384 rtx mask, set, dst, src;
21385 bool use_sse = false;
21386 bool vector_mode = VECTOR_MODE_P (mode);
21387 machine_mode vmode = mode;
21389 if (vector_mode)
21390 use_sse = true;
21391 else if (mode == TFmode)
21392 use_sse = true;
21393 else if (TARGET_SSE_MATH)
21395 use_sse = SSE_FLOAT_MODE_P (mode);
21396 if (mode == SFmode)
21397 vmode = V4SFmode;
21398 else if (mode == DFmode)
21399 vmode = V2DFmode;
21402 /* NEG and ABS performed with SSE use bitwise mask operations.
21403 Create the appropriate mask now. */
21404 if (use_sse)
21405 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21406 else
21407 mask = NULL_RTX;
21409 dst = operands[0];
21410 src = operands[1];
21412 set = gen_rtx_fmt_e (code, mode, src);
21413 set = gen_rtx_SET (dst, set);
21415 if (mask)
21417 rtx use, clob;
21418 rtvec par;
21420 use = gen_rtx_USE (VOIDmode, mask);
21421 if (vector_mode)
21422 par = gen_rtvec (2, set, use);
21423 else
21425 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21426 par = gen_rtvec (3, set, use, clob);
21428 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21430 else
21431 emit_insn (set);
21434 /* Expand a copysign operation. Special case operand 0 being a constant. */
21436 void
21437 ix86_expand_copysign (rtx operands[])
21439 machine_mode mode, vmode;
21440 rtx dest, op0, op1, mask, nmask;
21442 dest = operands[0];
21443 op0 = operands[1];
21444 op1 = operands[2];
21446 mode = GET_MODE (dest);
21448 if (mode == SFmode)
21449 vmode = V4SFmode;
21450 else if (mode == DFmode)
21451 vmode = V2DFmode;
21452 else
21453 vmode = mode;
21455 if (CONST_DOUBLE_P (op0))
21457 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21459 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21460 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21462 if (mode == SFmode || mode == DFmode)
21464 if (op0 == CONST0_RTX (mode))
21465 op0 = CONST0_RTX (vmode);
21466 else
21468 rtx v = ix86_build_const_vector (vmode, false, op0);
21470 op0 = force_reg (vmode, v);
21473 else if (op0 != CONST0_RTX (mode))
21474 op0 = force_reg (mode, op0);
21476 mask = ix86_build_signbit_mask (vmode, 0, 0);
21478 if (mode == SFmode)
21479 copysign_insn = gen_copysignsf3_const;
21480 else if (mode == DFmode)
21481 copysign_insn = gen_copysigndf3_const;
21482 else
21483 copysign_insn = gen_copysigntf3_const;
21485 emit_insn (copysign_insn (dest, op0, op1, mask));
21487 else
21489 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21491 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21492 mask = ix86_build_signbit_mask (vmode, 0, 0);
21494 if (mode == SFmode)
21495 copysign_insn = gen_copysignsf3_var;
21496 else if (mode == DFmode)
21497 copysign_insn = gen_copysigndf3_var;
21498 else
21499 copysign_insn = gen_copysigntf3_var;
21501 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21505 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21506 be a constant, and so has already been expanded into a vector constant. */
21508 void
21509 ix86_split_copysign_const (rtx operands[])
21511 machine_mode mode, vmode;
21512 rtx dest, op0, mask, x;
21514 dest = operands[0];
21515 op0 = operands[1];
21516 mask = operands[3];
21518 mode = GET_MODE (dest);
21519 vmode = GET_MODE (mask);
21521 dest = lowpart_subreg (vmode, dest, mode);
21522 x = gen_rtx_AND (vmode, dest, mask);
21523 emit_insn (gen_rtx_SET (dest, x));
21525 if (op0 != CONST0_RTX (vmode))
21527 x = gen_rtx_IOR (vmode, dest, op0);
21528 emit_insn (gen_rtx_SET (dest, x));
21532 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21533 so we have to do two masks. */
21535 void
21536 ix86_split_copysign_var (rtx operands[])
21538 machine_mode mode, vmode;
21539 rtx dest, scratch, op0, op1, mask, nmask, x;
21541 dest = operands[0];
21542 scratch = operands[1];
21543 op0 = operands[2];
21544 op1 = operands[3];
21545 nmask = operands[4];
21546 mask = operands[5];
21548 mode = GET_MODE (dest);
21549 vmode = GET_MODE (mask);
21551 if (rtx_equal_p (op0, op1))
21553 /* Shouldn't happen often (it's useless, obviously), but when it does
21554 we'd generate incorrect code if we continue below. */
21555 emit_move_insn (dest, op0);
21556 return;
21559 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21561 gcc_assert (REGNO (op1) == REGNO (scratch));
21563 x = gen_rtx_AND (vmode, scratch, mask);
21564 emit_insn (gen_rtx_SET (scratch, x));
21566 dest = mask;
21567 op0 = lowpart_subreg (vmode, op0, mode);
21568 x = gen_rtx_NOT (vmode, dest);
21569 x = gen_rtx_AND (vmode, x, op0);
21570 emit_insn (gen_rtx_SET (dest, x));
21572 else
21574 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21576 x = gen_rtx_AND (vmode, scratch, mask);
21578 else /* alternative 2,4 */
21580 gcc_assert (REGNO (mask) == REGNO (scratch));
21581 op1 = lowpart_subreg (vmode, op1, mode);
21582 x = gen_rtx_AND (vmode, scratch, op1);
21584 emit_insn (gen_rtx_SET (scratch, x));
21586 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21588 dest = lowpart_subreg (vmode, op0, mode);
21589 x = gen_rtx_AND (vmode, dest, nmask);
21591 else /* alternative 3,4 */
21593 gcc_assert (REGNO (nmask) == REGNO (dest));
21594 dest = nmask;
21595 op0 = lowpart_subreg (vmode, op0, mode);
21596 x = gen_rtx_AND (vmode, dest, op0);
21598 emit_insn (gen_rtx_SET (dest, x));
21601 x = gen_rtx_IOR (vmode, dest, scratch);
21602 emit_insn (gen_rtx_SET (dest, x));
21605 /* Return TRUE or FALSE depending on whether the first SET in INSN
21606 has source and destination with matching CC modes, and that the
21607 CC mode is at least as constrained as REQ_MODE. */
21609 bool
21610 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21612 rtx set;
21613 machine_mode set_mode;
21615 set = PATTERN (insn);
21616 if (GET_CODE (set) == PARALLEL)
21617 set = XVECEXP (set, 0, 0);
21618 gcc_assert (GET_CODE (set) == SET);
21619 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21621 set_mode = GET_MODE (SET_DEST (set));
21622 switch (set_mode)
21624 case E_CCNOmode:
21625 if (req_mode != CCNOmode
21626 && (req_mode != CCmode
21627 || XEXP (SET_SRC (set), 1) != const0_rtx))
21628 return false;
21629 break;
21630 case E_CCmode:
21631 if (req_mode == CCGCmode)
21632 return false;
21633 /* FALLTHRU */
21634 case E_CCGCmode:
21635 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21636 return false;
21637 /* FALLTHRU */
21638 case E_CCGOCmode:
21639 if (req_mode == CCZmode)
21640 return false;
21641 /* FALLTHRU */
21642 case E_CCZmode:
21643 break;
21645 case E_CCGZmode:
21647 case E_CCAmode:
21648 case E_CCCmode:
21649 case E_CCOmode:
21650 case E_CCPmode:
21651 case E_CCSmode:
21652 if (set_mode != req_mode)
21653 return false;
21654 break;
21656 default:
21657 gcc_unreachable ();
21660 return GET_MODE (SET_SRC (set)) == set_mode;
21663 /* Generate insn patterns to do an integer compare of OPERANDS. */
21665 static rtx
21666 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
21668 machine_mode cmpmode;
21669 rtx tmp, flags;
21671 cmpmode = SELECT_CC_MODE (code, op0, op1);
21672 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
21674 /* This is very simple, but making the interface the same as in the
21675 FP case makes the rest of the code easier. */
21676 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
21677 emit_insn (gen_rtx_SET (flags, tmp));
21679 /* Return the test that should be put into the flags user, i.e.
21680 the bcc, scc, or cmov instruction. */
21681 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
21684 /* Figure out whether to use ordered or unordered fp comparisons.
21685 Return the appropriate mode to use. */
21687 machine_mode
21688 ix86_fp_compare_mode (enum rtx_code code)
21690 if (!TARGET_IEEE_FP)
21691 return CCFPmode;
21693 switch (code)
21695 case GT:
21696 case GE:
21697 case LT:
21698 case LE:
21699 return CCFPmode;
21701 case EQ:
21702 case NE:
21704 case LTGT:
21705 case UNORDERED:
21706 case ORDERED:
21707 case UNLT:
21708 case UNLE:
21709 case UNGT:
21710 case UNGE:
21711 case UNEQ:
21712 return CCFPUmode;
21714 default:
21715 gcc_unreachable ();
21719 machine_mode
21720 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
21722 machine_mode mode = GET_MODE (op0);
21724 if (SCALAR_FLOAT_MODE_P (mode))
21726 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
21727 return ix86_fp_compare_mode (code);
21730 switch (code)
21732 /* Only zero flag is needed. */
21733 case EQ: /* ZF=0 */
21734 case NE: /* ZF!=0 */
21735 return CCZmode;
21736 /* Codes needing carry flag. */
21737 case GEU: /* CF=0 */
21738 case LTU: /* CF=1 */
21739 /* Detect overflow checks. They need just the carry flag. */
21740 if (GET_CODE (op0) == PLUS
21741 && (rtx_equal_p (op1, XEXP (op0, 0))
21742 || rtx_equal_p (op1, XEXP (op0, 1))))
21743 return CCCmode;
21744 else
21745 return CCmode;
21746 case GTU: /* CF=0 & ZF=0 */
21747 case LEU: /* CF=1 | ZF=1 */
21748 return CCmode;
21749 /* Codes possibly doable only with sign flag when
21750 comparing against zero. */
21751 case GE: /* SF=OF or SF=0 */
21752 case LT: /* SF<>OF or SF=1 */
21753 if (op1 == const0_rtx)
21754 return CCGOCmode;
21755 else
21756 /* For other cases Carry flag is not required. */
21757 return CCGCmode;
21758 /* Codes doable only with sign flag when comparing
21759 against zero, but we miss jump instruction for it
21760 so we need to use relational tests against overflow
21761 that thus needs to be zero. */
21762 case GT: /* ZF=0 & SF=OF */
21763 case LE: /* ZF=1 | SF<>OF */
21764 if (op1 == const0_rtx)
21765 return CCNOmode;
21766 else
21767 return CCGCmode;
21768 /* strcmp pattern do (use flags) and combine may ask us for proper
21769 mode. */
21770 case USE:
21771 return CCmode;
21772 default:
21773 gcc_unreachable ();
21777 /* Return the fixed registers used for condition codes. */
21779 static bool
21780 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
21782 *p1 = FLAGS_REG;
21783 *p2 = FPSR_REG;
21784 return true;
21787 /* If two condition code modes are compatible, return a condition code
21788 mode which is compatible with both. Otherwise, return
21789 VOIDmode. */
21791 static machine_mode
21792 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
21794 if (m1 == m2)
21795 return m1;
21797 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
21798 return VOIDmode;
21800 if ((m1 == CCGCmode && m2 == CCGOCmode)
21801 || (m1 == CCGOCmode && m2 == CCGCmode))
21802 return CCGCmode;
21804 if ((m1 == CCNOmode && m2 == CCGOCmode)
21805 || (m1 == CCGOCmode && m2 == CCNOmode))
21806 return CCNOmode;
21808 if (m1 == CCZmode
21809 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
21810 return m2;
21811 else if (m2 == CCZmode
21812 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
21813 return m1;
21815 switch (m1)
21817 default:
21818 gcc_unreachable ();
21820 case E_CCmode:
21821 case E_CCGCmode:
21822 case E_CCGOCmode:
21823 case E_CCNOmode:
21824 case E_CCAmode:
21825 case E_CCCmode:
21826 case E_CCOmode:
21827 case E_CCPmode:
21828 case E_CCSmode:
21829 case E_CCZmode:
21830 switch (m2)
21832 default:
21833 return VOIDmode;
21835 case E_CCmode:
21836 case E_CCGCmode:
21837 case E_CCGOCmode:
21838 case E_CCNOmode:
21839 case E_CCAmode:
21840 case E_CCCmode:
21841 case E_CCOmode:
21842 case E_CCPmode:
21843 case E_CCSmode:
21844 case E_CCZmode:
21845 return CCmode;
21848 case E_CCFPmode:
21849 case E_CCFPUmode:
21850 /* These are only compatible with themselves, which we already
21851 checked above. */
21852 return VOIDmode;
21857 /* Return a comparison we can do and that it is equivalent to
21858 swap_condition (code) apart possibly from orderedness.
21859 But, never change orderedness if TARGET_IEEE_FP, returning
21860 UNKNOWN in that case if necessary. */
21862 static enum rtx_code
21863 ix86_fp_swap_condition (enum rtx_code code)
21865 switch (code)
21867 case GT: /* GTU - CF=0 & ZF=0 */
21868 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
21869 case GE: /* GEU - CF=0 */
21870 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
21871 case UNLT: /* LTU - CF=1 */
21872 return TARGET_IEEE_FP ? UNKNOWN : GT;
21873 case UNLE: /* LEU - CF=1 | ZF=1 */
21874 return TARGET_IEEE_FP ? UNKNOWN : GE;
21875 default:
21876 return swap_condition (code);
21880 /* Return cost of comparison CODE using the best strategy for performance.
21881 All following functions do use number of instructions as a cost metrics.
21882 In future this should be tweaked to compute bytes for optimize_size and
21883 take into account performance of various instructions on various CPUs. */
21885 static int
21886 ix86_fp_comparison_cost (enum rtx_code code)
21888 int arith_cost;
21890 /* The cost of code using bit-twiddling on %ah. */
21891 switch (code)
21893 case UNLE:
21894 case UNLT:
21895 case LTGT:
21896 case GT:
21897 case GE:
21898 case UNORDERED:
21899 case ORDERED:
21900 case UNEQ:
21901 arith_cost = 4;
21902 break;
21903 case LT:
21904 case NE:
21905 case EQ:
21906 case UNGE:
21907 arith_cost = TARGET_IEEE_FP ? 5 : 4;
21908 break;
21909 case LE:
21910 case UNGT:
21911 arith_cost = TARGET_IEEE_FP ? 6 : 4;
21912 break;
21913 default:
21914 gcc_unreachable ();
21917 switch (ix86_fp_comparison_strategy (code))
21919 case IX86_FPCMP_COMI:
21920 return arith_cost > 4 ? 3 : 2;
21921 case IX86_FPCMP_SAHF:
21922 return arith_cost > 4 ? 4 : 3;
21923 default:
21924 return arith_cost;
21928 /* Return strategy to use for floating-point. We assume that fcomi is always
21929 preferrable where available, since that is also true when looking at size
21930 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
21932 enum ix86_fpcmp_strategy
21933 ix86_fp_comparison_strategy (enum rtx_code)
21935 /* Do fcomi/sahf based test when profitable. */
21937 if (TARGET_CMOVE)
21938 return IX86_FPCMP_COMI;
21940 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
21941 return IX86_FPCMP_SAHF;
21943 return IX86_FPCMP_ARITH;
21946 /* Swap, force into registers, or otherwise massage the two operands
21947 to a fp comparison. The operands are updated in place; the new
21948 comparison code is returned. */
21950 static enum rtx_code
21951 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
21953 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
21954 rtx op0 = *pop0, op1 = *pop1;
21955 machine_mode op_mode = GET_MODE (op0);
21956 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
21958 /* All of the unordered compare instructions only work on registers.
21959 The same is true of the fcomi compare instructions. The XFmode
21960 compare instructions require registers except when comparing
21961 against zero or when converting operand 1 from fixed point to
21962 floating point. */
21964 if (!is_sse
21965 && (fpcmp_mode == CCFPUmode
21966 || (op_mode == XFmode
21967 && ! (standard_80387_constant_p (op0) == 1
21968 || standard_80387_constant_p (op1) == 1)
21969 && GET_CODE (op1) != FLOAT)
21970 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
21972 op0 = force_reg (op_mode, op0);
21973 op1 = force_reg (op_mode, op1);
21975 else
21977 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
21978 things around if they appear profitable, otherwise force op0
21979 into a register. */
21981 if (standard_80387_constant_p (op0) == 0
21982 || (MEM_P (op0)
21983 && ! (standard_80387_constant_p (op1) == 0
21984 || MEM_P (op1))))
21986 enum rtx_code new_code = ix86_fp_swap_condition (code);
21987 if (new_code != UNKNOWN)
21989 std::swap (op0, op1);
21990 code = new_code;
21994 if (!REG_P (op0))
21995 op0 = force_reg (op_mode, op0);
21997 if (CONSTANT_P (op1))
21999 int tmp = standard_80387_constant_p (op1);
22000 if (tmp == 0)
22001 op1 = validize_mem (force_const_mem (op_mode, op1));
22002 else if (tmp == 1)
22004 if (TARGET_CMOVE)
22005 op1 = force_reg (op_mode, op1);
22007 else
22008 op1 = force_reg (op_mode, op1);
22012 /* Try to rearrange the comparison to make it cheaper. */
22013 if (ix86_fp_comparison_cost (code)
22014 > ix86_fp_comparison_cost (swap_condition (code))
22015 && (REG_P (op1) || can_create_pseudo_p ()))
22017 std::swap (op0, op1);
22018 code = swap_condition (code);
22019 if (!REG_P (op0))
22020 op0 = force_reg (op_mode, op0);
22023 *pop0 = op0;
22024 *pop1 = op1;
22025 return code;
22028 /* Convert comparison codes we use to represent FP comparison to integer
22029 code that will result in proper branch. Return UNKNOWN if no such code
22030 is available. */
22032 enum rtx_code
22033 ix86_fp_compare_code_to_integer (enum rtx_code code)
22035 switch (code)
22037 case GT:
22038 return GTU;
22039 case GE:
22040 return GEU;
22041 case ORDERED:
22042 case UNORDERED:
22043 return code;
22044 case UNEQ:
22045 return EQ;
22046 case UNLT:
22047 return LTU;
22048 case UNLE:
22049 return LEU;
22050 case LTGT:
22051 return NE;
22052 default:
22053 return UNKNOWN;
22057 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22059 static rtx
22060 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22062 machine_mode fpcmp_mode, intcmp_mode;
22063 rtx tmp, tmp2;
22065 fpcmp_mode = ix86_fp_compare_mode (code);
22066 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22068 /* Do fcomi/sahf based test when profitable. */
22069 switch (ix86_fp_comparison_strategy (code))
22071 case IX86_FPCMP_COMI:
22072 intcmp_mode = fpcmp_mode;
22073 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22074 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22075 emit_insn (tmp);
22076 break;
22078 case IX86_FPCMP_SAHF:
22079 intcmp_mode = fpcmp_mode;
22080 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22081 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22083 if (!scratch)
22084 scratch = gen_reg_rtx (HImode);
22085 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22086 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22087 break;
22089 case IX86_FPCMP_ARITH:
22090 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22091 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22092 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22093 if (!scratch)
22094 scratch = gen_reg_rtx (HImode);
22095 emit_insn (gen_rtx_SET (scratch, tmp2));
22097 /* In the unordered case, we have to check C2 for NaN's, which
22098 doesn't happen to work out to anything nice combination-wise.
22099 So do some bit twiddling on the value we've got in AH to come
22100 up with an appropriate set of condition codes. */
22102 intcmp_mode = CCNOmode;
22103 switch (code)
22105 case GT:
22106 case UNGT:
22107 if (code == GT || !TARGET_IEEE_FP)
22109 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22110 code = EQ;
22112 else
22114 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22115 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22116 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22117 intcmp_mode = CCmode;
22118 code = GEU;
22120 break;
22121 case LT:
22122 case UNLT:
22123 if (code == LT && TARGET_IEEE_FP)
22125 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22126 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22127 intcmp_mode = CCmode;
22128 code = EQ;
22130 else
22132 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22133 code = NE;
22135 break;
22136 case GE:
22137 case UNGE:
22138 if (code == GE || !TARGET_IEEE_FP)
22140 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22141 code = EQ;
22143 else
22145 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22146 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22147 code = NE;
22149 break;
22150 case LE:
22151 case UNLE:
22152 if (code == LE && TARGET_IEEE_FP)
22154 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22155 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22156 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22157 intcmp_mode = CCmode;
22158 code = LTU;
22160 else
22162 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22163 code = NE;
22165 break;
22166 case EQ:
22167 case UNEQ:
22168 if (code == EQ && TARGET_IEEE_FP)
22170 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22171 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22172 intcmp_mode = CCmode;
22173 code = EQ;
22175 else
22177 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22178 code = NE;
22180 break;
22181 case NE:
22182 case LTGT:
22183 if (code == NE && TARGET_IEEE_FP)
22185 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22186 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22187 GEN_INT (0x40)));
22188 code = NE;
22190 else
22192 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22193 code = EQ;
22195 break;
22197 case UNORDERED:
22198 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22199 code = NE;
22200 break;
22201 case ORDERED:
22202 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22203 code = EQ;
22204 break;
22206 default:
22207 gcc_unreachable ();
22209 break;
22211 default:
22212 gcc_unreachable();
22215 /* Return the test that should be put into the flags user, i.e.
22216 the bcc, scc, or cmov instruction. */
22217 return gen_rtx_fmt_ee (code, VOIDmode,
22218 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22219 const0_rtx);
22222 static rtx
22223 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22225 rtx ret;
22227 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22228 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22230 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22232 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22233 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22235 else
22236 ret = ix86_expand_int_compare (code, op0, op1);
22238 return ret;
22241 void
22242 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22244 machine_mode mode = GET_MODE (op0);
22245 rtx tmp;
22247 /* Handle special case - vector comparsion with boolean result, transform
22248 it using ptest instruction. */
22249 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22251 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22252 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22254 gcc_assert (code == EQ || code == NE);
22255 /* Generate XOR since we can't check that one operand is zero vector. */
22256 tmp = gen_reg_rtx (mode);
22257 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22258 tmp = gen_lowpart (p_mode, tmp);
22259 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22260 gen_rtx_UNSPEC (CCmode,
22261 gen_rtvec (2, tmp, tmp),
22262 UNSPEC_PTEST)));
22263 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22264 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22265 gen_rtx_LABEL_REF (VOIDmode, label),
22266 pc_rtx);
22267 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22268 return;
22271 switch (mode)
22273 case E_SFmode:
22274 case E_DFmode:
22275 case E_XFmode:
22276 case E_QImode:
22277 case E_HImode:
22278 case E_SImode:
22279 simple:
22280 tmp = ix86_expand_compare (code, op0, op1);
22281 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22282 gen_rtx_LABEL_REF (VOIDmode, label),
22283 pc_rtx);
22284 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22285 return;
22287 case E_DImode:
22288 if (TARGET_64BIT)
22289 goto simple;
22290 /* For 32-bit target DI comparison may be performed on
22291 SSE registers. To allow this we should avoid split
22292 to SI mode which is achieved by doing xor in DI mode
22293 and then comparing with zero (which is recognized by
22294 STV pass). We don't compare using xor when optimizing
22295 for size. */
22296 if (!optimize_insn_for_size_p ()
22297 && TARGET_STV
22298 && (code == EQ || code == NE))
22300 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22301 op1 = const0_rtx;
22303 /* FALLTHRU */
22304 case E_TImode:
22305 /* Expand DImode branch into multiple compare+branch. */
22307 rtx lo[2], hi[2];
22308 rtx_code_label *label2;
22309 enum rtx_code code1, code2, code3;
22310 machine_mode submode;
22312 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22314 std::swap (op0, op1);
22315 code = swap_condition (code);
22318 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22319 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22321 submode = mode == DImode ? SImode : DImode;
22323 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22324 avoid two branches. This costs one extra insn, so disable when
22325 optimizing for size. */
22327 if ((code == EQ || code == NE)
22328 && (!optimize_insn_for_size_p ()
22329 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22331 rtx xor0, xor1;
22333 xor1 = hi[0];
22334 if (hi[1] != const0_rtx)
22335 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22336 NULL_RTX, 0, OPTAB_WIDEN);
22338 xor0 = lo[0];
22339 if (lo[1] != const0_rtx)
22340 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22341 NULL_RTX, 0, OPTAB_WIDEN);
22343 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22344 NULL_RTX, 0, OPTAB_WIDEN);
22346 ix86_expand_branch (code, tmp, const0_rtx, label);
22347 return;
22350 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22351 op1 is a constant and the low word is zero, then we can just
22352 examine the high word. Similarly for low word -1 and
22353 less-or-equal-than or greater-than. */
22355 if (CONST_INT_P (hi[1]))
22356 switch (code)
22358 case LT: case LTU: case GE: case GEU:
22359 if (lo[1] == const0_rtx)
22361 ix86_expand_branch (code, hi[0], hi[1], label);
22362 return;
22364 break;
22365 case LE: case LEU: case GT: case GTU:
22366 if (lo[1] == constm1_rtx)
22368 ix86_expand_branch (code, hi[0], hi[1], label);
22369 return;
22371 break;
22372 default:
22373 break;
22376 /* Emulate comparisons that do not depend on Zero flag with
22377 double-word subtraction. Note that only Overflow, Sign
22378 and Carry flags are valid, so swap arguments and condition
22379 of comparisons that would otherwise test Zero flag. */
22381 switch (code)
22383 case LE: case LEU: case GT: case GTU:
22384 std::swap (lo[0], lo[1]);
22385 std::swap (hi[0], hi[1]);
22386 code = swap_condition (code);
22387 /* FALLTHRU */
22389 case LT: case LTU: case GE: case GEU:
22391 rtx (*cmp_insn) (rtx, rtx);
22392 rtx (*sbb_insn) (rtx, rtx, rtx);
22393 bool uns = (code == LTU || code == GEU);
22395 if (TARGET_64BIT)
22397 cmp_insn = gen_cmpdi_1;
22398 sbb_insn
22399 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
22401 else
22403 cmp_insn = gen_cmpsi_1;
22404 sbb_insn
22405 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
22408 if (!nonimmediate_operand (lo[0], submode))
22409 lo[0] = force_reg (submode, lo[0]);
22410 if (!x86_64_general_operand (lo[1], submode))
22411 lo[1] = force_reg (submode, lo[1]);
22413 if (!register_operand (hi[0], submode))
22414 hi[0] = force_reg (submode, hi[0]);
22415 if ((uns && !nonimmediate_operand (hi[1], submode))
22416 || (!uns && !x86_64_general_operand (hi[1], submode)))
22417 hi[1] = force_reg (submode, hi[1]);
22419 emit_insn (cmp_insn (lo[0], lo[1]));
22420 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
22422 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
22424 ix86_expand_branch (code, tmp, const0_rtx, label);
22425 return;
22428 default:
22429 break;
22432 /* Otherwise, we need two or three jumps. */
22434 label2 = gen_label_rtx ();
22436 code1 = code;
22437 code2 = swap_condition (code);
22438 code3 = unsigned_condition (code);
22440 switch (code)
22442 case LT: case GT: case LTU: case GTU:
22443 break;
22445 case LE: code1 = LT; code2 = GT; break;
22446 case GE: code1 = GT; code2 = LT; break;
22447 case LEU: code1 = LTU; code2 = GTU; break;
22448 case GEU: code1 = GTU; code2 = LTU; break;
22450 case EQ: code1 = UNKNOWN; code2 = NE; break;
22451 case NE: code2 = UNKNOWN; break;
22453 default:
22454 gcc_unreachable ();
22458 * a < b =>
22459 * if (hi(a) < hi(b)) goto true;
22460 * if (hi(a) > hi(b)) goto false;
22461 * if (lo(a) < lo(b)) goto true;
22462 * false:
22465 if (code1 != UNKNOWN)
22466 ix86_expand_branch (code1, hi[0], hi[1], label);
22467 if (code2 != UNKNOWN)
22468 ix86_expand_branch (code2, hi[0], hi[1], label2);
22470 ix86_expand_branch (code3, lo[0], lo[1], label);
22472 if (code2 != UNKNOWN)
22473 emit_label (label2);
22474 return;
22477 default:
22478 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22479 goto simple;
22483 void
22484 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22486 rtx ret;
22488 gcc_assert (GET_MODE (dest) == QImode);
22490 ret = ix86_expand_compare (code, op0, op1);
22491 PUT_MODE (ret, QImode);
22492 emit_insn (gen_rtx_SET (dest, ret));
22495 /* Expand comparison setting or clearing carry flag. Return true when
22496 successful and set pop for the operation. */
22497 static bool
22498 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22500 machine_mode mode =
22501 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22503 /* Do not handle double-mode compares that go through special path. */
22504 if (mode == (TARGET_64BIT ? TImode : DImode))
22505 return false;
22507 if (SCALAR_FLOAT_MODE_P (mode))
22509 rtx compare_op;
22510 rtx_insn *compare_seq;
22512 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22514 /* Shortcut: following common codes never translate
22515 into carry flag compares. */
22516 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22517 || code == ORDERED || code == UNORDERED)
22518 return false;
22520 /* These comparisons require zero flag; swap operands so they won't. */
22521 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22522 && !TARGET_IEEE_FP)
22524 std::swap (op0, op1);
22525 code = swap_condition (code);
22528 /* Try to expand the comparison and verify that we end up with
22529 carry flag based comparison. This fails to be true only when
22530 we decide to expand comparison using arithmetic that is not
22531 too common scenario. */
22532 start_sequence ();
22533 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22534 compare_seq = get_insns ();
22535 end_sequence ();
22537 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
22538 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
22539 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22540 else
22541 code = GET_CODE (compare_op);
22543 if (code != LTU && code != GEU)
22544 return false;
22546 emit_insn (compare_seq);
22547 *pop = compare_op;
22548 return true;
22551 if (!INTEGRAL_MODE_P (mode))
22552 return false;
22554 switch (code)
22556 case LTU:
22557 case GEU:
22558 break;
22560 /* Convert a==0 into (unsigned)a<1. */
22561 case EQ:
22562 case NE:
22563 if (op1 != const0_rtx)
22564 return false;
22565 op1 = const1_rtx;
22566 code = (code == EQ ? LTU : GEU);
22567 break;
22569 /* Convert a>b into b<a or a>=b-1. */
22570 case GTU:
22571 case LEU:
22572 if (CONST_INT_P (op1))
22574 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22575 /* Bail out on overflow. We still can swap operands but that
22576 would force loading of the constant into register. */
22577 if (op1 == const0_rtx
22578 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22579 return false;
22580 code = (code == GTU ? GEU : LTU);
22582 else
22584 std::swap (op0, op1);
22585 code = (code == GTU ? LTU : GEU);
22587 break;
22589 /* Convert a>=0 into (unsigned)a<0x80000000. */
22590 case LT:
22591 case GE:
22592 if (mode == DImode || op1 != const0_rtx)
22593 return false;
22594 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22595 code = (code == LT ? GEU : LTU);
22596 break;
22597 case LE:
22598 case GT:
22599 if (mode == DImode || op1 != constm1_rtx)
22600 return false;
22601 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22602 code = (code == LE ? GEU : LTU);
22603 break;
22605 default:
22606 return false;
22608 /* Swapping operands may cause constant to appear as first operand. */
22609 if (!nonimmediate_operand (op0, VOIDmode))
22611 if (!can_create_pseudo_p ())
22612 return false;
22613 op0 = force_reg (mode, op0);
22615 *pop = ix86_expand_compare (code, op0, op1);
22616 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22617 return true;
22620 bool
22621 ix86_expand_int_movcc (rtx operands[])
22623 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22624 rtx_insn *compare_seq;
22625 rtx compare_op;
22626 machine_mode mode = GET_MODE (operands[0]);
22627 bool sign_bit_compare_p = false;
22628 rtx op0 = XEXP (operands[1], 0);
22629 rtx op1 = XEXP (operands[1], 1);
22631 if (GET_MODE (op0) == TImode
22632 || (GET_MODE (op0) == DImode
22633 && !TARGET_64BIT))
22634 return false;
22636 start_sequence ();
22637 compare_op = ix86_expand_compare (code, op0, op1);
22638 compare_seq = get_insns ();
22639 end_sequence ();
22641 compare_code = GET_CODE (compare_op);
22643 if ((op1 == const0_rtx && (code == GE || code == LT))
22644 || (op1 == constm1_rtx && (code == GT || code == LE)))
22645 sign_bit_compare_p = true;
22647 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22648 HImode insns, we'd be swallowed in word prefix ops. */
22650 if ((mode != HImode || TARGET_FAST_PREFIX)
22651 && (mode != (TARGET_64BIT ? TImode : DImode))
22652 && CONST_INT_P (operands[2])
22653 && CONST_INT_P (operands[3]))
22655 rtx out = operands[0];
22656 HOST_WIDE_INT ct = INTVAL (operands[2]);
22657 HOST_WIDE_INT cf = INTVAL (operands[3]);
22658 HOST_WIDE_INT diff;
22660 diff = ct - cf;
22661 /* Sign bit compares are better done using shifts than we do by using
22662 sbb. */
22663 if (sign_bit_compare_p
22664 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22666 /* Detect overlap between destination and compare sources. */
22667 rtx tmp = out;
22669 if (!sign_bit_compare_p)
22671 rtx flags;
22672 bool fpcmp = false;
22674 compare_code = GET_CODE (compare_op);
22676 flags = XEXP (compare_op, 0);
22678 if (GET_MODE (flags) == CCFPmode
22679 || GET_MODE (flags) == CCFPUmode)
22681 fpcmp = true;
22682 compare_code
22683 = ix86_fp_compare_code_to_integer (compare_code);
22686 /* To simplify rest of code, restrict to the GEU case. */
22687 if (compare_code == LTU)
22689 std::swap (ct, cf);
22690 compare_code = reverse_condition (compare_code);
22691 code = reverse_condition (code);
22693 else
22695 if (fpcmp)
22696 PUT_CODE (compare_op,
22697 reverse_condition_maybe_unordered
22698 (GET_CODE (compare_op)));
22699 else
22700 PUT_CODE (compare_op,
22701 reverse_condition (GET_CODE (compare_op)));
22703 diff = ct - cf;
22705 if (reg_overlap_mentioned_p (out, op0)
22706 || reg_overlap_mentioned_p (out, op1))
22707 tmp = gen_reg_rtx (mode);
22709 if (mode == DImode)
22710 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22711 else
22712 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22713 flags, compare_op));
22715 else
22717 if (code == GT || code == GE)
22718 code = reverse_condition (code);
22719 else
22721 std::swap (ct, cf);
22722 diff = ct - cf;
22724 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
22727 if (diff == 1)
22730 * cmpl op0,op1
22731 * sbbl dest,dest
22732 * [addl dest, ct]
22734 * Size 5 - 8.
22736 if (ct)
22737 tmp = expand_simple_binop (mode, PLUS,
22738 tmp, GEN_INT (ct),
22739 copy_rtx (tmp), 1, OPTAB_DIRECT);
22741 else if (cf == -1)
22744 * cmpl op0,op1
22745 * sbbl dest,dest
22746 * orl $ct, dest
22748 * Size 8.
22750 tmp = expand_simple_binop (mode, IOR,
22751 tmp, GEN_INT (ct),
22752 copy_rtx (tmp), 1, OPTAB_DIRECT);
22754 else if (diff == -1 && ct)
22757 * cmpl op0,op1
22758 * sbbl dest,dest
22759 * notl dest
22760 * [addl dest, cf]
22762 * Size 8 - 11.
22764 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22765 if (cf)
22766 tmp = expand_simple_binop (mode, PLUS,
22767 copy_rtx (tmp), GEN_INT (cf),
22768 copy_rtx (tmp), 1, OPTAB_DIRECT);
22770 else
22773 * cmpl op0,op1
22774 * sbbl dest,dest
22775 * [notl dest]
22776 * andl cf - ct, dest
22777 * [addl dest, ct]
22779 * Size 8 - 11.
22782 if (cf == 0)
22784 cf = ct;
22785 ct = 0;
22786 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22789 tmp = expand_simple_binop (mode, AND,
22790 copy_rtx (tmp),
22791 gen_int_mode (cf - ct, mode),
22792 copy_rtx (tmp), 1, OPTAB_DIRECT);
22793 if (ct)
22794 tmp = expand_simple_binop (mode, PLUS,
22795 copy_rtx (tmp), GEN_INT (ct),
22796 copy_rtx (tmp), 1, OPTAB_DIRECT);
22799 if (!rtx_equal_p (tmp, out))
22800 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
22802 return true;
22805 if (diff < 0)
22807 machine_mode cmp_mode = GET_MODE (op0);
22808 enum rtx_code new_code;
22810 if (SCALAR_FLOAT_MODE_P (cmp_mode))
22812 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
22814 /* We may be reversing unordered compare to normal compare, that
22815 is not valid in general (we may convert non-trapping condition
22816 to trapping one), however on i386 we currently emit all
22817 comparisons unordered. */
22818 new_code = reverse_condition_maybe_unordered (code);
22820 else
22821 new_code = ix86_reverse_condition (code, cmp_mode);
22822 if (new_code != UNKNOWN)
22824 std::swap (ct, cf);
22825 diff = -diff;
22826 code = new_code;
22830 compare_code = UNKNOWN;
22831 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
22832 && CONST_INT_P (op1))
22834 if (op1 == const0_rtx
22835 && (code == LT || code == GE))
22836 compare_code = code;
22837 else if (op1 == constm1_rtx)
22839 if (code == LE)
22840 compare_code = LT;
22841 else if (code == GT)
22842 compare_code = GE;
22846 /* Optimize dest = (op0 < 0) ? -1 : cf. */
22847 if (compare_code != UNKNOWN
22848 && GET_MODE (op0) == GET_MODE (out)
22849 && (cf == -1 || ct == -1))
22851 /* If lea code below could be used, only optimize
22852 if it results in a 2 insn sequence. */
22854 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
22855 || diff == 3 || diff == 5 || diff == 9)
22856 || (compare_code == LT && ct == -1)
22857 || (compare_code == GE && cf == -1))
22860 * notl op1 (if necessary)
22861 * sarl $31, op1
22862 * orl cf, op1
22864 if (ct != -1)
22866 cf = ct;
22867 ct = -1;
22868 code = reverse_condition (code);
22871 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
22873 out = expand_simple_binop (mode, IOR,
22874 out, GEN_INT (cf),
22875 out, 1, OPTAB_DIRECT);
22876 if (out != operands[0])
22877 emit_move_insn (operands[0], out);
22879 return true;
22884 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
22885 || diff == 3 || diff == 5 || diff == 9)
22886 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
22887 && (mode != DImode
22888 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
22891 * xorl dest,dest
22892 * cmpl op1,op2
22893 * setcc dest
22894 * lea cf(dest*(ct-cf)),dest
22896 * Size 14.
22898 * This also catches the degenerate setcc-only case.
22901 rtx tmp;
22902 int nops;
22904 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
22906 nops = 0;
22907 /* On x86_64 the lea instruction operates on Pmode, so we need
22908 to get arithmetics done in proper mode to match. */
22909 if (diff == 1)
22910 tmp = copy_rtx (out);
22911 else
22913 rtx out1;
22914 out1 = copy_rtx (out);
22915 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
22916 nops++;
22917 if (diff & 1)
22919 tmp = gen_rtx_PLUS (mode, tmp, out1);
22920 nops++;
22923 if (cf != 0)
22925 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
22926 nops++;
22928 if (!rtx_equal_p (tmp, out))
22930 if (nops == 1)
22931 out = force_operand (tmp, copy_rtx (out));
22932 else
22933 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
22935 if (!rtx_equal_p (out, operands[0]))
22936 emit_move_insn (operands[0], copy_rtx (out));
22938 return true;
22942 * General case: Jumpful:
22943 * xorl dest,dest cmpl op1, op2
22944 * cmpl op1, op2 movl ct, dest
22945 * setcc dest jcc 1f
22946 * decl dest movl cf, dest
22947 * andl (cf-ct),dest 1:
22948 * addl ct,dest
22950 * Size 20. Size 14.
22952 * This is reasonably steep, but branch mispredict costs are
22953 * high on modern cpus, so consider failing only if optimizing
22954 * for space.
22957 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
22958 && BRANCH_COST (optimize_insn_for_speed_p (),
22959 false) >= 2)
22961 if (cf == 0)
22963 machine_mode cmp_mode = GET_MODE (op0);
22964 enum rtx_code new_code;
22966 if (SCALAR_FLOAT_MODE_P (cmp_mode))
22968 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
22970 /* We may be reversing unordered compare to normal compare,
22971 that is not valid in general (we may convert non-trapping
22972 condition to trapping one), however on i386 we currently
22973 emit all comparisons unordered. */
22974 new_code = reverse_condition_maybe_unordered (code);
22976 else
22978 new_code = ix86_reverse_condition (code, cmp_mode);
22979 if (compare_code != UNKNOWN && new_code != UNKNOWN)
22980 compare_code = reverse_condition (compare_code);
22983 if (new_code != UNKNOWN)
22985 cf = ct;
22986 ct = 0;
22987 code = new_code;
22991 if (compare_code != UNKNOWN)
22993 /* notl op1 (if needed)
22994 sarl $31, op1
22995 andl (cf-ct), op1
22996 addl ct, op1
22998 For x < 0 (resp. x <= -1) there will be no notl,
22999 so if possible swap the constants to get rid of the
23000 complement.
23001 True/false will be -1/0 while code below (store flag
23002 followed by decrement) is 0/-1, so the constants need
23003 to be exchanged once more. */
23005 if (compare_code == GE || !cf)
23007 code = reverse_condition (code);
23008 compare_code = LT;
23010 else
23011 std::swap (ct, cf);
23013 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23015 else
23017 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23019 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23020 constm1_rtx,
23021 copy_rtx (out), 1, OPTAB_DIRECT);
23024 out = expand_simple_binop (mode, AND, copy_rtx (out),
23025 gen_int_mode (cf - ct, mode),
23026 copy_rtx (out), 1, OPTAB_DIRECT);
23027 if (ct)
23028 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23029 copy_rtx (out), 1, OPTAB_DIRECT);
23030 if (!rtx_equal_p (out, operands[0]))
23031 emit_move_insn (operands[0], copy_rtx (out));
23033 return true;
23037 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23039 /* Try a few things more with specific constants and a variable. */
23041 optab op;
23042 rtx var, orig_out, out, tmp;
23044 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23045 return false;
23047 /* If one of the two operands is an interesting constant, load a
23048 constant with the above and mask it in with a logical operation. */
23050 if (CONST_INT_P (operands[2]))
23052 var = operands[3];
23053 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23054 operands[3] = constm1_rtx, op = and_optab;
23055 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23056 operands[3] = const0_rtx, op = ior_optab;
23057 else
23058 return false;
23060 else if (CONST_INT_P (operands[3]))
23062 var = operands[2];
23063 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23064 operands[2] = constm1_rtx, op = and_optab;
23065 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23066 operands[2] = const0_rtx, op = ior_optab;
23067 else
23068 return false;
23070 else
23071 return false;
23073 orig_out = operands[0];
23074 tmp = gen_reg_rtx (mode);
23075 operands[0] = tmp;
23077 /* Recurse to get the constant loaded. */
23078 if (!ix86_expand_int_movcc (operands))
23079 return false;
23081 /* Mask in the interesting variable. */
23082 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23083 OPTAB_WIDEN);
23084 if (!rtx_equal_p (out, orig_out))
23085 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23087 return true;
23091 * For comparison with above,
23093 * movl cf,dest
23094 * movl ct,tmp
23095 * cmpl op1,op2
23096 * cmovcc tmp,dest
23098 * Size 15.
23101 if (! nonimmediate_operand (operands[2], mode))
23102 operands[2] = force_reg (mode, operands[2]);
23103 if (! nonimmediate_operand (operands[3], mode))
23104 operands[3] = force_reg (mode, operands[3]);
23106 if (! register_operand (operands[2], VOIDmode)
23107 && (mode == QImode
23108 || ! register_operand (operands[3], VOIDmode)))
23109 operands[2] = force_reg (mode, operands[2]);
23111 if (mode == QImode
23112 && ! register_operand (operands[3], VOIDmode))
23113 operands[3] = force_reg (mode, operands[3]);
23115 emit_insn (compare_seq);
23116 emit_insn (gen_rtx_SET (operands[0],
23117 gen_rtx_IF_THEN_ELSE (mode,
23118 compare_op, operands[2],
23119 operands[3])));
23120 return true;
23123 /* Swap, force into registers, or otherwise massage the two operands
23124 to an sse comparison with a mask result. Thus we differ a bit from
23125 ix86_prepare_fp_compare_args which expects to produce a flags result.
23127 The DEST operand exists to help determine whether to commute commutative
23128 operators. The POP0/POP1 operands are updated in place. The new
23129 comparison code is returned, or UNKNOWN if not implementable. */
23131 static enum rtx_code
23132 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23133 rtx *pop0, rtx *pop1)
23135 switch (code)
23137 case LTGT:
23138 case UNEQ:
23139 /* AVX supports all the needed comparisons. */
23140 if (TARGET_AVX)
23141 break;
23142 /* We have no LTGT as an operator. We could implement it with
23143 NE & ORDERED, but this requires an extra temporary. It's
23144 not clear that it's worth it. */
23145 return UNKNOWN;
23147 case LT:
23148 case LE:
23149 case UNGT:
23150 case UNGE:
23151 /* These are supported directly. */
23152 break;
23154 case EQ:
23155 case NE:
23156 case UNORDERED:
23157 case ORDERED:
23158 /* AVX has 3 operand comparisons, no need to swap anything. */
23159 if (TARGET_AVX)
23160 break;
23161 /* For commutative operators, try to canonicalize the destination
23162 operand to be first in the comparison - this helps reload to
23163 avoid extra moves. */
23164 if (!dest || !rtx_equal_p (dest, *pop1))
23165 break;
23166 /* FALLTHRU */
23168 case GE:
23169 case GT:
23170 case UNLE:
23171 case UNLT:
23172 /* These are not supported directly before AVX, and furthermore
23173 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23174 comparison operands to transform into something that is
23175 supported. */
23176 std::swap (*pop0, *pop1);
23177 code = swap_condition (code);
23178 break;
23180 default:
23181 gcc_unreachable ();
23184 return code;
23187 /* Detect conditional moves that exactly match min/max operational
23188 semantics. Note that this is IEEE safe, as long as we don't
23189 interchange the operands.
23191 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23192 and TRUE if the operation is successful and instructions are emitted. */
23194 static bool
23195 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23196 rtx cmp_op1, rtx if_true, rtx if_false)
23198 machine_mode mode;
23199 bool is_min;
23200 rtx tmp;
23202 if (code == LT)
23204 else if (code == UNGE)
23205 std::swap (if_true, if_false);
23206 else
23207 return false;
23209 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23210 is_min = true;
23211 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23212 is_min = false;
23213 else
23214 return false;
23216 mode = GET_MODE (dest);
23218 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23219 but MODE may be a vector mode and thus not appropriate. */
23220 if (!flag_finite_math_only || flag_signed_zeros)
23222 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23223 rtvec v;
23225 if_true = force_reg (mode, if_true);
23226 v = gen_rtvec (2, if_true, if_false);
23227 tmp = gen_rtx_UNSPEC (mode, v, u);
23229 else
23231 code = is_min ? SMIN : SMAX;
23232 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23235 emit_insn (gen_rtx_SET (dest, tmp));
23236 return true;
23239 /* Expand an sse vector comparison. Return the register with the result. */
23241 static rtx
23242 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23243 rtx op_true, rtx op_false)
23245 machine_mode mode = GET_MODE (dest);
23246 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23248 /* In general case result of comparison can differ from operands' type. */
23249 machine_mode cmp_mode;
23251 /* In AVX512F the result of comparison is an integer mask. */
23252 bool maskcmp = false;
23253 rtx x;
23255 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23257 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23258 cmp_mode = int_mode_for_size (nbits, 0).require ();
23259 maskcmp = true;
23261 else
23262 cmp_mode = cmp_ops_mode;
23265 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23266 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23267 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23269 if (optimize
23270 || (maskcmp && cmp_mode != mode)
23271 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23272 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23273 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23275 /* Compare patterns for int modes are unspec in AVX512F only. */
23276 if (maskcmp && (code == GT || code == EQ))
23278 rtx (*gen)(rtx, rtx, rtx);
23280 switch (cmp_ops_mode)
23282 case E_V64QImode:
23283 gcc_assert (TARGET_AVX512BW);
23284 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23285 break;
23286 case E_V32HImode:
23287 gcc_assert (TARGET_AVX512BW);
23288 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23289 break;
23290 case E_V16SImode:
23291 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23292 break;
23293 case E_V8DImode:
23294 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23295 break;
23296 default:
23297 gen = NULL;
23300 if (gen)
23302 emit_insn (gen (dest, cmp_op0, cmp_op1));
23303 return dest;
23306 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23308 if (cmp_mode != mode && !maskcmp)
23310 x = force_reg (cmp_ops_mode, x);
23311 convert_move (dest, x, false);
23313 else
23314 emit_insn (gen_rtx_SET (dest, x));
23316 return dest;
23319 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23320 operations. This is used for both scalar and vector conditional moves. */
23322 void
23323 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23325 machine_mode mode = GET_MODE (dest);
23326 machine_mode cmpmode = GET_MODE (cmp);
23328 /* In AVX512F the result of comparison is an integer mask. */
23329 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23331 rtx t2, t3, x;
23333 /* If we have an integer mask and FP value then we need
23334 to cast mask to FP mode. */
23335 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23337 cmp = force_reg (cmpmode, cmp);
23338 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23341 if (vector_all_ones_operand (op_true, mode)
23342 && rtx_equal_p (op_false, CONST0_RTX (mode))
23343 && !maskcmp)
23345 emit_insn (gen_rtx_SET (dest, cmp));
23347 else if (op_false == CONST0_RTX (mode)
23348 && !maskcmp)
23350 op_true = force_reg (mode, op_true);
23351 x = gen_rtx_AND (mode, cmp, op_true);
23352 emit_insn (gen_rtx_SET (dest, x));
23354 else if (op_true == CONST0_RTX (mode)
23355 && !maskcmp)
23357 op_false = force_reg (mode, op_false);
23358 x = gen_rtx_NOT (mode, cmp);
23359 x = gen_rtx_AND (mode, x, op_false);
23360 emit_insn (gen_rtx_SET (dest, x));
23362 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23363 && !maskcmp)
23365 op_false = force_reg (mode, op_false);
23366 x = gen_rtx_IOR (mode, cmp, op_false);
23367 emit_insn (gen_rtx_SET (dest, x));
23369 else if (TARGET_XOP
23370 && !maskcmp)
23372 op_true = force_reg (mode, op_true);
23374 if (!nonimmediate_operand (op_false, mode))
23375 op_false = force_reg (mode, op_false);
23377 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23378 op_true,
23379 op_false)));
23381 else
23383 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23384 rtx d = dest;
23386 if (!nonimmediate_operand (op_true, mode))
23387 op_true = force_reg (mode, op_true);
23389 op_false = force_reg (mode, op_false);
23391 switch (mode)
23393 case E_V4SFmode:
23394 if (TARGET_SSE4_1)
23395 gen = gen_sse4_1_blendvps;
23396 break;
23397 case E_V2DFmode:
23398 if (TARGET_SSE4_1)
23399 gen = gen_sse4_1_blendvpd;
23400 break;
23401 case E_V16QImode:
23402 case E_V8HImode:
23403 case E_V4SImode:
23404 case E_V2DImode:
23405 if (TARGET_SSE4_1)
23407 gen = gen_sse4_1_pblendvb;
23408 if (mode != V16QImode)
23409 d = gen_reg_rtx (V16QImode);
23410 op_false = gen_lowpart (V16QImode, op_false);
23411 op_true = gen_lowpart (V16QImode, op_true);
23412 cmp = gen_lowpart (V16QImode, cmp);
23414 break;
23415 case E_V8SFmode:
23416 if (TARGET_AVX)
23417 gen = gen_avx_blendvps256;
23418 break;
23419 case E_V4DFmode:
23420 if (TARGET_AVX)
23421 gen = gen_avx_blendvpd256;
23422 break;
23423 case E_V32QImode:
23424 case E_V16HImode:
23425 case E_V8SImode:
23426 case E_V4DImode:
23427 if (TARGET_AVX2)
23429 gen = gen_avx2_pblendvb;
23430 if (mode != V32QImode)
23431 d = gen_reg_rtx (V32QImode);
23432 op_false = gen_lowpart (V32QImode, op_false);
23433 op_true = gen_lowpart (V32QImode, op_true);
23434 cmp = gen_lowpart (V32QImode, cmp);
23436 break;
23438 case E_V64QImode:
23439 gen = gen_avx512bw_blendmv64qi;
23440 break;
23441 case E_V32HImode:
23442 gen = gen_avx512bw_blendmv32hi;
23443 break;
23444 case E_V16SImode:
23445 gen = gen_avx512f_blendmv16si;
23446 break;
23447 case E_V8DImode:
23448 gen = gen_avx512f_blendmv8di;
23449 break;
23450 case E_V8DFmode:
23451 gen = gen_avx512f_blendmv8df;
23452 break;
23453 case E_V16SFmode:
23454 gen = gen_avx512f_blendmv16sf;
23455 break;
23457 default:
23458 break;
23461 if (gen != NULL)
23463 emit_insn (gen (d, op_false, op_true, cmp));
23464 if (d != dest)
23465 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23467 else
23469 op_true = force_reg (mode, op_true);
23471 t2 = gen_reg_rtx (mode);
23472 if (optimize)
23473 t3 = gen_reg_rtx (mode);
23474 else
23475 t3 = dest;
23477 x = gen_rtx_AND (mode, op_true, cmp);
23478 emit_insn (gen_rtx_SET (t2, x));
23480 x = gen_rtx_NOT (mode, cmp);
23481 x = gen_rtx_AND (mode, x, op_false);
23482 emit_insn (gen_rtx_SET (t3, x));
23484 x = gen_rtx_IOR (mode, t3, t2);
23485 emit_insn (gen_rtx_SET (dest, x));
23490 /* Expand a floating-point conditional move. Return true if successful. */
23492 bool
23493 ix86_expand_fp_movcc (rtx operands[])
23495 machine_mode mode = GET_MODE (operands[0]);
23496 enum rtx_code code = GET_CODE (operands[1]);
23497 rtx tmp, compare_op;
23498 rtx op0 = XEXP (operands[1], 0);
23499 rtx op1 = XEXP (operands[1], 1);
23501 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23503 machine_mode cmode;
23505 /* Since we've no cmove for sse registers, don't force bad register
23506 allocation just to gain access to it. Deny movcc when the
23507 comparison mode doesn't match the move mode. */
23508 cmode = GET_MODE (op0);
23509 if (cmode == VOIDmode)
23510 cmode = GET_MODE (op1);
23511 if (cmode != mode)
23512 return false;
23514 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23515 if (code == UNKNOWN)
23516 return false;
23518 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23519 operands[2], operands[3]))
23520 return true;
23522 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23523 operands[2], operands[3]);
23524 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23525 return true;
23528 if (GET_MODE (op0) == TImode
23529 || (GET_MODE (op0) == DImode
23530 && !TARGET_64BIT))
23531 return false;
23533 /* The floating point conditional move instructions don't directly
23534 support conditions resulting from a signed integer comparison. */
23536 compare_op = ix86_expand_compare (code, op0, op1);
23537 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23539 tmp = gen_reg_rtx (QImode);
23540 ix86_expand_setcc (tmp, code, op0, op1);
23542 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23545 emit_insn (gen_rtx_SET (operands[0],
23546 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23547 operands[2], operands[3])));
23549 return true;
23552 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23554 static int
23555 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23557 switch (code)
23559 case EQ:
23560 return 0;
23561 case LT:
23562 case LTU:
23563 return 1;
23564 case LE:
23565 case LEU:
23566 return 2;
23567 case NE:
23568 return 4;
23569 case GE:
23570 case GEU:
23571 return 5;
23572 case GT:
23573 case GTU:
23574 return 6;
23575 default:
23576 gcc_unreachable ();
23580 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23582 static int
23583 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23585 switch (code)
23587 case EQ:
23588 return 0x00;
23589 case NE:
23590 return 0x04;
23591 case GT:
23592 return 0x0e;
23593 case LE:
23594 return 0x02;
23595 case GE:
23596 return 0x0d;
23597 case LT:
23598 return 0x01;
23599 case UNLE:
23600 return 0x0a;
23601 case UNLT:
23602 return 0x09;
23603 case UNGE:
23604 return 0x05;
23605 case UNGT:
23606 return 0x06;
23607 case UNEQ:
23608 return 0x18;
23609 case LTGT:
23610 return 0x0c;
23611 case ORDERED:
23612 return 0x07;
23613 case UNORDERED:
23614 return 0x03;
23615 default:
23616 gcc_unreachable ();
23620 /* Return immediate value to be used in UNSPEC_PCMP
23621 for comparison CODE in MODE. */
23623 static int
23624 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23626 if (FLOAT_MODE_P (mode))
23627 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23628 return ix86_int_cmp_code_to_pcmp_immediate (code);
23631 /* Expand AVX-512 vector comparison. */
23633 bool
23634 ix86_expand_mask_vec_cmp (rtx operands[])
23636 machine_mode mask_mode = GET_MODE (operands[0]);
23637 machine_mode cmp_mode = GET_MODE (operands[2]);
23638 enum rtx_code code = GET_CODE (operands[1]);
23639 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23640 int unspec_code;
23641 rtx unspec;
23643 switch (code)
23645 case LEU:
23646 case GTU:
23647 case GEU:
23648 case LTU:
23649 unspec_code = UNSPEC_UNSIGNED_PCMP;
23650 break;
23652 default:
23653 unspec_code = UNSPEC_PCMP;
23656 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23657 operands[3], imm),
23658 unspec_code);
23659 emit_insn (gen_rtx_SET (operands[0], unspec));
23661 return true;
23664 /* Expand fp vector comparison. */
23666 bool
23667 ix86_expand_fp_vec_cmp (rtx operands[])
23669 enum rtx_code code = GET_CODE (operands[1]);
23670 rtx cmp;
23672 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23673 &operands[2], &operands[3]);
23674 if (code == UNKNOWN)
23676 rtx temp;
23677 switch (GET_CODE (operands[1]))
23679 case LTGT:
23680 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23681 operands[3], NULL, NULL);
23682 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23683 operands[3], NULL, NULL);
23684 code = AND;
23685 break;
23686 case UNEQ:
23687 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23688 operands[3], NULL, NULL);
23689 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23690 operands[3], NULL, NULL);
23691 code = IOR;
23692 break;
23693 default:
23694 gcc_unreachable ();
23696 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23697 OPTAB_DIRECT);
23699 else
23700 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23701 operands[1], operands[2]);
23703 if (operands[0] != cmp)
23704 emit_move_insn (operands[0], cmp);
23706 return true;
23709 static rtx
23710 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23711 rtx op_true, rtx op_false, bool *negate)
23713 machine_mode data_mode = GET_MODE (dest);
23714 machine_mode mode = GET_MODE (cop0);
23715 rtx x;
23717 *negate = false;
23719 /* XOP supports all of the comparisons on all 128-bit vector int types. */
23720 if (TARGET_XOP
23721 && (mode == V16QImode || mode == V8HImode
23722 || mode == V4SImode || mode == V2DImode))
23724 else
23726 /* Canonicalize the comparison to EQ, GT, GTU. */
23727 switch (code)
23729 case EQ:
23730 case GT:
23731 case GTU:
23732 break;
23734 case NE:
23735 case LE:
23736 case LEU:
23737 code = reverse_condition (code);
23738 *negate = true;
23739 break;
23741 case GE:
23742 case GEU:
23743 code = reverse_condition (code);
23744 *negate = true;
23745 /* FALLTHRU */
23747 case LT:
23748 case LTU:
23749 std::swap (cop0, cop1);
23750 code = swap_condition (code);
23751 break;
23753 default:
23754 gcc_unreachable ();
23757 /* Only SSE4.1/SSE4.2 supports V2DImode. */
23758 if (mode == V2DImode)
23760 switch (code)
23762 case EQ:
23763 /* SSE4.1 supports EQ. */
23764 if (!TARGET_SSE4_1)
23765 return NULL;
23766 break;
23768 case GT:
23769 case GTU:
23770 /* SSE4.2 supports GT/GTU. */
23771 if (!TARGET_SSE4_2)
23772 return NULL;
23773 break;
23775 default:
23776 gcc_unreachable ();
23780 /* Unsigned parallel compare is not supported by the hardware.
23781 Play some tricks to turn this into a signed comparison
23782 against 0. */
23783 if (code == GTU)
23785 cop0 = force_reg (mode, cop0);
23787 switch (mode)
23789 case E_V16SImode:
23790 case E_V8DImode:
23791 case E_V8SImode:
23792 case E_V4DImode:
23793 case E_V4SImode:
23794 case E_V2DImode:
23796 rtx t1, t2, mask;
23797 rtx (*gen_sub3) (rtx, rtx, rtx);
23799 switch (mode)
23801 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
23802 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
23803 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
23804 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
23805 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
23806 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
23807 default:
23808 gcc_unreachable ();
23810 /* Subtract (-(INT MAX) - 1) from both operands to make
23811 them signed. */
23812 mask = ix86_build_signbit_mask (mode, true, false);
23813 t1 = gen_reg_rtx (mode);
23814 emit_insn (gen_sub3 (t1, cop0, mask));
23816 t2 = gen_reg_rtx (mode);
23817 emit_insn (gen_sub3 (t2, cop1, mask));
23819 cop0 = t1;
23820 cop1 = t2;
23821 code = GT;
23823 break;
23825 case E_V64QImode:
23826 case E_V32HImode:
23827 case E_V32QImode:
23828 case E_V16HImode:
23829 case E_V16QImode:
23830 case E_V8HImode:
23831 /* Perform a parallel unsigned saturating subtraction. */
23832 x = gen_reg_rtx (mode);
23833 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
23834 cop1)));
23836 cop0 = x;
23837 cop1 = CONST0_RTX (mode);
23838 code = EQ;
23839 *negate = !*negate;
23840 break;
23842 default:
23843 gcc_unreachable ();
23848 if (*negate)
23849 std::swap (op_true, op_false);
23851 /* Allow the comparison to be done in one mode, but the movcc to
23852 happen in another mode. */
23853 if (data_mode == mode)
23855 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
23856 op_true, op_false);
23858 else
23860 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
23861 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
23862 op_true, op_false);
23863 if (GET_MODE (x) == mode)
23864 x = gen_lowpart (data_mode, x);
23867 return x;
23870 /* Expand integer vector comparison. */
23872 bool
23873 ix86_expand_int_vec_cmp (rtx operands[])
23875 rtx_code code = GET_CODE (operands[1]);
23876 bool negate = false;
23877 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
23878 operands[3], NULL, NULL, &negate);
23880 if (!cmp)
23881 return false;
23883 if (negate)
23884 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
23885 CONST0_RTX (GET_MODE (cmp)),
23886 NULL, NULL, &negate);
23888 gcc_assert (!negate);
23890 if (operands[0] != cmp)
23891 emit_move_insn (operands[0], cmp);
23893 return true;
23896 /* Expand a floating-point vector conditional move; a vcond operation
23897 rather than a movcc operation. */
23899 bool
23900 ix86_expand_fp_vcond (rtx operands[])
23902 enum rtx_code code = GET_CODE (operands[3]);
23903 rtx cmp;
23905 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23906 &operands[4], &operands[5]);
23907 if (code == UNKNOWN)
23909 rtx temp;
23910 switch (GET_CODE (operands[3]))
23912 case LTGT:
23913 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
23914 operands[5], operands[0], operands[0]);
23915 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
23916 operands[5], operands[1], operands[2]);
23917 code = AND;
23918 break;
23919 case UNEQ:
23920 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
23921 operands[5], operands[0], operands[0]);
23922 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
23923 operands[5], operands[1], operands[2]);
23924 code = IOR;
23925 break;
23926 default:
23927 gcc_unreachable ();
23929 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23930 OPTAB_DIRECT);
23931 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
23932 return true;
23935 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
23936 operands[5], operands[1], operands[2]))
23937 return true;
23939 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
23940 operands[1], operands[2]);
23941 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
23942 return true;
23945 /* Expand a signed/unsigned integral vector conditional move. */
23947 bool
23948 ix86_expand_int_vcond (rtx operands[])
23950 machine_mode data_mode = GET_MODE (operands[0]);
23951 machine_mode mode = GET_MODE (operands[4]);
23952 enum rtx_code code = GET_CODE (operands[3]);
23953 bool negate = false;
23954 rtx x, cop0, cop1;
23956 cop0 = operands[4];
23957 cop1 = operands[5];
23959 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
23960 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
23961 if ((code == LT || code == GE)
23962 && data_mode == mode
23963 && cop1 == CONST0_RTX (mode)
23964 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
23965 && GET_MODE_UNIT_SIZE (data_mode) > 1
23966 && GET_MODE_UNIT_SIZE (data_mode) <= 8
23967 && (GET_MODE_SIZE (data_mode) == 16
23968 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
23970 rtx negop = operands[2 - (code == LT)];
23971 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
23972 if (negop == CONST1_RTX (data_mode))
23974 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
23975 operands[0], 1, OPTAB_DIRECT);
23976 if (res != operands[0])
23977 emit_move_insn (operands[0], res);
23978 return true;
23980 else if (GET_MODE_INNER (data_mode) != DImode
23981 && vector_all_ones_operand (negop, data_mode))
23983 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
23984 operands[0], 0, OPTAB_DIRECT);
23985 if (res != operands[0])
23986 emit_move_insn (operands[0], res);
23987 return true;
23991 if (!nonimmediate_operand (cop1, mode))
23992 cop1 = force_reg (mode, cop1);
23993 if (!general_operand (operands[1], data_mode))
23994 operands[1] = force_reg (data_mode, operands[1]);
23995 if (!general_operand (operands[2], data_mode))
23996 operands[2] = force_reg (data_mode, operands[2]);
23998 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
23999 operands[1], operands[2], &negate);
24001 if (!x)
24002 return false;
24004 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24005 operands[2-negate]);
24006 return true;
24009 /* AVX512F does support 64-byte integer vector operations,
24010 thus the longest vector we are faced with is V64QImode. */
24011 #define MAX_VECT_LEN 64
24013 struct expand_vec_perm_d
24015 rtx target, op0, op1;
24016 unsigned char perm[MAX_VECT_LEN];
24017 machine_mode vmode;
24018 unsigned char nelt;
24019 bool one_operand_p;
24020 bool testing_p;
24023 static bool
24024 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24025 struct expand_vec_perm_d *d)
24027 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24028 expander, so args are either in d, or in op0, op1 etc. */
24029 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24030 machine_mode maskmode = mode;
24031 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24033 switch (mode)
24035 case E_V8HImode:
24036 if (TARGET_AVX512VL && TARGET_AVX512BW)
24037 gen = gen_avx512vl_vpermi2varv8hi3;
24038 break;
24039 case E_V16HImode:
24040 if (TARGET_AVX512VL && TARGET_AVX512BW)
24041 gen = gen_avx512vl_vpermi2varv16hi3;
24042 break;
24043 case E_V64QImode:
24044 if (TARGET_AVX512VBMI)
24045 gen = gen_avx512bw_vpermi2varv64qi3;
24046 break;
24047 case E_V32HImode:
24048 if (TARGET_AVX512BW)
24049 gen = gen_avx512bw_vpermi2varv32hi3;
24050 break;
24051 case E_V4SImode:
24052 if (TARGET_AVX512VL)
24053 gen = gen_avx512vl_vpermi2varv4si3;
24054 break;
24055 case E_V8SImode:
24056 if (TARGET_AVX512VL)
24057 gen = gen_avx512vl_vpermi2varv8si3;
24058 break;
24059 case E_V16SImode:
24060 if (TARGET_AVX512F)
24061 gen = gen_avx512f_vpermi2varv16si3;
24062 break;
24063 case E_V4SFmode:
24064 if (TARGET_AVX512VL)
24066 gen = gen_avx512vl_vpermi2varv4sf3;
24067 maskmode = V4SImode;
24069 break;
24070 case E_V8SFmode:
24071 if (TARGET_AVX512VL)
24073 gen = gen_avx512vl_vpermi2varv8sf3;
24074 maskmode = V8SImode;
24076 break;
24077 case E_V16SFmode:
24078 if (TARGET_AVX512F)
24080 gen = gen_avx512f_vpermi2varv16sf3;
24081 maskmode = V16SImode;
24083 break;
24084 case E_V2DImode:
24085 if (TARGET_AVX512VL)
24086 gen = gen_avx512vl_vpermi2varv2di3;
24087 break;
24088 case E_V4DImode:
24089 if (TARGET_AVX512VL)
24090 gen = gen_avx512vl_vpermi2varv4di3;
24091 break;
24092 case E_V8DImode:
24093 if (TARGET_AVX512F)
24094 gen = gen_avx512f_vpermi2varv8di3;
24095 break;
24096 case E_V2DFmode:
24097 if (TARGET_AVX512VL)
24099 gen = gen_avx512vl_vpermi2varv2df3;
24100 maskmode = V2DImode;
24102 break;
24103 case E_V4DFmode:
24104 if (TARGET_AVX512VL)
24106 gen = gen_avx512vl_vpermi2varv4df3;
24107 maskmode = V4DImode;
24109 break;
24110 case E_V8DFmode:
24111 if (TARGET_AVX512F)
24113 gen = gen_avx512f_vpermi2varv8df3;
24114 maskmode = V8DImode;
24116 break;
24117 default:
24118 break;
24121 if (gen == NULL)
24122 return false;
24124 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24125 expander, so args are either in d, or in op0, op1 etc. */
24126 if (d)
24128 rtx vec[64];
24129 target = d->target;
24130 op0 = d->op0;
24131 op1 = d->op1;
24132 for (int i = 0; i < d->nelt; ++i)
24133 vec[i] = GEN_INT (d->perm[i]);
24134 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24137 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24138 return true;
24141 /* Expand a variable vector permutation. */
24143 void
24144 ix86_expand_vec_perm (rtx operands[])
24146 rtx target = operands[0];
24147 rtx op0 = operands[1];
24148 rtx op1 = operands[2];
24149 rtx mask = operands[3];
24150 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24151 machine_mode mode = GET_MODE (op0);
24152 machine_mode maskmode = GET_MODE (mask);
24153 int w, e, i;
24154 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24156 /* Number of elements in the vector. */
24157 w = GET_MODE_NUNITS (mode);
24158 e = GET_MODE_UNIT_SIZE (mode);
24159 gcc_assert (w <= 64);
24161 if (TARGET_AVX512F && one_operand_shuffle)
24163 rtx (*gen) (rtx, rtx, rtx) = NULL;
24164 switch (mode)
24166 case E_V16SImode:
24167 gen =gen_avx512f_permvarv16si;
24168 break;
24169 case E_V16SFmode:
24170 gen = gen_avx512f_permvarv16sf;
24171 break;
24172 case E_V8DImode:
24173 gen = gen_avx512f_permvarv8di;
24174 break;
24175 case E_V8DFmode:
24176 gen = gen_avx512f_permvarv8df;
24177 break;
24178 default:
24179 break;
24181 if (gen != NULL)
24183 emit_insn (gen (target, op0, mask));
24184 return;
24188 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24189 return;
24191 if (TARGET_AVX2)
24193 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24195 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24196 an constant shuffle operand. With a tiny bit of effort we can
24197 use VPERMD instead. A re-interpretation stall for V4DFmode is
24198 unfortunate but there's no avoiding it.
24199 Similarly for V16HImode we don't have instructions for variable
24200 shuffling, while for V32QImode we can use after preparing suitable
24201 masks vpshufb; vpshufb; vpermq; vpor. */
24203 if (mode == V16HImode)
24205 maskmode = mode = V32QImode;
24206 w = 32;
24207 e = 1;
24209 else
24211 maskmode = mode = V8SImode;
24212 w = 8;
24213 e = 4;
24215 t1 = gen_reg_rtx (maskmode);
24217 /* Replicate the low bits of the V4DImode mask into V8SImode:
24218 mask = { A B C D }
24219 t1 = { A A B B C C D D }. */
24220 for (i = 0; i < w / 2; ++i)
24221 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24222 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24223 vt = force_reg (maskmode, vt);
24224 mask = gen_lowpart (maskmode, mask);
24225 if (maskmode == V8SImode)
24226 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24227 else
24228 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24230 /* Multiply the shuffle indicies by two. */
24231 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24232 OPTAB_DIRECT);
24234 /* Add one to the odd shuffle indicies:
24235 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24236 for (i = 0; i < w / 2; ++i)
24238 vec[i * 2] = const0_rtx;
24239 vec[i * 2 + 1] = const1_rtx;
24241 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24242 vt = validize_mem (force_const_mem (maskmode, vt));
24243 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24244 OPTAB_DIRECT);
24246 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24247 operands[3] = mask = t1;
24248 target = gen_reg_rtx (mode);
24249 op0 = gen_lowpart (mode, op0);
24250 op1 = gen_lowpart (mode, op1);
24253 switch (mode)
24255 case E_V8SImode:
24256 /* The VPERMD and VPERMPS instructions already properly ignore
24257 the high bits of the shuffle elements. No need for us to
24258 perform an AND ourselves. */
24259 if (one_operand_shuffle)
24261 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24262 if (target != operands[0])
24263 emit_move_insn (operands[0],
24264 gen_lowpart (GET_MODE (operands[0]), target));
24266 else
24268 t1 = gen_reg_rtx (V8SImode);
24269 t2 = gen_reg_rtx (V8SImode);
24270 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24271 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24272 goto merge_two;
24274 return;
24276 case E_V8SFmode:
24277 mask = gen_lowpart (V8SImode, mask);
24278 if (one_operand_shuffle)
24279 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24280 else
24282 t1 = gen_reg_rtx (V8SFmode);
24283 t2 = gen_reg_rtx (V8SFmode);
24284 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24285 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24286 goto merge_two;
24288 return;
24290 case E_V4SImode:
24291 /* By combining the two 128-bit input vectors into one 256-bit
24292 input vector, we can use VPERMD and VPERMPS for the full
24293 two-operand shuffle. */
24294 t1 = gen_reg_rtx (V8SImode);
24295 t2 = gen_reg_rtx (V8SImode);
24296 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24297 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24298 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24299 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24300 return;
24302 case E_V4SFmode:
24303 t1 = gen_reg_rtx (V8SFmode);
24304 t2 = gen_reg_rtx (V8SImode);
24305 mask = gen_lowpart (V4SImode, mask);
24306 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24307 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24308 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24309 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24310 return;
24312 case E_V32QImode:
24313 t1 = gen_reg_rtx (V32QImode);
24314 t2 = gen_reg_rtx (V32QImode);
24315 t3 = gen_reg_rtx (V32QImode);
24316 vt2 = GEN_INT (-128);
24317 for (i = 0; i < 32; i++)
24318 vec[i] = vt2;
24319 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24320 vt = force_reg (V32QImode, vt);
24321 for (i = 0; i < 32; i++)
24322 vec[i] = i < 16 ? vt2 : const0_rtx;
24323 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24324 vt2 = force_reg (V32QImode, vt2);
24325 /* From mask create two adjusted masks, which contain the same
24326 bits as mask in the low 7 bits of each vector element.
24327 The first mask will have the most significant bit clear
24328 if it requests element from the same 128-bit lane
24329 and MSB set if it requests element from the other 128-bit lane.
24330 The second mask will have the opposite values of the MSB,
24331 and additionally will have its 128-bit lanes swapped.
24332 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24333 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24334 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24335 stands for other 12 bytes. */
24336 /* The bit whether element is from the same lane or the other
24337 lane is bit 4, so shift it up by 3 to the MSB position. */
24338 t5 = gen_reg_rtx (V4DImode);
24339 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24340 GEN_INT (3)));
24341 /* Clear MSB bits from the mask just in case it had them set. */
24342 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24343 /* After this t1 will have MSB set for elements from other lane. */
24344 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24345 /* Clear bits other than MSB. */
24346 emit_insn (gen_andv32qi3 (t1, t1, vt));
24347 /* Or in the lower bits from mask into t3. */
24348 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24349 /* And invert MSB bits in t1, so MSB is set for elements from the same
24350 lane. */
24351 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24352 /* Swap 128-bit lanes in t3. */
24353 t6 = gen_reg_rtx (V4DImode);
24354 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24355 const2_rtx, GEN_INT (3),
24356 const0_rtx, const1_rtx));
24357 /* And or in the lower bits from mask into t1. */
24358 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24359 if (one_operand_shuffle)
24361 /* Each of these shuffles will put 0s in places where
24362 element from the other 128-bit lane is needed, otherwise
24363 will shuffle in the requested value. */
24364 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24365 gen_lowpart (V32QImode, t6)));
24366 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24367 /* For t3 the 128-bit lanes are swapped again. */
24368 t7 = gen_reg_rtx (V4DImode);
24369 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24370 const2_rtx, GEN_INT (3),
24371 const0_rtx, const1_rtx));
24372 /* And oring both together leads to the result. */
24373 emit_insn (gen_iorv32qi3 (target, t1,
24374 gen_lowpart (V32QImode, t7)));
24375 if (target != operands[0])
24376 emit_move_insn (operands[0],
24377 gen_lowpart (GET_MODE (operands[0]), target));
24378 return;
24381 t4 = gen_reg_rtx (V32QImode);
24382 /* Similarly to the above one_operand_shuffle code,
24383 just for repeated twice for each operand. merge_two:
24384 code will merge the two results together. */
24385 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24386 gen_lowpart (V32QImode, t6)));
24387 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24388 gen_lowpart (V32QImode, t6)));
24389 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24390 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24391 t7 = gen_reg_rtx (V4DImode);
24392 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24393 const2_rtx, GEN_INT (3),
24394 const0_rtx, const1_rtx));
24395 t8 = gen_reg_rtx (V4DImode);
24396 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24397 const2_rtx, GEN_INT (3),
24398 const0_rtx, const1_rtx));
24399 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24400 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24401 t1 = t4;
24402 t2 = t3;
24403 goto merge_two;
24405 default:
24406 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24407 break;
24411 if (TARGET_XOP)
24413 /* The XOP VPPERM insn supports three inputs. By ignoring the
24414 one_operand_shuffle special case, we avoid creating another
24415 set of constant vectors in memory. */
24416 one_operand_shuffle = false;
24418 /* mask = mask & {2*w-1, ...} */
24419 vt = GEN_INT (2*w - 1);
24421 else
24423 /* mask = mask & {w-1, ...} */
24424 vt = GEN_INT (w - 1);
24427 for (i = 0; i < w; i++)
24428 vec[i] = vt;
24429 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24430 mask = expand_simple_binop (maskmode, AND, mask, vt,
24431 NULL_RTX, 0, OPTAB_DIRECT);
24433 /* For non-QImode operations, convert the word permutation control
24434 into a byte permutation control. */
24435 if (mode != V16QImode)
24437 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24438 GEN_INT (exact_log2 (e)),
24439 NULL_RTX, 0, OPTAB_DIRECT);
24441 /* Convert mask to vector of chars. */
24442 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24444 /* Replicate each of the input bytes into byte positions:
24445 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24446 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24447 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24448 for (i = 0; i < 16; ++i)
24449 vec[i] = GEN_INT (i/e * e);
24450 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24451 vt = validize_mem (force_const_mem (V16QImode, vt));
24452 if (TARGET_XOP)
24453 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24454 else
24455 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24457 /* Convert it into the byte positions by doing
24458 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24459 for (i = 0; i < 16; ++i)
24460 vec[i] = GEN_INT (i % e);
24461 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24462 vt = validize_mem (force_const_mem (V16QImode, vt));
24463 emit_insn (gen_addv16qi3 (mask, mask, vt));
24466 /* The actual shuffle operations all operate on V16QImode. */
24467 op0 = gen_lowpart (V16QImode, op0);
24468 op1 = gen_lowpart (V16QImode, op1);
24470 if (TARGET_XOP)
24472 if (GET_MODE (target) != V16QImode)
24473 target = gen_reg_rtx (V16QImode);
24474 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24475 if (target != operands[0])
24476 emit_move_insn (operands[0],
24477 gen_lowpart (GET_MODE (operands[0]), target));
24479 else if (one_operand_shuffle)
24481 if (GET_MODE (target) != V16QImode)
24482 target = gen_reg_rtx (V16QImode);
24483 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24484 if (target != operands[0])
24485 emit_move_insn (operands[0],
24486 gen_lowpart (GET_MODE (operands[0]), target));
24488 else
24490 rtx xops[6];
24491 bool ok;
24493 /* Shuffle the two input vectors independently. */
24494 t1 = gen_reg_rtx (V16QImode);
24495 t2 = gen_reg_rtx (V16QImode);
24496 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24497 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24499 merge_two:
24500 /* Then merge them together. The key is whether any given control
24501 element contained a bit set that indicates the second word. */
24502 mask = operands[3];
24503 vt = GEN_INT (w);
24504 if (maskmode == V2DImode && !TARGET_SSE4_1)
24506 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24507 more shuffle to convert the V2DI input mask into a V4SI
24508 input mask. At which point the masking that expand_int_vcond
24509 will work as desired. */
24510 rtx t3 = gen_reg_rtx (V4SImode);
24511 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24512 const0_rtx, const0_rtx,
24513 const2_rtx, const2_rtx));
24514 mask = t3;
24515 maskmode = V4SImode;
24516 e = w = 4;
24519 for (i = 0; i < w; i++)
24520 vec[i] = vt;
24521 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24522 vt = force_reg (maskmode, vt);
24523 mask = expand_simple_binop (maskmode, AND, mask, vt,
24524 NULL_RTX, 0, OPTAB_DIRECT);
24526 if (GET_MODE (target) != mode)
24527 target = gen_reg_rtx (mode);
24528 xops[0] = target;
24529 xops[1] = gen_lowpart (mode, t2);
24530 xops[2] = gen_lowpart (mode, t1);
24531 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24532 xops[4] = mask;
24533 xops[5] = vt;
24534 ok = ix86_expand_int_vcond (xops);
24535 gcc_assert (ok);
24536 if (target != operands[0])
24537 emit_move_insn (operands[0],
24538 gen_lowpart (GET_MODE (operands[0]), target));
24542 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24543 true if we should do zero extension, else sign extension. HIGH_P is
24544 true if we want the N/2 high elements, else the low elements. */
24546 void
24547 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24549 machine_mode imode = GET_MODE (src);
24550 rtx tmp;
24552 if (TARGET_SSE4_1)
24554 rtx (*unpack)(rtx, rtx);
24555 rtx (*extract)(rtx, rtx) = NULL;
24556 machine_mode halfmode = BLKmode;
24558 switch (imode)
24560 case E_V64QImode:
24561 if (unsigned_p)
24562 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24563 else
24564 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24565 halfmode = V32QImode;
24566 extract
24567 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24568 break;
24569 case E_V32QImode:
24570 if (unsigned_p)
24571 unpack = gen_avx2_zero_extendv16qiv16hi2;
24572 else
24573 unpack = gen_avx2_sign_extendv16qiv16hi2;
24574 halfmode = V16QImode;
24575 extract
24576 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24577 break;
24578 case E_V32HImode:
24579 if (unsigned_p)
24580 unpack = gen_avx512f_zero_extendv16hiv16si2;
24581 else
24582 unpack = gen_avx512f_sign_extendv16hiv16si2;
24583 halfmode = V16HImode;
24584 extract
24585 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24586 break;
24587 case E_V16HImode:
24588 if (unsigned_p)
24589 unpack = gen_avx2_zero_extendv8hiv8si2;
24590 else
24591 unpack = gen_avx2_sign_extendv8hiv8si2;
24592 halfmode = V8HImode;
24593 extract
24594 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24595 break;
24596 case E_V16SImode:
24597 if (unsigned_p)
24598 unpack = gen_avx512f_zero_extendv8siv8di2;
24599 else
24600 unpack = gen_avx512f_sign_extendv8siv8di2;
24601 halfmode = V8SImode;
24602 extract
24603 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24604 break;
24605 case E_V8SImode:
24606 if (unsigned_p)
24607 unpack = gen_avx2_zero_extendv4siv4di2;
24608 else
24609 unpack = gen_avx2_sign_extendv4siv4di2;
24610 halfmode = V4SImode;
24611 extract
24612 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24613 break;
24614 case E_V16QImode:
24615 if (unsigned_p)
24616 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24617 else
24618 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24619 break;
24620 case E_V8HImode:
24621 if (unsigned_p)
24622 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24623 else
24624 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24625 break;
24626 case E_V4SImode:
24627 if (unsigned_p)
24628 unpack = gen_sse4_1_zero_extendv2siv2di2;
24629 else
24630 unpack = gen_sse4_1_sign_extendv2siv2di2;
24631 break;
24632 default:
24633 gcc_unreachable ();
24636 if (GET_MODE_SIZE (imode) >= 32)
24638 tmp = gen_reg_rtx (halfmode);
24639 emit_insn (extract (tmp, src));
24641 else if (high_p)
24643 /* Shift higher 8 bytes to lower 8 bytes. */
24644 tmp = gen_reg_rtx (V1TImode);
24645 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24646 GEN_INT (64)));
24647 tmp = gen_lowpart (imode, tmp);
24649 else
24650 tmp = src;
24652 emit_insn (unpack (dest, tmp));
24654 else
24656 rtx (*unpack)(rtx, rtx, rtx);
24658 switch (imode)
24660 case E_V16QImode:
24661 if (high_p)
24662 unpack = gen_vec_interleave_highv16qi;
24663 else
24664 unpack = gen_vec_interleave_lowv16qi;
24665 break;
24666 case E_V8HImode:
24667 if (high_p)
24668 unpack = gen_vec_interleave_highv8hi;
24669 else
24670 unpack = gen_vec_interleave_lowv8hi;
24671 break;
24672 case E_V4SImode:
24673 if (high_p)
24674 unpack = gen_vec_interleave_highv4si;
24675 else
24676 unpack = gen_vec_interleave_lowv4si;
24677 break;
24678 default:
24679 gcc_unreachable ();
24682 if (unsigned_p)
24683 tmp = force_reg (imode, CONST0_RTX (imode));
24684 else
24685 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24686 src, pc_rtx, pc_rtx);
24688 rtx tmp2 = gen_reg_rtx (imode);
24689 emit_insn (unpack (tmp2, src, tmp));
24690 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24694 /* Expand conditional increment or decrement using adb/sbb instructions.
24695 The default case using setcc followed by the conditional move can be
24696 done by generic code. */
24697 bool
24698 ix86_expand_int_addcc (rtx operands[])
24700 enum rtx_code code = GET_CODE (operands[1]);
24701 rtx flags;
24702 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24703 rtx compare_op;
24704 rtx val = const0_rtx;
24705 bool fpcmp = false;
24706 machine_mode mode;
24707 rtx op0 = XEXP (operands[1], 0);
24708 rtx op1 = XEXP (operands[1], 1);
24710 if (operands[3] != const1_rtx
24711 && operands[3] != constm1_rtx)
24712 return false;
24713 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24714 return false;
24715 code = GET_CODE (compare_op);
24717 flags = XEXP (compare_op, 0);
24719 if (GET_MODE (flags) == CCFPmode
24720 || GET_MODE (flags) == CCFPUmode)
24722 fpcmp = true;
24723 code = ix86_fp_compare_code_to_integer (code);
24726 if (code != LTU)
24728 val = constm1_rtx;
24729 if (fpcmp)
24730 PUT_CODE (compare_op,
24731 reverse_condition_maybe_unordered
24732 (GET_CODE (compare_op)));
24733 else
24734 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
24737 mode = GET_MODE (operands[0]);
24739 /* Construct either adc or sbb insn. */
24740 if ((code == LTU) == (operands[3] == constm1_rtx))
24742 switch (mode)
24744 case E_QImode:
24745 insn = gen_subqi3_carry;
24746 break;
24747 case E_HImode:
24748 insn = gen_subhi3_carry;
24749 break;
24750 case E_SImode:
24751 insn = gen_subsi3_carry;
24752 break;
24753 case E_DImode:
24754 insn = gen_subdi3_carry;
24755 break;
24756 default:
24757 gcc_unreachable ();
24760 else
24762 switch (mode)
24764 case E_QImode:
24765 insn = gen_addqi3_carry;
24766 break;
24767 case E_HImode:
24768 insn = gen_addhi3_carry;
24769 break;
24770 case E_SImode:
24771 insn = gen_addsi3_carry;
24772 break;
24773 case E_DImode:
24774 insn = gen_adddi3_carry;
24775 break;
24776 default:
24777 gcc_unreachable ();
24780 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
24782 return true;
24786 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
24787 but works for floating pointer parameters and nonoffsetable memories.
24788 For pushes, it returns just stack offsets; the values will be saved
24789 in the right order. Maximally three parts are generated. */
24791 static int
24792 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
24794 int size;
24796 if (!TARGET_64BIT)
24797 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
24798 else
24799 size = (GET_MODE_SIZE (mode) + 4) / 8;
24801 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
24802 gcc_assert (size >= 2 && size <= 4);
24804 /* Optimize constant pool reference to immediates. This is used by fp
24805 moves, that force all constants to memory to allow combining. */
24806 if (MEM_P (operand) && MEM_READONLY_P (operand))
24808 rtx tmp = maybe_get_pool_constant (operand);
24809 if (tmp)
24810 operand = tmp;
24813 if (MEM_P (operand) && !offsettable_memref_p (operand))
24815 /* The only non-offsetable memories we handle are pushes. */
24816 int ok = push_operand (operand, VOIDmode);
24818 gcc_assert (ok);
24820 operand = copy_rtx (operand);
24821 PUT_MODE (operand, word_mode);
24822 parts[0] = parts[1] = parts[2] = parts[3] = operand;
24823 return size;
24826 if (GET_CODE (operand) == CONST_VECTOR)
24828 scalar_int_mode imode = int_mode_for_mode (mode).require ();
24829 /* Caution: if we looked through a constant pool memory above,
24830 the operand may actually have a different mode now. That's
24831 ok, since we want to pun this all the way back to an integer. */
24832 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
24833 gcc_assert (operand != NULL);
24834 mode = imode;
24837 if (!TARGET_64BIT)
24839 if (mode == DImode)
24840 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
24841 else
24843 int i;
24845 if (REG_P (operand))
24847 gcc_assert (reload_completed);
24848 for (i = 0; i < size; i++)
24849 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
24851 else if (offsettable_memref_p (operand))
24853 operand = adjust_address (operand, SImode, 0);
24854 parts[0] = operand;
24855 for (i = 1; i < size; i++)
24856 parts[i] = adjust_address (operand, SImode, 4 * i);
24858 else if (CONST_DOUBLE_P (operand))
24860 const REAL_VALUE_TYPE *r;
24861 long l[4];
24863 r = CONST_DOUBLE_REAL_VALUE (operand);
24864 switch (mode)
24866 case E_TFmode:
24867 real_to_target (l, r, mode);
24868 parts[3] = gen_int_mode (l[3], SImode);
24869 parts[2] = gen_int_mode (l[2], SImode);
24870 break;
24871 case E_XFmode:
24872 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
24873 long double may not be 80-bit. */
24874 real_to_target (l, r, mode);
24875 parts[2] = gen_int_mode (l[2], SImode);
24876 break;
24877 case E_DFmode:
24878 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
24879 break;
24880 default:
24881 gcc_unreachable ();
24883 parts[1] = gen_int_mode (l[1], SImode);
24884 parts[0] = gen_int_mode (l[0], SImode);
24886 else
24887 gcc_unreachable ();
24890 else
24892 if (mode == TImode)
24893 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
24894 if (mode == XFmode || mode == TFmode)
24896 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
24897 if (REG_P (operand))
24899 gcc_assert (reload_completed);
24900 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
24901 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
24903 else if (offsettable_memref_p (operand))
24905 operand = adjust_address (operand, DImode, 0);
24906 parts[0] = operand;
24907 parts[1] = adjust_address (operand, upper_mode, 8);
24909 else if (CONST_DOUBLE_P (operand))
24911 long l[4];
24913 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
24915 /* real_to_target puts 32-bit pieces in each long. */
24916 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
24917 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
24918 << 32), DImode);
24920 if (upper_mode == SImode)
24921 parts[1] = gen_int_mode (l[2], SImode);
24922 else
24923 parts[1]
24924 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
24925 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
24926 << 32), DImode);
24928 else
24929 gcc_unreachable ();
24933 return size;
24936 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
24937 Return false when normal moves are needed; true when all required
24938 insns have been emitted. Operands 2-4 contain the input values
24939 int the correct order; operands 5-7 contain the output values. */
24941 void
24942 ix86_split_long_move (rtx operands[])
24944 rtx part[2][4];
24945 int nparts, i, j;
24946 int push = 0;
24947 int collisions = 0;
24948 machine_mode mode = GET_MODE (operands[0]);
24949 bool collisionparts[4];
24951 /* The DFmode expanders may ask us to move double.
24952 For 64bit target this is single move. By hiding the fact
24953 here we simplify i386.md splitters. */
24954 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
24956 /* Optimize constant pool reference to immediates. This is used by
24957 fp moves, that force all constants to memory to allow combining. */
24959 if (MEM_P (operands[1])
24960 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
24961 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
24962 operands[1] = get_pool_constant (XEXP (operands[1], 0));
24963 if (push_operand (operands[0], VOIDmode))
24965 operands[0] = copy_rtx (operands[0]);
24966 PUT_MODE (operands[0], word_mode);
24968 else
24969 operands[0] = gen_lowpart (DImode, operands[0]);
24970 operands[1] = gen_lowpart (DImode, operands[1]);
24971 emit_move_insn (operands[0], operands[1]);
24972 return;
24975 /* The only non-offsettable memory we handle is push. */
24976 if (push_operand (operands[0], VOIDmode))
24977 push = 1;
24978 else
24979 gcc_assert (!MEM_P (operands[0])
24980 || offsettable_memref_p (operands[0]));
24982 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
24983 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
24985 /* When emitting push, take care for source operands on the stack. */
24986 if (push && MEM_P (operands[1])
24987 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
24989 rtx src_base = XEXP (part[1][nparts - 1], 0);
24991 /* Compensate for the stack decrement by 4. */
24992 if (!TARGET_64BIT && nparts == 3
24993 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
24994 src_base = plus_constant (Pmode, src_base, 4);
24996 /* src_base refers to the stack pointer and is
24997 automatically decreased by emitted push. */
24998 for (i = 0; i < nparts; i++)
24999 part[1][i] = change_address (part[1][i],
25000 GET_MODE (part[1][i]), src_base);
25003 /* We need to do copy in the right order in case an address register
25004 of the source overlaps the destination. */
25005 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25007 rtx tmp;
25009 for (i = 0; i < nparts; i++)
25011 collisionparts[i]
25012 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25013 if (collisionparts[i])
25014 collisions++;
25017 /* Collision in the middle part can be handled by reordering. */
25018 if (collisions == 1 && nparts == 3 && collisionparts [1])
25020 std::swap (part[0][1], part[0][2]);
25021 std::swap (part[1][1], part[1][2]);
25023 else if (collisions == 1
25024 && nparts == 4
25025 && (collisionparts [1] || collisionparts [2]))
25027 if (collisionparts [1])
25029 std::swap (part[0][1], part[0][2]);
25030 std::swap (part[1][1], part[1][2]);
25032 else
25034 std::swap (part[0][2], part[0][3]);
25035 std::swap (part[1][2], part[1][3]);
25039 /* If there are more collisions, we can't handle it by reordering.
25040 Do an lea to the last part and use only one colliding move. */
25041 else if (collisions > 1)
25043 rtx base, addr;
25045 collisions = 1;
25047 base = part[0][nparts - 1];
25049 /* Handle the case when the last part isn't valid for lea.
25050 Happens in 64-bit mode storing the 12-byte XFmode. */
25051 if (GET_MODE (base) != Pmode)
25052 base = gen_rtx_REG (Pmode, REGNO (base));
25054 addr = XEXP (part[1][0], 0);
25055 if (TARGET_TLS_DIRECT_SEG_REFS)
25057 struct ix86_address parts;
25058 int ok = ix86_decompose_address (addr, &parts);
25059 gcc_assert (ok);
25060 /* It is not valid to use %gs: or %fs: in lea. */
25061 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25063 emit_insn (gen_rtx_SET (base, addr));
25064 part[1][0] = replace_equiv_address (part[1][0], base);
25065 for (i = 1; i < nparts; i++)
25067 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25068 part[1][i] = replace_equiv_address (part[1][i], tmp);
25073 if (push)
25075 if (!TARGET_64BIT)
25077 if (nparts == 3)
25079 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25080 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25081 stack_pointer_rtx, GEN_INT (-4)));
25082 emit_move_insn (part[0][2], part[1][2]);
25084 else if (nparts == 4)
25086 emit_move_insn (part[0][3], part[1][3]);
25087 emit_move_insn (part[0][2], part[1][2]);
25090 else
25092 /* In 64bit mode we don't have 32bit push available. In case this is
25093 register, it is OK - we will just use larger counterpart. We also
25094 retype memory - these comes from attempt to avoid REX prefix on
25095 moving of second half of TFmode value. */
25096 if (GET_MODE (part[1][1]) == SImode)
25098 switch (GET_CODE (part[1][1]))
25100 case MEM:
25101 part[1][1] = adjust_address (part[1][1], DImode, 0);
25102 break;
25104 case REG:
25105 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25106 break;
25108 default:
25109 gcc_unreachable ();
25112 if (GET_MODE (part[1][0]) == SImode)
25113 part[1][0] = part[1][1];
25116 emit_move_insn (part[0][1], part[1][1]);
25117 emit_move_insn (part[0][0], part[1][0]);
25118 return;
25121 /* Choose correct order to not overwrite the source before it is copied. */
25122 if ((REG_P (part[0][0])
25123 && REG_P (part[1][1])
25124 && (REGNO (part[0][0]) == REGNO (part[1][1])
25125 || (nparts == 3
25126 && REGNO (part[0][0]) == REGNO (part[1][2]))
25127 || (nparts == 4
25128 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25129 || (collisions > 0
25130 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25132 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25134 operands[2 + i] = part[0][j];
25135 operands[6 + i] = part[1][j];
25138 else
25140 for (i = 0; i < nparts; i++)
25142 operands[2 + i] = part[0][i];
25143 operands[6 + i] = part[1][i];
25147 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25148 if (optimize_insn_for_size_p ())
25150 for (j = 0; j < nparts - 1; j++)
25151 if (CONST_INT_P (operands[6 + j])
25152 && operands[6 + j] != const0_rtx
25153 && REG_P (operands[2 + j]))
25154 for (i = j; i < nparts - 1; i++)
25155 if (CONST_INT_P (operands[7 + i])
25156 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25157 operands[7 + i] = operands[2 + j];
25160 for (i = 0; i < nparts; i++)
25161 emit_move_insn (operands[2 + i], operands[6 + i]);
25163 return;
25166 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25167 left shift by a constant, either using a single shift or
25168 a sequence of add instructions. */
25170 static void
25171 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25173 rtx (*insn)(rtx, rtx, rtx);
25175 if (count == 1
25176 || (count * ix86_cost->add <= ix86_cost->shift_const
25177 && !optimize_insn_for_size_p ()))
25179 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25180 while (count-- > 0)
25181 emit_insn (insn (operand, operand, operand));
25183 else
25185 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25186 emit_insn (insn (operand, operand, GEN_INT (count)));
25190 void
25191 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25193 rtx (*gen_ashl3)(rtx, rtx, rtx);
25194 rtx (*gen_shld)(rtx, rtx, rtx);
25195 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25197 rtx low[2], high[2];
25198 int count;
25200 if (CONST_INT_P (operands[2]))
25202 split_double_mode (mode, operands, 2, low, high);
25203 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25205 if (count >= half_width)
25207 emit_move_insn (high[0], low[1]);
25208 emit_move_insn (low[0], const0_rtx);
25210 if (count > half_width)
25211 ix86_expand_ashl_const (high[0], count - half_width, mode);
25213 else
25215 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25217 if (!rtx_equal_p (operands[0], operands[1]))
25218 emit_move_insn (operands[0], operands[1]);
25220 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25221 ix86_expand_ashl_const (low[0], count, mode);
25223 return;
25226 split_double_mode (mode, operands, 1, low, high);
25228 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25230 if (operands[1] == const1_rtx)
25232 /* Assuming we've chosen a QImode capable registers, then 1 << N
25233 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25234 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25236 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25238 ix86_expand_clear (low[0]);
25239 ix86_expand_clear (high[0]);
25240 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25242 d = gen_lowpart (QImode, low[0]);
25243 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25244 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25245 emit_insn (gen_rtx_SET (d, s));
25247 d = gen_lowpart (QImode, high[0]);
25248 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25249 s = gen_rtx_NE (QImode, flags, const0_rtx);
25250 emit_insn (gen_rtx_SET (d, s));
25253 /* Otherwise, we can get the same results by manually performing
25254 a bit extract operation on bit 5/6, and then performing the two
25255 shifts. The two methods of getting 0/1 into low/high are exactly
25256 the same size. Avoiding the shift in the bit extract case helps
25257 pentium4 a bit; no one else seems to care much either way. */
25258 else
25260 machine_mode half_mode;
25261 rtx (*gen_lshr3)(rtx, rtx, rtx);
25262 rtx (*gen_and3)(rtx, rtx, rtx);
25263 rtx (*gen_xor3)(rtx, rtx, rtx);
25264 HOST_WIDE_INT bits;
25265 rtx x;
25267 if (mode == DImode)
25269 half_mode = SImode;
25270 gen_lshr3 = gen_lshrsi3;
25271 gen_and3 = gen_andsi3;
25272 gen_xor3 = gen_xorsi3;
25273 bits = 5;
25275 else
25277 half_mode = DImode;
25278 gen_lshr3 = gen_lshrdi3;
25279 gen_and3 = gen_anddi3;
25280 gen_xor3 = gen_xordi3;
25281 bits = 6;
25284 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25285 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25286 else
25287 x = gen_lowpart (half_mode, operands[2]);
25288 emit_insn (gen_rtx_SET (high[0], x));
25290 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25291 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25292 emit_move_insn (low[0], high[0]);
25293 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25296 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25297 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25298 return;
25301 if (operands[1] == constm1_rtx)
25303 /* For -1 << N, we can avoid the shld instruction, because we
25304 know that we're shifting 0...31/63 ones into a -1. */
25305 emit_move_insn (low[0], constm1_rtx);
25306 if (optimize_insn_for_size_p ())
25307 emit_move_insn (high[0], low[0]);
25308 else
25309 emit_move_insn (high[0], constm1_rtx);
25311 else
25313 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25315 if (!rtx_equal_p (operands[0], operands[1]))
25316 emit_move_insn (operands[0], operands[1]);
25318 split_double_mode (mode, operands, 1, low, high);
25319 emit_insn (gen_shld (high[0], low[0], operands[2]));
25322 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25324 if (TARGET_CMOVE && scratch)
25326 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25327 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25329 ix86_expand_clear (scratch);
25330 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25332 else
25334 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25335 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25337 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25341 void
25342 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25344 rtx (*gen_ashr3)(rtx, rtx, rtx)
25345 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25346 rtx (*gen_shrd)(rtx, rtx, rtx);
25347 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25349 rtx low[2], high[2];
25350 int count;
25352 if (CONST_INT_P (operands[2]))
25354 split_double_mode (mode, operands, 2, low, high);
25355 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25357 if (count == GET_MODE_BITSIZE (mode) - 1)
25359 emit_move_insn (high[0], high[1]);
25360 emit_insn (gen_ashr3 (high[0], high[0],
25361 GEN_INT (half_width - 1)));
25362 emit_move_insn (low[0], high[0]);
25365 else if (count >= half_width)
25367 emit_move_insn (low[0], high[1]);
25368 emit_move_insn (high[0], low[0]);
25369 emit_insn (gen_ashr3 (high[0], high[0],
25370 GEN_INT (half_width - 1)));
25372 if (count > half_width)
25373 emit_insn (gen_ashr3 (low[0], low[0],
25374 GEN_INT (count - half_width)));
25376 else
25378 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25380 if (!rtx_equal_p (operands[0], operands[1]))
25381 emit_move_insn (operands[0], operands[1]);
25383 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25384 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25387 else
25389 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25391 if (!rtx_equal_p (operands[0], operands[1]))
25392 emit_move_insn (operands[0], operands[1]);
25394 split_double_mode (mode, operands, 1, low, high);
25396 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25397 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25399 if (TARGET_CMOVE && scratch)
25401 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25402 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25404 emit_move_insn (scratch, high[0]);
25405 emit_insn (gen_ashr3 (scratch, scratch,
25406 GEN_INT (half_width - 1)));
25407 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25408 scratch));
25410 else
25412 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25413 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25415 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25420 void
25421 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25423 rtx (*gen_lshr3)(rtx, rtx, rtx)
25424 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25425 rtx (*gen_shrd)(rtx, rtx, rtx);
25426 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25428 rtx low[2], high[2];
25429 int count;
25431 if (CONST_INT_P (operands[2]))
25433 split_double_mode (mode, operands, 2, low, high);
25434 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25436 if (count >= half_width)
25438 emit_move_insn (low[0], high[1]);
25439 ix86_expand_clear (high[0]);
25441 if (count > half_width)
25442 emit_insn (gen_lshr3 (low[0], low[0],
25443 GEN_INT (count - half_width)));
25445 else
25447 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25449 if (!rtx_equal_p (operands[0], operands[1]))
25450 emit_move_insn (operands[0], operands[1]);
25452 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25453 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25456 else
25458 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25460 if (!rtx_equal_p (operands[0], operands[1]))
25461 emit_move_insn (operands[0], operands[1]);
25463 split_double_mode (mode, operands, 1, low, high);
25465 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25466 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25468 if (TARGET_CMOVE && scratch)
25470 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25471 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25473 ix86_expand_clear (scratch);
25474 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25475 scratch));
25477 else
25479 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25480 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25482 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25487 /* Predict just emitted jump instruction to be taken with probability PROB. */
25488 static void
25489 predict_jump (int prob)
25491 rtx_insn *insn = get_last_insn ();
25492 gcc_assert (JUMP_P (insn));
25493 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
25496 /* Helper function for the string operations below. Dest VARIABLE whether
25497 it is aligned to VALUE bytes. If true, jump to the label. */
25498 static rtx_code_label *
25499 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25501 rtx_code_label *label = gen_label_rtx ();
25502 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25503 if (GET_MODE (variable) == DImode)
25504 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25505 else
25506 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25507 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25508 1, label);
25509 if (epilogue)
25510 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25511 else
25512 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25513 return label;
25516 /* Adjust COUNTER by the VALUE. */
25517 static void
25518 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25520 rtx (*gen_add)(rtx, rtx, rtx)
25521 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25523 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25526 /* Zero extend possibly SImode EXP to Pmode register. */
25528 ix86_zero_extend_to_Pmode (rtx exp)
25530 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25533 /* Divide COUNTREG by SCALE. */
25534 static rtx
25535 scale_counter (rtx countreg, int scale)
25537 rtx sc;
25539 if (scale == 1)
25540 return countreg;
25541 if (CONST_INT_P (countreg))
25542 return GEN_INT (INTVAL (countreg) / scale);
25543 gcc_assert (REG_P (countreg));
25545 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25546 GEN_INT (exact_log2 (scale)),
25547 NULL, 1, OPTAB_DIRECT);
25548 return sc;
25551 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25552 DImode for constant loop counts. */
25554 static machine_mode
25555 counter_mode (rtx count_exp)
25557 if (GET_MODE (count_exp) != VOIDmode)
25558 return GET_MODE (count_exp);
25559 if (!CONST_INT_P (count_exp))
25560 return Pmode;
25561 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25562 return DImode;
25563 return SImode;
25566 /* Copy the address to a Pmode register. This is used for x32 to
25567 truncate DImode TLS address to a SImode register. */
25569 static rtx
25570 ix86_copy_addr_to_reg (rtx addr)
25572 rtx reg;
25573 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25575 reg = copy_addr_to_reg (addr);
25576 REG_POINTER (reg) = 1;
25577 return reg;
25579 else
25581 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25582 reg = copy_to_mode_reg (DImode, addr);
25583 REG_POINTER (reg) = 1;
25584 return gen_rtx_SUBREG (SImode, reg, 0);
25588 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25589 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25590 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25591 memory by VALUE (supposed to be in MODE).
25593 The size is rounded down to whole number of chunk size moved at once.
25594 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25597 static void
25598 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25599 rtx destptr, rtx srcptr, rtx value,
25600 rtx count, machine_mode mode, int unroll,
25601 int expected_size, bool issetmem)
25603 rtx_code_label *out_label, *top_label;
25604 rtx iter, tmp;
25605 machine_mode iter_mode = counter_mode (count);
25606 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25607 rtx piece_size = GEN_INT (piece_size_n);
25608 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25609 rtx size;
25610 int i;
25612 top_label = gen_label_rtx ();
25613 out_label = gen_label_rtx ();
25614 iter = gen_reg_rtx (iter_mode);
25616 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25617 NULL, 1, OPTAB_DIRECT);
25618 /* Those two should combine. */
25619 if (piece_size == const1_rtx)
25621 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25622 true, out_label);
25623 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25625 emit_move_insn (iter, const0_rtx);
25627 emit_label (top_label);
25629 tmp = convert_modes (Pmode, iter_mode, iter, true);
25631 /* This assert could be relaxed - in this case we'll need to compute
25632 smallest power of two, containing in PIECE_SIZE_N and pass it to
25633 offset_address. */
25634 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25635 destmem = offset_address (destmem, tmp, piece_size_n);
25636 destmem = adjust_address (destmem, mode, 0);
25638 if (!issetmem)
25640 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25641 srcmem = adjust_address (srcmem, mode, 0);
25643 /* When unrolling for chips that reorder memory reads and writes,
25644 we can save registers by using single temporary.
25645 Also using 4 temporaries is overkill in 32bit mode. */
25646 if (!TARGET_64BIT && 0)
25648 for (i = 0; i < unroll; i++)
25650 if (i)
25652 destmem =
25653 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25654 srcmem =
25655 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25657 emit_move_insn (destmem, srcmem);
25660 else
25662 rtx tmpreg[4];
25663 gcc_assert (unroll <= 4);
25664 for (i = 0; i < unroll; i++)
25666 tmpreg[i] = gen_reg_rtx (mode);
25667 if (i)
25669 srcmem =
25670 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25672 emit_move_insn (tmpreg[i], srcmem);
25674 for (i = 0; i < unroll; i++)
25676 if (i)
25678 destmem =
25679 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25681 emit_move_insn (destmem, tmpreg[i]);
25685 else
25686 for (i = 0; i < unroll; i++)
25688 if (i)
25689 destmem =
25690 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25691 emit_move_insn (destmem, value);
25694 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
25695 true, OPTAB_LIB_WIDEN);
25696 if (tmp != iter)
25697 emit_move_insn (iter, tmp);
25699 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
25700 true, top_label);
25701 if (expected_size != -1)
25703 expected_size /= GET_MODE_SIZE (mode) * unroll;
25704 if (expected_size == 0)
25705 predict_jump (0);
25706 else if (expected_size > REG_BR_PROB_BASE)
25707 predict_jump (REG_BR_PROB_BASE - 1);
25708 else
25709 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
25711 else
25712 predict_jump (REG_BR_PROB_BASE * 80 / 100);
25713 iter = ix86_zero_extend_to_Pmode (iter);
25714 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
25715 true, OPTAB_LIB_WIDEN);
25716 if (tmp != destptr)
25717 emit_move_insn (destptr, tmp);
25718 if (!issetmem)
25720 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
25721 true, OPTAB_LIB_WIDEN);
25722 if (tmp != srcptr)
25723 emit_move_insn (srcptr, tmp);
25725 emit_label (out_label);
25728 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
25729 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
25730 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
25731 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
25732 ORIG_VALUE is the original value passed to memset to fill the memory with.
25733 Other arguments have same meaning as for previous function. */
25735 static void
25736 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
25737 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
25738 rtx count,
25739 machine_mode mode, bool issetmem)
25741 rtx destexp;
25742 rtx srcexp;
25743 rtx countreg;
25744 HOST_WIDE_INT rounded_count;
25746 /* If possible, it is shorter to use rep movs.
25747 TODO: Maybe it is better to move this logic to decide_alg. */
25748 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
25749 && (!issetmem || orig_value == const0_rtx))
25750 mode = SImode;
25752 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
25753 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
25755 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
25756 GET_MODE_SIZE (mode)));
25757 if (mode != QImode)
25759 destexp = gen_rtx_ASHIFT (Pmode, countreg,
25760 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25761 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
25763 else
25764 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
25765 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
25767 rounded_count
25768 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25769 destmem = shallow_copy_rtx (destmem);
25770 set_mem_size (destmem, rounded_count);
25772 else if (MEM_SIZE_KNOWN_P (destmem))
25773 clear_mem_size (destmem);
25775 if (issetmem)
25777 value = force_reg (mode, gen_lowpart (mode, value));
25778 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
25780 else
25782 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
25783 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
25784 if (mode != QImode)
25786 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
25787 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25788 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
25790 else
25791 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
25792 if (CONST_INT_P (count))
25794 rounded_count
25795 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25796 srcmem = shallow_copy_rtx (srcmem);
25797 set_mem_size (srcmem, rounded_count);
25799 else
25801 if (MEM_SIZE_KNOWN_P (srcmem))
25802 clear_mem_size (srcmem);
25804 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
25805 destexp, srcexp));
25809 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
25810 DESTMEM.
25811 SRC is passed by pointer to be updated on return.
25812 Return value is updated DST. */
25813 static rtx
25814 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
25815 HOST_WIDE_INT size_to_move)
25817 rtx dst = destmem, src = *srcmem, adjust, tempreg;
25818 enum insn_code code;
25819 machine_mode move_mode;
25820 int piece_size, i;
25822 /* Find the widest mode in which we could perform moves.
25823 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
25824 it until move of such size is supported. */
25825 piece_size = 1 << floor_log2 (size_to_move);
25826 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
25827 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25829 gcc_assert (piece_size > 1);
25830 piece_size >>= 1;
25833 /* Find the corresponding vector mode with the same size as MOVE_MODE.
25834 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
25835 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
25837 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
25838 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
25839 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25841 move_mode = word_mode;
25842 piece_size = GET_MODE_SIZE (move_mode);
25843 code = optab_handler (mov_optab, move_mode);
25846 gcc_assert (code != CODE_FOR_nothing);
25848 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
25849 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
25851 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
25852 gcc_assert (size_to_move % piece_size == 0);
25853 adjust = GEN_INT (piece_size);
25854 for (i = 0; i < size_to_move; i += piece_size)
25856 /* We move from memory to memory, so we'll need to do it via
25857 a temporary register. */
25858 tempreg = gen_reg_rtx (move_mode);
25859 emit_insn (GEN_FCN (code) (tempreg, src));
25860 emit_insn (GEN_FCN (code) (dst, tempreg));
25862 emit_move_insn (destptr,
25863 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
25864 emit_move_insn (srcptr,
25865 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
25867 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
25868 piece_size);
25869 src = adjust_automodify_address_nv (src, move_mode, srcptr,
25870 piece_size);
25873 /* Update DST and SRC rtx. */
25874 *srcmem = src;
25875 return dst;
25878 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
25879 static void
25880 expand_movmem_epilogue (rtx destmem, rtx srcmem,
25881 rtx destptr, rtx srcptr, rtx count, int max_size)
25883 rtx src, dest;
25884 if (CONST_INT_P (count))
25886 HOST_WIDE_INT countval = INTVAL (count);
25887 HOST_WIDE_INT epilogue_size = countval % max_size;
25888 int i;
25890 /* For now MAX_SIZE should be a power of 2. This assert could be
25891 relaxed, but it'll require a bit more complicated epilogue
25892 expanding. */
25893 gcc_assert ((max_size & (max_size - 1)) == 0);
25894 for (i = max_size; i >= 1; i >>= 1)
25896 if (epilogue_size & i)
25897 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
25899 return;
25901 if (max_size > 8)
25903 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
25904 count, 1, OPTAB_DIRECT);
25905 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
25906 count, QImode, 1, 4, false);
25907 return;
25910 /* When there are stringops, we can cheaply increase dest and src pointers.
25911 Otherwise we save code size by maintaining offset (zero is readily
25912 available from preceding rep operation) and using x86 addressing modes.
25914 if (TARGET_SINGLE_STRINGOP)
25916 if (max_size > 4)
25918 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
25919 src = change_address (srcmem, SImode, srcptr);
25920 dest = change_address (destmem, SImode, destptr);
25921 emit_insn (gen_strmov (destptr, dest, srcptr, src));
25922 emit_label (label);
25923 LABEL_NUSES (label) = 1;
25925 if (max_size > 2)
25927 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
25928 src = change_address (srcmem, HImode, srcptr);
25929 dest = change_address (destmem, HImode, destptr);
25930 emit_insn (gen_strmov (destptr, dest, srcptr, src));
25931 emit_label (label);
25932 LABEL_NUSES (label) = 1;
25934 if (max_size > 1)
25936 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
25937 src = change_address (srcmem, QImode, srcptr);
25938 dest = change_address (destmem, QImode, destptr);
25939 emit_insn (gen_strmov (destptr, dest, srcptr, src));
25940 emit_label (label);
25941 LABEL_NUSES (label) = 1;
25944 else
25946 rtx offset = force_reg (Pmode, const0_rtx);
25947 rtx tmp;
25949 if (max_size > 4)
25951 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
25952 src = change_address (srcmem, SImode, srcptr);
25953 dest = change_address (destmem, SImode, destptr);
25954 emit_move_insn (dest, src);
25955 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
25956 true, OPTAB_LIB_WIDEN);
25957 if (tmp != offset)
25958 emit_move_insn (offset, tmp);
25959 emit_label (label);
25960 LABEL_NUSES (label) = 1;
25962 if (max_size > 2)
25964 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
25965 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
25966 src = change_address (srcmem, HImode, tmp);
25967 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
25968 dest = change_address (destmem, HImode, tmp);
25969 emit_move_insn (dest, src);
25970 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
25971 true, OPTAB_LIB_WIDEN);
25972 if (tmp != offset)
25973 emit_move_insn (offset, tmp);
25974 emit_label (label);
25975 LABEL_NUSES (label) = 1;
25977 if (max_size > 1)
25979 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
25980 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
25981 src = change_address (srcmem, QImode, tmp);
25982 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
25983 dest = change_address (destmem, QImode, tmp);
25984 emit_move_insn (dest, src);
25985 emit_label (label);
25986 LABEL_NUSES (label) = 1;
25991 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
25992 with value PROMOTED_VAL.
25993 SRC is passed by pointer to be updated on return.
25994 Return value is updated DST. */
25995 static rtx
25996 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
25997 HOST_WIDE_INT size_to_move)
25999 rtx dst = destmem, adjust;
26000 enum insn_code code;
26001 machine_mode move_mode;
26002 int piece_size, i;
26004 /* Find the widest mode in which we could perform moves.
26005 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26006 it until move of such size is supported. */
26007 move_mode = GET_MODE (promoted_val);
26008 if (move_mode == VOIDmode)
26009 move_mode = QImode;
26010 if (size_to_move < GET_MODE_SIZE (move_mode))
26012 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26013 move_mode = int_mode_for_size (move_bits, 0).require ();
26014 promoted_val = gen_lowpart (move_mode, promoted_val);
26016 piece_size = GET_MODE_SIZE (move_mode);
26017 code = optab_handler (mov_optab, move_mode);
26018 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26020 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26022 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26023 gcc_assert (size_to_move % piece_size == 0);
26024 adjust = GEN_INT (piece_size);
26025 for (i = 0; i < size_to_move; i += piece_size)
26027 if (piece_size <= GET_MODE_SIZE (word_mode))
26029 emit_insn (gen_strset (destptr, dst, promoted_val));
26030 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26031 piece_size);
26032 continue;
26035 emit_insn (GEN_FCN (code) (dst, promoted_val));
26037 emit_move_insn (destptr,
26038 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26040 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26041 piece_size);
26044 /* Update DST rtx. */
26045 return dst;
26047 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26048 static void
26049 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26050 rtx count, int max_size)
26052 count =
26053 expand_simple_binop (counter_mode (count), AND, count,
26054 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26055 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26056 gen_lowpart (QImode, value), count, QImode,
26057 1, max_size / 2, true);
26060 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26061 static void
26062 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26063 rtx count, int max_size)
26065 rtx dest;
26067 if (CONST_INT_P (count))
26069 HOST_WIDE_INT countval = INTVAL (count);
26070 HOST_WIDE_INT epilogue_size = countval % max_size;
26071 int i;
26073 /* For now MAX_SIZE should be a power of 2. This assert could be
26074 relaxed, but it'll require a bit more complicated epilogue
26075 expanding. */
26076 gcc_assert ((max_size & (max_size - 1)) == 0);
26077 for (i = max_size; i >= 1; i >>= 1)
26079 if (epilogue_size & i)
26081 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26082 destmem = emit_memset (destmem, destptr, vec_value, i);
26083 else
26084 destmem = emit_memset (destmem, destptr, value, i);
26087 return;
26089 if (max_size > 32)
26091 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26092 return;
26094 if (max_size > 16)
26096 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26097 if (TARGET_64BIT)
26099 dest = change_address (destmem, DImode, destptr);
26100 emit_insn (gen_strset (destptr, dest, value));
26101 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26102 emit_insn (gen_strset (destptr, dest, value));
26104 else
26106 dest = change_address (destmem, SImode, destptr);
26107 emit_insn (gen_strset (destptr, dest, value));
26108 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26109 emit_insn (gen_strset (destptr, dest, value));
26110 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26111 emit_insn (gen_strset (destptr, dest, value));
26112 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26113 emit_insn (gen_strset (destptr, dest, value));
26115 emit_label (label);
26116 LABEL_NUSES (label) = 1;
26118 if (max_size > 8)
26120 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26121 if (TARGET_64BIT)
26123 dest = change_address (destmem, DImode, destptr);
26124 emit_insn (gen_strset (destptr, dest, value));
26126 else
26128 dest = change_address (destmem, SImode, destptr);
26129 emit_insn (gen_strset (destptr, dest, value));
26130 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26131 emit_insn (gen_strset (destptr, dest, value));
26133 emit_label (label);
26134 LABEL_NUSES (label) = 1;
26136 if (max_size > 4)
26138 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26139 dest = change_address (destmem, SImode, destptr);
26140 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26141 emit_label (label);
26142 LABEL_NUSES (label) = 1;
26144 if (max_size > 2)
26146 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26147 dest = change_address (destmem, HImode, destptr);
26148 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26149 emit_label (label);
26150 LABEL_NUSES (label) = 1;
26152 if (max_size > 1)
26154 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26155 dest = change_address (destmem, QImode, destptr);
26156 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26157 emit_label (label);
26158 LABEL_NUSES (label) = 1;
26162 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26163 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26164 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26165 ignored.
26166 Return value is updated DESTMEM. */
26167 static rtx
26168 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26169 rtx destptr, rtx srcptr, rtx value,
26170 rtx vec_value, rtx count, int align,
26171 int desired_alignment, bool issetmem)
26173 int i;
26174 for (i = 1; i < desired_alignment; i <<= 1)
26176 if (align <= i)
26178 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26179 if (issetmem)
26181 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26182 destmem = emit_memset (destmem, destptr, vec_value, i);
26183 else
26184 destmem = emit_memset (destmem, destptr, value, i);
26186 else
26187 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26188 ix86_adjust_counter (count, i);
26189 emit_label (label);
26190 LABEL_NUSES (label) = 1;
26191 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26194 return destmem;
26197 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26198 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26199 and jump to DONE_LABEL. */
26200 static void
26201 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26202 rtx destptr, rtx srcptr,
26203 rtx value, rtx vec_value,
26204 rtx count, int size,
26205 rtx done_label, bool issetmem)
26207 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26208 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26209 rtx modesize;
26210 int n;
26212 /* If we do not have vector value to copy, we must reduce size. */
26213 if (issetmem)
26215 if (!vec_value)
26217 if (GET_MODE (value) == VOIDmode && size > 8)
26218 mode = Pmode;
26219 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26220 mode = GET_MODE (value);
26222 else
26223 mode = GET_MODE (vec_value), value = vec_value;
26225 else
26227 /* Choose appropriate vector mode. */
26228 if (size >= 32)
26229 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26230 else if (size >= 16)
26231 mode = TARGET_SSE ? V16QImode : DImode;
26232 srcmem = change_address (srcmem, mode, srcptr);
26234 destmem = change_address (destmem, mode, destptr);
26235 modesize = GEN_INT (GET_MODE_SIZE (mode));
26236 gcc_assert (GET_MODE_SIZE (mode) <= size);
26237 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26239 if (issetmem)
26240 emit_move_insn (destmem, gen_lowpart (mode, value));
26241 else
26243 emit_move_insn (destmem, srcmem);
26244 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26246 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26249 destmem = offset_address (destmem, count, 1);
26250 destmem = offset_address (destmem, GEN_INT (-2 * size),
26251 GET_MODE_SIZE (mode));
26252 if (!issetmem)
26254 srcmem = offset_address (srcmem, count, 1);
26255 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26256 GET_MODE_SIZE (mode));
26258 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26260 if (issetmem)
26261 emit_move_insn (destmem, gen_lowpart (mode, value));
26262 else
26264 emit_move_insn (destmem, srcmem);
26265 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26267 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26269 emit_jump_insn (gen_jump (done_label));
26270 emit_barrier ();
26272 emit_label (label);
26273 LABEL_NUSES (label) = 1;
26276 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26277 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26278 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26279 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26280 DONE_LABEL is a label after the whole copying sequence. The label is created
26281 on demand if *DONE_LABEL is NULL.
26282 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26283 bounds after the initial copies.
26285 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26286 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26287 we will dispatch to a library call for large blocks.
26289 In pseudocode we do:
26291 if (COUNT < SIZE)
26293 Assume that SIZE is 4. Bigger sizes are handled analogously
26294 if (COUNT & 4)
26296 copy 4 bytes from SRCPTR to DESTPTR
26297 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26298 goto done_label
26300 if (!COUNT)
26301 goto done_label;
26302 copy 1 byte from SRCPTR to DESTPTR
26303 if (COUNT & 2)
26305 copy 2 bytes from SRCPTR to DESTPTR
26306 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26309 else
26311 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26312 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26314 OLD_DESPTR = DESTPTR;
26315 Align DESTPTR up to DESIRED_ALIGN
26316 SRCPTR += DESTPTR - OLD_DESTPTR
26317 COUNT -= DEST_PTR - OLD_DESTPTR
26318 if (DYNAMIC_CHECK)
26319 Round COUNT down to multiple of SIZE
26320 << optional caller supplied zero size guard is here >>
26321 << optional caller supplied dynamic check is here >>
26322 << caller supplied main copy loop is here >>
26324 done_label:
26326 static void
26327 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26328 rtx *destptr, rtx *srcptr,
26329 machine_mode mode,
26330 rtx value, rtx vec_value,
26331 rtx *count,
26332 rtx_code_label **done_label,
26333 int size,
26334 int desired_align,
26335 int align,
26336 unsigned HOST_WIDE_INT *min_size,
26337 bool dynamic_check,
26338 bool issetmem)
26340 rtx_code_label *loop_label = NULL, *label;
26341 int n;
26342 rtx modesize;
26343 int prolog_size = 0;
26344 rtx mode_value;
26346 /* Chose proper value to copy. */
26347 if (issetmem && VECTOR_MODE_P (mode))
26348 mode_value = vec_value;
26349 else
26350 mode_value = value;
26351 gcc_assert (GET_MODE_SIZE (mode) <= size);
26353 /* See if block is big or small, handle small blocks. */
26354 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26356 int size2 = size;
26357 loop_label = gen_label_rtx ();
26359 if (!*done_label)
26360 *done_label = gen_label_rtx ();
26362 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26363 1, loop_label);
26364 size2 >>= 1;
26366 /* Handle sizes > 3. */
26367 for (;size2 > 2; size2 >>= 1)
26368 expand_small_movmem_or_setmem (destmem, srcmem,
26369 *destptr, *srcptr,
26370 value, vec_value,
26371 *count,
26372 size2, *done_label, issetmem);
26373 /* Nothing to copy? Jump to DONE_LABEL if so */
26374 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26375 1, *done_label);
26377 /* Do a byte copy. */
26378 destmem = change_address (destmem, QImode, *destptr);
26379 if (issetmem)
26380 emit_move_insn (destmem, gen_lowpart (QImode, value));
26381 else
26383 srcmem = change_address (srcmem, QImode, *srcptr);
26384 emit_move_insn (destmem, srcmem);
26387 /* Handle sizes 2 and 3. */
26388 label = ix86_expand_aligntest (*count, 2, false);
26389 destmem = change_address (destmem, HImode, *destptr);
26390 destmem = offset_address (destmem, *count, 1);
26391 destmem = offset_address (destmem, GEN_INT (-2), 2);
26392 if (issetmem)
26393 emit_move_insn (destmem, gen_lowpart (HImode, value));
26394 else
26396 srcmem = change_address (srcmem, HImode, *srcptr);
26397 srcmem = offset_address (srcmem, *count, 1);
26398 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26399 emit_move_insn (destmem, srcmem);
26402 emit_label (label);
26403 LABEL_NUSES (label) = 1;
26404 emit_jump_insn (gen_jump (*done_label));
26405 emit_barrier ();
26407 else
26408 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26409 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26411 /* Start memcpy for COUNT >= SIZE. */
26412 if (loop_label)
26414 emit_label (loop_label);
26415 LABEL_NUSES (loop_label) = 1;
26418 /* Copy first desired_align bytes. */
26419 if (!issetmem)
26420 srcmem = change_address (srcmem, mode, *srcptr);
26421 destmem = change_address (destmem, mode, *destptr);
26422 modesize = GEN_INT (GET_MODE_SIZE (mode));
26423 for (n = 0; prolog_size < desired_align - align; n++)
26425 if (issetmem)
26426 emit_move_insn (destmem, mode_value);
26427 else
26429 emit_move_insn (destmem, srcmem);
26430 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26432 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26433 prolog_size += GET_MODE_SIZE (mode);
26437 /* Copy last SIZE bytes. */
26438 destmem = offset_address (destmem, *count, 1);
26439 destmem = offset_address (destmem,
26440 GEN_INT (-size - prolog_size),
26442 if (issetmem)
26443 emit_move_insn (destmem, mode_value);
26444 else
26446 srcmem = offset_address (srcmem, *count, 1);
26447 srcmem = offset_address (srcmem,
26448 GEN_INT (-size - prolog_size),
26450 emit_move_insn (destmem, srcmem);
26452 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26454 destmem = offset_address (destmem, modesize, 1);
26455 if (issetmem)
26456 emit_move_insn (destmem, mode_value);
26457 else
26459 srcmem = offset_address (srcmem, modesize, 1);
26460 emit_move_insn (destmem, srcmem);
26464 /* Align destination. */
26465 if (desired_align > 1 && desired_align > align)
26467 rtx saveddest = *destptr;
26469 gcc_assert (desired_align <= size);
26470 /* Align destptr up, place it to new register. */
26471 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26472 GEN_INT (prolog_size),
26473 NULL_RTX, 1, OPTAB_DIRECT);
26474 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26475 REG_POINTER (*destptr) = 1;
26476 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26477 GEN_INT (-desired_align),
26478 *destptr, 1, OPTAB_DIRECT);
26479 /* See how many bytes we skipped. */
26480 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26481 *destptr,
26482 saveddest, 1, OPTAB_DIRECT);
26483 /* Adjust srcptr and count. */
26484 if (!issetmem)
26485 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26486 saveddest, *srcptr, 1, OPTAB_DIRECT);
26487 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26488 saveddest, *count, 1, OPTAB_DIRECT);
26489 /* We copied at most size + prolog_size. */
26490 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26491 *min_size
26492 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26493 else
26494 *min_size = 0;
26496 /* Our loops always round down the block size, but for dispatch to
26497 library we need precise value. */
26498 if (dynamic_check)
26499 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26500 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26502 else
26504 gcc_assert (prolog_size == 0);
26505 /* Decrease count, so we won't end up copying last word twice. */
26506 if (!CONST_INT_P (*count))
26507 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26508 constm1_rtx, *count, 1, OPTAB_DIRECT);
26509 else
26510 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26511 (unsigned HOST_WIDE_INT)size));
26512 if (*min_size)
26513 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26518 /* This function is like the previous one, except here we know how many bytes
26519 need to be copied. That allows us to update alignment not only of DST, which
26520 is returned, but also of SRC, which is passed as a pointer for that
26521 reason. */
26522 static rtx
26523 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26524 rtx srcreg, rtx value, rtx vec_value,
26525 int desired_align, int align_bytes,
26526 bool issetmem)
26528 rtx src = NULL;
26529 rtx orig_dst = dst;
26530 rtx orig_src = NULL;
26531 int piece_size = 1;
26532 int copied_bytes = 0;
26534 if (!issetmem)
26536 gcc_assert (srcp != NULL);
26537 src = *srcp;
26538 orig_src = src;
26541 for (piece_size = 1;
26542 piece_size <= desired_align && copied_bytes < align_bytes;
26543 piece_size <<= 1)
26545 if (align_bytes & piece_size)
26547 if (issetmem)
26549 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26550 dst = emit_memset (dst, destreg, vec_value, piece_size);
26551 else
26552 dst = emit_memset (dst, destreg, value, piece_size);
26554 else
26555 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26556 copied_bytes += piece_size;
26559 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26560 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26561 if (MEM_SIZE_KNOWN_P (orig_dst))
26562 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26564 if (!issetmem)
26566 int src_align_bytes = get_mem_align_offset (src, desired_align
26567 * BITS_PER_UNIT);
26568 if (src_align_bytes >= 0)
26569 src_align_bytes = desired_align - src_align_bytes;
26570 if (src_align_bytes >= 0)
26572 unsigned int src_align;
26573 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26575 if ((src_align_bytes & (src_align - 1))
26576 == (align_bytes & (src_align - 1)))
26577 break;
26579 if (src_align > (unsigned int) desired_align)
26580 src_align = desired_align;
26581 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26582 set_mem_align (src, src_align * BITS_PER_UNIT);
26584 if (MEM_SIZE_KNOWN_P (orig_src))
26585 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26586 *srcp = src;
26589 return dst;
26592 /* Return true if ALG can be used in current context.
26593 Assume we expand memset if MEMSET is true. */
26594 static bool
26595 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26597 if (alg == no_stringop)
26598 return false;
26599 if (alg == vector_loop)
26600 return TARGET_SSE || TARGET_AVX;
26601 /* Algorithms using the rep prefix want at least edi and ecx;
26602 additionally, memset wants eax and memcpy wants esi. Don't
26603 consider such algorithms if the user has appropriated those
26604 registers for their own purposes, or if we have a non-default
26605 address space, since some string insns cannot override the segment. */
26606 if (alg == rep_prefix_1_byte
26607 || alg == rep_prefix_4_byte
26608 || alg == rep_prefix_8_byte)
26610 if (have_as)
26611 return false;
26612 if (fixed_regs[CX_REG]
26613 || fixed_regs[DI_REG]
26614 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26615 return false;
26617 return true;
26620 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26621 static enum stringop_alg
26622 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26623 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26624 bool memset, bool zero_memset, bool have_as,
26625 int *dynamic_check, bool *noalign, bool recur)
26627 const struct stringop_algs *algs;
26628 bool optimize_for_speed;
26629 int max = 0;
26630 const struct processor_costs *cost;
26631 int i;
26632 bool any_alg_usable_p = false;
26634 *noalign = false;
26635 *dynamic_check = -1;
26637 /* Even if the string operation call is cold, we still might spend a lot
26638 of time processing large blocks. */
26639 if (optimize_function_for_size_p (cfun)
26640 || (optimize_insn_for_size_p ()
26641 && (max_size < 256
26642 || (expected_size != -1 && expected_size < 256))))
26643 optimize_for_speed = false;
26644 else
26645 optimize_for_speed = true;
26647 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26648 if (memset)
26649 algs = &cost->memset[TARGET_64BIT != 0];
26650 else
26651 algs = &cost->memcpy[TARGET_64BIT != 0];
26653 /* See maximal size for user defined algorithm. */
26654 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26656 enum stringop_alg candidate = algs->size[i].alg;
26657 bool usable = alg_usable_p (candidate, memset, have_as);
26658 any_alg_usable_p |= usable;
26660 if (candidate != libcall && candidate && usable)
26661 max = algs->size[i].max;
26664 /* If expected size is not known but max size is small enough
26665 so inline version is a win, set expected size into
26666 the range. */
26667 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26668 && expected_size == -1)
26669 expected_size = min_size / 2 + max_size / 2;
26671 /* If user specified the algorithm, honor it if possible. */
26672 if (ix86_stringop_alg != no_stringop
26673 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26674 return ix86_stringop_alg;
26675 /* rep; movq or rep; movl is the smallest variant. */
26676 else if (!optimize_for_speed)
26678 *noalign = true;
26679 if (!count || (count & 3) || (memset && !zero_memset))
26680 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26681 ? rep_prefix_1_byte : loop_1_byte;
26682 else
26683 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
26684 ? rep_prefix_4_byte : loop;
26686 /* Very tiny blocks are best handled via the loop, REP is expensive to
26687 setup. */
26688 else if (expected_size != -1 && expected_size < 4)
26689 return loop_1_byte;
26690 else if (expected_size != -1)
26692 enum stringop_alg alg = libcall;
26693 bool alg_noalign = false;
26694 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26696 /* We get here if the algorithms that were not libcall-based
26697 were rep-prefix based and we are unable to use rep prefixes
26698 based on global register usage. Break out of the loop and
26699 use the heuristic below. */
26700 if (algs->size[i].max == 0)
26701 break;
26702 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
26704 enum stringop_alg candidate = algs->size[i].alg;
26706 if (candidate != libcall
26707 && alg_usable_p (candidate, memset, have_as))
26709 alg = candidate;
26710 alg_noalign = algs->size[i].noalign;
26712 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
26713 last non-libcall inline algorithm. */
26714 if (TARGET_INLINE_ALL_STRINGOPS)
26716 /* When the current size is best to be copied by a libcall,
26717 but we are still forced to inline, run the heuristic below
26718 that will pick code for medium sized blocks. */
26719 if (alg != libcall)
26721 *noalign = alg_noalign;
26722 return alg;
26724 else if (!any_alg_usable_p)
26725 break;
26727 else if (alg_usable_p (candidate, memset, have_as))
26729 *noalign = algs->size[i].noalign;
26730 return candidate;
26735 /* When asked to inline the call anyway, try to pick meaningful choice.
26736 We look for maximal size of block that is faster to copy by hand and
26737 take blocks of at most of that size guessing that average size will
26738 be roughly half of the block.
26740 If this turns out to be bad, we might simply specify the preferred
26741 choice in ix86_costs. */
26742 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26743 && (algs->unknown_size == libcall
26744 || !alg_usable_p (algs->unknown_size, memset, have_as)))
26746 enum stringop_alg alg;
26747 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
26749 /* If there aren't any usable algorithms or if recursing already,
26750 then recursing on smaller sizes or same size isn't going to
26751 find anything. Just return the simple byte-at-a-time copy loop. */
26752 if (!any_alg_usable_p || recur)
26754 /* Pick something reasonable. */
26755 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
26756 *dynamic_check = 128;
26757 return loop_1_byte;
26759 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
26760 zero_memset, have_as, dynamic_check, noalign, true);
26761 gcc_assert (*dynamic_check == -1);
26762 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26763 *dynamic_check = max;
26764 else
26765 gcc_assert (alg != libcall);
26766 return alg;
26768 return (alg_usable_p (algs->unknown_size, memset, have_as)
26769 ? algs->unknown_size : libcall);
26772 /* Decide on alignment. We know that the operand is already aligned to ALIGN
26773 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
26774 static int
26775 decide_alignment (int align,
26776 enum stringop_alg alg,
26777 int expected_size,
26778 machine_mode move_mode)
26780 int desired_align = 0;
26782 gcc_assert (alg != no_stringop);
26784 if (alg == libcall)
26785 return 0;
26786 if (move_mode == VOIDmode)
26787 return 0;
26789 desired_align = GET_MODE_SIZE (move_mode);
26790 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
26791 copying whole cacheline at once. */
26792 if (TARGET_PENTIUMPRO
26793 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
26794 desired_align = 8;
26796 if (optimize_size)
26797 desired_align = 1;
26798 if (desired_align < align)
26799 desired_align = align;
26800 if (expected_size != -1 && expected_size < 4)
26801 desired_align = align;
26803 return desired_align;
26807 /* Helper function for memcpy. For QImode value 0xXY produce
26808 0xXYXYXYXY of wide specified by MODE. This is essentially
26809 a * 0x10101010, but we can do slightly better than
26810 synth_mult by unwinding the sequence by hand on CPUs with
26811 slow multiply. */
26812 static rtx
26813 promote_duplicated_reg (machine_mode mode, rtx val)
26815 machine_mode valmode = GET_MODE (val);
26816 rtx tmp;
26817 int nops = mode == DImode ? 3 : 2;
26819 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
26820 if (val == const0_rtx)
26821 return copy_to_mode_reg (mode, CONST0_RTX (mode));
26822 if (CONST_INT_P (val))
26824 HOST_WIDE_INT v = INTVAL (val) & 255;
26826 v |= v << 8;
26827 v |= v << 16;
26828 if (mode == DImode)
26829 v |= (v << 16) << 16;
26830 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
26833 if (valmode == VOIDmode)
26834 valmode = QImode;
26835 if (valmode != QImode)
26836 val = gen_lowpart (QImode, val);
26837 if (mode == QImode)
26838 return val;
26839 if (!TARGET_PARTIAL_REG_STALL)
26840 nops--;
26841 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
26842 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
26843 <= (ix86_cost->shift_const + ix86_cost->add) * nops
26844 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
26846 rtx reg = convert_modes (mode, QImode, val, true);
26847 tmp = promote_duplicated_reg (mode, const1_rtx);
26848 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
26849 OPTAB_DIRECT);
26851 else
26853 rtx reg = convert_modes (mode, QImode, val, true);
26855 if (!TARGET_PARTIAL_REG_STALL)
26856 if (mode == SImode)
26857 emit_insn (gen_insvsi_1 (reg, reg));
26858 else
26859 emit_insn (gen_insvdi_1 (reg, reg));
26860 else
26862 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
26863 NULL, 1, OPTAB_DIRECT);
26864 reg =
26865 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
26867 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
26868 NULL, 1, OPTAB_DIRECT);
26869 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
26870 if (mode == SImode)
26871 return reg;
26872 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
26873 NULL, 1, OPTAB_DIRECT);
26874 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
26875 return reg;
26879 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
26880 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
26881 alignment from ALIGN to DESIRED_ALIGN. */
26882 static rtx
26883 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
26884 int align)
26886 rtx promoted_val;
26888 if (TARGET_64BIT
26889 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
26890 promoted_val = promote_duplicated_reg (DImode, val);
26891 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
26892 promoted_val = promote_duplicated_reg (SImode, val);
26893 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
26894 promoted_val = promote_duplicated_reg (HImode, val);
26895 else
26896 promoted_val = val;
26898 return promoted_val;
26901 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
26902 operations when profitable. The code depends upon architecture, block size
26903 and alignment, but always has one of the following overall structures:
26905 Aligned move sequence:
26907 1) Prologue guard: Conditional that jumps up to epilogues for small
26908 blocks that can be handled by epilogue alone. This is faster
26909 but also needed for correctness, since prologue assume the block
26910 is larger than the desired alignment.
26912 Optional dynamic check for size and libcall for large
26913 blocks is emitted here too, with -minline-stringops-dynamically.
26915 2) Prologue: copy first few bytes in order to get destination
26916 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
26917 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
26918 copied. We emit either a jump tree on power of two sized
26919 blocks, or a byte loop.
26921 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
26922 with specified algorithm.
26924 4) Epilogue: code copying tail of the block that is too small to be
26925 handled by main body (or up to size guarded by prologue guard).
26927 Misaligned move sequence
26929 1) missaligned move prologue/epilogue containing:
26930 a) Prologue handling small memory blocks and jumping to done_label
26931 (skipped if blocks are known to be large enough)
26932 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
26933 needed by single possibly misaligned move
26934 (skipped if alignment is not needed)
26935 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
26937 2) Zero size guard dispatching to done_label, if needed
26939 3) dispatch to library call, if needed,
26941 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
26942 with specified algorithm. */
26943 bool
26944 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
26945 rtx align_exp, rtx expected_align_exp,
26946 rtx expected_size_exp, rtx min_size_exp,
26947 rtx max_size_exp, rtx probable_max_size_exp,
26948 bool issetmem)
26950 rtx destreg;
26951 rtx srcreg = NULL;
26952 rtx_code_label *label = NULL;
26953 rtx tmp;
26954 rtx_code_label *jump_around_label = NULL;
26955 HOST_WIDE_INT align = 1;
26956 unsigned HOST_WIDE_INT count = 0;
26957 HOST_WIDE_INT expected_size = -1;
26958 int size_needed = 0, epilogue_size_needed;
26959 int desired_align = 0, align_bytes = 0;
26960 enum stringop_alg alg;
26961 rtx promoted_val = NULL;
26962 rtx vec_promoted_val = NULL;
26963 bool force_loopy_epilogue = false;
26964 int dynamic_check;
26965 bool need_zero_guard = false;
26966 bool noalign;
26967 machine_mode move_mode = VOIDmode;
26968 machine_mode wider_mode;
26969 int unroll_factor = 1;
26970 /* TODO: Once value ranges are available, fill in proper data. */
26971 unsigned HOST_WIDE_INT min_size = 0;
26972 unsigned HOST_WIDE_INT max_size = -1;
26973 unsigned HOST_WIDE_INT probable_max_size = -1;
26974 bool misaligned_prologue_used = false;
26975 bool have_as;
26977 if (CONST_INT_P (align_exp))
26978 align = INTVAL (align_exp);
26979 /* i386 can do misaligned access on reasonably increased cost. */
26980 if (CONST_INT_P (expected_align_exp)
26981 && INTVAL (expected_align_exp) > align)
26982 align = INTVAL (expected_align_exp);
26983 /* ALIGN is the minimum of destination and source alignment, but we care here
26984 just about destination alignment. */
26985 else if (!issetmem
26986 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
26987 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
26989 if (CONST_INT_P (count_exp))
26991 min_size = max_size = probable_max_size = count = expected_size
26992 = INTVAL (count_exp);
26993 /* When COUNT is 0, there is nothing to do. */
26994 if (!count)
26995 return true;
26997 else
26999 if (min_size_exp)
27000 min_size = INTVAL (min_size_exp);
27001 if (max_size_exp)
27002 max_size = INTVAL (max_size_exp);
27003 if (probable_max_size_exp)
27004 probable_max_size = INTVAL (probable_max_size_exp);
27005 if (CONST_INT_P (expected_size_exp))
27006 expected_size = INTVAL (expected_size_exp);
27009 /* Make sure we don't need to care about overflow later on. */
27010 if (count > (HOST_WIDE_INT_1U << 30))
27011 return false;
27013 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27014 if (!issetmem)
27015 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27017 /* Step 0: Decide on preferred algorithm, desired alignment and
27018 size of chunks to be copied by main loop. */
27019 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27020 issetmem,
27021 issetmem && val_exp == const0_rtx, have_as,
27022 &dynamic_check, &noalign, false);
27023 if (alg == libcall)
27024 return false;
27025 gcc_assert (alg != no_stringop);
27027 /* For now vector-version of memset is generated only for memory zeroing, as
27028 creating of promoted vector value is very cheap in this case. */
27029 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27030 alg = unrolled_loop;
27032 if (!count)
27033 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27034 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27035 if (!issetmem)
27036 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27038 unroll_factor = 1;
27039 move_mode = word_mode;
27040 switch (alg)
27042 case libcall:
27043 case no_stringop:
27044 case last_alg:
27045 gcc_unreachable ();
27046 case loop_1_byte:
27047 need_zero_guard = true;
27048 move_mode = QImode;
27049 break;
27050 case loop:
27051 need_zero_guard = true;
27052 break;
27053 case unrolled_loop:
27054 need_zero_guard = true;
27055 unroll_factor = (TARGET_64BIT ? 4 : 2);
27056 break;
27057 case vector_loop:
27058 need_zero_guard = true;
27059 unroll_factor = 4;
27060 /* Find the widest supported mode. */
27061 move_mode = word_mode;
27062 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27063 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27064 move_mode = wider_mode;
27066 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27067 move_mode = TImode;
27069 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27070 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27071 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27073 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27074 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27075 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27076 move_mode = word_mode;
27078 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27079 break;
27080 case rep_prefix_8_byte:
27081 move_mode = DImode;
27082 break;
27083 case rep_prefix_4_byte:
27084 move_mode = SImode;
27085 break;
27086 case rep_prefix_1_byte:
27087 move_mode = QImode;
27088 break;
27090 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27091 epilogue_size_needed = size_needed;
27093 /* If we are going to call any library calls conditionally, make sure any
27094 pending stack adjustment happen before the first conditional branch,
27095 otherwise they will be emitted before the library call only and won't
27096 happen from the other branches. */
27097 if (dynamic_check != -1)
27098 do_pending_stack_adjust ();
27100 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27101 if (!TARGET_ALIGN_STRINGOPS || noalign)
27102 align = desired_align;
27104 /* Step 1: Prologue guard. */
27106 /* Alignment code needs count to be in register. */
27107 if (CONST_INT_P (count_exp) && desired_align > align)
27109 if (INTVAL (count_exp) > desired_align
27110 && INTVAL (count_exp) > size_needed)
27112 align_bytes
27113 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27114 if (align_bytes <= 0)
27115 align_bytes = 0;
27116 else
27117 align_bytes = desired_align - align_bytes;
27119 if (align_bytes == 0)
27120 count_exp = force_reg (counter_mode (count_exp), count_exp);
27122 gcc_assert (desired_align >= 1 && align >= 1);
27124 /* Misaligned move sequences handle both prologue and epilogue at once.
27125 Default code generation results in a smaller code for large alignments
27126 and also avoids redundant job when sizes are known precisely. */
27127 misaligned_prologue_used
27128 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27129 && MAX (desired_align, epilogue_size_needed) <= 32
27130 && desired_align <= epilogue_size_needed
27131 && ((desired_align > align && !align_bytes)
27132 || (!count && epilogue_size_needed > 1)));
27134 /* Do the cheap promotion to allow better CSE across the
27135 main loop and epilogue (ie one load of the big constant in the
27136 front of all code.
27137 For now the misaligned move sequences do not have fast path
27138 without broadcasting. */
27139 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27141 if (alg == vector_loop)
27143 gcc_assert (val_exp == const0_rtx);
27144 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27145 promoted_val = promote_duplicated_reg_to_size (val_exp,
27146 GET_MODE_SIZE (word_mode),
27147 desired_align, align);
27149 else
27151 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27152 desired_align, align);
27155 /* Misaligned move sequences handles both prologues and epilogues at once.
27156 Default code generation results in smaller code for large alignments and
27157 also avoids redundant job when sizes are known precisely. */
27158 if (misaligned_prologue_used)
27160 /* Misaligned move prologue handled small blocks by itself. */
27161 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27162 (dst, src, &destreg, &srcreg,
27163 move_mode, promoted_val, vec_promoted_val,
27164 &count_exp,
27165 &jump_around_label,
27166 desired_align < align
27167 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27168 desired_align, align, &min_size, dynamic_check, issetmem);
27169 if (!issetmem)
27170 src = change_address (src, BLKmode, srcreg);
27171 dst = change_address (dst, BLKmode, destreg);
27172 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27173 epilogue_size_needed = 0;
27174 if (need_zero_guard
27175 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27177 /* It is possible that we copied enough so the main loop will not
27178 execute. */
27179 gcc_assert (size_needed > 1);
27180 if (jump_around_label == NULL_RTX)
27181 jump_around_label = gen_label_rtx ();
27182 emit_cmp_and_jump_insns (count_exp,
27183 GEN_INT (size_needed),
27184 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27185 if (expected_size == -1
27186 || expected_size < (desired_align - align) / 2 + size_needed)
27187 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27188 else
27189 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27192 /* Ensure that alignment prologue won't copy past end of block. */
27193 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27195 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27196 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27197 Make sure it is power of 2. */
27198 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27200 /* To improve performance of small blocks, we jump around the VAL
27201 promoting mode. This mean that if the promoted VAL is not constant,
27202 we might not use it in the epilogue and have to use byte
27203 loop variant. */
27204 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27205 force_loopy_epilogue = true;
27206 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27207 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27209 /* If main algorithm works on QImode, no epilogue is needed.
27210 For small sizes just don't align anything. */
27211 if (size_needed == 1)
27212 desired_align = align;
27213 else
27214 goto epilogue;
27216 else if (!count
27217 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27219 label = gen_label_rtx ();
27220 emit_cmp_and_jump_insns (count_exp,
27221 GEN_INT (epilogue_size_needed),
27222 LTU, 0, counter_mode (count_exp), 1, label);
27223 if (expected_size == -1 || expected_size < epilogue_size_needed)
27224 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27225 else
27226 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27230 /* Emit code to decide on runtime whether library call or inline should be
27231 used. */
27232 if (dynamic_check != -1)
27234 if (!issetmem && CONST_INT_P (count_exp))
27236 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27238 emit_block_copy_via_libcall (dst, src, count_exp);
27239 count_exp = const0_rtx;
27240 goto epilogue;
27243 else
27245 rtx_code_label *hot_label = gen_label_rtx ();
27246 if (jump_around_label == NULL_RTX)
27247 jump_around_label = gen_label_rtx ();
27248 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27249 LEU, 0, counter_mode (count_exp),
27250 1, hot_label);
27251 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27252 if (issetmem)
27253 set_storage_via_libcall (dst, count_exp, val_exp);
27254 else
27255 emit_block_copy_via_libcall (dst, src, count_exp);
27256 emit_jump (jump_around_label);
27257 emit_label (hot_label);
27261 /* Step 2: Alignment prologue. */
27262 /* Do the expensive promotion once we branched off the small blocks. */
27263 if (issetmem && !promoted_val)
27264 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27265 desired_align, align);
27267 if (desired_align > align && !misaligned_prologue_used)
27269 if (align_bytes == 0)
27271 /* Except for the first move in prologue, we no longer know
27272 constant offset in aliasing info. It don't seems to worth
27273 the pain to maintain it for the first move, so throw away
27274 the info early. */
27275 dst = change_address (dst, BLKmode, destreg);
27276 if (!issetmem)
27277 src = change_address (src, BLKmode, srcreg);
27278 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27279 promoted_val, vec_promoted_val,
27280 count_exp, align, desired_align,
27281 issetmem);
27282 /* At most desired_align - align bytes are copied. */
27283 if (min_size < (unsigned)(desired_align - align))
27284 min_size = 0;
27285 else
27286 min_size -= desired_align - align;
27288 else
27290 /* If we know how many bytes need to be stored before dst is
27291 sufficiently aligned, maintain aliasing info accurately. */
27292 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27293 srcreg,
27294 promoted_val,
27295 vec_promoted_val,
27296 desired_align,
27297 align_bytes,
27298 issetmem);
27300 count_exp = plus_constant (counter_mode (count_exp),
27301 count_exp, -align_bytes);
27302 count -= align_bytes;
27303 min_size -= align_bytes;
27304 max_size -= align_bytes;
27306 if (need_zero_guard
27307 && min_size < (unsigned HOST_WIDE_INT) size_needed
27308 && (count < (unsigned HOST_WIDE_INT) size_needed
27309 || (align_bytes == 0
27310 && count < ((unsigned HOST_WIDE_INT) size_needed
27311 + desired_align - align))))
27313 /* It is possible that we copied enough so the main loop will not
27314 execute. */
27315 gcc_assert (size_needed > 1);
27316 if (label == NULL_RTX)
27317 label = gen_label_rtx ();
27318 emit_cmp_and_jump_insns (count_exp,
27319 GEN_INT (size_needed),
27320 LTU, 0, counter_mode (count_exp), 1, label);
27321 if (expected_size == -1
27322 || expected_size < (desired_align - align) / 2 + size_needed)
27323 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27324 else
27325 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27328 if (label && size_needed == 1)
27330 emit_label (label);
27331 LABEL_NUSES (label) = 1;
27332 label = NULL;
27333 epilogue_size_needed = 1;
27334 if (issetmem)
27335 promoted_val = val_exp;
27337 else if (label == NULL_RTX && !misaligned_prologue_used)
27338 epilogue_size_needed = size_needed;
27340 /* Step 3: Main loop. */
27342 switch (alg)
27344 case libcall:
27345 case no_stringop:
27346 case last_alg:
27347 gcc_unreachable ();
27348 case loop_1_byte:
27349 case loop:
27350 case unrolled_loop:
27351 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27352 count_exp, move_mode, unroll_factor,
27353 expected_size, issetmem);
27354 break;
27355 case vector_loop:
27356 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27357 vec_promoted_val, count_exp, move_mode,
27358 unroll_factor, expected_size, issetmem);
27359 break;
27360 case rep_prefix_8_byte:
27361 case rep_prefix_4_byte:
27362 case rep_prefix_1_byte:
27363 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27364 val_exp, count_exp, move_mode, issetmem);
27365 break;
27367 /* Adjust properly the offset of src and dest memory for aliasing. */
27368 if (CONST_INT_P (count_exp))
27370 if (!issetmem)
27371 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27372 (count / size_needed) * size_needed);
27373 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27374 (count / size_needed) * size_needed);
27376 else
27378 if (!issetmem)
27379 src = change_address (src, BLKmode, srcreg);
27380 dst = change_address (dst, BLKmode, destreg);
27383 /* Step 4: Epilogue to copy the remaining bytes. */
27384 epilogue:
27385 if (label)
27387 /* When the main loop is done, COUNT_EXP might hold original count,
27388 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27389 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27390 bytes. Compensate if needed. */
27392 if (size_needed < epilogue_size_needed)
27394 tmp =
27395 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27396 GEN_INT (size_needed - 1), count_exp, 1,
27397 OPTAB_DIRECT);
27398 if (tmp != count_exp)
27399 emit_move_insn (count_exp, tmp);
27401 emit_label (label);
27402 LABEL_NUSES (label) = 1;
27405 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27407 if (force_loopy_epilogue)
27408 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27409 epilogue_size_needed);
27410 else
27412 if (issetmem)
27413 expand_setmem_epilogue (dst, destreg, promoted_val,
27414 vec_promoted_val, count_exp,
27415 epilogue_size_needed);
27416 else
27417 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27418 epilogue_size_needed);
27421 if (jump_around_label)
27422 emit_label (jump_around_label);
27423 return true;
27427 /* Expand the appropriate insns for doing strlen if not just doing
27428 repnz; scasb
27430 out = result, initialized with the start address
27431 align_rtx = alignment of the address.
27432 scratch = scratch register, initialized with the startaddress when
27433 not aligned, otherwise undefined
27435 This is just the body. It needs the initializations mentioned above and
27436 some address computing at the end. These things are done in i386.md. */
27438 static void
27439 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27441 int align;
27442 rtx tmp;
27443 rtx_code_label *align_2_label = NULL;
27444 rtx_code_label *align_3_label = NULL;
27445 rtx_code_label *align_4_label = gen_label_rtx ();
27446 rtx_code_label *end_0_label = gen_label_rtx ();
27447 rtx mem;
27448 rtx tmpreg = gen_reg_rtx (SImode);
27449 rtx scratch = gen_reg_rtx (SImode);
27450 rtx cmp;
27452 align = 0;
27453 if (CONST_INT_P (align_rtx))
27454 align = INTVAL (align_rtx);
27456 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27458 /* Is there a known alignment and is it less than 4? */
27459 if (align < 4)
27461 rtx scratch1 = gen_reg_rtx (Pmode);
27462 emit_move_insn (scratch1, out);
27463 /* Is there a known alignment and is it not 2? */
27464 if (align != 2)
27466 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27467 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27469 /* Leave just the 3 lower bits. */
27470 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27471 NULL_RTX, 0, OPTAB_WIDEN);
27473 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27474 Pmode, 1, align_4_label);
27475 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27476 Pmode, 1, align_2_label);
27477 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27478 Pmode, 1, align_3_label);
27480 else
27482 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27483 check if is aligned to 4 - byte. */
27485 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27486 NULL_RTX, 0, OPTAB_WIDEN);
27488 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27489 Pmode, 1, align_4_label);
27492 mem = change_address (src, QImode, out);
27494 /* Now compare the bytes. */
27496 /* Compare the first n unaligned byte on a byte per byte basis. */
27497 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27498 QImode, 1, end_0_label);
27500 /* Increment the address. */
27501 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27503 /* Not needed with an alignment of 2 */
27504 if (align != 2)
27506 emit_label (align_2_label);
27508 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27509 end_0_label);
27511 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27513 emit_label (align_3_label);
27516 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27517 end_0_label);
27519 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27522 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27523 align this loop. It gives only huge programs, but does not help to
27524 speed up. */
27525 emit_label (align_4_label);
27527 mem = change_address (src, SImode, out);
27528 emit_move_insn (scratch, mem);
27529 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27531 /* This formula yields a nonzero result iff one of the bytes is zero.
27532 This saves three branches inside loop and many cycles. */
27534 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27535 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27536 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27537 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27538 gen_int_mode (0x80808080, SImode)));
27539 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27540 align_4_label);
27542 if (TARGET_CMOVE)
27544 rtx reg = gen_reg_rtx (SImode);
27545 rtx reg2 = gen_reg_rtx (Pmode);
27546 emit_move_insn (reg, tmpreg);
27547 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27549 /* If zero is not in the first two bytes, move two bytes forward. */
27550 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27551 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27552 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27553 emit_insn (gen_rtx_SET (tmpreg,
27554 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27555 reg,
27556 tmpreg)));
27557 /* Emit lea manually to avoid clobbering of flags. */
27558 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27560 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27561 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27562 emit_insn (gen_rtx_SET (out,
27563 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27564 reg2,
27565 out)));
27567 else
27569 rtx_code_label *end_2_label = gen_label_rtx ();
27570 /* Is zero in the first two bytes? */
27572 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27573 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27574 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27575 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27576 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27577 pc_rtx);
27578 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27579 JUMP_LABEL (tmp) = end_2_label;
27581 /* Not in the first two. Move two bytes forward. */
27582 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27583 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27585 emit_label (end_2_label);
27589 /* Avoid branch in fixing the byte. */
27590 tmpreg = gen_lowpart (QImode, tmpreg);
27591 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27592 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27593 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27594 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27596 emit_label (end_0_label);
27599 /* Expand strlen. */
27601 bool
27602 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27604 rtx addr, scratch1, scratch2, scratch3, scratch4;
27606 /* The generic case of strlen expander is long. Avoid it's
27607 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27609 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27610 && !TARGET_INLINE_ALL_STRINGOPS
27611 && !optimize_insn_for_size_p ()
27612 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27613 return false;
27615 addr = force_reg (Pmode, XEXP (src, 0));
27616 scratch1 = gen_reg_rtx (Pmode);
27618 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27619 && !optimize_insn_for_size_p ())
27621 /* Well it seems that some optimizer does not combine a call like
27622 foo(strlen(bar), strlen(bar));
27623 when the move and the subtraction is done here. It does calculate
27624 the length just once when these instructions are done inside of
27625 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27626 often used and I use one fewer register for the lifetime of
27627 output_strlen_unroll() this is better. */
27629 emit_move_insn (out, addr);
27631 ix86_expand_strlensi_unroll_1 (out, src, align);
27633 /* strlensi_unroll_1 returns the address of the zero at the end of
27634 the string, like memchr(), so compute the length by subtracting
27635 the start address. */
27636 emit_insn (ix86_gen_sub3 (out, out, addr));
27638 else
27640 rtx unspec;
27642 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27643 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27644 return false;
27645 /* Can't use this for non-default address spaces. */
27646 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27647 return false;
27649 scratch2 = gen_reg_rtx (Pmode);
27650 scratch3 = gen_reg_rtx (Pmode);
27651 scratch4 = force_reg (Pmode, constm1_rtx);
27653 emit_move_insn (scratch3, addr);
27654 eoschar = force_reg (QImode, eoschar);
27656 src = replace_equiv_address_nv (src, scratch3);
27658 /* If .md starts supporting :P, this can be done in .md. */
27659 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27660 scratch4), UNSPEC_SCAS);
27661 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27662 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27663 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27665 return true;
27668 /* For given symbol (function) construct code to compute address of it's PLT
27669 entry in large x86-64 PIC model. */
27670 static rtx
27671 construct_plt_address (rtx symbol)
27673 rtx tmp, unspec;
27675 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27676 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27677 gcc_assert (Pmode == DImode);
27679 tmp = gen_reg_rtx (Pmode);
27680 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27682 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27683 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27684 return tmp;
27688 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
27689 rtx callarg2,
27690 rtx pop, bool sibcall)
27692 rtx vec[3];
27693 rtx use = NULL, call;
27694 unsigned int vec_len = 0;
27695 tree fndecl;
27697 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27699 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
27700 if (fndecl
27701 && (lookup_attribute ("interrupt",
27702 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
27703 error ("interrupt service routine can't be called directly");
27705 else
27706 fndecl = NULL_TREE;
27708 if (pop == const0_rtx)
27709 pop = NULL;
27710 gcc_assert (!TARGET_64BIT || !pop);
27712 if (TARGET_MACHO && !TARGET_64BIT)
27714 #if TARGET_MACHO
27715 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27716 fnaddr = machopic_indirect_call_target (fnaddr);
27717 #endif
27719 else
27721 /* Static functions and indirect calls don't need the pic register. Also,
27722 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
27723 it an indirect call. */
27724 rtx addr = XEXP (fnaddr, 0);
27725 if (flag_pic
27726 && GET_CODE (addr) == SYMBOL_REF
27727 && !SYMBOL_REF_LOCAL_P (addr))
27729 if (flag_plt
27730 && (SYMBOL_REF_DECL (addr) == NULL_TREE
27731 || !lookup_attribute ("noplt",
27732 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
27734 if (!TARGET_64BIT
27735 || (ix86_cmodel == CM_LARGE_PIC
27736 && DEFAULT_ABI != MS_ABI))
27738 use_reg (&use, gen_rtx_REG (Pmode,
27739 REAL_PIC_OFFSET_TABLE_REGNUM));
27740 if (ix86_use_pseudo_pic_reg ())
27741 emit_move_insn (gen_rtx_REG (Pmode,
27742 REAL_PIC_OFFSET_TABLE_REGNUM),
27743 pic_offset_table_rtx);
27746 else if (!TARGET_PECOFF && !TARGET_MACHO)
27748 if (TARGET_64BIT)
27750 fnaddr = gen_rtx_UNSPEC (Pmode,
27751 gen_rtvec (1, addr),
27752 UNSPEC_GOTPCREL);
27753 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27755 else
27757 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
27758 UNSPEC_GOT);
27759 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27760 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
27761 fnaddr);
27763 fnaddr = gen_const_mem (Pmode, fnaddr);
27764 /* Pmode may not be the same as word_mode for x32, which
27765 doesn't support indirect branch via 32-bit memory slot.
27766 Since x32 GOT slot is 64 bit with zero upper 32 bits,
27767 indirect branch via x32 GOT slot is OK. */
27768 if (GET_MODE (fnaddr) != word_mode)
27769 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
27770 fnaddr = gen_rtx_MEM (QImode, fnaddr);
27775 /* Skip setting up RAX register for -mskip-rax-setup when there are no
27776 parameters passed in vector registers. */
27777 if (TARGET_64BIT
27778 && (INTVAL (callarg2) > 0
27779 || (INTVAL (callarg2) == 0
27780 && (TARGET_SSE || !flag_skip_rax_setup))))
27782 rtx al = gen_rtx_REG (QImode, AX_REG);
27783 emit_move_insn (al, callarg2);
27784 use_reg (&use, al);
27787 if (ix86_cmodel == CM_LARGE_PIC
27788 && !TARGET_PECOFF
27789 && MEM_P (fnaddr)
27790 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
27791 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
27792 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
27793 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
27794 branch via x32 GOT slot is OK. */
27795 else if (!(TARGET_X32
27796 && MEM_P (fnaddr)
27797 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
27798 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
27799 && (sibcall
27800 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
27801 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
27803 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
27804 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
27807 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
27809 if (retval)
27811 /* We should add bounds as destination register in case
27812 pointer with bounds may be returned. */
27813 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
27815 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
27816 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
27817 if (GET_CODE (retval) == PARALLEL)
27819 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
27820 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
27821 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
27822 retval = chkp_join_splitted_slot (retval, par);
27824 else
27826 retval = gen_rtx_PARALLEL (VOIDmode,
27827 gen_rtvec (3, retval, b0, b1));
27828 chkp_put_regs_to_expr_list (retval);
27832 call = gen_rtx_SET (retval, call);
27834 vec[vec_len++] = call;
27836 if (pop)
27838 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
27839 pop = gen_rtx_SET (stack_pointer_rtx, pop);
27840 vec[vec_len++] = pop;
27843 if (cfun->machine->no_caller_saved_registers
27844 && (!fndecl
27845 || (!TREE_THIS_VOLATILE (fndecl)
27846 && !lookup_attribute ("no_caller_saved_registers",
27847 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
27849 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
27850 bool is_64bit_ms_abi = (TARGET_64BIT
27851 && ix86_function_abi (fndecl) == MS_ABI);
27852 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
27854 /* If there are no caller-saved registers, add all registers
27855 that are clobbered by the call which returns. */
27856 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
27857 if (!fixed_regs[i]
27858 && (ix86_call_used_regs[i] == 1
27859 || (ix86_call_used_regs[i] & c_mask))
27860 && !STACK_REGNO_P (i)
27861 && !MMX_REGNO_P (i))
27862 clobber_reg (&use,
27863 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
27865 else if (TARGET_64BIT_MS_ABI
27866 && (!callarg2 || INTVAL (callarg2) != -2))
27868 unsigned i;
27870 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
27872 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
27873 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
27875 clobber_reg (&use, gen_rtx_REG (mode, regno));
27878 /* Set here, but it may get cleared later. */
27879 if (TARGET_CALL_MS2SYSV_XLOGUES)
27881 if (!TARGET_SSE)
27884 /* Don't break hot-patched functions. */
27885 else if (ix86_function_ms_hook_prologue (current_function_decl))
27888 /* TODO: Cases not yet examined. */
27889 else if (flag_split_stack)
27890 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
27892 else
27894 gcc_assert (!reload_completed);
27895 cfun->machine->call_ms2sysv = true;
27900 if (vec_len > 1)
27901 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
27902 call = emit_call_insn (call);
27903 if (use)
27904 CALL_INSN_FUNCTION_USAGE (call) = use;
27906 return call;
27909 /* Return true if the function being called was marked with attribute
27910 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
27911 to handle the non-PIC case in the backend because there is no easy
27912 interface for the front-end to force non-PLT calls to use the GOT.
27913 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
27914 to call the function marked "noplt" indirectly. */
27916 static bool
27917 ix86_nopic_noplt_attribute_p (rtx call_op)
27919 if (flag_pic || ix86_cmodel == CM_LARGE
27920 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
27921 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
27922 || SYMBOL_REF_LOCAL_P (call_op))
27923 return false;
27925 tree symbol_decl = SYMBOL_REF_DECL (call_op);
27927 if (!flag_plt
27928 || (symbol_decl != NULL_TREE
27929 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
27930 return true;
27932 return false;
27935 /* Output the assembly for a call instruction. */
27937 const char *
27938 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
27940 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
27941 bool seh_nop_p = false;
27942 const char *xasm;
27944 if (SIBLING_CALL_P (insn))
27946 if (direct_p)
27948 if (ix86_nopic_noplt_attribute_p (call_op))
27950 if (TARGET_64BIT)
27951 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
27952 else
27953 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
27955 else
27956 xasm = "%!jmp\t%P0";
27958 /* SEH epilogue detection requires the indirect branch case
27959 to include REX.W. */
27960 else if (TARGET_SEH)
27961 xasm = "%!rex.W jmp\t%A0";
27962 else
27963 xasm = "%!jmp\t%A0";
27965 output_asm_insn (xasm, &call_op);
27966 return "";
27969 /* SEH unwinding can require an extra nop to be emitted in several
27970 circumstances. Determine if we have one of those. */
27971 if (TARGET_SEH)
27973 rtx_insn *i;
27975 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
27977 /* If we get to another real insn, we don't need the nop. */
27978 if (INSN_P (i))
27979 break;
27981 /* If we get to the epilogue note, prevent a catch region from
27982 being adjacent to the standard epilogue sequence. If non-
27983 call-exceptions, we'll have done this during epilogue emission. */
27984 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
27985 && !flag_non_call_exceptions
27986 && !can_throw_internal (insn))
27988 seh_nop_p = true;
27989 break;
27993 /* If we didn't find a real insn following the call, prevent the
27994 unwinder from looking into the next function. */
27995 if (i == NULL)
27996 seh_nop_p = true;
27999 if (direct_p)
28001 if (ix86_nopic_noplt_attribute_p (call_op))
28003 if (TARGET_64BIT)
28004 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28005 else
28006 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28008 else
28009 xasm = "%!call\t%P0";
28011 else
28012 xasm = "%!call\t%A0";
28014 output_asm_insn (xasm, &call_op);
28016 if (seh_nop_p)
28017 return "nop";
28019 return "";
28022 /* Clear stack slot assignments remembered from previous functions.
28023 This is called from INIT_EXPANDERS once before RTL is emitted for each
28024 function. */
28026 static struct machine_function *
28027 ix86_init_machine_status (void)
28029 struct machine_function *f;
28031 f = ggc_cleared_alloc<machine_function> ();
28032 f->call_abi = ix86_abi;
28034 return f;
28037 /* Return a MEM corresponding to a stack slot with mode MODE.
28038 Allocate a new slot if necessary.
28040 The RTL for a function can have several slots available: N is
28041 which slot to use. */
28044 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28046 struct stack_local_entry *s;
28048 gcc_assert (n < MAX_386_STACK_LOCALS);
28050 for (s = ix86_stack_locals; s; s = s->next)
28051 if (s->mode == mode && s->n == n)
28052 return validize_mem (copy_rtx (s->rtl));
28054 s = ggc_alloc<stack_local_entry> ();
28055 s->n = n;
28056 s->mode = mode;
28057 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28059 s->next = ix86_stack_locals;
28060 ix86_stack_locals = s;
28061 return validize_mem (copy_rtx (s->rtl));
28064 static void
28065 ix86_instantiate_decls (void)
28067 struct stack_local_entry *s;
28069 for (s = ix86_stack_locals; s; s = s->next)
28070 if (s->rtl != NULL_RTX)
28071 instantiate_decl_rtl (s->rtl);
28074 /* Return the number used for encoding REG, in the range 0..7. */
28076 static int
28077 reg_encoded_number (rtx reg)
28079 unsigned regno = REGNO (reg);
28080 switch (regno)
28082 case AX_REG:
28083 return 0;
28084 case CX_REG:
28085 return 1;
28086 case DX_REG:
28087 return 2;
28088 case BX_REG:
28089 return 3;
28090 case SP_REG:
28091 return 4;
28092 case BP_REG:
28093 return 5;
28094 case SI_REG:
28095 return 6;
28096 case DI_REG:
28097 return 7;
28098 default:
28099 break;
28101 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28102 return regno - FIRST_STACK_REG;
28103 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28104 return regno - FIRST_SSE_REG;
28105 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28106 return regno - FIRST_MMX_REG;
28107 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28108 return regno - FIRST_REX_SSE_REG;
28109 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28110 return regno - FIRST_REX_INT_REG;
28111 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28112 return regno - FIRST_MASK_REG;
28113 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28114 return regno - FIRST_BND_REG;
28115 return -1;
28118 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28119 in its encoding if it could be relevant for ROP mitigation, otherwise
28120 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28121 used for calculating it into them. */
28123 static int
28124 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28125 int *popno0 = 0, int *popno1 = 0)
28127 if (asm_noperands (PATTERN (insn)) >= 0)
28128 return -1;
28129 int has_modrm = get_attr_modrm (insn);
28130 if (!has_modrm)
28131 return -1;
28132 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28133 rtx op0, op1;
28134 switch (cls)
28136 case MODRM_CLASS_OP02:
28137 gcc_assert (noperands >= 3);
28138 if (popno0)
28140 *popno0 = 0;
28141 *popno1 = 2;
28143 op0 = operands[0];
28144 op1 = operands[2];
28145 break;
28146 case MODRM_CLASS_OP01:
28147 gcc_assert (noperands >= 2);
28148 if (popno0)
28150 *popno0 = 0;
28151 *popno1 = 1;
28153 op0 = operands[0];
28154 op1 = operands[1];
28155 break;
28156 default:
28157 return -1;
28159 if (REG_P (op0) && REG_P (op1))
28161 int enc0 = reg_encoded_number (op0);
28162 int enc1 = reg_encoded_number (op1);
28163 return 0xc0 + (enc1 << 3) + enc0;
28165 return -1;
28168 /* Check whether x86 address PARTS is a pc-relative address. */
28170 bool
28171 ix86_rip_relative_addr_p (struct ix86_address *parts)
28173 rtx base, index, disp;
28175 base = parts->base;
28176 index = parts->index;
28177 disp = parts->disp;
28179 if (disp && !base && !index)
28181 if (TARGET_64BIT)
28183 rtx symbol = disp;
28185 if (GET_CODE (disp) == CONST)
28186 symbol = XEXP (disp, 0);
28187 if (GET_CODE (symbol) == PLUS
28188 && CONST_INT_P (XEXP (symbol, 1)))
28189 symbol = XEXP (symbol, 0);
28191 if (GET_CODE (symbol) == LABEL_REF
28192 || (GET_CODE (symbol) == SYMBOL_REF
28193 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28194 || (GET_CODE (symbol) == UNSPEC
28195 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28196 || XINT (symbol, 1) == UNSPEC_PCREL
28197 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28198 return true;
28201 return false;
28204 /* Calculate the length of the memory address in the instruction encoding.
28205 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28206 or other prefixes. We never generate addr32 prefix for LEA insn. */
28209 memory_address_length (rtx addr, bool lea)
28211 struct ix86_address parts;
28212 rtx base, index, disp;
28213 int len;
28214 int ok;
28216 if (GET_CODE (addr) == PRE_DEC
28217 || GET_CODE (addr) == POST_INC
28218 || GET_CODE (addr) == PRE_MODIFY
28219 || GET_CODE (addr) == POST_MODIFY)
28220 return 0;
28222 ok = ix86_decompose_address (addr, &parts);
28223 gcc_assert (ok);
28225 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28227 /* If this is not LEA instruction, add the length of addr32 prefix. */
28228 if (TARGET_64BIT && !lea
28229 && (SImode_address_operand (addr, VOIDmode)
28230 || (parts.base && GET_MODE (parts.base) == SImode)
28231 || (parts.index && GET_MODE (parts.index) == SImode)))
28232 len++;
28234 base = parts.base;
28235 index = parts.index;
28236 disp = parts.disp;
28238 if (base && SUBREG_P (base))
28239 base = SUBREG_REG (base);
28240 if (index && SUBREG_P (index))
28241 index = SUBREG_REG (index);
28243 gcc_assert (base == NULL_RTX || REG_P (base));
28244 gcc_assert (index == NULL_RTX || REG_P (index));
28246 /* Rule of thumb:
28247 - esp as the base always wants an index,
28248 - ebp as the base always wants a displacement,
28249 - r12 as the base always wants an index,
28250 - r13 as the base always wants a displacement. */
28252 /* Register Indirect. */
28253 if (base && !index && !disp)
28255 /* esp (for its index) and ebp (for its displacement) need
28256 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28257 code. */
28258 if (base == arg_pointer_rtx
28259 || base == frame_pointer_rtx
28260 || REGNO (base) == SP_REG
28261 || REGNO (base) == BP_REG
28262 || REGNO (base) == R12_REG
28263 || REGNO (base) == R13_REG)
28264 len++;
28267 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28268 is not disp32, but disp32(%rip), so for disp32
28269 SIB byte is needed, unless print_operand_address
28270 optimizes it into disp32(%rip) or (%rip) is implied
28271 by UNSPEC. */
28272 else if (disp && !base && !index)
28274 len += 4;
28275 if (!ix86_rip_relative_addr_p (&parts))
28276 len++;
28278 else
28280 /* Find the length of the displacement constant. */
28281 if (disp)
28283 if (base && satisfies_constraint_K (disp))
28284 len += 1;
28285 else
28286 len += 4;
28288 /* ebp always wants a displacement. Similarly r13. */
28289 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28290 len++;
28292 /* An index requires the two-byte modrm form.... */
28293 if (index
28294 /* ...like esp (or r12), which always wants an index. */
28295 || base == arg_pointer_rtx
28296 || base == frame_pointer_rtx
28297 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28298 len++;
28301 return len;
28304 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28305 is set, expect that insn have 8bit immediate alternative. */
28307 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28309 int len = 0;
28310 int i;
28311 extract_insn_cached (insn);
28312 for (i = recog_data.n_operands - 1; i >= 0; --i)
28313 if (CONSTANT_P (recog_data.operand[i]))
28315 enum attr_mode mode = get_attr_mode (insn);
28317 gcc_assert (!len);
28318 if (shortform && CONST_INT_P (recog_data.operand[i]))
28320 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28321 switch (mode)
28323 case MODE_QI:
28324 len = 1;
28325 continue;
28326 case MODE_HI:
28327 ival = trunc_int_for_mode (ival, HImode);
28328 break;
28329 case MODE_SI:
28330 ival = trunc_int_for_mode (ival, SImode);
28331 break;
28332 default:
28333 break;
28335 if (IN_RANGE (ival, -128, 127))
28337 len = 1;
28338 continue;
28341 switch (mode)
28343 case MODE_QI:
28344 len = 1;
28345 break;
28346 case MODE_HI:
28347 len = 2;
28348 break;
28349 case MODE_SI:
28350 len = 4;
28351 break;
28352 /* Immediates for DImode instructions are encoded
28353 as 32bit sign extended values. */
28354 case MODE_DI:
28355 len = 4;
28356 break;
28357 default:
28358 fatal_insn ("unknown insn mode", insn);
28361 return len;
28364 /* Compute default value for "length_address" attribute. */
28366 ix86_attr_length_address_default (rtx_insn *insn)
28368 int i;
28370 if (get_attr_type (insn) == TYPE_LEA)
28372 rtx set = PATTERN (insn), addr;
28374 if (GET_CODE (set) == PARALLEL)
28375 set = XVECEXP (set, 0, 0);
28377 gcc_assert (GET_CODE (set) == SET);
28379 addr = SET_SRC (set);
28381 return memory_address_length (addr, true);
28384 extract_insn_cached (insn);
28385 for (i = recog_data.n_operands - 1; i >= 0; --i)
28387 rtx op = recog_data.operand[i];
28388 if (MEM_P (op))
28390 constrain_operands_cached (insn, reload_completed);
28391 if (which_alternative != -1)
28393 const char *constraints = recog_data.constraints[i];
28394 int alt = which_alternative;
28396 while (*constraints == '=' || *constraints == '+')
28397 constraints++;
28398 while (alt-- > 0)
28399 while (*constraints++ != ',')
28401 /* Skip ignored operands. */
28402 if (*constraints == 'X')
28403 continue;
28406 int len = memory_address_length (XEXP (op, 0), false);
28408 /* Account for segment prefix for non-default addr spaces. */
28409 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28410 len++;
28412 return len;
28415 return 0;
28418 /* Compute default value for "length_vex" attribute. It includes
28419 2 or 3 byte VEX prefix and 1 opcode byte. */
28422 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28423 bool has_vex_w)
28425 int i;
28427 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28428 byte VEX prefix. */
28429 if (!has_0f_opcode || has_vex_w)
28430 return 3 + 1;
28432 /* We can always use 2 byte VEX prefix in 32bit. */
28433 if (!TARGET_64BIT)
28434 return 2 + 1;
28436 extract_insn_cached (insn);
28438 for (i = recog_data.n_operands - 1; i >= 0; --i)
28439 if (REG_P (recog_data.operand[i]))
28441 /* REX.W bit uses 3 byte VEX prefix. */
28442 if (GET_MODE (recog_data.operand[i]) == DImode
28443 && GENERAL_REG_P (recog_data.operand[i]))
28444 return 3 + 1;
28446 else
28448 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28449 if (MEM_P (recog_data.operand[i])
28450 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28451 return 3 + 1;
28454 return 2 + 1;
28458 static bool
28459 ix86_class_likely_spilled_p (reg_class_t);
28461 /* Returns true if lhs of insn is HW function argument register and set up
28462 is_spilled to true if it is likely spilled HW register. */
28463 static bool
28464 insn_is_function_arg (rtx insn, bool* is_spilled)
28466 rtx dst;
28468 if (!NONDEBUG_INSN_P (insn))
28469 return false;
28470 /* Call instructions are not movable, ignore it. */
28471 if (CALL_P (insn))
28472 return false;
28473 insn = PATTERN (insn);
28474 if (GET_CODE (insn) == PARALLEL)
28475 insn = XVECEXP (insn, 0, 0);
28476 if (GET_CODE (insn) != SET)
28477 return false;
28478 dst = SET_DEST (insn);
28479 if (REG_P (dst) && HARD_REGISTER_P (dst)
28480 && ix86_function_arg_regno_p (REGNO (dst)))
28482 /* Is it likely spilled HW register? */
28483 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
28484 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
28485 *is_spilled = true;
28486 return true;
28488 return false;
28491 /* Add output dependencies for chain of function adjacent arguments if only
28492 there is a move to likely spilled HW register. Return first argument
28493 if at least one dependence was added or NULL otherwise. */
28494 static rtx_insn *
28495 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
28497 rtx_insn *insn;
28498 rtx_insn *last = call;
28499 rtx_insn *first_arg = NULL;
28500 bool is_spilled = false;
28502 head = PREV_INSN (head);
28504 /* Find nearest to call argument passing instruction. */
28505 while (true)
28507 last = PREV_INSN (last);
28508 if (last == head)
28509 return NULL;
28510 if (!NONDEBUG_INSN_P (last))
28511 continue;
28512 if (insn_is_function_arg (last, &is_spilled))
28513 break;
28514 return NULL;
28517 first_arg = last;
28518 while (true)
28520 insn = PREV_INSN (last);
28521 if (!INSN_P (insn))
28522 break;
28523 if (insn == head)
28524 break;
28525 if (!NONDEBUG_INSN_P (insn))
28527 last = insn;
28528 continue;
28530 if (insn_is_function_arg (insn, &is_spilled))
28532 /* Add output depdendence between two function arguments if chain
28533 of output arguments contains likely spilled HW registers. */
28534 if (is_spilled)
28535 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28536 first_arg = last = insn;
28538 else
28539 break;
28541 if (!is_spilled)
28542 return NULL;
28543 return first_arg;
28546 /* Add output or anti dependency from insn to first_arg to restrict its code
28547 motion. */
28548 static void
28549 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
28551 rtx set;
28552 rtx tmp;
28554 /* Add anti dependencies for bounds stores. */
28555 if (INSN_P (insn)
28556 && GET_CODE (PATTERN (insn)) == PARALLEL
28557 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
28558 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
28560 add_dependence (first_arg, insn, REG_DEP_ANTI);
28561 return;
28564 set = single_set (insn);
28565 if (!set)
28566 return;
28567 tmp = SET_DEST (set);
28568 if (REG_P (tmp))
28570 /* Add output dependency to the first function argument. */
28571 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28572 return;
28574 /* Add anti dependency. */
28575 add_dependence (first_arg, insn, REG_DEP_ANTI);
28578 /* Avoid cross block motion of function argument through adding dependency
28579 from the first non-jump instruction in bb. */
28580 static void
28581 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
28583 rtx_insn *insn = BB_END (bb);
28585 while (insn)
28587 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
28589 rtx set = single_set (insn);
28590 if (set)
28592 avoid_func_arg_motion (arg, insn);
28593 return;
28596 if (insn == BB_HEAD (bb))
28597 return;
28598 insn = PREV_INSN (insn);
28602 /* Hook for pre-reload schedule - avoid motion of function arguments
28603 passed in likely spilled HW registers. */
28604 static void
28605 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
28607 rtx_insn *insn;
28608 rtx_insn *first_arg = NULL;
28609 if (reload_completed)
28610 return;
28611 while (head != tail && DEBUG_INSN_P (head))
28612 head = NEXT_INSN (head);
28613 for (insn = tail; insn != head; insn = PREV_INSN (insn))
28614 if (INSN_P (insn) && CALL_P (insn))
28616 first_arg = add_parameter_dependencies (insn, head);
28617 if (first_arg)
28619 /* Add dependee for first argument to predecessors if only
28620 region contains more than one block. */
28621 basic_block bb = BLOCK_FOR_INSN (insn);
28622 int rgn = CONTAINING_RGN (bb->index);
28623 int nr_blks = RGN_NR_BLOCKS (rgn);
28624 /* Skip trivial regions and region head blocks that can have
28625 predecessors outside of region. */
28626 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
28628 edge e;
28629 edge_iterator ei;
28631 /* Regions are SCCs with the exception of selective
28632 scheduling with pipelining of outer blocks enabled.
28633 So also check that immediate predecessors of a non-head
28634 block are in the same region. */
28635 FOR_EACH_EDGE (e, ei, bb->preds)
28637 /* Avoid creating of loop-carried dependencies through
28638 using topological ordering in the region. */
28639 if (rgn == CONTAINING_RGN (e->src->index)
28640 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
28641 add_dependee_for_func_arg (first_arg, e->src);
28644 insn = first_arg;
28645 if (insn == head)
28646 break;
28649 else if (first_arg)
28650 avoid_func_arg_motion (first_arg, insn);
28653 /* Hook for pre-reload schedule - set priority of moves from likely spilled
28654 HW registers to maximum, to schedule them at soon as possible. These are
28655 moves from function argument registers at the top of the function entry
28656 and moves from function return value registers after call. */
28657 static int
28658 ix86_adjust_priority (rtx_insn *insn, int priority)
28660 rtx set;
28662 if (reload_completed)
28663 return priority;
28665 if (!NONDEBUG_INSN_P (insn))
28666 return priority;
28668 set = single_set (insn);
28669 if (set)
28671 rtx tmp = SET_SRC (set);
28672 if (REG_P (tmp)
28673 && HARD_REGISTER_P (tmp)
28674 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
28675 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
28676 return current_sched_info->sched_max_insns_priority;
28679 return priority;
28682 /* Prepare for scheduling pass. */
28683 static void
28684 ix86_sched_init_global (FILE *, int, int)
28686 /* Install scheduling hooks for current CPU. Some of these hooks are used
28687 in time-critical parts of the scheduler, so we only set them up when
28688 they are actually used. */
28689 switch (ix86_tune)
28691 case PROCESSOR_CORE2:
28692 case PROCESSOR_NEHALEM:
28693 case PROCESSOR_SANDYBRIDGE:
28694 case PROCESSOR_HASWELL:
28695 /* Do not perform multipass scheduling for pre-reload schedule
28696 to save compile time. */
28697 if (reload_completed)
28699 ix86_core2i7_init_hooks ();
28700 break;
28702 /* Fall through. */
28703 default:
28704 targetm.sched.dfa_post_advance_cycle = NULL;
28705 targetm.sched.first_cycle_multipass_init = NULL;
28706 targetm.sched.first_cycle_multipass_begin = NULL;
28707 targetm.sched.first_cycle_multipass_issue = NULL;
28708 targetm.sched.first_cycle_multipass_backtrack = NULL;
28709 targetm.sched.first_cycle_multipass_end = NULL;
28710 targetm.sched.first_cycle_multipass_fini = NULL;
28711 break;
28716 /* Implement TARGET_CONSTANT_ALIGNMENT. */
28718 static HOST_WIDE_INT
28719 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
28721 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
28722 || TREE_CODE (exp) == INTEGER_CST)
28724 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
28725 return 64;
28726 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
28727 return 128;
28729 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
28730 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
28731 return BITS_PER_WORD;
28733 return align;
28736 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
28737 the data type, and ALIGN is the alignment that the object would
28738 ordinarily have. */
28740 static int
28741 iamcu_alignment (tree type, int align)
28743 machine_mode mode;
28745 if (align < 32 || TYPE_USER_ALIGN (type))
28746 return align;
28748 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
28749 bytes. */
28750 mode = TYPE_MODE (strip_array_types (type));
28751 switch (GET_MODE_CLASS (mode))
28753 case MODE_INT:
28754 case MODE_COMPLEX_INT:
28755 case MODE_COMPLEX_FLOAT:
28756 case MODE_FLOAT:
28757 case MODE_DECIMAL_FLOAT:
28758 return 32;
28759 default:
28760 return align;
28764 /* Compute the alignment for a static variable.
28765 TYPE is the data type, and ALIGN is the alignment that
28766 the object would ordinarily have. The value of this function is used
28767 instead of that alignment to align the object. */
28770 ix86_data_alignment (tree type, int align, bool opt)
28772 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
28773 for symbols from other compilation units or symbols that don't need
28774 to bind locally. In order to preserve some ABI compatibility with
28775 those compilers, ensure we don't decrease alignment from what we
28776 used to assume. */
28778 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
28780 /* A data structure, equal or greater than the size of a cache line
28781 (64 bytes in the Pentium 4 and other recent Intel processors, including
28782 processors based on Intel Core microarchitecture) should be aligned
28783 so that its base address is a multiple of a cache line size. */
28785 int max_align
28786 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
28788 if (max_align < BITS_PER_WORD)
28789 max_align = BITS_PER_WORD;
28791 switch (ix86_align_data_type)
28793 case ix86_align_data_type_abi: opt = false; break;
28794 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
28795 case ix86_align_data_type_cacheline: break;
28798 if (TARGET_IAMCU)
28799 align = iamcu_alignment (type, align);
28801 if (opt
28802 && AGGREGATE_TYPE_P (type)
28803 && TYPE_SIZE (type)
28804 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
28806 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
28807 && align < max_align_compat)
28808 align = max_align_compat;
28809 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
28810 && align < max_align)
28811 align = max_align;
28814 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
28815 to 16byte boundary. */
28816 if (TARGET_64BIT)
28818 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
28819 && TYPE_SIZE (type)
28820 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
28821 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
28822 && align < 128)
28823 return 128;
28826 if (!opt)
28827 return align;
28829 if (TREE_CODE (type) == ARRAY_TYPE)
28831 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
28832 return 64;
28833 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
28834 return 128;
28836 else if (TREE_CODE (type) == COMPLEX_TYPE)
28839 if (TYPE_MODE (type) == DCmode && align < 64)
28840 return 64;
28841 if ((TYPE_MODE (type) == XCmode
28842 || TYPE_MODE (type) == TCmode) && align < 128)
28843 return 128;
28845 else if ((TREE_CODE (type) == RECORD_TYPE
28846 || TREE_CODE (type) == UNION_TYPE
28847 || TREE_CODE (type) == QUAL_UNION_TYPE)
28848 && TYPE_FIELDS (type))
28850 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
28851 return 64;
28852 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
28853 return 128;
28855 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
28856 || TREE_CODE (type) == INTEGER_TYPE)
28858 if (TYPE_MODE (type) == DFmode && align < 64)
28859 return 64;
28860 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
28861 return 128;
28864 return align;
28867 /* Compute the alignment for a local variable or a stack slot. EXP is
28868 the data type or decl itself, MODE is the widest mode available and
28869 ALIGN is the alignment that the object would ordinarily have. The
28870 value of this macro is used instead of that alignment to align the
28871 object. */
28873 unsigned int
28874 ix86_local_alignment (tree exp, machine_mode mode,
28875 unsigned int align)
28877 tree type, decl;
28879 if (exp && DECL_P (exp))
28881 type = TREE_TYPE (exp);
28882 decl = exp;
28884 else
28886 type = exp;
28887 decl = NULL;
28890 /* Don't do dynamic stack realignment for long long objects with
28891 -mpreferred-stack-boundary=2. */
28892 if (!TARGET_64BIT
28893 && align == 64
28894 && ix86_preferred_stack_boundary < 64
28895 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
28896 && (!type || !TYPE_USER_ALIGN (type))
28897 && (!decl || !DECL_USER_ALIGN (decl)))
28898 align = 32;
28900 /* If TYPE is NULL, we are allocating a stack slot for caller-save
28901 register in MODE. We will return the largest alignment of XF
28902 and DF. */
28903 if (!type)
28905 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
28906 align = GET_MODE_ALIGNMENT (DFmode);
28907 return align;
28910 /* Don't increase alignment for Intel MCU psABI. */
28911 if (TARGET_IAMCU)
28912 return align;
28914 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
28915 to 16byte boundary. Exact wording is:
28917 An array uses the same alignment as its elements, except that a local or
28918 global array variable of length at least 16 bytes or
28919 a C99 variable-length array variable always has alignment of at least 16 bytes.
28921 This was added to allow use of aligned SSE instructions at arrays. This
28922 rule is meant for static storage (where compiler can not do the analysis
28923 by itself). We follow it for automatic variables only when convenient.
28924 We fully control everything in the function compiled and functions from
28925 other unit can not rely on the alignment.
28927 Exclude va_list type. It is the common case of local array where
28928 we can not benefit from the alignment.
28930 TODO: Probably one should optimize for size only when var is not escaping. */
28931 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
28932 && TARGET_SSE)
28934 if (AGGREGATE_TYPE_P (type)
28935 && (va_list_type_node == NULL_TREE
28936 || (TYPE_MAIN_VARIANT (type)
28937 != TYPE_MAIN_VARIANT (va_list_type_node)))
28938 && TYPE_SIZE (type)
28939 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
28940 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
28941 && align < 128)
28942 return 128;
28944 if (TREE_CODE (type) == ARRAY_TYPE)
28946 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
28947 return 64;
28948 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
28949 return 128;
28951 else if (TREE_CODE (type) == COMPLEX_TYPE)
28953 if (TYPE_MODE (type) == DCmode && align < 64)
28954 return 64;
28955 if ((TYPE_MODE (type) == XCmode
28956 || TYPE_MODE (type) == TCmode) && align < 128)
28957 return 128;
28959 else if ((TREE_CODE (type) == RECORD_TYPE
28960 || TREE_CODE (type) == UNION_TYPE
28961 || TREE_CODE (type) == QUAL_UNION_TYPE)
28962 && TYPE_FIELDS (type))
28964 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
28965 return 64;
28966 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
28967 return 128;
28969 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
28970 || TREE_CODE (type) == INTEGER_TYPE)
28973 if (TYPE_MODE (type) == DFmode && align < 64)
28974 return 64;
28975 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
28976 return 128;
28978 return align;
28981 /* Compute the minimum required alignment for dynamic stack realignment
28982 purposes for a local variable, parameter or a stack slot. EXP is
28983 the data type or decl itself, MODE is its mode and ALIGN is the
28984 alignment that the object would ordinarily have. */
28986 unsigned int
28987 ix86_minimum_alignment (tree exp, machine_mode mode,
28988 unsigned int align)
28990 tree type, decl;
28992 if (exp && DECL_P (exp))
28994 type = TREE_TYPE (exp);
28995 decl = exp;
28997 else
28999 type = exp;
29000 decl = NULL;
29003 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
29004 return align;
29006 /* Don't do dynamic stack realignment for long long objects with
29007 -mpreferred-stack-boundary=2. */
29008 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
29009 && (!type || !TYPE_USER_ALIGN (type))
29010 && (!decl || !DECL_USER_ALIGN (decl)))
29012 gcc_checking_assert (!TARGET_STV);
29013 return 32;
29016 return align;
29019 /* Find a location for the static chain incoming to a nested function.
29020 This is a register, unless all free registers are used by arguments. */
29022 static rtx
29023 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
29025 unsigned regno;
29027 /* While this function won't be called by the middle-end when a static
29028 chain isn't needed, it's also used throughout the backend so it's
29029 easiest to keep this check centralized. */
29030 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
29031 return NULL;
29033 if (TARGET_64BIT)
29035 /* We always use R10 in 64-bit mode. */
29036 regno = R10_REG;
29038 else
29040 const_tree fntype, fndecl;
29041 unsigned int ccvt;
29043 /* By default in 32-bit mode we use ECX to pass the static chain. */
29044 regno = CX_REG;
29046 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
29048 fntype = TREE_TYPE (fndecl_or_type);
29049 fndecl = fndecl_or_type;
29051 else
29053 fntype = fndecl_or_type;
29054 fndecl = NULL;
29057 ccvt = ix86_get_callcvt (fntype);
29058 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29060 /* Fastcall functions use ecx/edx for arguments, which leaves
29061 us with EAX for the static chain.
29062 Thiscall functions use ecx for arguments, which also
29063 leaves us with EAX for the static chain. */
29064 regno = AX_REG;
29066 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29068 /* Thiscall functions use ecx for arguments, which leaves
29069 us with EAX and EDX for the static chain.
29070 We are using for abi-compatibility EAX. */
29071 regno = AX_REG;
29073 else if (ix86_function_regparm (fntype, fndecl) == 3)
29075 /* For regparm 3, we have no free call-clobbered registers in
29076 which to store the static chain. In order to implement this,
29077 we have the trampoline push the static chain to the stack.
29078 However, we can't push a value below the return address when
29079 we call the nested function directly, so we have to use an
29080 alternate entry point. For this we use ESI, and have the
29081 alternate entry point push ESI, so that things appear the
29082 same once we're executing the nested function. */
29083 if (incoming_p)
29085 if (fndecl == current_function_decl
29086 && !ix86_static_chain_on_stack)
29088 gcc_assert (!reload_completed);
29089 ix86_static_chain_on_stack = true;
29091 return gen_frame_mem (SImode,
29092 plus_constant (Pmode,
29093 arg_pointer_rtx, -8));
29095 regno = SI_REG;
29099 return gen_rtx_REG (Pmode, regno);
29102 /* Emit RTL insns to initialize the variable parts of a trampoline.
29103 FNDECL is the decl of the target address; M_TRAMP is a MEM for
29104 the trampoline, and CHAIN_VALUE is an RTX for the static chain
29105 to be passed to the target function. */
29107 static void
29108 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
29110 rtx mem, fnaddr;
29111 int opcode;
29112 int offset = 0;
29114 fnaddr = XEXP (DECL_RTL (fndecl), 0);
29116 if (TARGET_64BIT)
29118 int size;
29120 /* Load the function address to r11. Try to load address using
29121 the shorter movl instead of movabs. We may want to support
29122 movq for kernel mode, but kernel does not use trampolines at
29123 the moment. FNADDR is a 32bit address and may not be in
29124 DImode when ptr_mode == SImode. Always use movl in this
29125 case. */
29126 if (ptr_mode == SImode
29127 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
29129 fnaddr = copy_addr_to_reg (fnaddr);
29131 mem = adjust_address (m_tramp, HImode, offset);
29132 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
29134 mem = adjust_address (m_tramp, SImode, offset + 2);
29135 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
29136 offset += 6;
29138 else
29140 mem = adjust_address (m_tramp, HImode, offset);
29141 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
29143 mem = adjust_address (m_tramp, DImode, offset + 2);
29144 emit_move_insn (mem, fnaddr);
29145 offset += 10;
29148 /* Load static chain using movabs to r10. Use the shorter movl
29149 instead of movabs when ptr_mode == SImode. */
29150 if (ptr_mode == SImode)
29152 opcode = 0xba41;
29153 size = 6;
29155 else
29157 opcode = 0xba49;
29158 size = 10;
29161 mem = adjust_address (m_tramp, HImode, offset);
29162 emit_move_insn (mem, gen_int_mode (opcode, HImode));
29164 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
29165 emit_move_insn (mem, chain_value);
29166 offset += size;
29168 /* Jump to r11; the last (unused) byte is a nop, only there to
29169 pad the write out to a single 32-bit store. */
29170 mem = adjust_address (m_tramp, SImode, offset);
29171 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
29172 offset += 4;
29174 else
29176 rtx disp, chain;
29178 /* Depending on the static chain location, either load a register
29179 with a constant, or push the constant to the stack. All of the
29180 instructions are the same size. */
29181 chain = ix86_static_chain (fndecl, true);
29182 if (REG_P (chain))
29184 switch (REGNO (chain))
29186 case AX_REG:
29187 opcode = 0xb8; break;
29188 case CX_REG:
29189 opcode = 0xb9; break;
29190 default:
29191 gcc_unreachable ();
29194 else
29195 opcode = 0x68;
29197 mem = adjust_address (m_tramp, QImode, offset);
29198 emit_move_insn (mem, gen_int_mode (opcode, QImode));
29200 mem = adjust_address (m_tramp, SImode, offset + 1);
29201 emit_move_insn (mem, chain_value);
29202 offset += 5;
29204 mem = adjust_address (m_tramp, QImode, offset);
29205 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
29207 mem = adjust_address (m_tramp, SImode, offset + 1);
29209 /* Compute offset from the end of the jmp to the target function.
29210 In the case in which the trampoline stores the static chain on
29211 the stack, we need to skip the first insn which pushes the
29212 (call-saved) register static chain; this push is 1 byte. */
29213 offset += 5;
29214 disp = expand_binop (SImode, sub_optab, fnaddr,
29215 plus_constant (Pmode, XEXP (m_tramp, 0),
29216 offset - (MEM_P (chain) ? 1 : 0)),
29217 NULL_RTX, 1, OPTAB_DIRECT);
29218 emit_move_insn (mem, disp);
29221 gcc_assert (offset <= TRAMPOLINE_SIZE);
29223 #ifdef HAVE_ENABLE_EXECUTE_STACK
29224 #ifdef CHECK_EXECUTE_STACK_ENABLED
29225 if (CHECK_EXECUTE_STACK_ENABLED)
29226 #endif
29227 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
29228 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
29229 #endif
29232 static bool
29233 ix86_allocate_stack_slots_for_args (void)
29235 /* Naked functions should not allocate stack slots for arguments. */
29236 return !ix86_function_naked (current_function_decl);
29239 static bool
29240 ix86_warn_func_return (tree decl)
29242 /* Naked functions are implemented entirely in assembly, including the
29243 return sequence, so suppress warnings about this. */
29244 return !ix86_function_naked (decl);
29247 /* The following file contains several enumerations and data structures
29248 built from the definitions in i386-builtin-types.def. */
29250 #include "i386-builtin-types.inc"
29252 /* Table for the ix86 builtin non-function types. */
29253 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
29255 /* Retrieve an element from the above table, building some of
29256 the types lazily. */
29258 static tree
29259 ix86_get_builtin_type (enum ix86_builtin_type tcode)
29261 unsigned int index;
29262 tree type, itype;
29264 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
29266 type = ix86_builtin_type_tab[(int) tcode];
29267 if (type != NULL)
29268 return type;
29270 gcc_assert (tcode > IX86_BT_LAST_PRIM);
29271 if (tcode <= IX86_BT_LAST_VECT)
29273 machine_mode mode;
29275 index = tcode - IX86_BT_LAST_PRIM - 1;
29276 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
29277 mode = ix86_builtin_type_vect_mode[index];
29279 type = build_vector_type_for_mode (itype, mode);
29281 else
29283 int quals;
29285 index = tcode - IX86_BT_LAST_VECT - 1;
29286 if (tcode <= IX86_BT_LAST_PTR)
29287 quals = TYPE_UNQUALIFIED;
29288 else
29289 quals = TYPE_QUAL_CONST;
29291 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
29292 if (quals != TYPE_UNQUALIFIED)
29293 itype = build_qualified_type (itype, quals);
29295 type = build_pointer_type (itype);
29298 ix86_builtin_type_tab[(int) tcode] = type;
29299 return type;
29302 /* Table for the ix86 builtin function types. */
29303 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
29305 /* Retrieve an element from the above table, building some of
29306 the types lazily. */
29308 static tree
29309 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
29311 tree type;
29313 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
29315 type = ix86_builtin_func_type_tab[(int) tcode];
29316 if (type != NULL)
29317 return type;
29319 if (tcode <= IX86_BT_LAST_FUNC)
29321 unsigned start = ix86_builtin_func_start[(int) tcode];
29322 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
29323 tree rtype, atype, args = void_list_node;
29324 unsigned i;
29326 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
29327 for (i = after - 1; i > start; --i)
29329 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
29330 args = tree_cons (NULL, atype, args);
29333 type = build_function_type (rtype, args);
29335 else
29337 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
29338 enum ix86_builtin_func_type icode;
29340 icode = ix86_builtin_func_alias_base[index];
29341 type = ix86_get_builtin_func_type (icode);
29344 ix86_builtin_func_type_tab[(int) tcode] = type;
29345 return type;
29349 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
29350 bdesc_* arrays below should come first, then builtins for each bdesc_*
29351 array in ascending order, so that we can use direct array accesses. */
29352 enum ix86_builtins
29354 IX86_BUILTIN_MASKMOVQ,
29355 IX86_BUILTIN_LDMXCSR,
29356 IX86_BUILTIN_STMXCSR,
29357 IX86_BUILTIN_MASKMOVDQU,
29358 IX86_BUILTIN_PSLLDQ128,
29359 IX86_BUILTIN_CLFLUSH,
29360 IX86_BUILTIN_MONITOR,
29361 IX86_BUILTIN_MWAIT,
29362 IX86_BUILTIN_CLZERO,
29363 IX86_BUILTIN_VEC_INIT_V2SI,
29364 IX86_BUILTIN_VEC_INIT_V4HI,
29365 IX86_BUILTIN_VEC_INIT_V8QI,
29366 IX86_BUILTIN_VEC_EXT_V2DF,
29367 IX86_BUILTIN_VEC_EXT_V2DI,
29368 IX86_BUILTIN_VEC_EXT_V4SF,
29369 IX86_BUILTIN_VEC_EXT_V4SI,
29370 IX86_BUILTIN_VEC_EXT_V8HI,
29371 IX86_BUILTIN_VEC_EXT_V2SI,
29372 IX86_BUILTIN_VEC_EXT_V4HI,
29373 IX86_BUILTIN_VEC_EXT_V16QI,
29374 IX86_BUILTIN_VEC_SET_V2DI,
29375 IX86_BUILTIN_VEC_SET_V4SF,
29376 IX86_BUILTIN_VEC_SET_V4SI,
29377 IX86_BUILTIN_VEC_SET_V8HI,
29378 IX86_BUILTIN_VEC_SET_V4HI,
29379 IX86_BUILTIN_VEC_SET_V16QI,
29380 IX86_BUILTIN_GATHERSIV2DF,
29381 IX86_BUILTIN_GATHERSIV4DF,
29382 IX86_BUILTIN_GATHERDIV2DF,
29383 IX86_BUILTIN_GATHERDIV4DF,
29384 IX86_BUILTIN_GATHERSIV4SF,
29385 IX86_BUILTIN_GATHERSIV8SF,
29386 IX86_BUILTIN_GATHERDIV4SF,
29387 IX86_BUILTIN_GATHERDIV8SF,
29388 IX86_BUILTIN_GATHERSIV2DI,
29389 IX86_BUILTIN_GATHERSIV4DI,
29390 IX86_BUILTIN_GATHERDIV2DI,
29391 IX86_BUILTIN_GATHERDIV4DI,
29392 IX86_BUILTIN_GATHERSIV4SI,
29393 IX86_BUILTIN_GATHERSIV8SI,
29394 IX86_BUILTIN_GATHERDIV4SI,
29395 IX86_BUILTIN_GATHERDIV8SI,
29396 IX86_BUILTIN_VFMSUBSD3_MASK3,
29397 IX86_BUILTIN_VFMSUBSS3_MASK3,
29398 IX86_BUILTIN_GATHER3SIV8SF,
29399 IX86_BUILTIN_GATHER3SIV4SF,
29400 IX86_BUILTIN_GATHER3SIV4DF,
29401 IX86_BUILTIN_GATHER3SIV2DF,
29402 IX86_BUILTIN_GATHER3DIV8SF,
29403 IX86_BUILTIN_GATHER3DIV4SF,
29404 IX86_BUILTIN_GATHER3DIV4DF,
29405 IX86_BUILTIN_GATHER3DIV2DF,
29406 IX86_BUILTIN_GATHER3SIV8SI,
29407 IX86_BUILTIN_GATHER3SIV4SI,
29408 IX86_BUILTIN_GATHER3SIV4DI,
29409 IX86_BUILTIN_GATHER3SIV2DI,
29410 IX86_BUILTIN_GATHER3DIV8SI,
29411 IX86_BUILTIN_GATHER3DIV4SI,
29412 IX86_BUILTIN_GATHER3DIV4DI,
29413 IX86_BUILTIN_GATHER3DIV2DI,
29414 IX86_BUILTIN_SCATTERSIV8SF,
29415 IX86_BUILTIN_SCATTERSIV4SF,
29416 IX86_BUILTIN_SCATTERSIV4DF,
29417 IX86_BUILTIN_SCATTERSIV2DF,
29418 IX86_BUILTIN_SCATTERDIV8SF,
29419 IX86_BUILTIN_SCATTERDIV4SF,
29420 IX86_BUILTIN_SCATTERDIV4DF,
29421 IX86_BUILTIN_SCATTERDIV2DF,
29422 IX86_BUILTIN_SCATTERSIV8SI,
29423 IX86_BUILTIN_SCATTERSIV4SI,
29424 IX86_BUILTIN_SCATTERSIV4DI,
29425 IX86_BUILTIN_SCATTERSIV2DI,
29426 IX86_BUILTIN_SCATTERDIV8SI,
29427 IX86_BUILTIN_SCATTERDIV4SI,
29428 IX86_BUILTIN_SCATTERDIV4DI,
29429 IX86_BUILTIN_SCATTERDIV2DI,
29430 /* Alternate 4 and 8 element gather/scatter for the vectorizer
29431 where all operands are 32-byte or 64-byte wide respectively. */
29432 IX86_BUILTIN_GATHERALTSIV4DF,
29433 IX86_BUILTIN_GATHERALTDIV8SF,
29434 IX86_BUILTIN_GATHERALTSIV4DI,
29435 IX86_BUILTIN_GATHERALTDIV8SI,
29436 IX86_BUILTIN_GATHER3ALTDIV16SF,
29437 IX86_BUILTIN_GATHER3ALTDIV16SI,
29438 IX86_BUILTIN_GATHER3ALTSIV4DF,
29439 IX86_BUILTIN_GATHER3ALTDIV8SF,
29440 IX86_BUILTIN_GATHER3ALTSIV4DI,
29441 IX86_BUILTIN_GATHER3ALTDIV8SI,
29442 IX86_BUILTIN_GATHER3ALTSIV8DF,
29443 IX86_BUILTIN_GATHER3ALTSIV8DI,
29444 IX86_BUILTIN_GATHER3DIV16SF,
29445 IX86_BUILTIN_GATHER3DIV16SI,
29446 IX86_BUILTIN_GATHER3DIV8DF,
29447 IX86_BUILTIN_GATHER3DIV8DI,
29448 IX86_BUILTIN_GATHER3SIV16SF,
29449 IX86_BUILTIN_GATHER3SIV16SI,
29450 IX86_BUILTIN_GATHER3SIV8DF,
29451 IX86_BUILTIN_GATHER3SIV8DI,
29452 IX86_BUILTIN_SCATTERALTSIV8DF,
29453 IX86_BUILTIN_SCATTERALTDIV16SF,
29454 IX86_BUILTIN_SCATTERALTSIV8DI,
29455 IX86_BUILTIN_SCATTERALTDIV16SI,
29456 IX86_BUILTIN_SCATTERDIV16SF,
29457 IX86_BUILTIN_SCATTERDIV16SI,
29458 IX86_BUILTIN_SCATTERDIV8DF,
29459 IX86_BUILTIN_SCATTERDIV8DI,
29460 IX86_BUILTIN_SCATTERSIV16SF,
29461 IX86_BUILTIN_SCATTERSIV16SI,
29462 IX86_BUILTIN_SCATTERSIV8DF,
29463 IX86_BUILTIN_SCATTERSIV8DI,
29464 IX86_BUILTIN_GATHERPFQPD,
29465 IX86_BUILTIN_GATHERPFDPS,
29466 IX86_BUILTIN_GATHERPFDPD,
29467 IX86_BUILTIN_GATHERPFQPS,
29468 IX86_BUILTIN_SCATTERPFDPD,
29469 IX86_BUILTIN_SCATTERPFDPS,
29470 IX86_BUILTIN_SCATTERPFQPD,
29471 IX86_BUILTIN_SCATTERPFQPS,
29472 IX86_BUILTIN_CLWB,
29473 IX86_BUILTIN_CLFLUSHOPT,
29474 IX86_BUILTIN_INFQ,
29475 IX86_BUILTIN_HUGE_VALQ,
29476 IX86_BUILTIN_NANQ,
29477 IX86_BUILTIN_NANSQ,
29478 IX86_BUILTIN_XABORT,
29479 IX86_BUILTIN_ADDCARRYX32,
29480 IX86_BUILTIN_ADDCARRYX64,
29481 IX86_BUILTIN_SBB32,
29482 IX86_BUILTIN_SBB64,
29483 IX86_BUILTIN_RDRAND16_STEP,
29484 IX86_BUILTIN_RDRAND32_STEP,
29485 IX86_BUILTIN_RDRAND64_STEP,
29486 IX86_BUILTIN_RDSEED16_STEP,
29487 IX86_BUILTIN_RDSEED32_STEP,
29488 IX86_BUILTIN_RDSEED64_STEP,
29489 IX86_BUILTIN_MONITORX,
29490 IX86_BUILTIN_MWAITX,
29491 IX86_BUILTIN_CFSTRING,
29492 IX86_BUILTIN_CPU_INIT,
29493 IX86_BUILTIN_CPU_IS,
29494 IX86_BUILTIN_CPU_SUPPORTS,
29495 IX86_BUILTIN_READ_FLAGS,
29496 IX86_BUILTIN_WRITE_FLAGS,
29498 /* All the remaining builtins are tracked in bdesc_* arrays in
29499 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
29500 this point. */
29501 #define BDESC(mask, icode, name, code, comparison, flag) \
29502 code,
29503 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29504 code, \
29505 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
29506 #define BDESC_END(kind, next_kind)
29508 #include "i386-builtin.def"
29510 #undef BDESC
29511 #undef BDESC_FIRST
29512 #undef BDESC_END
29514 IX86_BUILTIN_MAX,
29516 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
29518 /* Now just the aliases for bdesc_* start/end. */
29519 #define BDESC(mask, icode, name, code, comparison, flag)
29520 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
29521 #define BDESC_END(kind, next_kind) \
29522 IX86_BUILTIN__BDESC_##kind##_LAST \
29523 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
29525 #include "i386-builtin.def"
29527 #undef BDESC
29528 #undef BDESC_FIRST
29529 #undef BDESC_END
29531 /* Just to make sure there is no comma after the last enumerator. */
29532 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
29535 /* Table for the ix86 builtin decls. */
29536 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
29538 /* Table of all of the builtin functions that are possible with different ISA's
29539 but are waiting to be built until a function is declared to use that
29540 ISA. */
29541 struct builtin_isa {
29542 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
29543 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
29544 const char *name; /* function name */
29545 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
29546 unsigned char const_p:1; /* true if the declaration is constant */
29547 unsigned char pure_p:1; /* true if the declaration has pure attribute */
29548 bool leaf_p; /* true if the declaration has leaf attribute */
29549 bool nothrow_p; /* true if the declaration has nothrow attribute */
29550 bool set_and_not_built_p;
29553 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
29555 /* Bits that can still enable any inclusion of a builtin. */
29556 static HOST_WIDE_INT deferred_isa_values = 0;
29557 static HOST_WIDE_INT deferred_isa_values2 = 0;
29559 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
29560 of which isa_flags to use in the ix86_builtins_isa array. Stores the
29561 function decl in the ix86_builtins array. Returns the function decl or
29562 NULL_TREE, if the builtin was not added.
29564 If the front end has a special hook for builtin functions, delay adding
29565 builtin functions that aren't in the current ISA until the ISA is changed
29566 with function specific optimization. Doing so, can save about 300K for the
29567 default compiler. When the builtin is expanded, check at that time whether
29568 it is valid.
29570 If the front end doesn't have a special hook, record all builtins, even if
29571 it isn't an instruction set in the current ISA in case the user uses
29572 function specific options for a different ISA, so that we don't get scope
29573 errors if a builtin is added in the middle of a function scope. */
29575 static inline tree
29576 def_builtin (HOST_WIDE_INT mask, const char *name,
29577 enum ix86_builtin_func_type tcode,
29578 enum ix86_builtins code)
29580 tree decl = NULL_TREE;
29582 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
29584 ix86_builtins_isa[(int) code].isa = mask;
29586 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
29587 where any bit set means that built-in is enable, this bit must be *and-ed*
29588 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
29589 means that *both* cpuid bits must be set for the built-in to be available.
29590 Handle this here. */
29591 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
29592 mask &= ~OPTION_MASK_ISA_AVX512VL;
29594 mask &= ~OPTION_MASK_ISA_64BIT;
29595 if (mask == 0
29596 || (mask & ix86_isa_flags) != 0
29597 || (lang_hooks.builtin_function
29598 == lang_hooks.builtin_function_ext_scope))
29601 tree type = ix86_get_builtin_func_type (tcode);
29602 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29603 NULL, NULL_TREE);
29604 ix86_builtins[(int) code] = decl;
29605 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29607 else
29609 /* Just a MASK where set_and_not_built_p == true can potentially
29610 include a builtin. */
29611 deferred_isa_values |= mask;
29612 ix86_builtins[(int) code] = NULL_TREE;
29613 ix86_builtins_isa[(int) code].tcode = tcode;
29614 ix86_builtins_isa[(int) code].name = name;
29615 ix86_builtins_isa[(int) code].leaf_p = false;
29616 ix86_builtins_isa[(int) code].nothrow_p = false;
29617 ix86_builtins_isa[(int) code].const_p = false;
29618 ix86_builtins_isa[(int) code].pure_p = false;
29619 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29623 return decl;
29626 /* Like def_builtin, but also marks the function decl "const". */
29628 static inline tree
29629 def_builtin_const (HOST_WIDE_INT mask, const char *name,
29630 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29632 tree decl = def_builtin (mask, name, tcode, code);
29633 if (decl)
29634 TREE_READONLY (decl) = 1;
29635 else
29636 ix86_builtins_isa[(int) code].const_p = true;
29638 return decl;
29641 /* Like def_builtin, but also marks the function decl "pure". */
29643 static inline tree
29644 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
29645 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29647 tree decl = def_builtin (mask, name, tcode, code);
29648 if (decl)
29649 DECL_PURE_P (decl) = 1;
29650 else
29651 ix86_builtins_isa[(int) code].pure_p = true;
29653 return decl;
29656 /* Like def_builtin, but for additional isa2 flags. */
29658 static inline tree
29659 def_builtin2 (HOST_WIDE_INT mask, const char *name,
29660 enum ix86_builtin_func_type tcode,
29661 enum ix86_builtins code)
29663 tree decl = NULL_TREE;
29665 ix86_builtins_isa[(int) code].isa2 = mask;
29667 if (mask == 0
29668 || (mask & ix86_isa_flags2) != 0
29669 || (lang_hooks.builtin_function
29670 == lang_hooks.builtin_function_ext_scope))
29673 tree type = ix86_get_builtin_func_type (tcode);
29674 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29675 NULL, NULL_TREE);
29676 ix86_builtins[(int) code] = decl;
29677 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29679 else
29681 /* Just a MASK where set_and_not_built_p == true can potentially
29682 include a builtin. */
29683 deferred_isa_values2 |= mask;
29684 ix86_builtins[(int) code] = NULL_TREE;
29685 ix86_builtins_isa[(int) code].tcode = tcode;
29686 ix86_builtins_isa[(int) code].name = name;
29687 ix86_builtins_isa[(int) code].leaf_p = false;
29688 ix86_builtins_isa[(int) code].nothrow_p = false;
29689 ix86_builtins_isa[(int) code].const_p = false;
29690 ix86_builtins_isa[(int) code].pure_p = false;
29691 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29694 return decl;
29697 /* Like def_builtin, but also marks the function decl "const". */
29699 static inline tree
29700 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
29701 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29703 tree decl = def_builtin2 (mask, name, tcode, code);
29704 if (decl)
29705 TREE_READONLY (decl) = 1;
29706 else
29707 ix86_builtins_isa[(int) code].const_p = true;
29709 return decl;
29712 /* Like def_builtin, but also marks the function decl "pure". */
29714 static inline tree
29715 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
29716 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29718 tree decl = def_builtin2 (mask, name, tcode, code);
29719 if (decl)
29720 DECL_PURE_P (decl) = 1;
29721 else
29722 ix86_builtins_isa[(int) code].pure_p = true;
29724 return decl;
29727 /* Add any new builtin functions for a given ISA that may not have been
29728 declared. This saves a bit of space compared to adding all of the
29729 declarations to the tree, even if we didn't use them. */
29731 static void
29732 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
29734 if ((isa & deferred_isa_values) == 0
29735 && (isa2 & deferred_isa_values2) == 0)
29736 return;
29738 /* Bits in ISA value can be removed from potential isa values. */
29739 deferred_isa_values &= ~isa;
29740 deferred_isa_values2 &= ~isa2;
29742 int i;
29743 tree saved_current_target_pragma = current_target_pragma;
29744 current_target_pragma = NULL_TREE;
29746 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
29748 if (((ix86_builtins_isa[i].isa & isa) != 0
29749 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
29750 && ix86_builtins_isa[i].set_and_not_built_p)
29752 tree decl, type;
29754 /* Don't define the builtin again. */
29755 ix86_builtins_isa[i].set_and_not_built_p = false;
29757 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
29758 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
29759 type, i, BUILT_IN_MD, NULL,
29760 NULL_TREE);
29762 ix86_builtins[i] = decl;
29763 if (ix86_builtins_isa[i].const_p)
29764 TREE_READONLY (decl) = 1;
29765 if (ix86_builtins_isa[i].pure_p)
29766 DECL_PURE_P (decl) = 1;
29767 if (ix86_builtins_isa[i].leaf_p)
29768 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
29769 NULL_TREE);
29770 if (ix86_builtins_isa[i].nothrow_p)
29771 TREE_NOTHROW (decl) = 1;
29775 current_target_pragma = saved_current_target_pragma;
29778 /* Bits for builtin_description.flag. */
29780 /* Set when we don't support the comparison natively, and should
29781 swap_comparison in order to support it. */
29782 #define BUILTIN_DESC_SWAP_OPERANDS 1
29784 struct builtin_description
29786 const HOST_WIDE_INT mask;
29787 const enum insn_code icode;
29788 const char *const name;
29789 const enum ix86_builtins code;
29790 const enum rtx_code comparison;
29791 const int flag;
29794 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
29795 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
29796 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
29797 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
29798 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
29799 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
29800 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
29801 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
29802 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
29803 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
29804 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
29805 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
29806 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
29807 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
29808 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
29809 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
29810 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
29811 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
29812 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
29813 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
29814 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
29815 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
29816 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
29817 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
29818 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
29819 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
29820 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
29821 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
29822 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
29823 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
29824 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
29825 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
29826 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
29827 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
29828 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
29829 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
29830 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
29831 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
29832 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
29833 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
29834 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
29835 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
29836 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
29837 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
29838 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
29839 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
29840 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
29841 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
29842 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
29843 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
29844 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
29845 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
29847 #define BDESC(mask, icode, name, code, comparison, flag) \
29848 { mask, icode, name, code, comparison, flag },
29849 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29850 static const struct builtin_description bdesc_##kind[] = \
29852 BDESC (mask, icode, name, code, comparison, flag)
29853 #define BDESC_END(kind, next_kind) \
29856 #include "i386-builtin.def"
29858 #undef BDESC
29859 #undef BDESC_FIRST
29860 #undef BDESC_END
29862 /* TM vector builtins. */
29864 /* Reuse the existing x86-specific `struct builtin_description' cause
29865 we're lazy. Add casts to make them fit. */
29866 static const struct builtin_description bdesc_tm[] =
29868 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29869 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29870 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29871 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29872 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29873 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29874 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29876 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29877 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29878 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29879 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29880 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29881 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29882 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29884 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29885 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29886 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29887 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29888 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29889 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29890 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29892 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29893 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29894 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29897 /* Initialize the transactional memory vector load/store builtins. */
29899 static void
29900 ix86_init_tm_builtins (void)
29902 enum ix86_builtin_func_type ftype;
29903 const struct builtin_description *d;
29904 size_t i;
29905 tree decl;
29906 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29907 tree attrs_log, attrs_type_log;
29909 if (!flag_tm)
29910 return;
29912 /* If there are no builtins defined, we must be compiling in a
29913 language without trans-mem support. */
29914 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29915 return;
29917 /* Use whatever attributes a normal TM load has. */
29918 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29919 attrs_load = DECL_ATTRIBUTES (decl);
29920 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29921 /* Use whatever attributes a normal TM store has. */
29922 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29923 attrs_store = DECL_ATTRIBUTES (decl);
29924 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29925 /* Use whatever attributes a normal TM log has. */
29926 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29927 attrs_log = DECL_ATTRIBUTES (decl);
29928 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29930 for (i = 0, d = bdesc_tm;
29931 i < ARRAY_SIZE (bdesc_tm);
29932 i++, d++)
29934 if ((d->mask & ix86_isa_flags) != 0
29935 || (lang_hooks.builtin_function
29936 == lang_hooks.builtin_function_ext_scope))
29938 tree type, attrs, attrs_type;
29939 enum built_in_function code = (enum built_in_function) d->code;
29941 ftype = (enum ix86_builtin_func_type) d->flag;
29942 type = ix86_get_builtin_func_type (ftype);
29944 if (BUILTIN_TM_LOAD_P (code))
29946 attrs = attrs_load;
29947 attrs_type = attrs_type_load;
29949 else if (BUILTIN_TM_STORE_P (code))
29951 attrs = attrs_store;
29952 attrs_type = attrs_type_store;
29954 else
29956 attrs = attrs_log;
29957 attrs_type = attrs_type_log;
29959 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29960 /* The builtin without the prefix for
29961 calling it directly. */
29962 d->name + strlen ("__builtin_"),
29963 attrs);
29964 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29965 set the TYPE_ATTRIBUTES. */
29966 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29968 set_builtin_decl (code, decl, false);
29973 /* Macros for verification of enum ix86_builtins order. */
29974 #define BDESC_VERIFY(x, y, z) \
29975 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
29976 #define BDESC_VERIFYS(x, y, z) \
29977 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
29979 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
29980 IX86_BUILTIN__BDESC_COMI_LAST, 1);
29981 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
29982 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
29983 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
29984 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
29985 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
29986 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
29987 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
29988 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
29989 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
29990 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
29991 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
29992 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
29993 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
29994 IX86_BUILTIN__BDESC_MPX_LAST, 1);
29995 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
29996 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
29997 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
29998 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
29999 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30000 IX86_BUILTIN__BDESC_CET_LAST, 1);
30001 BDESC_VERIFYS (IX86_BUILTIN_MAX,
30002 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
30004 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
30005 in the current target ISA to allow the user to compile particular modules
30006 with different target specific options that differ from the command line
30007 options. */
30008 static void
30009 ix86_init_mmx_sse_builtins (void)
30011 const struct builtin_description * d;
30012 enum ix86_builtin_func_type ftype;
30013 size_t i;
30015 /* Add all special builtins with variable number of operands. */
30016 for (i = 0, d = bdesc_special_args;
30017 i < ARRAY_SIZE (bdesc_special_args);
30018 i++, d++)
30020 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
30021 if (d->name == 0)
30022 continue;
30024 ftype = (enum ix86_builtin_func_type) d->flag;
30025 def_builtin (d->mask, d->name, ftype, d->code);
30027 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
30028 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30029 ARRAY_SIZE (bdesc_special_args) - 1);
30031 /* Add all builtins with variable number of operands. */
30032 for (i = 0, d = bdesc_args;
30033 i < ARRAY_SIZE (bdesc_args);
30034 i++, d++)
30036 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
30037 if (d->name == 0)
30038 continue;
30040 ftype = (enum ix86_builtin_func_type) d->flag;
30041 def_builtin_const (d->mask, d->name, ftype, d->code);
30043 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
30044 IX86_BUILTIN__BDESC_ARGS_FIRST,
30045 ARRAY_SIZE (bdesc_args) - 1);
30047 /* Add all builtins with variable number of operands. */
30048 for (i = 0, d = bdesc_args2;
30049 i < ARRAY_SIZE (bdesc_args2);
30050 i++, d++)
30052 if (d->name == 0)
30053 continue;
30055 ftype = (enum ix86_builtin_func_type) d->flag;
30056 def_builtin_const2 (d->mask, d->name, ftype, d->code);
30059 /* Add all builtins with rounding. */
30060 for (i = 0, d = bdesc_round_args;
30061 i < ARRAY_SIZE (bdesc_round_args);
30062 i++, d++)
30064 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
30065 if (d->name == 0)
30066 continue;
30068 ftype = (enum ix86_builtin_func_type) d->flag;
30069 def_builtin_const (d->mask, d->name, ftype, d->code);
30071 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
30072 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30073 ARRAY_SIZE (bdesc_round_args) - 1);
30075 /* pcmpestr[im] insns. */
30076 for (i = 0, d = bdesc_pcmpestr;
30077 i < ARRAY_SIZE (bdesc_pcmpestr);
30078 i++, d++)
30080 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
30081 if (d->code == IX86_BUILTIN_PCMPESTRM128)
30082 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
30083 else
30084 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
30085 def_builtin_const (d->mask, d->name, ftype, d->code);
30087 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
30088 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30089 ARRAY_SIZE (bdesc_pcmpestr) - 1);
30091 /* pcmpistr[im] insns. */
30092 for (i = 0, d = bdesc_pcmpistr;
30093 i < ARRAY_SIZE (bdesc_pcmpistr);
30094 i++, d++)
30096 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
30097 if (d->code == IX86_BUILTIN_PCMPISTRM128)
30098 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
30099 else
30100 ftype = INT_FTYPE_V16QI_V16QI_INT;
30101 def_builtin_const (d->mask, d->name, ftype, d->code);
30103 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
30104 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30105 ARRAY_SIZE (bdesc_pcmpistr) - 1);
30107 /* comi/ucomi insns. */
30108 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30110 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
30111 if (d->mask == OPTION_MASK_ISA_SSE2)
30112 ftype = INT_FTYPE_V2DF_V2DF;
30113 else
30114 ftype = INT_FTYPE_V4SF_V4SF;
30115 def_builtin_const (d->mask, d->name, ftype, d->code);
30117 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
30118 IX86_BUILTIN__BDESC_COMI_FIRST,
30119 ARRAY_SIZE (bdesc_comi) - 1);
30121 /* SSE */
30122 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
30123 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
30124 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
30125 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
30127 /* SSE or 3DNow!A */
30128 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30129 /* As it uses V4HImode, we have to require -mmmx too. */
30130 | OPTION_MASK_ISA_MMX,
30131 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
30132 IX86_BUILTIN_MASKMOVQ);
30134 /* SSE2 */
30135 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
30136 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
30138 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
30139 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
30140 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
30141 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
30143 /* SSE3. */
30144 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
30145 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
30146 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
30147 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
30149 /* AES */
30150 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
30151 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
30152 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
30153 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
30154 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
30155 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
30156 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
30157 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
30158 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
30159 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
30160 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
30161 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
30163 /* PCLMUL */
30164 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
30165 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
30167 /* RDRND */
30168 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
30169 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
30170 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
30171 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
30172 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
30173 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
30174 IX86_BUILTIN_RDRAND64_STEP);
30176 /* AVX2 */
30177 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
30178 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
30179 IX86_BUILTIN_GATHERSIV2DF);
30181 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
30182 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
30183 IX86_BUILTIN_GATHERSIV4DF);
30185 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
30186 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
30187 IX86_BUILTIN_GATHERDIV2DF);
30189 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
30190 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
30191 IX86_BUILTIN_GATHERDIV4DF);
30193 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
30194 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
30195 IX86_BUILTIN_GATHERSIV4SF);
30197 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
30198 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
30199 IX86_BUILTIN_GATHERSIV8SF);
30201 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
30202 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
30203 IX86_BUILTIN_GATHERDIV4SF);
30205 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
30206 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
30207 IX86_BUILTIN_GATHERDIV8SF);
30209 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
30210 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
30211 IX86_BUILTIN_GATHERSIV2DI);
30213 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
30214 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
30215 IX86_BUILTIN_GATHERSIV4DI);
30217 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
30218 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
30219 IX86_BUILTIN_GATHERDIV2DI);
30221 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
30222 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
30223 IX86_BUILTIN_GATHERDIV4DI);
30225 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
30226 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
30227 IX86_BUILTIN_GATHERSIV4SI);
30229 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
30230 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
30231 IX86_BUILTIN_GATHERSIV8SI);
30233 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
30234 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
30235 IX86_BUILTIN_GATHERDIV4SI);
30237 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
30238 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
30239 IX86_BUILTIN_GATHERDIV8SI);
30241 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
30242 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
30243 IX86_BUILTIN_GATHERALTSIV4DF);
30245 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
30246 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
30247 IX86_BUILTIN_GATHERALTDIV8SF);
30249 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
30250 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
30251 IX86_BUILTIN_GATHERALTSIV4DI);
30253 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
30254 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
30255 IX86_BUILTIN_GATHERALTDIV8SI);
30257 /* AVX512F */
30258 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
30259 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
30260 IX86_BUILTIN_GATHER3SIV16SF);
30262 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
30263 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
30264 IX86_BUILTIN_GATHER3SIV8DF);
30266 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
30267 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
30268 IX86_BUILTIN_GATHER3DIV16SF);
30270 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
30271 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
30272 IX86_BUILTIN_GATHER3DIV8DF);
30274 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
30275 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
30276 IX86_BUILTIN_GATHER3SIV16SI);
30278 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
30279 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
30280 IX86_BUILTIN_GATHER3SIV8DI);
30282 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
30283 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
30284 IX86_BUILTIN_GATHER3DIV16SI);
30286 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
30287 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
30288 IX86_BUILTIN_GATHER3DIV8DI);
30290 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
30291 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
30292 IX86_BUILTIN_GATHER3ALTSIV8DF);
30294 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
30295 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
30296 IX86_BUILTIN_GATHER3ALTDIV16SF);
30298 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
30299 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
30300 IX86_BUILTIN_GATHER3ALTSIV8DI);
30302 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
30303 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
30304 IX86_BUILTIN_GATHER3ALTDIV16SI);
30306 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
30307 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
30308 IX86_BUILTIN_SCATTERSIV16SF);
30310 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
30311 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
30312 IX86_BUILTIN_SCATTERSIV8DF);
30314 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
30315 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
30316 IX86_BUILTIN_SCATTERDIV16SF);
30318 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
30319 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
30320 IX86_BUILTIN_SCATTERDIV8DF);
30322 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
30323 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
30324 IX86_BUILTIN_SCATTERSIV16SI);
30326 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
30327 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
30328 IX86_BUILTIN_SCATTERSIV8DI);
30330 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
30331 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
30332 IX86_BUILTIN_SCATTERDIV16SI);
30334 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
30335 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
30336 IX86_BUILTIN_SCATTERDIV8DI);
30338 /* AVX512VL */
30339 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
30340 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
30341 IX86_BUILTIN_GATHER3SIV2DF);
30343 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
30344 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
30345 IX86_BUILTIN_GATHER3SIV4DF);
30347 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
30348 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
30349 IX86_BUILTIN_GATHER3DIV2DF);
30351 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
30352 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
30353 IX86_BUILTIN_GATHER3DIV4DF);
30355 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
30356 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
30357 IX86_BUILTIN_GATHER3SIV4SF);
30359 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
30360 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
30361 IX86_BUILTIN_GATHER3SIV8SF);
30363 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
30364 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
30365 IX86_BUILTIN_GATHER3DIV4SF);
30367 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
30368 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
30369 IX86_BUILTIN_GATHER3DIV8SF);
30371 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
30372 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
30373 IX86_BUILTIN_GATHER3SIV2DI);
30375 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
30376 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
30377 IX86_BUILTIN_GATHER3SIV4DI);
30379 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
30380 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
30381 IX86_BUILTIN_GATHER3DIV2DI);
30383 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
30384 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
30385 IX86_BUILTIN_GATHER3DIV4DI);
30387 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
30388 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
30389 IX86_BUILTIN_GATHER3SIV4SI);
30391 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
30392 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
30393 IX86_BUILTIN_GATHER3SIV8SI);
30395 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
30396 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
30397 IX86_BUILTIN_GATHER3DIV4SI);
30399 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
30400 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
30401 IX86_BUILTIN_GATHER3DIV8SI);
30403 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
30404 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
30405 IX86_BUILTIN_GATHER3ALTSIV4DF);
30407 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
30408 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
30409 IX86_BUILTIN_GATHER3ALTDIV8SF);
30411 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
30412 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
30413 IX86_BUILTIN_GATHER3ALTSIV4DI);
30415 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
30416 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
30417 IX86_BUILTIN_GATHER3ALTDIV8SI);
30419 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
30420 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
30421 IX86_BUILTIN_SCATTERSIV8SF);
30423 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
30424 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
30425 IX86_BUILTIN_SCATTERSIV4SF);
30427 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
30428 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
30429 IX86_BUILTIN_SCATTERSIV4DF);
30431 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
30432 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
30433 IX86_BUILTIN_SCATTERSIV2DF);
30435 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
30436 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
30437 IX86_BUILTIN_SCATTERDIV8SF);
30439 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
30440 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
30441 IX86_BUILTIN_SCATTERDIV4SF);
30443 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
30444 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
30445 IX86_BUILTIN_SCATTERDIV4DF);
30447 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
30448 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
30449 IX86_BUILTIN_SCATTERDIV2DF);
30451 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
30452 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
30453 IX86_BUILTIN_SCATTERSIV8SI);
30455 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
30456 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
30457 IX86_BUILTIN_SCATTERSIV4SI);
30459 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
30460 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
30461 IX86_BUILTIN_SCATTERSIV4DI);
30463 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
30464 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
30465 IX86_BUILTIN_SCATTERSIV2DI);
30467 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
30468 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
30469 IX86_BUILTIN_SCATTERDIV8SI);
30471 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
30472 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
30473 IX86_BUILTIN_SCATTERDIV4SI);
30475 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
30476 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
30477 IX86_BUILTIN_SCATTERDIV4DI);
30479 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
30480 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
30481 IX86_BUILTIN_SCATTERDIV2DI);
30482 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
30483 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
30484 IX86_BUILTIN_SCATTERALTSIV8DF);
30486 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
30487 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
30488 IX86_BUILTIN_SCATTERALTDIV16SF);
30490 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
30491 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
30492 IX86_BUILTIN_SCATTERALTSIV8DI);
30494 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
30495 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
30496 IX86_BUILTIN_SCATTERALTDIV16SI);
30498 /* AVX512PF */
30499 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
30500 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30501 IX86_BUILTIN_GATHERPFDPD);
30502 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
30503 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30504 IX86_BUILTIN_GATHERPFDPS);
30505 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
30506 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30507 IX86_BUILTIN_GATHERPFQPD);
30508 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
30509 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30510 IX86_BUILTIN_GATHERPFQPS);
30511 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
30512 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30513 IX86_BUILTIN_SCATTERPFDPD);
30514 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
30515 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30516 IX86_BUILTIN_SCATTERPFDPS);
30517 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
30518 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30519 IX86_BUILTIN_SCATTERPFQPD);
30520 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
30521 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30522 IX86_BUILTIN_SCATTERPFQPS);
30524 /* SHA */
30525 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
30526 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
30527 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
30528 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
30529 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
30530 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
30531 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
30532 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
30533 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
30534 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
30535 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
30536 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
30537 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
30538 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
30540 /* RTM. */
30541 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
30542 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
30544 /* MMX access to the vec_init patterns. */
30545 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
30546 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
30548 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
30549 V4HI_FTYPE_HI_HI_HI_HI,
30550 IX86_BUILTIN_VEC_INIT_V4HI);
30552 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
30553 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
30554 IX86_BUILTIN_VEC_INIT_V8QI);
30556 /* Access to the vec_extract patterns. */
30557 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
30558 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
30559 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
30560 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
30561 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
30562 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
30563 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
30564 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
30565 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
30566 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
30568 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30569 /* As it uses V4HImode, we have to require -mmmx too. */
30570 | OPTION_MASK_ISA_MMX,
30571 "__builtin_ia32_vec_ext_v4hi",
30572 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
30574 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
30575 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
30577 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
30578 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
30580 /* Access to the vec_set patterns. */
30581 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
30582 "__builtin_ia32_vec_set_v2di",
30583 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
30585 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
30586 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
30588 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
30589 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
30591 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
30592 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
30594 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30595 /* As it uses V4HImode, we have to require -mmmx too. */
30596 | OPTION_MASK_ISA_MMX,
30597 "__builtin_ia32_vec_set_v4hi",
30598 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
30600 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
30601 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
30603 /* RDSEED */
30604 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
30605 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
30606 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
30607 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
30608 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
30609 "__builtin_ia32_rdseed_di_step",
30610 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
30612 /* ADCX */
30613 def_builtin (0, "__builtin_ia32_addcarryx_u32",
30614 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
30615 def_builtin (OPTION_MASK_ISA_64BIT,
30616 "__builtin_ia32_addcarryx_u64",
30617 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30618 IX86_BUILTIN_ADDCARRYX64);
30620 /* SBB */
30621 def_builtin (0, "__builtin_ia32_sbb_u32",
30622 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
30623 def_builtin (OPTION_MASK_ISA_64BIT,
30624 "__builtin_ia32_sbb_u64",
30625 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30626 IX86_BUILTIN_SBB64);
30628 /* Read/write FLAGS. */
30629 def_builtin (0, "__builtin_ia32_readeflags_u32",
30630 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30631 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
30632 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30633 def_builtin (0, "__builtin_ia32_writeeflags_u32",
30634 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
30635 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
30636 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
30638 /* CLFLUSHOPT. */
30639 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
30640 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
30642 /* CLWB. */
30643 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
30644 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
30646 /* MONITORX and MWAITX. */
30647 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
30648 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
30649 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
30650 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
30652 /* CLZERO. */
30653 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
30654 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
30656 /* Add FMA4 multi-arg argument instructions */
30657 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30659 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
30660 if (d->name == 0)
30661 continue;
30663 ftype = (enum ix86_builtin_func_type) d->flag;
30664 def_builtin_const (d->mask, d->name, ftype, d->code);
30666 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
30667 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30668 ARRAY_SIZE (bdesc_multi_arg) - 1);
30670 /* Add CET inrinsics. */
30671 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
30673 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
30674 if (d->name == 0)
30675 continue;
30677 ftype = (enum ix86_builtin_func_type) d->flag;
30678 def_builtin2 (d->mask, d->name, ftype, d->code);
30680 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
30681 IX86_BUILTIN__BDESC_CET_FIRST,
30682 ARRAY_SIZE (bdesc_cet) - 1);
30684 for (i = 0, d = bdesc_cet_rdssp;
30685 i < ARRAY_SIZE (bdesc_cet_rdssp);
30686 i++, d++)
30688 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
30689 if (d->name == 0)
30690 continue;
30692 ftype = (enum ix86_builtin_func_type) d->flag;
30693 def_builtin2 (d->mask, d->name, ftype, d->code);
30695 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
30696 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30697 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
30700 static void
30701 ix86_init_mpx_builtins ()
30703 const struct builtin_description * d;
30704 enum ix86_builtin_func_type ftype;
30705 tree decl;
30706 size_t i;
30708 for (i = 0, d = bdesc_mpx;
30709 i < ARRAY_SIZE (bdesc_mpx);
30710 i++, d++)
30712 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
30713 if (d->name == 0)
30714 continue;
30716 ftype = (enum ix86_builtin_func_type) d->flag;
30717 decl = def_builtin (d->mask, d->name, ftype, d->code);
30719 /* With no leaf and nothrow flags for MPX builtins
30720 abnormal edges may follow its call when setjmp
30721 presents in the function. Since we may have a lot
30722 of MPX builtins calls it causes lots of useless
30723 edges and enormous PHI nodes. To avoid this we mark
30724 MPX builtins as leaf and nothrow. */
30725 if (decl)
30727 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30728 NULL_TREE);
30729 TREE_NOTHROW (decl) = 1;
30731 else
30733 ix86_builtins_isa[(int)d->code].leaf_p = true;
30734 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30737 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
30738 IX86_BUILTIN__BDESC_MPX_FIRST,
30739 ARRAY_SIZE (bdesc_mpx) - 1);
30741 for (i = 0, d = bdesc_mpx_const;
30742 i < ARRAY_SIZE (bdesc_mpx_const);
30743 i++, d++)
30745 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
30746 if (d->name == 0)
30747 continue;
30749 ftype = (enum ix86_builtin_func_type) d->flag;
30750 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
30752 if (decl)
30754 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30755 NULL_TREE);
30756 TREE_NOTHROW (decl) = 1;
30758 else
30760 ix86_builtins_isa[(int)d->code].leaf_p = true;
30761 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30764 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
30765 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30766 ARRAY_SIZE (bdesc_mpx_const) - 1);
30768 #undef BDESC_VERIFY
30769 #undef BDESC_VERIFYS
30771 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
30772 to return a pointer to VERSION_DECL if the outcome of the expression
30773 formed by PREDICATE_CHAIN is true. This function will be called during
30774 version dispatch to decide which function version to execute. It returns
30775 the basic block at the end, to which more conditions can be added. */
30777 static basic_block
30778 add_condition_to_bb (tree function_decl, tree version_decl,
30779 tree predicate_chain, basic_block new_bb)
30781 gimple *return_stmt;
30782 tree convert_expr, result_var;
30783 gimple *convert_stmt;
30784 gimple *call_cond_stmt;
30785 gimple *if_else_stmt;
30787 basic_block bb1, bb2, bb3;
30788 edge e12, e23;
30790 tree cond_var, and_expr_var = NULL_TREE;
30791 gimple_seq gseq;
30793 tree predicate_decl, predicate_arg;
30795 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
30797 gcc_assert (new_bb != NULL);
30798 gseq = bb_seq (new_bb);
30801 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
30802 build_fold_addr_expr (version_decl));
30803 result_var = create_tmp_var (ptr_type_node);
30804 convert_stmt = gimple_build_assign (result_var, convert_expr);
30805 return_stmt = gimple_build_return (result_var);
30807 if (predicate_chain == NULL_TREE)
30809 gimple_seq_add_stmt (&gseq, convert_stmt);
30810 gimple_seq_add_stmt (&gseq, return_stmt);
30811 set_bb_seq (new_bb, gseq);
30812 gimple_set_bb (convert_stmt, new_bb);
30813 gimple_set_bb (return_stmt, new_bb);
30814 pop_cfun ();
30815 return new_bb;
30818 while (predicate_chain != NULL)
30820 cond_var = create_tmp_var (integer_type_node);
30821 predicate_decl = TREE_PURPOSE (predicate_chain);
30822 predicate_arg = TREE_VALUE (predicate_chain);
30823 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
30824 gimple_call_set_lhs (call_cond_stmt, cond_var);
30826 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
30827 gimple_set_bb (call_cond_stmt, new_bb);
30828 gimple_seq_add_stmt (&gseq, call_cond_stmt);
30830 predicate_chain = TREE_CHAIN (predicate_chain);
30832 if (and_expr_var == NULL)
30833 and_expr_var = cond_var;
30834 else
30836 gimple *assign_stmt;
30837 /* Use MIN_EXPR to check if any integer is zero?.
30838 and_expr_var = min_expr <cond_var, and_expr_var> */
30839 assign_stmt = gimple_build_assign (and_expr_var,
30840 build2 (MIN_EXPR, integer_type_node,
30841 cond_var, and_expr_var));
30843 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
30844 gimple_set_bb (assign_stmt, new_bb);
30845 gimple_seq_add_stmt (&gseq, assign_stmt);
30849 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
30850 integer_zero_node,
30851 NULL_TREE, NULL_TREE);
30852 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
30853 gimple_set_bb (if_else_stmt, new_bb);
30854 gimple_seq_add_stmt (&gseq, if_else_stmt);
30856 gimple_seq_add_stmt (&gseq, convert_stmt);
30857 gimple_seq_add_stmt (&gseq, return_stmt);
30858 set_bb_seq (new_bb, gseq);
30860 bb1 = new_bb;
30861 e12 = split_block (bb1, if_else_stmt);
30862 bb2 = e12->dest;
30863 e12->flags &= ~EDGE_FALLTHRU;
30864 e12->flags |= EDGE_TRUE_VALUE;
30866 e23 = split_block (bb2, return_stmt);
30868 gimple_set_bb (convert_stmt, bb2);
30869 gimple_set_bb (return_stmt, bb2);
30871 bb3 = e23->dest;
30872 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
30874 remove_edge (e23);
30875 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
30877 pop_cfun ();
30879 return bb3;
30882 /* This parses the attribute arguments to target in DECL and determines
30883 the right builtin to use to match the platform specification.
30884 It returns the priority value for this version decl. If PREDICATE_LIST
30885 is not NULL, it stores the list of cpu features that need to be checked
30886 before dispatching this function. */
30888 static unsigned int
30889 get_builtin_code_for_version (tree decl, tree *predicate_list)
30891 tree attrs;
30892 struct cl_target_option cur_target;
30893 tree target_node;
30894 struct cl_target_option *new_target;
30895 const char *arg_str = NULL;
30896 const char *attrs_str = NULL;
30897 char *tok_str = NULL;
30898 char *token;
30900 /* Priority of i386 features, greater value is higher priority. This is
30901 used to decide the order in which function dispatch must happen. For
30902 instance, a version specialized for SSE4.2 should be checked for dispatch
30903 before a version for SSE3, as SSE4.2 implies SSE3. */
30904 enum feature_priority
30906 P_ZERO = 0,
30907 P_MMX,
30908 P_SSE,
30909 P_SSE2,
30910 P_SSE3,
30911 P_SSSE3,
30912 P_PROC_SSSE3,
30913 P_SSE4_A,
30914 P_PROC_SSE4_A,
30915 P_SSE4_1,
30916 P_SSE4_2,
30917 P_PROC_SSE4_2,
30918 P_POPCNT,
30919 P_AES,
30920 P_PCLMUL,
30921 P_AVX,
30922 P_PROC_AVX,
30923 P_BMI,
30924 P_PROC_BMI,
30925 P_FMA4,
30926 P_XOP,
30927 P_PROC_XOP,
30928 P_FMA,
30929 P_PROC_FMA,
30930 P_BMI2,
30931 P_AVX2,
30932 P_PROC_AVX2,
30933 P_AVX512F,
30934 P_PROC_AVX512F
30937 enum feature_priority priority = P_ZERO;
30939 /* These are the target attribute strings for which a dispatcher is
30940 available, from fold_builtin_cpu. */
30942 static struct _feature_list
30944 const char *const name;
30945 const enum feature_priority priority;
30947 const feature_list[] =
30949 {"mmx", P_MMX},
30950 {"sse", P_SSE},
30951 {"sse2", P_SSE2},
30952 {"sse3", P_SSE3},
30953 {"sse4a", P_SSE4_A},
30954 {"ssse3", P_SSSE3},
30955 {"sse4.1", P_SSE4_1},
30956 {"sse4.2", P_SSE4_2},
30957 {"popcnt", P_POPCNT},
30958 {"aes", P_AES},
30959 {"pclmul", P_PCLMUL},
30960 {"avx", P_AVX},
30961 {"bmi", P_BMI},
30962 {"fma4", P_FMA4},
30963 {"xop", P_XOP},
30964 {"fma", P_FMA},
30965 {"bmi2", P_BMI2},
30966 {"avx2", P_AVX2},
30967 {"avx512f", P_AVX512F}
30971 static unsigned int NUM_FEATURES
30972 = sizeof (feature_list) / sizeof (struct _feature_list);
30974 unsigned int i;
30976 tree predicate_chain = NULL_TREE;
30977 tree predicate_decl, predicate_arg;
30979 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30980 gcc_assert (attrs != NULL);
30982 attrs = TREE_VALUE (TREE_VALUE (attrs));
30984 gcc_assert (TREE_CODE (attrs) == STRING_CST);
30985 attrs_str = TREE_STRING_POINTER (attrs);
30987 /* Return priority zero for default function. */
30988 if (strcmp (attrs_str, "default") == 0)
30989 return 0;
30991 /* Handle arch= if specified. For priority, set it to be 1 more than
30992 the best instruction set the processor can handle. For instance, if
30993 there is a version for atom and a version for ssse3 (the highest ISA
30994 priority for atom), the atom version must be checked for dispatch
30995 before the ssse3 version. */
30996 if (strstr (attrs_str, "arch=") != NULL)
30998 cl_target_option_save (&cur_target, &global_options);
30999 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
31000 &global_options_set);
31002 gcc_assert (target_node);
31003 new_target = TREE_TARGET_OPTION (target_node);
31004 gcc_assert (new_target);
31006 if (new_target->arch_specified && new_target->arch > 0)
31008 switch (new_target->arch)
31010 case PROCESSOR_CORE2:
31011 arg_str = "core2";
31012 priority = P_PROC_SSSE3;
31013 break;
31014 case PROCESSOR_NEHALEM:
31015 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
31017 arg_str = "westmere";
31018 priority = P_AES;
31020 else
31022 /* We translate "arch=corei7" and "arch=nehalem" to
31023 "corei7" so that it will be mapped to M_INTEL_COREI7
31024 as cpu type to cover all M_INTEL_COREI7_XXXs. */
31025 arg_str = "corei7";
31026 priority = P_PROC_SSE4_2;
31028 break;
31029 case PROCESSOR_SANDYBRIDGE:
31030 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
31031 arg_str = "ivybridge";
31032 else
31033 arg_str = "sandybridge";
31034 priority = P_PROC_AVX;
31035 break;
31036 case PROCESSOR_HASWELL:
31037 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31038 arg_str = "skylake-avx512";
31039 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
31040 arg_str = "skylake";
31041 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
31042 arg_str = "broadwell";
31043 else
31044 arg_str = "haswell";
31045 priority = P_PROC_AVX2;
31046 break;
31047 case PROCESSOR_BONNELL:
31048 arg_str = "bonnell";
31049 priority = P_PROC_SSSE3;
31050 break;
31051 case PROCESSOR_KNL:
31052 arg_str = "knl";
31053 priority = P_PROC_AVX512F;
31054 break;
31055 case PROCESSOR_KNM:
31056 arg_str = "knm";
31057 priority = P_PROC_AVX512F;
31058 break;
31059 case PROCESSOR_SILVERMONT:
31060 arg_str = "silvermont";
31061 priority = P_PROC_SSE4_2;
31062 break;
31063 case PROCESSOR_AMDFAM10:
31064 arg_str = "amdfam10h";
31065 priority = P_PROC_SSE4_A;
31066 break;
31067 case PROCESSOR_BTVER1:
31068 arg_str = "btver1";
31069 priority = P_PROC_SSE4_A;
31070 break;
31071 case PROCESSOR_BTVER2:
31072 arg_str = "btver2";
31073 priority = P_PROC_BMI;
31074 break;
31075 case PROCESSOR_BDVER1:
31076 arg_str = "bdver1";
31077 priority = P_PROC_XOP;
31078 break;
31079 case PROCESSOR_BDVER2:
31080 arg_str = "bdver2";
31081 priority = P_PROC_FMA;
31082 break;
31083 case PROCESSOR_BDVER3:
31084 arg_str = "bdver3";
31085 priority = P_PROC_FMA;
31086 break;
31087 case PROCESSOR_BDVER4:
31088 arg_str = "bdver4";
31089 priority = P_PROC_AVX2;
31090 break;
31091 case PROCESSOR_ZNVER1:
31092 arg_str = "znver1";
31093 priority = P_PROC_AVX2;
31094 break;
31098 cl_target_option_restore (&global_options, &cur_target);
31100 if (predicate_list && arg_str == NULL)
31102 error_at (DECL_SOURCE_LOCATION (decl),
31103 "No dispatcher found for the versioning attributes");
31104 return 0;
31107 if (predicate_list)
31109 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
31110 /* For a C string literal the length includes the trailing NULL. */
31111 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
31112 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31113 predicate_chain);
31117 /* Process feature name. */
31118 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
31119 strcpy (tok_str, attrs_str);
31120 token = strtok (tok_str, ",");
31121 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
31123 while (token != NULL)
31125 /* Do not process "arch=" */
31126 if (strncmp (token, "arch=", 5) == 0)
31128 token = strtok (NULL, ",");
31129 continue;
31131 for (i = 0; i < NUM_FEATURES; ++i)
31133 if (strcmp (token, feature_list[i].name) == 0)
31135 if (predicate_list)
31137 predicate_arg = build_string_literal (
31138 strlen (feature_list[i].name) + 1,
31139 feature_list[i].name);
31140 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31141 predicate_chain);
31143 /* Find the maximum priority feature. */
31144 if (feature_list[i].priority > priority)
31145 priority = feature_list[i].priority;
31147 break;
31150 if (predicate_list && i == NUM_FEATURES)
31152 error_at (DECL_SOURCE_LOCATION (decl),
31153 "No dispatcher found for %s", token);
31154 return 0;
31156 token = strtok (NULL, ",");
31158 free (tok_str);
31160 if (predicate_list && predicate_chain == NULL_TREE)
31162 error_at (DECL_SOURCE_LOCATION (decl),
31163 "No dispatcher found for the versioning attributes : %s",
31164 attrs_str);
31165 return 0;
31167 else if (predicate_list)
31169 predicate_chain = nreverse (predicate_chain);
31170 *predicate_list = predicate_chain;
31173 return priority;
31176 /* This compares the priority of target features in function DECL1
31177 and DECL2. It returns positive value if DECL1 is higher priority,
31178 negative value if DECL2 is higher priority and 0 if they are the
31179 same. */
31181 static int
31182 ix86_compare_version_priority (tree decl1, tree decl2)
31184 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
31185 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
31187 return (int)priority1 - (int)priority2;
31190 /* V1 and V2 point to function versions with different priorities
31191 based on the target ISA. This function compares their priorities. */
31193 static int
31194 feature_compare (const void *v1, const void *v2)
31196 typedef struct _function_version_info
31198 tree version_decl;
31199 tree predicate_chain;
31200 unsigned int dispatch_priority;
31201 } function_version_info;
31203 const function_version_info c1 = *(const function_version_info *)v1;
31204 const function_version_info c2 = *(const function_version_info *)v2;
31205 return (c2.dispatch_priority - c1.dispatch_priority);
31208 /* This function generates the dispatch function for
31209 multi-versioned functions. DISPATCH_DECL is the function which will
31210 contain the dispatch logic. FNDECLS are the function choices for
31211 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
31212 in DISPATCH_DECL in which the dispatch code is generated. */
31214 static int
31215 dispatch_function_versions (tree dispatch_decl,
31216 void *fndecls_p,
31217 basic_block *empty_bb)
31219 tree default_decl;
31220 gimple *ifunc_cpu_init_stmt;
31221 gimple_seq gseq;
31222 int ix;
31223 tree ele;
31224 vec<tree> *fndecls;
31225 unsigned int num_versions = 0;
31226 unsigned int actual_versions = 0;
31227 unsigned int i;
31229 struct _function_version_info
31231 tree version_decl;
31232 tree predicate_chain;
31233 unsigned int dispatch_priority;
31234 }*function_version_info;
31236 gcc_assert (dispatch_decl != NULL
31237 && fndecls_p != NULL
31238 && empty_bb != NULL);
31240 /*fndecls_p is actually a vector. */
31241 fndecls = static_cast<vec<tree> *> (fndecls_p);
31243 /* At least one more version other than the default. */
31244 num_versions = fndecls->length ();
31245 gcc_assert (num_versions >= 2);
31247 function_version_info = (struct _function_version_info *)
31248 XNEWVEC (struct _function_version_info, (num_versions - 1));
31250 /* The first version in the vector is the default decl. */
31251 default_decl = (*fndecls)[0];
31253 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
31255 gseq = bb_seq (*empty_bb);
31256 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
31257 constructors, so explicity call __builtin_cpu_init here. */
31258 ifunc_cpu_init_stmt = gimple_build_call_vec (
31259 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
31260 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
31261 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
31262 set_bb_seq (*empty_bb, gseq);
31264 pop_cfun ();
31267 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
31269 tree version_decl = ele;
31270 tree predicate_chain = NULL_TREE;
31271 unsigned int priority;
31272 /* Get attribute string, parse it and find the right predicate decl.
31273 The predicate function could be a lengthy combination of many
31274 features, like arch-type and various isa-variants. */
31275 priority = get_builtin_code_for_version (version_decl,
31276 &predicate_chain);
31278 if (predicate_chain == NULL_TREE)
31279 continue;
31281 function_version_info [actual_versions].version_decl = version_decl;
31282 function_version_info [actual_versions].predicate_chain
31283 = predicate_chain;
31284 function_version_info [actual_versions].dispatch_priority = priority;
31285 actual_versions++;
31288 /* Sort the versions according to descending order of dispatch priority. The
31289 priority is based on the ISA. This is not a perfect solution. There
31290 could still be ambiguity. If more than one function version is suitable
31291 to execute, which one should be dispatched? In future, allow the user
31292 to specify a dispatch priority next to the version. */
31293 qsort (function_version_info, actual_versions,
31294 sizeof (struct _function_version_info), feature_compare);
31296 for (i = 0; i < actual_versions; ++i)
31297 *empty_bb = add_condition_to_bb (dispatch_decl,
31298 function_version_info[i].version_decl,
31299 function_version_info[i].predicate_chain,
31300 *empty_bb);
31302 /* dispatch default version at the end. */
31303 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
31304 NULL, *empty_bb);
31306 free (function_version_info);
31307 return 0;
31310 /* This function changes the assembler name for functions that are
31311 versions. If DECL is a function version and has a "target"
31312 attribute, it appends the attribute string to its assembler name. */
31314 static tree
31315 ix86_mangle_function_version_assembler_name (tree decl, tree id)
31317 tree version_attr;
31318 const char *orig_name, *version_string;
31319 char *attr_str, *assembler_name;
31321 if (DECL_DECLARED_INLINE_P (decl)
31322 && lookup_attribute ("gnu_inline",
31323 DECL_ATTRIBUTES (decl)))
31324 error_at (DECL_SOURCE_LOCATION (decl),
31325 "Function versions cannot be marked as gnu_inline,"
31326 " bodies have to be generated");
31328 if (DECL_VIRTUAL_P (decl)
31329 || DECL_VINDEX (decl))
31330 sorry ("Virtual function multiversioning not supported");
31332 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31334 /* target attribute string cannot be NULL. */
31335 gcc_assert (version_attr != NULL_TREE);
31337 orig_name = IDENTIFIER_POINTER (id);
31338 version_string
31339 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
31341 if (strcmp (version_string, "default") == 0)
31342 return id;
31344 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
31345 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
31347 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
31349 /* Allow assembler name to be modified if already set. */
31350 if (DECL_ASSEMBLER_NAME_SET_P (decl))
31351 SET_DECL_RTL (decl, NULL);
31353 tree ret = get_identifier (assembler_name);
31354 XDELETEVEC (attr_str);
31355 XDELETEVEC (assembler_name);
31356 return ret;
31360 static tree
31361 ix86_mangle_decl_assembler_name (tree decl, tree id)
31363 /* For function version, add the target suffix to the assembler name. */
31364 if (TREE_CODE (decl) == FUNCTION_DECL
31365 && DECL_FUNCTION_VERSIONED (decl))
31366 id = ix86_mangle_function_version_assembler_name (decl, id);
31367 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
31368 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
31369 #endif
31371 return id;
31374 /* Make a dispatcher declaration for the multi-versioned function DECL.
31375 Calls to DECL function will be replaced with calls to the dispatcher
31376 by the front-end. Returns the decl of the dispatcher function. */
31378 static tree
31379 ix86_get_function_versions_dispatcher (void *decl)
31381 tree fn = (tree) decl;
31382 struct cgraph_node *node = NULL;
31383 struct cgraph_node *default_node = NULL;
31384 struct cgraph_function_version_info *node_v = NULL;
31385 struct cgraph_function_version_info *first_v = NULL;
31387 tree dispatch_decl = NULL;
31389 struct cgraph_function_version_info *default_version_info = NULL;
31391 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
31393 node = cgraph_node::get (fn);
31394 gcc_assert (node != NULL);
31396 node_v = node->function_version ();
31397 gcc_assert (node_v != NULL);
31399 if (node_v->dispatcher_resolver != NULL)
31400 return node_v->dispatcher_resolver;
31402 /* Find the default version and make it the first node. */
31403 first_v = node_v;
31404 /* Go to the beginning of the chain. */
31405 while (first_v->prev != NULL)
31406 first_v = first_v->prev;
31407 default_version_info = first_v;
31408 while (default_version_info != NULL)
31410 if (is_function_default_version
31411 (default_version_info->this_node->decl))
31412 break;
31413 default_version_info = default_version_info->next;
31416 /* If there is no default node, just return NULL. */
31417 if (default_version_info == NULL)
31418 return NULL;
31420 /* Make default info the first node. */
31421 if (first_v != default_version_info)
31423 default_version_info->prev->next = default_version_info->next;
31424 if (default_version_info->next)
31425 default_version_info->next->prev = default_version_info->prev;
31426 first_v->prev = default_version_info;
31427 default_version_info->next = first_v;
31428 default_version_info->prev = NULL;
31431 default_node = default_version_info->this_node;
31433 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
31434 if (targetm.has_ifunc_p ())
31436 struct cgraph_function_version_info *it_v = NULL;
31437 struct cgraph_node *dispatcher_node = NULL;
31438 struct cgraph_function_version_info *dispatcher_version_info = NULL;
31440 /* Right now, the dispatching is done via ifunc. */
31441 dispatch_decl = make_dispatcher_decl (default_node->decl);
31443 dispatcher_node = cgraph_node::get_create (dispatch_decl);
31444 gcc_assert (dispatcher_node != NULL);
31445 dispatcher_node->dispatcher_function = 1;
31446 dispatcher_version_info
31447 = dispatcher_node->insert_new_function_version ();
31448 dispatcher_version_info->next = default_version_info;
31449 dispatcher_node->definition = 1;
31451 /* Set the dispatcher for all the versions. */
31452 it_v = default_version_info;
31453 while (it_v != NULL)
31455 it_v->dispatcher_resolver = dispatch_decl;
31456 it_v = it_v->next;
31459 else
31460 #endif
31462 error_at (DECL_SOURCE_LOCATION (default_node->decl),
31463 "multiversioning needs ifunc which is not supported "
31464 "on this target");
31467 return dispatch_decl;
31470 /* Make the resolver function decl to dispatch the versions of
31471 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
31472 ifunc alias that will point to the created resolver. Create an
31473 empty basic block in the resolver and store the pointer in
31474 EMPTY_BB. Return the decl of the resolver function. */
31476 static tree
31477 make_resolver_func (const tree default_decl,
31478 const tree ifunc_alias_decl,
31479 basic_block *empty_bb)
31481 char *resolver_name;
31482 tree decl, type, decl_name, t;
31484 /* IFUNC's have to be globally visible. So, if the default_decl is
31485 not, then the name of the IFUNC should be made unique. */
31486 if (TREE_PUBLIC (default_decl) == 0)
31488 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
31489 symtab->change_decl_assembler_name (ifunc_alias_decl,
31490 get_identifier (ifunc_name));
31491 XDELETEVEC (ifunc_name);
31494 resolver_name = make_unique_name (default_decl, "resolver", false);
31496 /* The resolver function should return a (void *). */
31497 type = build_function_type_list (ptr_type_node, NULL_TREE);
31499 decl = build_fn_decl (resolver_name, type);
31500 decl_name = get_identifier (resolver_name);
31501 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
31503 DECL_NAME (decl) = decl_name;
31504 TREE_USED (decl) = 1;
31505 DECL_ARTIFICIAL (decl) = 1;
31506 DECL_IGNORED_P (decl) = 1;
31507 TREE_PUBLIC (decl) = 0;
31508 DECL_UNINLINABLE (decl) = 1;
31510 /* Resolver is not external, body is generated. */
31511 DECL_EXTERNAL (decl) = 0;
31512 DECL_EXTERNAL (ifunc_alias_decl) = 0;
31514 DECL_CONTEXT (decl) = NULL_TREE;
31515 DECL_INITIAL (decl) = make_node (BLOCK);
31516 DECL_STATIC_CONSTRUCTOR (decl) = 0;
31518 if (DECL_COMDAT_GROUP (default_decl)
31519 || TREE_PUBLIC (default_decl))
31521 /* In this case, each translation unit with a call to this
31522 versioned function will put out a resolver. Ensure it
31523 is comdat to keep just one copy. */
31524 DECL_COMDAT (decl) = 1;
31525 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
31527 /* Build result decl and add to function_decl. */
31528 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
31529 DECL_ARTIFICIAL (t) = 1;
31530 DECL_IGNORED_P (t) = 1;
31531 DECL_RESULT (decl) = t;
31533 gimplify_function_tree (decl);
31534 push_cfun (DECL_STRUCT_FUNCTION (decl));
31535 *empty_bb = init_lowered_empty_function (decl, false,
31536 profile_count::uninitialized ());
31538 cgraph_node::add_new_function (decl, true);
31539 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
31541 pop_cfun ();
31543 gcc_assert (ifunc_alias_decl != NULL);
31544 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
31545 DECL_ATTRIBUTES (ifunc_alias_decl)
31546 = make_attribute ("ifunc", resolver_name,
31547 DECL_ATTRIBUTES (ifunc_alias_decl));
31549 /* Create the alias for dispatch to resolver here. */
31550 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
31551 XDELETEVEC (resolver_name);
31552 return decl;
31555 /* Generate the dispatching code body to dispatch multi-versioned function
31556 DECL. The target hook is called to process the "target" attributes and
31557 provide the code to dispatch the right function at run-time. NODE points
31558 to the dispatcher decl whose body will be created. */
31560 static tree
31561 ix86_generate_version_dispatcher_body (void *node_p)
31563 tree resolver_decl;
31564 basic_block empty_bb;
31565 tree default_ver_decl;
31566 struct cgraph_node *versn;
31567 struct cgraph_node *node;
31569 struct cgraph_function_version_info *node_version_info = NULL;
31570 struct cgraph_function_version_info *versn_info = NULL;
31572 node = (cgraph_node *)node_p;
31574 node_version_info = node->function_version ();
31575 gcc_assert (node->dispatcher_function
31576 && node_version_info != NULL);
31578 if (node_version_info->dispatcher_resolver)
31579 return node_version_info->dispatcher_resolver;
31581 /* The first version in the chain corresponds to the default version. */
31582 default_ver_decl = node_version_info->next->this_node->decl;
31584 /* node is going to be an alias, so remove the finalized bit. */
31585 node->definition = false;
31587 resolver_decl = make_resolver_func (default_ver_decl,
31588 node->decl, &empty_bb);
31590 node_version_info->dispatcher_resolver = resolver_decl;
31592 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
31594 auto_vec<tree, 2> fn_ver_vec;
31596 for (versn_info = node_version_info->next; versn_info;
31597 versn_info = versn_info->next)
31599 versn = versn_info->this_node;
31600 /* Check for virtual functions here again, as by this time it should
31601 have been determined if this function needs a vtable index or
31602 not. This happens for methods in derived classes that override
31603 virtual methods in base classes but are not explicitly marked as
31604 virtual. */
31605 if (DECL_VINDEX (versn->decl))
31606 sorry ("Virtual function multiversioning not supported");
31608 fn_ver_vec.safe_push (versn->decl);
31611 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
31612 cgraph_edge::rebuild_edges ();
31613 pop_cfun ();
31614 return resolver_decl;
31616 /* This builds the processor_model struct type defined in
31617 libgcc/config/i386/cpuinfo.c */
31619 static tree
31620 build_processor_model_struct (void)
31622 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
31623 "__cpu_features"};
31624 tree field = NULL_TREE, field_chain = NULL_TREE;
31625 int i;
31626 tree type = make_node (RECORD_TYPE);
31628 /* The first 3 fields are unsigned int. */
31629 for (i = 0; i < 3; ++i)
31631 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31632 get_identifier (field_name[i]), unsigned_type_node);
31633 if (field_chain != NULL_TREE)
31634 DECL_CHAIN (field) = field_chain;
31635 field_chain = field;
31638 /* The last field is an array of unsigned integers of size one. */
31639 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31640 get_identifier (field_name[3]),
31641 build_array_type (unsigned_type_node,
31642 build_index_type (size_one_node)));
31643 if (field_chain != NULL_TREE)
31644 DECL_CHAIN (field) = field_chain;
31645 field_chain = field;
31647 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
31648 return type;
31651 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
31653 static tree
31654 make_var_decl (tree type, const char *name)
31656 tree new_decl;
31658 new_decl = build_decl (UNKNOWN_LOCATION,
31659 VAR_DECL,
31660 get_identifier(name),
31661 type);
31663 DECL_EXTERNAL (new_decl) = 1;
31664 TREE_STATIC (new_decl) = 1;
31665 TREE_PUBLIC (new_decl) = 1;
31666 DECL_INITIAL (new_decl) = 0;
31667 DECL_ARTIFICIAL (new_decl) = 0;
31668 DECL_PRESERVE_P (new_decl) = 1;
31670 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
31671 assemble_variable (new_decl, 0, 0, 0);
31673 return new_decl;
31676 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
31677 into an integer defined in libgcc/config/i386/cpuinfo.c */
31679 static tree
31680 fold_builtin_cpu (tree fndecl, tree *args)
31682 unsigned int i;
31683 enum ix86_builtins fn_code = (enum ix86_builtins)
31684 DECL_FUNCTION_CODE (fndecl);
31685 tree param_string_cst = NULL;
31687 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
31688 enum processor_features
31690 F_CMOV = 0,
31691 F_MMX,
31692 F_POPCNT,
31693 F_SSE,
31694 F_SSE2,
31695 F_SSE3,
31696 F_SSSE3,
31697 F_SSE4_1,
31698 F_SSE4_2,
31699 F_AVX,
31700 F_AVX2,
31701 F_SSE4_A,
31702 F_FMA4,
31703 F_XOP,
31704 F_FMA,
31705 F_AVX512F,
31706 F_BMI,
31707 F_BMI2,
31708 F_AES,
31709 F_PCLMUL,
31710 F_AVX512VL,
31711 F_AVX512BW,
31712 F_AVX512DQ,
31713 F_AVX512CD,
31714 F_AVX512ER,
31715 F_AVX512PF,
31716 F_AVX512VBMI,
31717 F_AVX512IFMA,
31718 F_AVX5124VNNIW,
31719 F_AVX5124FMAPS,
31720 F_AVX512VPOPCNTDQ,
31721 F_MAX
31724 /* These are the values for vendor types and cpu types and subtypes
31725 in cpuinfo.c. Cpu types and subtypes should be subtracted by
31726 the corresponding start value. */
31727 enum processor_model
31729 M_INTEL = 1,
31730 M_AMD,
31731 M_CPU_TYPE_START,
31732 M_INTEL_BONNELL,
31733 M_INTEL_CORE2,
31734 M_INTEL_COREI7,
31735 M_AMDFAM10H,
31736 M_AMDFAM15H,
31737 M_INTEL_SILVERMONT,
31738 M_INTEL_KNL,
31739 M_AMD_BTVER1,
31740 M_AMD_BTVER2,
31741 M_AMDFAM17H,
31742 M_INTEL_KNM,
31743 M_CPU_SUBTYPE_START,
31744 M_INTEL_COREI7_NEHALEM,
31745 M_INTEL_COREI7_WESTMERE,
31746 M_INTEL_COREI7_SANDYBRIDGE,
31747 M_AMDFAM10H_BARCELONA,
31748 M_AMDFAM10H_SHANGHAI,
31749 M_AMDFAM10H_ISTANBUL,
31750 M_AMDFAM15H_BDVER1,
31751 M_AMDFAM15H_BDVER2,
31752 M_AMDFAM15H_BDVER3,
31753 M_AMDFAM15H_BDVER4,
31754 M_AMDFAM17H_ZNVER1,
31755 M_INTEL_COREI7_IVYBRIDGE,
31756 M_INTEL_COREI7_HASWELL,
31757 M_INTEL_COREI7_BROADWELL,
31758 M_INTEL_COREI7_SKYLAKE,
31759 M_INTEL_COREI7_SKYLAKE_AVX512
31762 static struct _arch_names_table
31764 const char *const name;
31765 const enum processor_model model;
31767 const arch_names_table[] =
31769 {"amd", M_AMD},
31770 {"intel", M_INTEL},
31771 {"atom", M_INTEL_BONNELL},
31772 {"slm", M_INTEL_SILVERMONT},
31773 {"core2", M_INTEL_CORE2},
31774 {"corei7", M_INTEL_COREI7},
31775 {"nehalem", M_INTEL_COREI7_NEHALEM},
31776 {"westmere", M_INTEL_COREI7_WESTMERE},
31777 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
31778 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
31779 {"haswell", M_INTEL_COREI7_HASWELL},
31780 {"broadwell", M_INTEL_COREI7_BROADWELL},
31781 {"skylake", M_INTEL_COREI7_SKYLAKE},
31782 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
31783 {"bonnell", M_INTEL_BONNELL},
31784 {"silvermont", M_INTEL_SILVERMONT},
31785 {"knl", M_INTEL_KNL},
31786 {"knm", M_INTEL_KNM},
31787 {"amdfam10h", M_AMDFAM10H},
31788 {"barcelona", M_AMDFAM10H_BARCELONA},
31789 {"shanghai", M_AMDFAM10H_SHANGHAI},
31790 {"istanbul", M_AMDFAM10H_ISTANBUL},
31791 {"btver1", M_AMD_BTVER1},
31792 {"amdfam15h", M_AMDFAM15H},
31793 {"bdver1", M_AMDFAM15H_BDVER1},
31794 {"bdver2", M_AMDFAM15H_BDVER2},
31795 {"bdver3", M_AMDFAM15H_BDVER3},
31796 {"bdver4", M_AMDFAM15H_BDVER4},
31797 {"btver2", M_AMD_BTVER2},
31798 {"amdfam17h", M_AMDFAM17H},
31799 {"znver1", M_AMDFAM17H_ZNVER1},
31802 static struct _isa_names_table
31804 const char *const name;
31805 const enum processor_features feature;
31807 const isa_names_table[] =
31809 {"cmov", F_CMOV},
31810 {"mmx", F_MMX},
31811 {"popcnt", F_POPCNT},
31812 {"sse", F_SSE},
31813 {"sse2", F_SSE2},
31814 {"sse3", F_SSE3},
31815 {"ssse3", F_SSSE3},
31816 {"sse4a", F_SSE4_A},
31817 {"sse4.1", F_SSE4_1},
31818 {"sse4.2", F_SSE4_2},
31819 {"avx", F_AVX},
31820 {"fma4", F_FMA4},
31821 {"xop", F_XOP},
31822 {"fma", F_FMA},
31823 {"avx2", F_AVX2},
31824 {"avx512f", F_AVX512F},
31825 {"bmi", F_BMI},
31826 {"bmi2", F_BMI2},
31827 {"aes", F_AES},
31828 {"pclmul", F_PCLMUL},
31829 {"avx512vl",F_AVX512VL},
31830 {"avx512bw",F_AVX512BW},
31831 {"avx512dq",F_AVX512DQ},
31832 {"avx512cd",F_AVX512CD},
31833 {"avx512er",F_AVX512ER},
31834 {"avx512pf",F_AVX512PF},
31835 {"avx512vbmi",F_AVX512VBMI},
31836 {"avx512ifma",F_AVX512IFMA},
31837 {"avx5124vnniw",F_AVX5124VNNIW},
31838 {"avx5124fmaps",F_AVX5124FMAPS},
31839 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
31842 tree __processor_model_type = build_processor_model_struct ();
31843 tree __cpu_model_var = make_var_decl (__processor_model_type,
31844 "__cpu_model");
31847 varpool_node::add (__cpu_model_var);
31849 gcc_assert ((args != NULL) && (*args != NULL));
31851 param_string_cst = *args;
31852 while (param_string_cst
31853 && TREE_CODE (param_string_cst) != STRING_CST)
31855 /* *args must be a expr that can contain other EXPRS leading to a
31856 STRING_CST. */
31857 if (!EXPR_P (param_string_cst))
31859 error ("Parameter to builtin must be a string constant or literal");
31860 return integer_zero_node;
31862 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
31865 gcc_assert (param_string_cst);
31867 if (fn_code == IX86_BUILTIN_CPU_IS)
31869 tree ref;
31870 tree field;
31871 tree final;
31873 unsigned int field_val = 0;
31874 unsigned int NUM_ARCH_NAMES
31875 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
31877 for (i = 0; i < NUM_ARCH_NAMES; i++)
31878 if (strcmp (arch_names_table[i].name,
31879 TREE_STRING_POINTER (param_string_cst)) == 0)
31880 break;
31882 if (i == NUM_ARCH_NAMES)
31884 error ("Parameter to builtin not valid: %s",
31885 TREE_STRING_POINTER (param_string_cst));
31886 return integer_zero_node;
31889 field = TYPE_FIELDS (__processor_model_type);
31890 field_val = arch_names_table[i].model;
31892 /* CPU types are stored in the next field. */
31893 if (field_val > M_CPU_TYPE_START
31894 && field_val < M_CPU_SUBTYPE_START)
31896 field = DECL_CHAIN (field);
31897 field_val -= M_CPU_TYPE_START;
31900 /* CPU subtypes are stored in the next field. */
31901 if (field_val > M_CPU_SUBTYPE_START)
31903 field = DECL_CHAIN ( DECL_CHAIN (field));
31904 field_val -= M_CPU_SUBTYPE_START;
31907 /* Get the appropriate field in __cpu_model. */
31908 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31909 field, NULL_TREE);
31911 /* Check the value. */
31912 final = build2 (EQ_EXPR, unsigned_type_node, ref,
31913 build_int_cstu (unsigned_type_node, field_val));
31914 return build1 (CONVERT_EXPR, integer_type_node, final);
31916 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31918 tree ref;
31919 tree array_elt;
31920 tree field;
31921 tree final;
31923 unsigned int field_val = 0;
31924 unsigned int NUM_ISA_NAMES
31925 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
31927 for (i = 0; i < NUM_ISA_NAMES; i++)
31928 if (strcmp (isa_names_table[i].name,
31929 TREE_STRING_POINTER (param_string_cst)) == 0)
31930 break;
31932 if (i == NUM_ISA_NAMES)
31934 error ("Parameter to builtin not valid: %s",
31935 TREE_STRING_POINTER (param_string_cst));
31936 return integer_zero_node;
31939 field = TYPE_FIELDS (__processor_model_type);
31940 /* Get the last field, which is __cpu_features. */
31941 while (DECL_CHAIN (field))
31942 field = DECL_CHAIN (field);
31944 /* Get the appropriate field: __cpu_model.__cpu_features */
31945 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31946 field, NULL_TREE);
31948 /* Access the 0th element of __cpu_features array. */
31949 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
31950 integer_zero_node, NULL_TREE, NULL_TREE);
31952 field_val = (1 << isa_names_table[i].feature);
31953 /* Return __cpu_model.__cpu_features[0] & field_val */
31954 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
31955 build_int_cstu (unsigned_type_node, field_val));
31956 return build1 (CONVERT_EXPR, integer_type_node, final);
31958 gcc_unreachable ();
31961 static tree
31962 ix86_fold_builtin (tree fndecl, int n_args,
31963 tree *args, bool ignore ATTRIBUTE_UNUSED)
31965 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31967 enum ix86_builtins fn_code = (enum ix86_builtins)
31968 DECL_FUNCTION_CODE (fndecl);
31969 switch (fn_code)
31971 case IX86_BUILTIN_CPU_IS:
31972 case IX86_BUILTIN_CPU_SUPPORTS:
31973 gcc_assert (n_args == 1);
31974 return fold_builtin_cpu (fndecl, args);
31976 case IX86_BUILTIN_NANQ:
31977 case IX86_BUILTIN_NANSQ:
31979 tree type = TREE_TYPE (TREE_TYPE (fndecl));
31980 const char *str = c_getstr (*args);
31981 int quiet = fn_code == IX86_BUILTIN_NANQ;
31982 REAL_VALUE_TYPE real;
31984 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
31985 return build_real (type, real);
31986 return NULL_TREE;
31989 case IX86_BUILTIN_INFQ:
31990 case IX86_BUILTIN_HUGE_VALQ:
31992 tree type = TREE_TYPE (TREE_TYPE (fndecl));
31993 REAL_VALUE_TYPE inf;
31994 real_inf (&inf);
31995 return build_real (type, inf);
31998 case IX86_BUILTIN_TZCNT16:
31999 case IX86_BUILTIN_CTZS:
32000 case IX86_BUILTIN_TZCNT32:
32001 case IX86_BUILTIN_TZCNT64:
32002 gcc_assert (n_args == 1);
32003 if (TREE_CODE (args[0]) == INTEGER_CST)
32005 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32006 tree arg = args[0];
32007 if (fn_code == IX86_BUILTIN_TZCNT16
32008 || fn_code == IX86_BUILTIN_CTZS)
32009 arg = fold_convert (short_unsigned_type_node, arg);
32010 if (integer_zerop (arg))
32011 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32012 else
32013 return fold_const_call (CFN_CTZ, type, arg);
32015 break;
32017 case IX86_BUILTIN_LZCNT16:
32018 case IX86_BUILTIN_CLZS:
32019 case IX86_BUILTIN_LZCNT32:
32020 case IX86_BUILTIN_LZCNT64:
32021 gcc_assert (n_args == 1);
32022 if (TREE_CODE (args[0]) == INTEGER_CST)
32024 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32025 tree arg = args[0];
32026 if (fn_code == IX86_BUILTIN_LZCNT16
32027 || fn_code == IX86_BUILTIN_CLZS)
32028 arg = fold_convert (short_unsigned_type_node, arg);
32029 if (integer_zerop (arg))
32030 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32031 else
32032 return fold_const_call (CFN_CLZ, type, arg);
32034 break;
32036 case IX86_BUILTIN_BEXTR32:
32037 case IX86_BUILTIN_BEXTR64:
32038 case IX86_BUILTIN_BEXTRI32:
32039 case IX86_BUILTIN_BEXTRI64:
32040 gcc_assert (n_args == 2);
32041 if (tree_fits_uhwi_p (args[1]))
32043 unsigned HOST_WIDE_INT res = 0;
32044 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
32045 unsigned int start = tree_to_uhwi (args[1]);
32046 unsigned int len = (start & 0xff00) >> 8;
32047 start &= 0xff;
32048 if (start >= prec || len == 0)
32049 res = 0;
32050 else if (!tree_fits_uhwi_p (args[0]))
32051 break;
32052 else
32053 res = tree_to_uhwi (args[0]) >> start;
32054 if (len > prec)
32055 len = prec;
32056 if (len < HOST_BITS_PER_WIDE_INT)
32057 res &= (HOST_WIDE_INT_1U << len) - 1;
32058 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32060 break;
32062 case IX86_BUILTIN_BZHI32:
32063 case IX86_BUILTIN_BZHI64:
32064 gcc_assert (n_args == 2);
32065 if (tree_fits_uhwi_p (args[1]))
32067 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
32068 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
32069 return args[0];
32070 if (!tree_fits_uhwi_p (args[0]))
32071 break;
32072 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
32073 res &= ~(HOST_WIDE_INT_M1U << idx);
32074 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32076 break;
32078 case IX86_BUILTIN_PDEP32:
32079 case IX86_BUILTIN_PDEP64:
32080 gcc_assert (n_args == 2);
32081 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32083 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32084 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32085 unsigned HOST_WIDE_INT res = 0;
32086 unsigned HOST_WIDE_INT m, k = 1;
32087 for (m = 1; m; m <<= 1)
32088 if ((mask & m) != 0)
32090 if ((src & k) != 0)
32091 res |= m;
32092 k <<= 1;
32094 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32096 break;
32098 case IX86_BUILTIN_PEXT32:
32099 case IX86_BUILTIN_PEXT64:
32100 gcc_assert (n_args == 2);
32101 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32103 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32104 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32105 unsigned HOST_WIDE_INT res = 0;
32106 unsigned HOST_WIDE_INT m, k = 1;
32107 for (m = 1; m; m <<= 1)
32108 if ((mask & m) != 0)
32110 if ((src & m) != 0)
32111 res |= k;
32112 k <<= 1;
32114 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32116 break;
32118 default:
32119 break;
32123 #ifdef SUBTARGET_FOLD_BUILTIN
32124 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
32125 #endif
32127 return NULL_TREE;
32130 /* Fold a MD builtin (use ix86_fold_builtin for folding into
32131 constant) in GIMPLE. */
32133 bool
32134 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
32136 gimple *stmt = gsi_stmt (*gsi);
32137 tree fndecl = gimple_call_fndecl (stmt);
32138 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
32139 int n_args = gimple_call_num_args (stmt);
32140 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
32141 tree decl = NULL_TREE;
32142 tree arg0, arg1;
32144 switch (fn_code)
32146 case IX86_BUILTIN_TZCNT32:
32147 decl = builtin_decl_implicit (BUILT_IN_CTZ);
32148 goto fold_tzcnt_lzcnt;
32150 case IX86_BUILTIN_TZCNT64:
32151 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
32152 goto fold_tzcnt_lzcnt;
32154 case IX86_BUILTIN_LZCNT32:
32155 decl = builtin_decl_implicit (BUILT_IN_CLZ);
32156 goto fold_tzcnt_lzcnt;
32158 case IX86_BUILTIN_LZCNT64:
32159 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
32160 goto fold_tzcnt_lzcnt;
32162 fold_tzcnt_lzcnt:
32163 gcc_assert (n_args == 1);
32164 arg0 = gimple_call_arg (stmt, 0);
32165 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
32167 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
32168 /* If arg0 is provably non-zero, optimize into generic
32169 __builtin_c[tl]z{,ll} function the middle-end handles
32170 better. */
32171 if (!expr_not_equal_to (arg0, wi::zero (prec)))
32172 return false;
32174 location_t loc = gimple_location (stmt);
32175 gimple *g = gimple_build_call (decl, 1, arg0);
32176 gimple_set_location (g, loc);
32177 tree lhs = make_ssa_name (integer_type_node);
32178 gimple_call_set_lhs (g, lhs);
32179 gsi_insert_before (gsi, g, GSI_SAME_STMT);
32180 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
32181 gimple_set_location (g, loc);
32182 gsi_replace (gsi, g, false);
32183 return true;
32185 break;
32187 case IX86_BUILTIN_BZHI32:
32188 case IX86_BUILTIN_BZHI64:
32189 gcc_assert (n_args == 2);
32190 arg1 = gimple_call_arg (stmt, 1);
32191 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
32193 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
32194 arg0 = gimple_call_arg (stmt, 0);
32195 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
32196 break;
32197 location_t loc = gimple_location (stmt);
32198 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32199 gimple_set_location (g, loc);
32200 gsi_replace (gsi, g, false);
32201 return true;
32203 break;
32205 case IX86_BUILTIN_PDEP32:
32206 case IX86_BUILTIN_PDEP64:
32207 case IX86_BUILTIN_PEXT32:
32208 case IX86_BUILTIN_PEXT64:
32209 gcc_assert (n_args == 2);
32210 arg1 = gimple_call_arg (stmt, 1);
32211 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
32213 location_t loc = gimple_location (stmt);
32214 arg0 = gimple_call_arg (stmt, 0);
32215 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32216 gimple_set_location (g, loc);
32217 gsi_replace (gsi, g, false);
32218 return true;
32220 break;
32222 default:
32223 break;
32226 return false;
32229 /* Make builtins to detect cpu type and features supported. NAME is
32230 the builtin name, CODE is the builtin code, and FTYPE is the function
32231 type of the builtin. */
32233 static void
32234 make_cpu_type_builtin (const char* name, int code,
32235 enum ix86_builtin_func_type ftype, bool is_const)
32237 tree decl;
32238 tree type;
32240 type = ix86_get_builtin_func_type (ftype);
32241 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32242 NULL, NULL_TREE);
32243 gcc_assert (decl != NULL_TREE);
32244 ix86_builtins[(int) code] = decl;
32245 TREE_READONLY (decl) = is_const;
32248 /* Make builtins to get CPU type and features supported. The created
32249 builtins are :
32251 __builtin_cpu_init (), to detect cpu type and features,
32252 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
32253 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
32256 static void
32257 ix86_init_platform_type_builtins (void)
32259 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
32260 INT_FTYPE_VOID, false);
32261 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
32262 INT_FTYPE_PCCHAR, true);
32263 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
32264 INT_FTYPE_PCCHAR, true);
32267 /* Internal method for ix86_init_builtins. */
32269 static void
32270 ix86_init_builtins_va_builtins_abi (void)
32272 tree ms_va_ref, sysv_va_ref;
32273 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
32274 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
32275 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
32276 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
32278 if (!TARGET_64BIT)
32279 return;
32280 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
32281 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
32282 ms_va_ref = build_reference_type (ms_va_list_type_node);
32283 sysv_va_ref =
32284 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
32286 fnvoid_va_end_ms =
32287 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32288 fnvoid_va_start_ms =
32289 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32290 fnvoid_va_end_sysv =
32291 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
32292 fnvoid_va_start_sysv =
32293 build_varargs_function_type_list (void_type_node, sysv_va_ref,
32294 NULL_TREE);
32295 fnvoid_va_copy_ms =
32296 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
32297 NULL_TREE);
32298 fnvoid_va_copy_sysv =
32299 build_function_type_list (void_type_node, sysv_va_ref,
32300 sysv_va_ref, NULL_TREE);
32302 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
32303 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
32304 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
32305 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
32306 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
32307 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
32308 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
32309 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32310 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
32311 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32312 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
32313 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32316 static void
32317 ix86_init_builtin_types (void)
32319 tree float80_type_node, const_string_type_node;
32321 /* The __float80 type. */
32322 float80_type_node = long_double_type_node;
32323 if (TYPE_MODE (float80_type_node) != XFmode)
32325 if (float64x_type_node != NULL_TREE
32326 && TYPE_MODE (float64x_type_node) == XFmode)
32327 float80_type_node = float64x_type_node;
32328 else
32330 /* The __float80 type. */
32331 float80_type_node = make_node (REAL_TYPE);
32333 TYPE_PRECISION (float80_type_node) = 80;
32334 layout_type (float80_type_node);
32337 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
32339 /* The __float128 type. The node has already been created as
32340 _Float128, so we only need to register the __float128 name for
32341 it. */
32342 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
32344 const_string_type_node
32345 = build_pointer_type (build_qualified_type
32346 (char_type_node, TYPE_QUAL_CONST));
32348 /* This macro is built by i386-builtin-types.awk. */
32349 DEFINE_BUILTIN_PRIMITIVE_TYPES;
32352 static void
32353 ix86_init_builtins (void)
32355 tree ftype, decl;
32357 ix86_init_builtin_types ();
32359 /* Builtins to get CPU type and features. */
32360 ix86_init_platform_type_builtins ();
32362 /* TFmode support builtins. */
32363 def_builtin_const (0, "__builtin_infq",
32364 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
32365 def_builtin_const (0, "__builtin_huge_valq",
32366 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
32368 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
32369 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
32370 BUILT_IN_MD, "nanq", NULL_TREE);
32371 TREE_READONLY (decl) = 1;
32372 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
32374 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
32375 BUILT_IN_MD, "nansq", NULL_TREE);
32376 TREE_READONLY (decl) = 1;
32377 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
32379 /* We will expand them to normal call if SSE isn't available since
32380 they are used by libgcc. */
32381 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
32382 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
32383 BUILT_IN_MD, "__fabstf2", NULL_TREE);
32384 TREE_READONLY (decl) = 1;
32385 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
32387 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
32388 decl = add_builtin_function ("__builtin_copysignq", ftype,
32389 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
32390 "__copysigntf3", NULL_TREE);
32391 TREE_READONLY (decl) = 1;
32392 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
32394 ix86_init_tm_builtins ();
32395 ix86_init_mmx_sse_builtins ();
32396 ix86_init_mpx_builtins ();
32398 if (TARGET_LP64)
32399 ix86_init_builtins_va_builtins_abi ();
32401 #ifdef SUBTARGET_INIT_BUILTINS
32402 SUBTARGET_INIT_BUILTINS;
32403 #endif
32406 /* Return the ix86 builtin for CODE. */
32408 static tree
32409 ix86_builtin_decl (unsigned code, bool)
32411 if (code >= IX86_BUILTIN_MAX)
32412 return error_mark_node;
32414 return ix86_builtins[code];
32417 /* Errors in the source file can cause expand_expr to return const0_rtx
32418 where we expect a vector. To avoid crashing, use one of the vector
32419 clear instructions. */
32420 static rtx
32421 safe_vector_operand (rtx x, machine_mode mode)
32423 if (x == const0_rtx)
32424 x = CONST0_RTX (mode);
32425 return x;
32428 /* Fixup modeless constants to fit required mode. */
32429 static rtx
32430 fixup_modeless_constant (rtx x, machine_mode mode)
32432 if (GET_MODE (x) == VOIDmode)
32433 x = convert_to_mode (mode, x, 1);
32434 return x;
32437 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
32439 static rtx
32440 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
32442 rtx pat;
32443 tree arg0 = CALL_EXPR_ARG (exp, 0);
32444 tree arg1 = CALL_EXPR_ARG (exp, 1);
32445 rtx op0 = expand_normal (arg0);
32446 rtx op1 = expand_normal (arg1);
32447 machine_mode tmode = insn_data[icode].operand[0].mode;
32448 machine_mode mode0 = insn_data[icode].operand[1].mode;
32449 machine_mode mode1 = insn_data[icode].operand[2].mode;
32451 if (VECTOR_MODE_P (mode0))
32452 op0 = safe_vector_operand (op0, mode0);
32453 if (VECTOR_MODE_P (mode1))
32454 op1 = safe_vector_operand (op1, mode1);
32456 if (optimize || !target
32457 || GET_MODE (target) != tmode
32458 || !insn_data[icode].operand[0].predicate (target, tmode))
32459 target = gen_reg_rtx (tmode);
32461 if (GET_MODE (op1) == SImode && mode1 == TImode)
32463 rtx x = gen_reg_rtx (V4SImode);
32464 emit_insn (gen_sse2_loadd (x, op1));
32465 op1 = gen_lowpart (TImode, x);
32468 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32469 op0 = copy_to_mode_reg (mode0, op0);
32470 if (!insn_data[icode].operand[2].predicate (op1, mode1))
32471 op1 = copy_to_mode_reg (mode1, op1);
32473 pat = GEN_FCN (icode) (target, op0, op1);
32474 if (! pat)
32475 return 0;
32477 emit_insn (pat);
32479 return target;
32482 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
32484 static rtx
32485 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
32486 enum ix86_builtin_func_type m_type,
32487 enum rtx_code sub_code)
32489 rtx pat;
32490 int i;
32491 int nargs;
32492 bool comparison_p = false;
32493 bool tf_p = false;
32494 bool last_arg_constant = false;
32495 int num_memory = 0;
32496 struct {
32497 rtx op;
32498 machine_mode mode;
32499 } args[4];
32501 machine_mode tmode = insn_data[icode].operand[0].mode;
32503 switch (m_type)
32505 case MULTI_ARG_4_DF2_DI_I:
32506 case MULTI_ARG_4_DF2_DI_I1:
32507 case MULTI_ARG_4_SF2_SI_I:
32508 case MULTI_ARG_4_SF2_SI_I1:
32509 nargs = 4;
32510 last_arg_constant = true;
32511 break;
32513 case MULTI_ARG_3_SF:
32514 case MULTI_ARG_3_DF:
32515 case MULTI_ARG_3_SF2:
32516 case MULTI_ARG_3_DF2:
32517 case MULTI_ARG_3_DI:
32518 case MULTI_ARG_3_SI:
32519 case MULTI_ARG_3_SI_DI:
32520 case MULTI_ARG_3_HI:
32521 case MULTI_ARG_3_HI_SI:
32522 case MULTI_ARG_3_QI:
32523 case MULTI_ARG_3_DI2:
32524 case MULTI_ARG_3_SI2:
32525 case MULTI_ARG_3_HI2:
32526 case MULTI_ARG_3_QI2:
32527 nargs = 3;
32528 break;
32530 case MULTI_ARG_2_SF:
32531 case MULTI_ARG_2_DF:
32532 case MULTI_ARG_2_DI:
32533 case MULTI_ARG_2_SI:
32534 case MULTI_ARG_2_HI:
32535 case MULTI_ARG_2_QI:
32536 nargs = 2;
32537 break;
32539 case MULTI_ARG_2_DI_IMM:
32540 case MULTI_ARG_2_SI_IMM:
32541 case MULTI_ARG_2_HI_IMM:
32542 case MULTI_ARG_2_QI_IMM:
32543 nargs = 2;
32544 last_arg_constant = true;
32545 break;
32547 case MULTI_ARG_1_SF:
32548 case MULTI_ARG_1_DF:
32549 case MULTI_ARG_1_SF2:
32550 case MULTI_ARG_1_DF2:
32551 case MULTI_ARG_1_DI:
32552 case MULTI_ARG_1_SI:
32553 case MULTI_ARG_1_HI:
32554 case MULTI_ARG_1_QI:
32555 case MULTI_ARG_1_SI_DI:
32556 case MULTI_ARG_1_HI_DI:
32557 case MULTI_ARG_1_HI_SI:
32558 case MULTI_ARG_1_QI_DI:
32559 case MULTI_ARG_1_QI_SI:
32560 case MULTI_ARG_1_QI_HI:
32561 nargs = 1;
32562 break;
32564 case MULTI_ARG_2_DI_CMP:
32565 case MULTI_ARG_2_SI_CMP:
32566 case MULTI_ARG_2_HI_CMP:
32567 case MULTI_ARG_2_QI_CMP:
32568 nargs = 2;
32569 comparison_p = true;
32570 break;
32572 case MULTI_ARG_2_SF_TF:
32573 case MULTI_ARG_2_DF_TF:
32574 case MULTI_ARG_2_DI_TF:
32575 case MULTI_ARG_2_SI_TF:
32576 case MULTI_ARG_2_HI_TF:
32577 case MULTI_ARG_2_QI_TF:
32578 nargs = 2;
32579 tf_p = true;
32580 break;
32582 default:
32583 gcc_unreachable ();
32586 if (optimize || !target
32587 || GET_MODE (target) != tmode
32588 || !insn_data[icode].operand[0].predicate (target, tmode))
32589 target = gen_reg_rtx (tmode);
32590 else if (memory_operand (target, tmode))
32591 num_memory++;
32593 gcc_assert (nargs <= 4);
32595 for (i = 0; i < nargs; i++)
32597 tree arg = CALL_EXPR_ARG (exp, i);
32598 rtx op = expand_normal (arg);
32599 int adjust = (comparison_p) ? 1 : 0;
32600 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
32602 if (last_arg_constant && i == nargs - 1)
32604 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
32606 enum insn_code new_icode = icode;
32607 switch (icode)
32609 case CODE_FOR_xop_vpermil2v2df3:
32610 case CODE_FOR_xop_vpermil2v4sf3:
32611 case CODE_FOR_xop_vpermil2v4df3:
32612 case CODE_FOR_xop_vpermil2v8sf3:
32613 error ("the last argument must be a 2-bit immediate");
32614 return gen_reg_rtx (tmode);
32615 case CODE_FOR_xop_rotlv2di3:
32616 new_icode = CODE_FOR_rotlv2di3;
32617 goto xop_rotl;
32618 case CODE_FOR_xop_rotlv4si3:
32619 new_icode = CODE_FOR_rotlv4si3;
32620 goto xop_rotl;
32621 case CODE_FOR_xop_rotlv8hi3:
32622 new_icode = CODE_FOR_rotlv8hi3;
32623 goto xop_rotl;
32624 case CODE_FOR_xop_rotlv16qi3:
32625 new_icode = CODE_FOR_rotlv16qi3;
32626 xop_rotl:
32627 if (CONST_INT_P (op))
32629 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
32630 op = GEN_INT (INTVAL (op) & mask);
32631 gcc_checking_assert
32632 (insn_data[icode].operand[i + 1].predicate (op, mode));
32634 else
32636 gcc_checking_assert
32637 (nargs == 2
32638 && insn_data[new_icode].operand[0].mode == tmode
32639 && insn_data[new_icode].operand[1].mode == tmode
32640 && insn_data[new_icode].operand[2].mode == mode
32641 && insn_data[new_icode].operand[0].predicate
32642 == insn_data[icode].operand[0].predicate
32643 && insn_data[new_icode].operand[1].predicate
32644 == insn_data[icode].operand[1].predicate);
32645 icode = new_icode;
32646 goto non_constant;
32648 break;
32649 default:
32650 gcc_unreachable ();
32654 else
32656 non_constant:
32657 if (VECTOR_MODE_P (mode))
32658 op = safe_vector_operand (op, mode);
32660 /* If we aren't optimizing, only allow one memory operand to be
32661 generated. */
32662 if (memory_operand (op, mode))
32663 num_memory++;
32665 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
32667 if (optimize
32668 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
32669 || num_memory > 1)
32670 op = force_reg (mode, op);
32673 args[i].op = op;
32674 args[i].mode = mode;
32677 switch (nargs)
32679 case 1:
32680 pat = GEN_FCN (icode) (target, args[0].op);
32681 break;
32683 case 2:
32684 if (tf_p)
32685 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
32686 GEN_INT ((int)sub_code));
32687 else if (! comparison_p)
32688 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32689 else
32691 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
32692 args[0].op,
32693 args[1].op);
32695 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
32697 break;
32699 case 3:
32700 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32701 break;
32703 case 4:
32704 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
32705 break;
32707 default:
32708 gcc_unreachable ();
32711 if (! pat)
32712 return 0;
32714 emit_insn (pat);
32715 return target;
32718 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
32719 insns with vec_merge. */
32721 static rtx
32722 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
32723 rtx target)
32725 rtx pat;
32726 tree arg0 = CALL_EXPR_ARG (exp, 0);
32727 rtx op1, op0 = expand_normal (arg0);
32728 machine_mode tmode = insn_data[icode].operand[0].mode;
32729 machine_mode mode0 = insn_data[icode].operand[1].mode;
32731 if (optimize || !target
32732 || GET_MODE (target) != tmode
32733 || !insn_data[icode].operand[0].predicate (target, tmode))
32734 target = gen_reg_rtx (tmode);
32736 if (VECTOR_MODE_P (mode0))
32737 op0 = safe_vector_operand (op0, mode0);
32739 if ((optimize && !register_operand (op0, mode0))
32740 || !insn_data[icode].operand[1].predicate (op0, mode0))
32741 op0 = copy_to_mode_reg (mode0, op0);
32743 op1 = op0;
32744 if (!insn_data[icode].operand[2].predicate (op1, mode0))
32745 op1 = copy_to_mode_reg (mode0, op1);
32747 pat = GEN_FCN (icode) (target, op0, op1);
32748 if (! pat)
32749 return 0;
32750 emit_insn (pat);
32751 return target;
32754 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
32756 static rtx
32757 ix86_expand_sse_compare (const struct builtin_description *d,
32758 tree exp, rtx target, bool swap)
32760 rtx pat;
32761 tree arg0 = CALL_EXPR_ARG (exp, 0);
32762 tree arg1 = CALL_EXPR_ARG (exp, 1);
32763 rtx op0 = expand_normal (arg0);
32764 rtx op1 = expand_normal (arg1);
32765 rtx op2;
32766 machine_mode tmode = insn_data[d->icode].operand[0].mode;
32767 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
32768 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
32769 enum rtx_code comparison = d->comparison;
32771 if (VECTOR_MODE_P (mode0))
32772 op0 = safe_vector_operand (op0, mode0);
32773 if (VECTOR_MODE_P (mode1))
32774 op1 = safe_vector_operand (op1, mode1);
32776 /* Swap operands if we have a comparison that isn't available in
32777 hardware. */
32778 if (swap)
32779 std::swap (op0, op1);
32781 if (optimize || !target
32782 || GET_MODE (target) != tmode
32783 || !insn_data[d->icode].operand[0].predicate (target, tmode))
32784 target = gen_reg_rtx (tmode);
32786 if ((optimize && !register_operand (op0, mode0))
32787 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
32788 op0 = copy_to_mode_reg (mode0, op0);
32789 if ((optimize && !register_operand (op1, mode1))
32790 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
32791 op1 = copy_to_mode_reg (mode1, op1);
32793 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
32794 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
32795 if (! pat)
32796 return 0;
32797 emit_insn (pat);
32798 return target;
32801 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
32803 static rtx
32804 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
32805 rtx target)
32807 rtx pat;
32808 tree arg0 = CALL_EXPR_ARG (exp, 0);
32809 tree arg1 = CALL_EXPR_ARG (exp, 1);
32810 rtx op0 = expand_normal (arg0);
32811 rtx op1 = expand_normal (arg1);
32812 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
32813 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
32814 enum rtx_code comparison = d->comparison;
32816 if (VECTOR_MODE_P (mode0))
32817 op0 = safe_vector_operand (op0, mode0);
32818 if (VECTOR_MODE_P (mode1))
32819 op1 = safe_vector_operand (op1, mode1);
32821 /* Swap operands if we have a comparison that isn't available in
32822 hardware. */
32823 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
32824 std::swap (op0, op1);
32826 target = gen_reg_rtx (SImode);
32827 emit_move_insn (target, const0_rtx);
32828 target = gen_rtx_SUBREG (QImode, target, 0);
32830 if ((optimize && !register_operand (op0, mode0))
32831 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
32832 op0 = copy_to_mode_reg (mode0, op0);
32833 if ((optimize && !register_operand (op1, mode1))
32834 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
32835 op1 = copy_to_mode_reg (mode1, op1);
32837 pat = GEN_FCN (d->icode) (op0, op1);
32838 if (! pat)
32839 return 0;
32840 emit_insn (pat);
32841 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
32842 gen_rtx_fmt_ee (comparison, QImode,
32843 SET_DEST (pat),
32844 const0_rtx)));
32846 return SUBREG_REG (target);
32849 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
32851 static rtx
32852 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
32853 rtx target)
32855 rtx pat;
32856 tree arg0 = CALL_EXPR_ARG (exp, 0);
32857 rtx op1, op0 = expand_normal (arg0);
32858 machine_mode tmode = insn_data[d->icode].operand[0].mode;
32859 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
32861 if (optimize || target == 0
32862 || GET_MODE (target) != tmode
32863 || !insn_data[d->icode].operand[0].predicate (target, tmode))
32864 target = gen_reg_rtx (tmode);
32866 if (VECTOR_MODE_P (mode0))
32867 op0 = safe_vector_operand (op0, mode0);
32869 if ((optimize && !register_operand (op0, mode0))
32870 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
32871 op0 = copy_to_mode_reg (mode0, op0);
32873 op1 = GEN_INT (d->comparison);
32875 pat = GEN_FCN (d->icode) (target, op0, op1);
32876 if (! pat)
32877 return 0;
32878 emit_insn (pat);
32879 return target;
32882 static rtx
32883 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
32884 tree exp, rtx target)
32886 rtx pat;
32887 tree arg0 = CALL_EXPR_ARG (exp, 0);
32888 tree arg1 = CALL_EXPR_ARG (exp, 1);
32889 rtx op0 = expand_normal (arg0);
32890 rtx op1 = expand_normal (arg1);
32891 rtx op2;
32892 machine_mode tmode = insn_data[d->icode].operand[0].mode;
32893 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
32894 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
32896 if (optimize || target == 0
32897 || GET_MODE (target) != tmode
32898 || !insn_data[d->icode].operand[0].predicate (target, tmode))
32899 target = gen_reg_rtx (tmode);
32901 op0 = safe_vector_operand (op0, mode0);
32902 op1 = safe_vector_operand (op1, mode1);
32904 if ((optimize && !register_operand (op0, mode0))
32905 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
32906 op0 = copy_to_mode_reg (mode0, op0);
32907 if ((optimize && !register_operand (op1, mode1))
32908 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
32909 op1 = copy_to_mode_reg (mode1, op1);
32911 op2 = GEN_INT (d->comparison);
32913 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
32914 if (! pat)
32915 return 0;
32916 emit_insn (pat);
32917 return target;
32920 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
32922 static rtx
32923 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
32924 rtx target)
32926 rtx pat;
32927 tree arg0 = CALL_EXPR_ARG (exp, 0);
32928 tree arg1 = CALL_EXPR_ARG (exp, 1);
32929 rtx op0 = expand_normal (arg0);
32930 rtx op1 = expand_normal (arg1);
32931 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
32932 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
32933 enum rtx_code comparison = d->comparison;
32935 if (VECTOR_MODE_P (mode0))
32936 op0 = safe_vector_operand (op0, mode0);
32937 if (VECTOR_MODE_P (mode1))
32938 op1 = safe_vector_operand (op1, mode1);
32940 target = gen_reg_rtx (SImode);
32941 emit_move_insn (target, const0_rtx);
32942 target = gen_rtx_SUBREG (QImode, target, 0);
32944 if ((optimize && !register_operand (op0, mode0))
32945 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
32946 op0 = copy_to_mode_reg (mode0, op0);
32947 if ((optimize && !register_operand (op1, mode1))
32948 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
32949 op1 = copy_to_mode_reg (mode1, op1);
32951 pat = GEN_FCN (d->icode) (op0, op1);
32952 if (! pat)
32953 return 0;
32954 emit_insn (pat);
32955 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
32956 gen_rtx_fmt_ee (comparison, QImode,
32957 SET_DEST (pat),
32958 const0_rtx)));
32960 return SUBREG_REG (target);
32963 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
32965 static rtx
32966 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
32967 tree exp, rtx target)
32969 rtx pat;
32970 tree arg0 = CALL_EXPR_ARG (exp, 0);
32971 tree arg1 = CALL_EXPR_ARG (exp, 1);
32972 tree arg2 = CALL_EXPR_ARG (exp, 2);
32973 tree arg3 = CALL_EXPR_ARG (exp, 3);
32974 tree arg4 = CALL_EXPR_ARG (exp, 4);
32975 rtx scratch0, scratch1;
32976 rtx op0 = expand_normal (arg0);
32977 rtx op1 = expand_normal (arg1);
32978 rtx op2 = expand_normal (arg2);
32979 rtx op3 = expand_normal (arg3);
32980 rtx op4 = expand_normal (arg4);
32981 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
32983 tmode0 = insn_data[d->icode].operand[0].mode;
32984 tmode1 = insn_data[d->icode].operand[1].mode;
32985 modev2 = insn_data[d->icode].operand[2].mode;
32986 modei3 = insn_data[d->icode].operand[3].mode;
32987 modev4 = insn_data[d->icode].operand[4].mode;
32988 modei5 = insn_data[d->icode].operand[5].mode;
32989 modeimm = insn_data[d->icode].operand[6].mode;
32991 if (VECTOR_MODE_P (modev2))
32992 op0 = safe_vector_operand (op0, modev2);
32993 if (VECTOR_MODE_P (modev4))
32994 op2 = safe_vector_operand (op2, modev4);
32996 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
32997 op0 = copy_to_mode_reg (modev2, op0);
32998 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
32999 op1 = copy_to_mode_reg (modei3, op1);
33000 if ((optimize && !register_operand (op2, modev4))
33001 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
33002 op2 = copy_to_mode_reg (modev4, op2);
33003 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
33004 op3 = copy_to_mode_reg (modei5, op3);
33006 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
33008 error ("the fifth argument must be an 8-bit immediate");
33009 return const0_rtx;
33012 if (d->code == IX86_BUILTIN_PCMPESTRI128)
33014 if (optimize || !target
33015 || GET_MODE (target) != tmode0
33016 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33017 target = gen_reg_rtx (tmode0);
33019 scratch1 = gen_reg_rtx (tmode1);
33021 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
33023 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
33025 if (optimize || !target
33026 || GET_MODE (target) != tmode1
33027 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33028 target = gen_reg_rtx (tmode1);
33030 scratch0 = gen_reg_rtx (tmode0);
33032 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
33034 else
33036 gcc_assert (d->flag);
33038 scratch0 = gen_reg_rtx (tmode0);
33039 scratch1 = gen_reg_rtx (tmode1);
33041 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
33044 if (! pat)
33045 return 0;
33047 emit_insn (pat);
33049 if (d->flag)
33051 target = gen_reg_rtx (SImode);
33052 emit_move_insn (target, const0_rtx);
33053 target = gen_rtx_SUBREG (QImode, target, 0);
33055 emit_insn
33056 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33057 gen_rtx_fmt_ee (EQ, QImode,
33058 gen_rtx_REG ((machine_mode) d->flag,
33059 FLAGS_REG),
33060 const0_rtx)));
33061 return SUBREG_REG (target);
33063 else
33064 return target;
33068 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
33070 static rtx
33071 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
33072 tree exp, rtx target)
33074 rtx pat;
33075 tree arg0 = CALL_EXPR_ARG (exp, 0);
33076 tree arg1 = CALL_EXPR_ARG (exp, 1);
33077 tree arg2 = CALL_EXPR_ARG (exp, 2);
33078 rtx scratch0, scratch1;
33079 rtx op0 = expand_normal (arg0);
33080 rtx op1 = expand_normal (arg1);
33081 rtx op2 = expand_normal (arg2);
33082 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
33084 tmode0 = insn_data[d->icode].operand[0].mode;
33085 tmode1 = insn_data[d->icode].operand[1].mode;
33086 modev2 = insn_data[d->icode].operand[2].mode;
33087 modev3 = insn_data[d->icode].operand[3].mode;
33088 modeimm = insn_data[d->icode].operand[4].mode;
33090 if (VECTOR_MODE_P (modev2))
33091 op0 = safe_vector_operand (op0, modev2);
33092 if (VECTOR_MODE_P (modev3))
33093 op1 = safe_vector_operand (op1, modev3);
33095 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33096 op0 = copy_to_mode_reg (modev2, op0);
33097 if ((optimize && !register_operand (op1, modev3))
33098 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
33099 op1 = copy_to_mode_reg (modev3, op1);
33101 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
33103 error ("the third argument must be an 8-bit immediate");
33104 return const0_rtx;
33107 if (d->code == IX86_BUILTIN_PCMPISTRI128)
33109 if (optimize || !target
33110 || GET_MODE (target) != tmode0
33111 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33112 target = gen_reg_rtx (tmode0);
33114 scratch1 = gen_reg_rtx (tmode1);
33116 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
33118 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
33120 if (optimize || !target
33121 || GET_MODE (target) != tmode1
33122 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33123 target = gen_reg_rtx (tmode1);
33125 scratch0 = gen_reg_rtx (tmode0);
33127 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
33129 else
33131 gcc_assert (d->flag);
33133 scratch0 = gen_reg_rtx (tmode0);
33134 scratch1 = gen_reg_rtx (tmode1);
33136 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
33139 if (! pat)
33140 return 0;
33142 emit_insn (pat);
33144 if (d->flag)
33146 target = gen_reg_rtx (SImode);
33147 emit_move_insn (target, const0_rtx);
33148 target = gen_rtx_SUBREG (QImode, target, 0);
33150 emit_insn
33151 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33152 gen_rtx_fmt_ee (EQ, QImode,
33153 gen_rtx_REG ((machine_mode) d->flag,
33154 FLAGS_REG),
33155 const0_rtx)));
33156 return SUBREG_REG (target);
33158 else
33159 return target;
33162 /* Subroutine of ix86_expand_builtin to take care of insns with
33163 variable number of operands. */
33165 static rtx
33166 ix86_expand_args_builtin (const struct builtin_description *d,
33167 tree exp, rtx target)
33169 rtx pat, real_target;
33170 unsigned int i, nargs;
33171 unsigned int nargs_constant = 0;
33172 unsigned int mask_pos = 0;
33173 int num_memory = 0;
33174 struct
33176 rtx op;
33177 machine_mode mode;
33178 } args[6];
33179 bool second_arg_count = false;
33180 enum insn_code icode = d->icode;
33181 const struct insn_data_d *insn_p = &insn_data[icode];
33182 machine_mode tmode = insn_p->operand[0].mode;
33183 machine_mode rmode = VOIDmode;
33184 bool swap = false;
33185 enum rtx_code comparison = d->comparison;
33187 switch ((enum ix86_builtin_func_type) d->flag)
33189 case V2DF_FTYPE_V2DF_ROUND:
33190 case V4DF_FTYPE_V4DF_ROUND:
33191 case V8DF_FTYPE_V8DF_ROUND:
33192 case V4SF_FTYPE_V4SF_ROUND:
33193 case V8SF_FTYPE_V8SF_ROUND:
33194 case V16SF_FTYPE_V16SF_ROUND:
33195 case V4SI_FTYPE_V4SF_ROUND:
33196 case V8SI_FTYPE_V8SF_ROUND:
33197 case V16SI_FTYPE_V16SF_ROUND:
33198 return ix86_expand_sse_round (d, exp, target);
33199 case V4SI_FTYPE_V2DF_V2DF_ROUND:
33200 case V8SI_FTYPE_V4DF_V4DF_ROUND:
33201 case V16SI_FTYPE_V8DF_V8DF_ROUND:
33202 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
33203 case INT_FTYPE_V8SF_V8SF_PTEST:
33204 case INT_FTYPE_V4DI_V4DI_PTEST:
33205 case INT_FTYPE_V4DF_V4DF_PTEST:
33206 case INT_FTYPE_V4SF_V4SF_PTEST:
33207 case INT_FTYPE_V2DI_V2DI_PTEST:
33208 case INT_FTYPE_V2DF_V2DF_PTEST:
33209 return ix86_expand_sse_ptest (d, exp, target);
33210 case FLOAT128_FTYPE_FLOAT128:
33211 case FLOAT_FTYPE_FLOAT:
33212 case INT_FTYPE_INT:
33213 case UINT_FTYPE_UINT:
33214 case UINT16_FTYPE_UINT16:
33215 case UINT64_FTYPE_INT:
33216 case UINT64_FTYPE_UINT64:
33217 case INT64_FTYPE_INT64:
33218 case INT64_FTYPE_V4SF:
33219 case INT64_FTYPE_V2DF:
33220 case INT_FTYPE_V16QI:
33221 case INT_FTYPE_V8QI:
33222 case INT_FTYPE_V8SF:
33223 case INT_FTYPE_V4DF:
33224 case INT_FTYPE_V4SF:
33225 case INT_FTYPE_V2DF:
33226 case INT_FTYPE_V32QI:
33227 case V16QI_FTYPE_V16QI:
33228 case V8SI_FTYPE_V8SF:
33229 case V8SI_FTYPE_V4SI:
33230 case V8HI_FTYPE_V8HI:
33231 case V8HI_FTYPE_V16QI:
33232 case V8QI_FTYPE_V8QI:
33233 case V8SF_FTYPE_V8SF:
33234 case V8SF_FTYPE_V8SI:
33235 case V8SF_FTYPE_V4SF:
33236 case V8SF_FTYPE_V8HI:
33237 case V4SI_FTYPE_V4SI:
33238 case V4SI_FTYPE_V16QI:
33239 case V4SI_FTYPE_V4SF:
33240 case V4SI_FTYPE_V8SI:
33241 case V4SI_FTYPE_V8HI:
33242 case V4SI_FTYPE_V4DF:
33243 case V4SI_FTYPE_V2DF:
33244 case V4HI_FTYPE_V4HI:
33245 case V4DF_FTYPE_V4DF:
33246 case V4DF_FTYPE_V4SI:
33247 case V4DF_FTYPE_V4SF:
33248 case V4DF_FTYPE_V2DF:
33249 case V4SF_FTYPE_V4SF:
33250 case V4SF_FTYPE_V4SI:
33251 case V4SF_FTYPE_V8SF:
33252 case V4SF_FTYPE_V4DF:
33253 case V4SF_FTYPE_V8HI:
33254 case V4SF_FTYPE_V2DF:
33255 case V2DI_FTYPE_V2DI:
33256 case V2DI_FTYPE_V16QI:
33257 case V2DI_FTYPE_V8HI:
33258 case V2DI_FTYPE_V4SI:
33259 case V2DF_FTYPE_V2DF:
33260 case V2DF_FTYPE_V4SI:
33261 case V2DF_FTYPE_V4DF:
33262 case V2DF_FTYPE_V4SF:
33263 case V2DF_FTYPE_V2SI:
33264 case V2SI_FTYPE_V2SI:
33265 case V2SI_FTYPE_V4SF:
33266 case V2SI_FTYPE_V2SF:
33267 case V2SI_FTYPE_V2DF:
33268 case V2SF_FTYPE_V2SF:
33269 case V2SF_FTYPE_V2SI:
33270 case V32QI_FTYPE_V32QI:
33271 case V32QI_FTYPE_V16QI:
33272 case V16HI_FTYPE_V16HI:
33273 case V16HI_FTYPE_V8HI:
33274 case V8SI_FTYPE_V8SI:
33275 case V16HI_FTYPE_V16QI:
33276 case V8SI_FTYPE_V16QI:
33277 case V4DI_FTYPE_V16QI:
33278 case V8SI_FTYPE_V8HI:
33279 case V4DI_FTYPE_V8HI:
33280 case V4DI_FTYPE_V4SI:
33281 case V4DI_FTYPE_V2DI:
33282 case UQI_FTYPE_UQI:
33283 case UHI_FTYPE_UHI:
33284 case USI_FTYPE_USI:
33285 case USI_FTYPE_UQI:
33286 case USI_FTYPE_UHI:
33287 case UDI_FTYPE_UDI:
33288 case UHI_FTYPE_V16QI:
33289 case USI_FTYPE_V32QI:
33290 case UDI_FTYPE_V64QI:
33291 case V16QI_FTYPE_UHI:
33292 case V32QI_FTYPE_USI:
33293 case V64QI_FTYPE_UDI:
33294 case V8HI_FTYPE_UQI:
33295 case V16HI_FTYPE_UHI:
33296 case V32HI_FTYPE_USI:
33297 case V4SI_FTYPE_UQI:
33298 case V8SI_FTYPE_UQI:
33299 case V4SI_FTYPE_UHI:
33300 case V8SI_FTYPE_UHI:
33301 case UQI_FTYPE_V8HI:
33302 case UHI_FTYPE_V16HI:
33303 case USI_FTYPE_V32HI:
33304 case UQI_FTYPE_V4SI:
33305 case UQI_FTYPE_V8SI:
33306 case UHI_FTYPE_V16SI:
33307 case UQI_FTYPE_V2DI:
33308 case UQI_FTYPE_V4DI:
33309 case UQI_FTYPE_V8DI:
33310 case V16SI_FTYPE_UHI:
33311 case V2DI_FTYPE_UQI:
33312 case V4DI_FTYPE_UQI:
33313 case V16SI_FTYPE_INT:
33314 case V16SF_FTYPE_V8SF:
33315 case V16SI_FTYPE_V8SI:
33316 case V16SF_FTYPE_V4SF:
33317 case V16SI_FTYPE_V4SI:
33318 case V16SI_FTYPE_V16SF:
33319 case V16SI_FTYPE_V16SI:
33320 case V16SF_FTYPE_V16SF:
33321 case V8DI_FTYPE_UQI:
33322 case V8DI_FTYPE_V8DI:
33323 case V8DF_FTYPE_V4DF:
33324 case V8DF_FTYPE_V2DF:
33325 case V8DF_FTYPE_V8DF:
33326 nargs = 1;
33327 break;
33328 case V4SF_FTYPE_V4SF_VEC_MERGE:
33329 case V2DF_FTYPE_V2DF_VEC_MERGE:
33330 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
33331 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
33332 case V16QI_FTYPE_V16QI_V16QI:
33333 case V16QI_FTYPE_V8HI_V8HI:
33334 case V16SF_FTYPE_V16SF_V16SF:
33335 case V8QI_FTYPE_V8QI_V8QI:
33336 case V8QI_FTYPE_V4HI_V4HI:
33337 case V8HI_FTYPE_V8HI_V8HI:
33338 case V8HI_FTYPE_V16QI_V16QI:
33339 case V8HI_FTYPE_V4SI_V4SI:
33340 case V8SF_FTYPE_V8SF_V8SF:
33341 case V8SF_FTYPE_V8SF_V8SI:
33342 case V8DF_FTYPE_V8DF_V8DF:
33343 case V4SI_FTYPE_V4SI_V4SI:
33344 case V4SI_FTYPE_V8HI_V8HI:
33345 case V4SI_FTYPE_V2DF_V2DF:
33346 case V4HI_FTYPE_V4HI_V4HI:
33347 case V4HI_FTYPE_V8QI_V8QI:
33348 case V4HI_FTYPE_V2SI_V2SI:
33349 case V4DF_FTYPE_V4DF_V4DF:
33350 case V4DF_FTYPE_V4DF_V4DI:
33351 case V4SF_FTYPE_V4SF_V4SF:
33352 case V4SF_FTYPE_V4SF_V4SI:
33353 case V4SF_FTYPE_V4SF_V2SI:
33354 case V4SF_FTYPE_V4SF_V2DF:
33355 case V4SF_FTYPE_V4SF_UINT:
33356 case V4SF_FTYPE_V4SF_DI:
33357 case V4SF_FTYPE_V4SF_SI:
33358 case V2DI_FTYPE_V2DI_V2DI:
33359 case V2DI_FTYPE_V16QI_V16QI:
33360 case V2DI_FTYPE_V4SI_V4SI:
33361 case V2DI_FTYPE_V2DI_V16QI:
33362 case V2SI_FTYPE_V2SI_V2SI:
33363 case V2SI_FTYPE_V4HI_V4HI:
33364 case V2SI_FTYPE_V2SF_V2SF:
33365 case V2DF_FTYPE_V2DF_V2DF:
33366 case V2DF_FTYPE_V2DF_V4SF:
33367 case V2DF_FTYPE_V2DF_V2DI:
33368 case V2DF_FTYPE_V2DF_DI:
33369 case V2DF_FTYPE_V2DF_SI:
33370 case V2DF_FTYPE_V2DF_UINT:
33371 case V2SF_FTYPE_V2SF_V2SF:
33372 case V1DI_FTYPE_V1DI_V1DI:
33373 case V1DI_FTYPE_V8QI_V8QI:
33374 case V1DI_FTYPE_V2SI_V2SI:
33375 case V32QI_FTYPE_V16HI_V16HI:
33376 case V16HI_FTYPE_V8SI_V8SI:
33377 case V32QI_FTYPE_V32QI_V32QI:
33378 case V16HI_FTYPE_V32QI_V32QI:
33379 case V16HI_FTYPE_V16HI_V16HI:
33380 case V8SI_FTYPE_V4DF_V4DF:
33381 case V8SI_FTYPE_V8SI_V8SI:
33382 case V8SI_FTYPE_V16HI_V16HI:
33383 case V4DI_FTYPE_V4DI_V4DI:
33384 case V4DI_FTYPE_V8SI_V8SI:
33385 case V8DI_FTYPE_V64QI_V64QI:
33386 if (comparison == UNKNOWN)
33387 return ix86_expand_binop_builtin (icode, exp, target);
33388 nargs = 2;
33389 break;
33390 case V4SF_FTYPE_V4SF_V4SF_SWAP:
33391 case V2DF_FTYPE_V2DF_V2DF_SWAP:
33392 gcc_assert (comparison != UNKNOWN);
33393 nargs = 2;
33394 swap = true;
33395 break;
33396 case V16HI_FTYPE_V16HI_V8HI_COUNT:
33397 case V16HI_FTYPE_V16HI_SI_COUNT:
33398 case V8SI_FTYPE_V8SI_V4SI_COUNT:
33399 case V8SI_FTYPE_V8SI_SI_COUNT:
33400 case V4DI_FTYPE_V4DI_V2DI_COUNT:
33401 case V4DI_FTYPE_V4DI_INT_COUNT:
33402 case V8HI_FTYPE_V8HI_V8HI_COUNT:
33403 case V8HI_FTYPE_V8HI_SI_COUNT:
33404 case V4SI_FTYPE_V4SI_V4SI_COUNT:
33405 case V4SI_FTYPE_V4SI_SI_COUNT:
33406 case V4HI_FTYPE_V4HI_V4HI_COUNT:
33407 case V4HI_FTYPE_V4HI_SI_COUNT:
33408 case V2DI_FTYPE_V2DI_V2DI_COUNT:
33409 case V2DI_FTYPE_V2DI_SI_COUNT:
33410 case V2SI_FTYPE_V2SI_V2SI_COUNT:
33411 case V2SI_FTYPE_V2SI_SI_COUNT:
33412 case V1DI_FTYPE_V1DI_V1DI_COUNT:
33413 case V1DI_FTYPE_V1DI_SI_COUNT:
33414 nargs = 2;
33415 second_arg_count = true;
33416 break;
33417 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
33418 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
33419 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
33420 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
33421 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
33422 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
33423 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
33424 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
33425 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
33426 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
33427 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
33428 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
33429 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
33430 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
33431 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
33432 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
33433 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
33434 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
33435 nargs = 4;
33436 second_arg_count = true;
33437 break;
33438 case UINT64_FTYPE_UINT64_UINT64:
33439 case UINT_FTYPE_UINT_UINT:
33440 case UINT_FTYPE_UINT_USHORT:
33441 case UINT_FTYPE_UINT_UCHAR:
33442 case UINT16_FTYPE_UINT16_INT:
33443 case UINT8_FTYPE_UINT8_INT:
33444 case UQI_FTYPE_UQI_UQI:
33445 case UHI_FTYPE_UHI_UHI:
33446 case USI_FTYPE_USI_USI:
33447 case UDI_FTYPE_UDI_UDI:
33448 case V16SI_FTYPE_V8DF_V8DF:
33449 nargs = 2;
33450 break;
33451 case V2DI_FTYPE_V2DI_INT_CONVERT:
33452 nargs = 2;
33453 rmode = V1TImode;
33454 nargs_constant = 1;
33455 break;
33456 case V4DI_FTYPE_V4DI_INT_CONVERT:
33457 nargs = 2;
33458 rmode = V2TImode;
33459 nargs_constant = 1;
33460 break;
33461 case V8DI_FTYPE_V8DI_INT_CONVERT:
33462 nargs = 2;
33463 rmode = V4TImode;
33464 nargs_constant = 1;
33465 break;
33466 case V8HI_FTYPE_V8HI_INT:
33467 case V8HI_FTYPE_V8SF_INT:
33468 case V16HI_FTYPE_V16SF_INT:
33469 case V8HI_FTYPE_V4SF_INT:
33470 case V8SF_FTYPE_V8SF_INT:
33471 case V4SF_FTYPE_V16SF_INT:
33472 case V16SF_FTYPE_V16SF_INT:
33473 case V4SI_FTYPE_V4SI_INT:
33474 case V4SI_FTYPE_V8SI_INT:
33475 case V4HI_FTYPE_V4HI_INT:
33476 case V4DF_FTYPE_V4DF_INT:
33477 case V4DF_FTYPE_V8DF_INT:
33478 case V4SF_FTYPE_V4SF_INT:
33479 case V4SF_FTYPE_V8SF_INT:
33480 case V2DI_FTYPE_V2DI_INT:
33481 case V2DF_FTYPE_V2DF_INT:
33482 case V2DF_FTYPE_V4DF_INT:
33483 case V16HI_FTYPE_V16HI_INT:
33484 case V8SI_FTYPE_V8SI_INT:
33485 case V16SI_FTYPE_V16SI_INT:
33486 case V4SI_FTYPE_V16SI_INT:
33487 case V4DI_FTYPE_V4DI_INT:
33488 case V2DI_FTYPE_V4DI_INT:
33489 case V4DI_FTYPE_V8DI_INT:
33490 case QI_FTYPE_V4SF_INT:
33491 case QI_FTYPE_V2DF_INT:
33492 case UQI_FTYPE_UQI_UQI_CONST:
33493 case UHI_FTYPE_UHI_UQI:
33494 case USI_FTYPE_USI_UQI:
33495 case UDI_FTYPE_UDI_UQI:
33496 nargs = 2;
33497 nargs_constant = 1;
33498 break;
33499 case V16QI_FTYPE_V16QI_V16QI_V16QI:
33500 case V8SF_FTYPE_V8SF_V8SF_V8SF:
33501 case V4DF_FTYPE_V4DF_V4DF_V4DF:
33502 case V4SF_FTYPE_V4SF_V4SF_V4SF:
33503 case V2DF_FTYPE_V2DF_V2DF_V2DF:
33504 case V32QI_FTYPE_V32QI_V32QI_V32QI:
33505 case UHI_FTYPE_V16SI_V16SI_UHI:
33506 case UQI_FTYPE_V8DI_V8DI_UQI:
33507 case V16HI_FTYPE_V16SI_V16HI_UHI:
33508 case V16QI_FTYPE_V16SI_V16QI_UHI:
33509 case V16QI_FTYPE_V8DI_V16QI_UQI:
33510 case V16SF_FTYPE_V16SF_V16SF_UHI:
33511 case V16SF_FTYPE_V4SF_V16SF_UHI:
33512 case V16SI_FTYPE_SI_V16SI_UHI:
33513 case V16SI_FTYPE_V16HI_V16SI_UHI:
33514 case V16SI_FTYPE_V16QI_V16SI_UHI:
33515 case V8SF_FTYPE_V4SF_V8SF_UQI:
33516 case V4DF_FTYPE_V2DF_V4DF_UQI:
33517 case V8SI_FTYPE_V4SI_V8SI_UQI:
33518 case V8SI_FTYPE_SI_V8SI_UQI:
33519 case V4SI_FTYPE_V4SI_V4SI_UQI:
33520 case V4SI_FTYPE_SI_V4SI_UQI:
33521 case V4DI_FTYPE_V2DI_V4DI_UQI:
33522 case V4DI_FTYPE_DI_V4DI_UQI:
33523 case V2DI_FTYPE_V2DI_V2DI_UQI:
33524 case V2DI_FTYPE_DI_V2DI_UQI:
33525 case V64QI_FTYPE_V64QI_V64QI_UDI:
33526 case V64QI_FTYPE_V16QI_V64QI_UDI:
33527 case V64QI_FTYPE_QI_V64QI_UDI:
33528 case V32QI_FTYPE_V32QI_V32QI_USI:
33529 case V32QI_FTYPE_V16QI_V32QI_USI:
33530 case V32QI_FTYPE_QI_V32QI_USI:
33531 case V16QI_FTYPE_V16QI_V16QI_UHI:
33532 case V16QI_FTYPE_QI_V16QI_UHI:
33533 case V32HI_FTYPE_V8HI_V32HI_USI:
33534 case V32HI_FTYPE_HI_V32HI_USI:
33535 case V16HI_FTYPE_V8HI_V16HI_UHI:
33536 case V16HI_FTYPE_HI_V16HI_UHI:
33537 case V8HI_FTYPE_V8HI_V8HI_UQI:
33538 case V8HI_FTYPE_HI_V8HI_UQI:
33539 case V8SF_FTYPE_V8HI_V8SF_UQI:
33540 case V4SF_FTYPE_V8HI_V4SF_UQI:
33541 case V8SI_FTYPE_V8SF_V8SI_UQI:
33542 case V4SI_FTYPE_V4SF_V4SI_UQI:
33543 case V4DI_FTYPE_V4SF_V4DI_UQI:
33544 case V2DI_FTYPE_V4SF_V2DI_UQI:
33545 case V4SF_FTYPE_V4DI_V4SF_UQI:
33546 case V4SF_FTYPE_V2DI_V4SF_UQI:
33547 case V4DF_FTYPE_V4DI_V4DF_UQI:
33548 case V2DF_FTYPE_V2DI_V2DF_UQI:
33549 case V16QI_FTYPE_V8HI_V16QI_UQI:
33550 case V16QI_FTYPE_V16HI_V16QI_UHI:
33551 case V16QI_FTYPE_V4SI_V16QI_UQI:
33552 case V16QI_FTYPE_V8SI_V16QI_UQI:
33553 case V8HI_FTYPE_V4SI_V8HI_UQI:
33554 case V8HI_FTYPE_V8SI_V8HI_UQI:
33555 case V16QI_FTYPE_V2DI_V16QI_UQI:
33556 case V16QI_FTYPE_V4DI_V16QI_UQI:
33557 case V8HI_FTYPE_V2DI_V8HI_UQI:
33558 case V8HI_FTYPE_V4DI_V8HI_UQI:
33559 case V4SI_FTYPE_V2DI_V4SI_UQI:
33560 case V4SI_FTYPE_V4DI_V4SI_UQI:
33561 case V32QI_FTYPE_V32HI_V32QI_USI:
33562 case UHI_FTYPE_V16QI_V16QI_UHI:
33563 case USI_FTYPE_V32QI_V32QI_USI:
33564 case UDI_FTYPE_V64QI_V64QI_UDI:
33565 case UQI_FTYPE_V8HI_V8HI_UQI:
33566 case UHI_FTYPE_V16HI_V16HI_UHI:
33567 case USI_FTYPE_V32HI_V32HI_USI:
33568 case UQI_FTYPE_V4SI_V4SI_UQI:
33569 case UQI_FTYPE_V8SI_V8SI_UQI:
33570 case UQI_FTYPE_V2DI_V2DI_UQI:
33571 case UQI_FTYPE_V4DI_V4DI_UQI:
33572 case V4SF_FTYPE_V2DF_V4SF_UQI:
33573 case V4SF_FTYPE_V4DF_V4SF_UQI:
33574 case V16SI_FTYPE_V16SI_V16SI_UHI:
33575 case V16SI_FTYPE_V4SI_V16SI_UHI:
33576 case V2DI_FTYPE_V4SI_V2DI_UQI:
33577 case V2DI_FTYPE_V8HI_V2DI_UQI:
33578 case V2DI_FTYPE_V16QI_V2DI_UQI:
33579 case V4DI_FTYPE_V4DI_V4DI_UQI:
33580 case V4DI_FTYPE_V4SI_V4DI_UQI:
33581 case V4DI_FTYPE_V8HI_V4DI_UQI:
33582 case V4DI_FTYPE_V16QI_V4DI_UQI:
33583 case V4DI_FTYPE_V4DF_V4DI_UQI:
33584 case V2DI_FTYPE_V2DF_V2DI_UQI:
33585 case V4SI_FTYPE_V4DF_V4SI_UQI:
33586 case V4SI_FTYPE_V2DF_V4SI_UQI:
33587 case V4SI_FTYPE_V8HI_V4SI_UQI:
33588 case V4SI_FTYPE_V16QI_V4SI_UQI:
33589 case V4DI_FTYPE_V4DI_V4DI_V4DI:
33590 case V8DF_FTYPE_V2DF_V8DF_UQI:
33591 case V8DF_FTYPE_V4DF_V8DF_UQI:
33592 case V8DF_FTYPE_V8DF_V8DF_UQI:
33593 case V8SF_FTYPE_V8SF_V8SF_UQI:
33594 case V8SF_FTYPE_V8SI_V8SF_UQI:
33595 case V4DF_FTYPE_V4DF_V4DF_UQI:
33596 case V4SF_FTYPE_V4SF_V4SF_UQI:
33597 case V2DF_FTYPE_V2DF_V2DF_UQI:
33598 case V2DF_FTYPE_V4SF_V2DF_UQI:
33599 case V2DF_FTYPE_V4SI_V2DF_UQI:
33600 case V4SF_FTYPE_V4SI_V4SF_UQI:
33601 case V4DF_FTYPE_V4SF_V4DF_UQI:
33602 case V4DF_FTYPE_V4SI_V4DF_UQI:
33603 case V8SI_FTYPE_V8SI_V8SI_UQI:
33604 case V8SI_FTYPE_V8HI_V8SI_UQI:
33605 case V8SI_FTYPE_V16QI_V8SI_UQI:
33606 case V8DF_FTYPE_V8SI_V8DF_UQI:
33607 case V8DI_FTYPE_DI_V8DI_UQI:
33608 case V16SF_FTYPE_V8SF_V16SF_UHI:
33609 case V16SI_FTYPE_V8SI_V16SI_UHI:
33610 case V16HI_FTYPE_V16HI_V16HI_UHI:
33611 case V8HI_FTYPE_V16QI_V8HI_UQI:
33612 case V16HI_FTYPE_V16QI_V16HI_UHI:
33613 case V32HI_FTYPE_V32HI_V32HI_USI:
33614 case V32HI_FTYPE_V32QI_V32HI_USI:
33615 case V8DI_FTYPE_V16QI_V8DI_UQI:
33616 case V8DI_FTYPE_V2DI_V8DI_UQI:
33617 case V8DI_FTYPE_V4DI_V8DI_UQI:
33618 case V8DI_FTYPE_V8DI_V8DI_UQI:
33619 case V8DI_FTYPE_V8HI_V8DI_UQI:
33620 case V8DI_FTYPE_V8SI_V8DI_UQI:
33621 case V8HI_FTYPE_V8DI_V8HI_UQI:
33622 case V8SI_FTYPE_V8DI_V8SI_UQI:
33623 case V4SI_FTYPE_V4SI_V4SI_V4SI:
33624 nargs = 3;
33625 break;
33626 case V32QI_FTYPE_V32QI_V32QI_INT:
33627 case V16HI_FTYPE_V16HI_V16HI_INT:
33628 case V16QI_FTYPE_V16QI_V16QI_INT:
33629 case V4DI_FTYPE_V4DI_V4DI_INT:
33630 case V8HI_FTYPE_V8HI_V8HI_INT:
33631 case V8SI_FTYPE_V8SI_V8SI_INT:
33632 case V8SI_FTYPE_V8SI_V4SI_INT:
33633 case V8SF_FTYPE_V8SF_V8SF_INT:
33634 case V8SF_FTYPE_V8SF_V4SF_INT:
33635 case V4SI_FTYPE_V4SI_V4SI_INT:
33636 case V4DF_FTYPE_V4DF_V4DF_INT:
33637 case V16SF_FTYPE_V16SF_V16SF_INT:
33638 case V16SF_FTYPE_V16SF_V4SF_INT:
33639 case V16SI_FTYPE_V16SI_V4SI_INT:
33640 case V4DF_FTYPE_V4DF_V2DF_INT:
33641 case V4SF_FTYPE_V4SF_V4SF_INT:
33642 case V2DI_FTYPE_V2DI_V2DI_INT:
33643 case V4DI_FTYPE_V4DI_V2DI_INT:
33644 case V2DF_FTYPE_V2DF_V2DF_INT:
33645 case UQI_FTYPE_V8DI_V8UDI_INT:
33646 case UQI_FTYPE_V8DF_V8DF_INT:
33647 case UQI_FTYPE_V2DF_V2DF_INT:
33648 case UQI_FTYPE_V4SF_V4SF_INT:
33649 case UHI_FTYPE_V16SI_V16SI_INT:
33650 case UHI_FTYPE_V16SF_V16SF_INT:
33651 nargs = 3;
33652 nargs_constant = 1;
33653 break;
33654 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
33655 nargs = 3;
33656 rmode = V4DImode;
33657 nargs_constant = 1;
33658 break;
33659 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
33660 nargs = 3;
33661 rmode = V2DImode;
33662 nargs_constant = 1;
33663 break;
33664 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
33665 nargs = 3;
33666 rmode = DImode;
33667 nargs_constant = 1;
33668 break;
33669 case V2DI_FTYPE_V2DI_UINT_UINT:
33670 nargs = 3;
33671 nargs_constant = 2;
33672 break;
33673 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
33674 nargs = 3;
33675 rmode = V8DImode;
33676 nargs_constant = 1;
33677 break;
33678 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
33679 nargs = 5;
33680 rmode = V8DImode;
33681 mask_pos = 2;
33682 nargs_constant = 1;
33683 break;
33684 case QI_FTYPE_V8DF_INT_UQI:
33685 case QI_FTYPE_V4DF_INT_UQI:
33686 case QI_FTYPE_V2DF_INT_UQI:
33687 case HI_FTYPE_V16SF_INT_UHI:
33688 case QI_FTYPE_V8SF_INT_UQI:
33689 case QI_FTYPE_V4SF_INT_UQI:
33690 nargs = 3;
33691 mask_pos = 1;
33692 nargs_constant = 1;
33693 break;
33694 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
33695 nargs = 5;
33696 rmode = V4DImode;
33697 mask_pos = 2;
33698 nargs_constant = 1;
33699 break;
33700 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
33701 nargs = 5;
33702 rmode = V2DImode;
33703 mask_pos = 2;
33704 nargs_constant = 1;
33705 break;
33706 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
33707 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
33708 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
33709 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
33710 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
33711 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
33712 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
33713 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
33714 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
33715 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
33716 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
33717 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
33718 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
33719 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
33720 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
33721 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
33722 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
33723 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
33724 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
33725 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
33726 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
33727 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
33728 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
33729 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
33730 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
33731 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
33732 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
33733 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
33734 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
33735 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
33736 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
33737 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
33738 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
33739 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
33740 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
33741 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
33742 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
33743 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
33744 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
33745 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
33746 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
33747 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
33748 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
33749 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
33750 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
33751 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
33752 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
33753 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
33754 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
33755 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
33756 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
33757 nargs = 4;
33758 break;
33759 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
33760 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
33761 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
33762 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
33763 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
33764 nargs = 4;
33765 nargs_constant = 1;
33766 break;
33767 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
33768 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
33769 case QI_FTYPE_V4DF_V4DF_INT_UQI:
33770 case QI_FTYPE_V8SF_V8SF_INT_UQI:
33771 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
33772 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
33773 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
33774 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
33775 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
33776 case USI_FTYPE_V32QI_V32QI_INT_USI:
33777 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
33778 case USI_FTYPE_V32HI_V32HI_INT_USI:
33779 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
33780 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
33781 nargs = 4;
33782 mask_pos = 1;
33783 nargs_constant = 1;
33784 break;
33785 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
33786 nargs = 4;
33787 nargs_constant = 2;
33788 break;
33789 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
33790 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
33791 nargs = 4;
33792 break;
33793 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
33794 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
33795 mask_pos = 1;
33796 nargs = 4;
33797 nargs_constant = 1;
33798 break;
33799 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
33800 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
33801 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
33802 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
33803 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
33804 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
33805 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
33806 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
33807 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
33808 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
33809 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
33810 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
33811 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
33812 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
33813 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
33814 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
33815 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
33816 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
33817 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
33818 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
33819 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
33820 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
33821 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
33822 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
33823 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
33824 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
33825 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
33826 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
33827 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
33828 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
33829 nargs = 4;
33830 mask_pos = 2;
33831 nargs_constant = 1;
33832 break;
33833 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
33834 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
33835 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
33836 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
33837 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
33838 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
33839 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
33840 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
33841 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
33842 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
33843 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
33844 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
33845 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
33846 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
33847 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
33848 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
33849 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
33850 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
33851 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
33852 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
33853 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
33854 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
33855 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
33856 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
33857 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
33858 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
33859 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
33860 nargs = 5;
33861 mask_pos = 2;
33862 nargs_constant = 1;
33863 break;
33864 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
33865 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
33866 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
33867 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
33868 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
33869 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
33870 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
33871 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
33872 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
33873 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
33874 nargs = 5;
33875 mask_pos = 1;
33876 nargs_constant = 1;
33877 break;
33879 default:
33880 gcc_unreachable ();
33883 gcc_assert (nargs <= ARRAY_SIZE (args));
33885 if (comparison != UNKNOWN)
33887 gcc_assert (nargs == 2);
33888 return ix86_expand_sse_compare (d, exp, target, swap);
33891 if (rmode == VOIDmode || rmode == tmode)
33893 if (optimize
33894 || target == 0
33895 || GET_MODE (target) != tmode
33896 || !insn_p->operand[0].predicate (target, tmode))
33897 target = gen_reg_rtx (tmode);
33898 else if (memory_operand (target, tmode))
33899 num_memory++;
33900 real_target = target;
33902 else
33904 real_target = gen_reg_rtx (tmode);
33905 target = lowpart_subreg (rmode, real_target, tmode);
33908 for (i = 0; i < nargs; i++)
33910 tree arg = CALL_EXPR_ARG (exp, i);
33911 rtx op = expand_normal (arg);
33912 machine_mode mode = insn_p->operand[i + 1].mode;
33913 bool match = insn_p->operand[i + 1].predicate (op, mode);
33915 if (second_arg_count && i == 1)
33917 /* SIMD shift insns take either an 8-bit immediate or
33918 register as count. But builtin functions take int as
33919 count. If count doesn't match, we put it in register.
33920 The instructions are using 64-bit count, if op is just
33921 32-bit, zero-extend it, as negative shift counts
33922 are undefined behavior and zero-extension is more
33923 efficient. */
33924 if (!match)
33926 if (SCALAR_INT_MODE_P (GET_MODE (op)))
33927 op = convert_modes (mode, GET_MODE (op), op, 1);
33928 else
33929 op = lowpart_subreg (mode, op, GET_MODE (op));
33930 if (!insn_p->operand[i + 1].predicate (op, mode))
33931 op = copy_to_reg (op);
33934 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
33935 (!mask_pos && (nargs - i) <= nargs_constant))
33937 if (!match)
33938 switch (icode)
33940 case CODE_FOR_avx_vinsertf128v4di:
33941 case CODE_FOR_avx_vextractf128v4di:
33942 error ("the last argument must be an 1-bit immediate");
33943 return const0_rtx;
33945 case CODE_FOR_avx512f_cmpv8di3_mask:
33946 case CODE_FOR_avx512f_cmpv16si3_mask:
33947 case CODE_FOR_avx512f_ucmpv8di3_mask:
33948 case CODE_FOR_avx512f_ucmpv16si3_mask:
33949 case CODE_FOR_avx512vl_cmpv4di3_mask:
33950 case CODE_FOR_avx512vl_cmpv8si3_mask:
33951 case CODE_FOR_avx512vl_ucmpv4di3_mask:
33952 case CODE_FOR_avx512vl_ucmpv8si3_mask:
33953 case CODE_FOR_avx512vl_cmpv2di3_mask:
33954 case CODE_FOR_avx512vl_cmpv4si3_mask:
33955 case CODE_FOR_avx512vl_ucmpv2di3_mask:
33956 case CODE_FOR_avx512vl_ucmpv4si3_mask:
33957 error ("the last argument must be a 3-bit immediate");
33958 return const0_rtx;
33960 case CODE_FOR_sse4_1_roundsd:
33961 case CODE_FOR_sse4_1_roundss:
33963 case CODE_FOR_sse4_1_roundpd:
33964 case CODE_FOR_sse4_1_roundps:
33965 case CODE_FOR_avx_roundpd256:
33966 case CODE_FOR_avx_roundps256:
33968 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
33969 case CODE_FOR_sse4_1_roundps_sfix:
33970 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
33971 case CODE_FOR_avx_roundps_sfix256:
33973 case CODE_FOR_sse4_1_blendps:
33974 case CODE_FOR_avx_blendpd256:
33975 case CODE_FOR_avx_vpermilv4df:
33976 case CODE_FOR_avx_vpermilv4df_mask:
33977 case CODE_FOR_avx512f_getmantv8df_mask:
33978 case CODE_FOR_avx512f_getmantv16sf_mask:
33979 case CODE_FOR_avx512vl_getmantv8sf_mask:
33980 case CODE_FOR_avx512vl_getmantv4df_mask:
33981 case CODE_FOR_avx512vl_getmantv4sf_mask:
33982 case CODE_FOR_avx512vl_getmantv2df_mask:
33983 case CODE_FOR_avx512dq_rangepv8df_mask_round:
33984 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
33985 case CODE_FOR_avx512dq_rangepv4df_mask:
33986 case CODE_FOR_avx512dq_rangepv8sf_mask:
33987 case CODE_FOR_avx512dq_rangepv2df_mask:
33988 case CODE_FOR_avx512dq_rangepv4sf_mask:
33989 case CODE_FOR_avx_shufpd256_mask:
33990 error ("the last argument must be a 4-bit immediate");
33991 return const0_rtx;
33993 case CODE_FOR_sha1rnds4:
33994 case CODE_FOR_sse4_1_blendpd:
33995 case CODE_FOR_avx_vpermilv2df:
33996 case CODE_FOR_avx_vpermilv2df_mask:
33997 case CODE_FOR_xop_vpermil2v2df3:
33998 case CODE_FOR_xop_vpermil2v4sf3:
33999 case CODE_FOR_xop_vpermil2v4df3:
34000 case CODE_FOR_xop_vpermil2v8sf3:
34001 case CODE_FOR_avx512f_vinsertf32x4_mask:
34002 case CODE_FOR_avx512f_vinserti32x4_mask:
34003 case CODE_FOR_avx512f_vextractf32x4_mask:
34004 case CODE_FOR_avx512f_vextracti32x4_mask:
34005 case CODE_FOR_sse2_shufpd:
34006 case CODE_FOR_sse2_shufpd_mask:
34007 case CODE_FOR_avx512dq_shuf_f64x2_mask:
34008 case CODE_FOR_avx512dq_shuf_i64x2_mask:
34009 case CODE_FOR_avx512vl_shuf_i32x4_mask:
34010 case CODE_FOR_avx512vl_shuf_f32x4_mask:
34011 error ("the last argument must be a 2-bit immediate");
34012 return const0_rtx;
34014 case CODE_FOR_avx_vextractf128v4df:
34015 case CODE_FOR_avx_vextractf128v8sf:
34016 case CODE_FOR_avx_vextractf128v8si:
34017 case CODE_FOR_avx_vinsertf128v4df:
34018 case CODE_FOR_avx_vinsertf128v8sf:
34019 case CODE_FOR_avx_vinsertf128v8si:
34020 case CODE_FOR_avx512f_vinsertf64x4_mask:
34021 case CODE_FOR_avx512f_vinserti64x4_mask:
34022 case CODE_FOR_avx512f_vextractf64x4_mask:
34023 case CODE_FOR_avx512f_vextracti64x4_mask:
34024 case CODE_FOR_avx512dq_vinsertf32x8_mask:
34025 case CODE_FOR_avx512dq_vinserti32x8_mask:
34026 case CODE_FOR_avx512vl_vinsertv4df:
34027 case CODE_FOR_avx512vl_vinsertv4di:
34028 case CODE_FOR_avx512vl_vinsertv8sf:
34029 case CODE_FOR_avx512vl_vinsertv8si:
34030 error ("the last argument must be a 1-bit immediate");
34031 return const0_rtx;
34033 case CODE_FOR_avx_vmcmpv2df3:
34034 case CODE_FOR_avx_vmcmpv4sf3:
34035 case CODE_FOR_avx_cmpv2df3:
34036 case CODE_FOR_avx_cmpv4sf3:
34037 case CODE_FOR_avx_cmpv4df3:
34038 case CODE_FOR_avx_cmpv8sf3:
34039 case CODE_FOR_avx512f_cmpv8df3_mask:
34040 case CODE_FOR_avx512f_cmpv16sf3_mask:
34041 case CODE_FOR_avx512f_vmcmpv2df3_mask:
34042 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
34043 error ("the last argument must be a 5-bit immediate");
34044 return const0_rtx;
34046 default:
34047 switch (nargs_constant)
34049 case 2:
34050 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34051 (!mask_pos && (nargs - i) == nargs_constant))
34053 error ("the next to last argument must be an 8-bit immediate");
34054 break;
34056 /* FALLTHRU */
34057 case 1:
34058 error ("the last argument must be an 8-bit immediate");
34059 break;
34060 default:
34061 gcc_unreachable ();
34063 return const0_rtx;
34066 else
34068 if (VECTOR_MODE_P (mode))
34069 op = safe_vector_operand (op, mode);
34071 /* If we aren't optimizing, only allow one memory operand to
34072 be generated. */
34073 if (memory_operand (op, mode))
34074 num_memory++;
34076 op = fixup_modeless_constant (op, mode);
34078 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34080 if (optimize || !match || num_memory > 1)
34081 op = copy_to_mode_reg (mode, op);
34083 else
34085 op = copy_to_reg (op);
34086 op = lowpart_subreg (mode, op, GET_MODE (op));
34090 args[i].op = op;
34091 args[i].mode = mode;
34094 switch (nargs)
34096 case 1:
34097 pat = GEN_FCN (icode) (real_target, args[0].op);
34098 break;
34099 case 2:
34100 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
34101 break;
34102 case 3:
34103 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34104 args[2].op);
34105 break;
34106 case 4:
34107 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34108 args[2].op, args[3].op);
34109 break;
34110 case 5:
34111 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34112 args[2].op, args[3].op, args[4].op);
34113 break;
34114 case 6:
34115 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34116 args[2].op, args[3].op, args[4].op,
34117 args[5].op);
34118 break;
34119 default:
34120 gcc_unreachable ();
34123 if (! pat)
34124 return 0;
34126 emit_insn (pat);
34127 return target;
34130 /* Transform pattern of following layout:
34131 (set A
34132 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
34134 into:
34135 (set (A B)) */
34137 static rtx
34138 ix86_erase_embedded_rounding (rtx pat)
34140 if (GET_CODE (pat) == INSN)
34141 pat = PATTERN (pat);
34143 gcc_assert (GET_CODE (pat) == SET);
34144 rtx src = SET_SRC (pat);
34145 gcc_assert (XVECLEN (src, 0) == 2);
34146 rtx p0 = XVECEXP (src, 0, 0);
34147 gcc_assert (GET_CODE (src) == UNSPEC
34148 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
34149 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
34150 return res;
34153 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
34154 with rounding. */
34155 static rtx
34156 ix86_expand_sse_comi_round (const struct builtin_description *d,
34157 tree exp, rtx target)
34159 rtx pat, set_dst;
34160 tree arg0 = CALL_EXPR_ARG (exp, 0);
34161 tree arg1 = CALL_EXPR_ARG (exp, 1);
34162 tree arg2 = CALL_EXPR_ARG (exp, 2);
34163 tree arg3 = CALL_EXPR_ARG (exp, 3);
34164 rtx op0 = expand_normal (arg0);
34165 rtx op1 = expand_normal (arg1);
34166 rtx op2 = expand_normal (arg2);
34167 rtx op3 = expand_normal (arg3);
34168 enum insn_code icode = d->icode;
34169 const struct insn_data_d *insn_p = &insn_data[icode];
34170 machine_mode mode0 = insn_p->operand[0].mode;
34171 machine_mode mode1 = insn_p->operand[1].mode;
34172 enum rtx_code comparison = UNEQ;
34173 bool need_ucomi = false;
34175 /* See avxintrin.h for values. */
34176 enum rtx_code comi_comparisons[32] =
34178 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
34179 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
34180 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
34182 bool need_ucomi_values[32] =
34184 true, false, false, true, true, false, false, true,
34185 true, false, false, true, true, false, false, true,
34186 false, true, true, false, false, true, true, false,
34187 false, true, true, false, false, true, true, false
34190 if (!CONST_INT_P (op2))
34192 error ("the third argument must be comparison constant");
34193 return const0_rtx;
34195 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
34197 error ("incorrect comparison mode");
34198 return const0_rtx;
34201 if (!insn_p->operand[2].predicate (op3, SImode))
34203 error ("incorrect rounding operand");
34204 return const0_rtx;
34207 comparison = comi_comparisons[INTVAL (op2)];
34208 need_ucomi = need_ucomi_values[INTVAL (op2)];
34210 if (VECTOR_MODE_P (mode0))
34211 op0 = safe_vector_operand (op0, mode0);
34212 if (VECTOR_MODE_P (mode1))
34213 op1 = safe_vector_operand (op1, mode1);
34215 target = gen_reg_rtx (SImode);
34216 emit_move_insn (target, const0_rtx);
34217 target = gen_rtx_SUBREG (QImode, target, 0);
34219 if ((optimize && !register_operand (op0, mode0))
34220 || !insn_p->operand[0].predicate (op0, mode0))
34221 op0 = copy_to_mode_reg (mode0, op0);
34222 if ((optimize && !register_operand (op1, mode1))
34223 || !insn_p->operand[1].predicate (op1, mode1))
34224 op1 = copy_to_mode_reg (mode1, op1);
34226 if (need_ucomi)
34227 icode = icode == CODE_FOR_sse_comi_round
34228 ? CODE_FOR_sse_ucomi_round
34229 : CODE_FOR_sse2_ucomi_round;
34231 pat = GEN_FCN (icode) (op0, op1, op3);
34232 if (! pat)
34233 return 0;
34235 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
34236 if (INTVAL (op3) == NO_ROUND)
34238 pat = ix86_erase_embedded_rounding (pat);
34239 if (! pat)
34240 return 0;
34242 set_dst = SET_DEST (pat);
34244 else
34246 gcc_assert (GET_CODE (pat) == SET);
34247 set_dst = SET_DEST (pat);
34250 emit_insn (pat);
34251 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34252 gen_rtx_fmt_ee (comparison, QImode,
34253 set_dst,
34254 const0_rtx)));
34256 return SUBREG_REG (target);
34259 static rtx
34260 ix86_expand_round_builtin (const struct builtin_description *d,
34261 tree exp, rtx target)
34263 rtx pat;
34264 unsigned int i, nargs;
34265 struct
34267 rtx op;
34268 machine_mode mode;
34269 } args[6];
34270 enum insn_code icode = d->icode;
34271 const struct insn_data_d *insn_p = &insn_data[icode];
34272 machine_mode tmode = insn_p->operand[0].mode;
34273 unsigned int nargs_constant = 0;
34274 unsigned int redundant_embed_rnd = 0;
34276 switch ((enum ix86_builtin_func_type) d->flag)
34278 case UINT64_FTYPE_V2DF_INT:
34279 case UINT64_FTYPE_V4SF_INT:
34280 case UINT_FTYPE_V2DF_INT:
34281 case UINT_FTYPE_V4SF_INT:
34282 case INT64_FTYPE_V2DF_INT:
34283 case INT64_FTYPE_V4SF_INT:
34284 case INT_FTYPE_V2DF_INT:
34285 case INT_FTYPE_V4SF_INT:
34286 nargs = 2;
34287 break;
34288 case V4SF_FTYPE_V4SF_UINT_INT:
34289 case V4SF_FTYPE_V4SF_UINT64_INT:
34290 case V2DF_FTYPE_V2DF_UINT64_INT:
34291 case V4SF_FTYPE_V4SF_INT_INT:
34292 case V4SF_FTYPE_V4SF_INT64_INT:
34293 case V2DF_FTYPE_V2DF_INT64_INT:
34294 case V4SF_FTYPE_V4SF_V4SF_INT:
34295 case V2DF_FTYPE_V2DF_V2DF_INT:
34296 case V4SF_FTYPE_V4SF_V2DF_INT:
34297 case V2DF_FTYPE_V2DF_V4SF_INT:
34298 nargs = 3;
34299 break;
34300 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
34301 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
34302 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
34303 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
34304 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
34305 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
34306 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
34307 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
34308 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
34309 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
34310 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
34311 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
34312 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
34313 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
34314 nargs = 4;
34315 break;
34316 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
34317 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
34318 nargs_constant = 2;
34319 nargs = 4;
34320 break;
34321 case INT_FTYPE_V4SF_V4SF_INT_INT:
34322 case INT_FTYPE_V2DF_V2DF_INT_INT:
34323 return ix86_expand_sse_comi_round (d, exp, target);
34324 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
34325 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
34326 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
34327 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
34328 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
34329 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
34330 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
34331 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
34332 nargs = 5;
34333 break;
34334 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
34335 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
34336 nargs_constant = 4;
34337 nargs = 5;
34338 break;
34339 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
34340 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
34341 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
34342 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
34343 nargs_constant = 3;
34344 nargs = 5;
34345 break;
34346 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
34347 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
34348 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
34349 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
34350 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
34351 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
34352 nargs = 6;
34353 nargs_constant = 4;
34354 break;
34355 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
34356 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
34357 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
34358 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
34359 nargs = 6;
34360 nargs_constant = 3;
34361 break;
34362 default:
34363 gcc_unreachable ();
34365 gcc_assert (nargs <= ARRAY_SIZE (args));
34367 if (optimize
34368 || target == 0
34369 || GET_MODE (target) != tmode
34370 || !insn_p->operand[0].predicate (target, tmode))
34371 target = gen_reg_rtx (tmode);
34373 for (i = 0; i < nargs; i++)
34375 tree arg = CALL_EXPR_ARG (exp, i);
34376 rtx op = expand_normal (arg);
34377 machine_mode mode = insn_p->operand[i + 1].mode;
34378 bool match = insn_p->operand[i + 1].predicate (op, mode);
34380 if (i == nargs - nargs_constant)
34382 if (!match)
34384 switch (icode)
34386 case CODE_FOR_avx512f_getmantv8df_mask_round:
34387 case CODE_FOR_avx512f_getmantv16sf_mask_round:
34388 case CODE_FOR_avx512f_vgetmantv2df_round:
34389 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
34390 case CODE_FOR_avx512f_vgetmantv4sf_round:
34391 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
34392 error ("the immediate argument must be a 4-bit immediate");
34393 return const0_rtx;
34394 case CODE_FOR_avx512f_cmpv8df3_mask_round:
34395 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
34396 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
34397 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
34398 error ("the immediate argument must be a 5-bit immediate");
34399 return const0_rtx;
34400 default:
34401 error ("the immediate argument must be an 8-bit immediate");
34402 return const0_rtx;
34406 else if (i == nargs-1)
34408 if (!insn_p->operand[nargs].predicate (op, SImode))
34410 error ("incorrect rounding operand");
34411 return const0_rtx;
34414 /* If there is no rounding use normal version of the pattern. */
34415 if (INTVAL (op) == NO_ROUND)
34416 redundant_embed_rnd = 1;
34418 else
34420 if (VECTOR_MODE_P (mode))
34421 op = safe_vector_operand (op, mode);
34423 op = fixup_modeless_constant (op, mode);
34425 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34427 if (optimize || !match)
34428 op = copy_to_mode_reg (mode, op);
34430 else
34432 op = copy_to_reg (op);
34433 op = lowpart_subreg (mode, op, GET_MODE (op));
34437 args[i].op = op;
34438 args[i].mode = mode;
34441 switch (nargs)
34443 case 1:
34444 pat = GEN_FCN (icode) (target, args[0].op);
34445 break;
34446 case 2:
34447 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34448 break;
34449 case 3:
34450 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34451 args[2].op);
34452 break;
34453 case 4:
34454 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34455 args[2].op, args[3].op);
34456 break;
34457 case 5:
34458 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34459 args[2].op, args[3].op, args[4].op);
34460 break;
34461 case 6:
34462 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34463 args[2].op, args[3].op, args[4].op,
34464 args[5].op);
34465 break;
34466 default:
34467 gcc_unreachable ();
34470 if (!pat)
34471 return 0;
34473 if (redundant_embed_rnd)
34474 pat = ix86_erase_embedded_rounding (pat);
34476 emit_insn (pat);
34477 return target;
34480 /* Subroutine of ix86_expand_builtin to take care of special insns
34481 with variable number of operands. */
34483 static rtx
34484 ix86_expand_special_args_builtin (const struct builtin_description *d,
34485 tree exp, rtx target)
34487 tree arg;
34488 rtx pat, op;
34489 unsigned int i, nargs, arg_adjust, memory;
34490 bool aligned_mem = false;
34491 struct
34493 rtx op;
34494 machine_mode mode;
34495 } args[3];
34496 enum insn_code icode = d->icode;
34497 bool last_arg_constant = false;
34498 const struct insn_data_d *insn_p = &insn_data[icode];
34499 machine_mode tmode = insn_p->operand[0].mode;
34500 enum { load, store } klass;
34502 switch ((enum ix86_builtin_func_type) d->flag)
34504 case VOID_FTYPE_VOID:
34505 emit_insn (GEN_FCN (icode) (target));
34506 return 0;
34507 case VOID_FTYPE_UINT64:
34508 case VOID_FTYPE_UNSIGNED:
34509 nargs = 0;
34510 klass = store;
34511 memory = 0;
34512 break;
34514 case INT_FTYPE_VOID:
34515 case USHORT_FTYPE_VOID:
34516 case UINT64_FTYPE_VOID:
34517 case UNSIGNED_FTYPE_VOID:
34518 nargs = 0;
34519 klass = load;
34520 memory = 0;
34521 break;
34522 case UINT64_FTYPE_PUNSIGNED:
34523 case V2DI_FTYPE_PV2DI:
34524 case V4DI_FTYPE_PV4DI:
34525 case V32QI_FTYPE_PCCHAR:
34526 case V16QI_FTYPE_PCCHAR:
34527 case V8SF_FTYPE_PCV4SF:
34528 case V8SF_FTYPE_PCFLOAT:
34529 case V4SF_FTYPE_PCFLOAT:
34530 case V4DF_FTYPE_PCV2DF:
34531 case V4DF_FTYPE_PCDOUBLE:
34532 case V2DF_FTYPE_PCDOUBLE:
34533 case VOID_FTYPE_PVOID:
34534 case V8DI_FTYPE_PV8DI:
34535 nargs = 1;
34536 klass = load;
34537 memory = 0;
34538 switch (icode)
34540 case CODE_FOR_sse4_1_movntdqa:
34541 case CODE_FOR_avx2_movntdqa:
34542 case CODE_FOR_avx512f_movntdqa:
34543 aligned_mem = true;
34544 break;
34545 default:
34546 break;
34548 break;
34549 case VOID_FTYPE_PV2SF_V4SF:
34550 case VOID_FTYPE_PV8DI_V8DI:
34551 case VOID_FTYPE_PV4DI_V4DI:
34552 case VOID_FTYPE_PV2DI_V2DI:
34553 case VOID_FTYPE_PCHAR_V32QI:
34554 case VOID_FTYPE_PCHAR_V16QI:
34555 case VOID_FTYPE_PFLOAT_V16SF:
34556 case VOID_FTYPE_PFLOAT_V8SF:
34557 case VOID_FTYPE_PFLOAT_V4SF:
34558 case VOID_FTYPE_PDOUBLE_V8DF:
34559 case VOID_FTYPE_PDOUBLE_V4DF:
34560 case VOID_FTYPE_PDOUBLE_V2DF:
34561 case VOID_FTYPE_PLONGLONG_LONGLONG:
34562 case VOID_FTYPE_PULONGLONG_ULONGLONG:
34563 case VOID_FTYPE_PINT_INT:
34564 nargs = 1;
34565 klass = store;
34566 /* Reserve memory operand for target. */
34567 memory = ARRAY_SIZE (args);
34568 switch (icode)
34570 /* These builtins and instructions require the memory
34571 to be properly aligned. */
34572 case CODE_FOR_avx_movntv4di:
34573 case CODE_FOR_sse2_movntv2di:
34574 case CODE_FOR_avx_movntv8sf:
34575 case CODE_FOR_sse_movntv4sf:
34576 case CODE_FOR_sse4a_vmmovntv4sf:
34577 case CODE_FOR_avx_movntv4df:
34578 case CODE_FOR_sse2_movntv2df:
34579 case CODE_FOR_sse4a_vmmovntv2df:
34580 case CODE_FOR_sse2_movntidi:
34581 case CODE_FOR_sse_movntq:
34582 case CODE_FOR_sse2_movntisi:
34583 case CODE_FOR_avx512f_movntv16sf:
34584 case CODE_FOR_avx512f_movntv8df:
34585 case CODE_FOR_avx512f_movntv8di:
34586 aligned_mem = true;
34587 break;
34588 default:
34589 break;
34591 break;
34592 case V4SF_FTYPE_V4SF_PCV2SF:
34593 case V2DF_FTYPE_V2DF_PCDOUBLE:
34594 nargs = 2;
34595 klass = load;
34596 memory = 1;
34597 break;
34598 case V8SF_FTYPE_PCV8SF_V8SI:
34599 case V4DF_FTYPE_PCV4DF_V4DI:
34600 case V4SF_FTYPE_PCV4SF_V4SI:
34601 case V2DF_FTYPE_PCV2DF_V2DI:
34602 case V8SI_FTYPE_PCV8SI_V8SI:
34603 case V4DI_FTYPE_PCV4DI_V4DI:
34604 case V4SI_FTYPE_PCV4SI_V4SI:
34605 case V2DI_FTYPE_PCV2DI_V2DI:
34606 case VOID_FTYPE_INT_INT64:
34607 nargs = 2;
34608 klass = load;
34609 memory = 0;
34610 break;
34611 case VOID_FTYPE_PV8DF_V8DF_UQI:
34612 case VOID_FTYPE_PV4DF_V4DF_UQI:
34613 case VOID_FTYPE_PV2DF_V2DF_UQI:
34614 case VOID_FTYPE_PV16SF_V16SF_UHI:
34615 case VOID_FTYPE_PV8SF_V8SF_UQI:
34616 case VOID_FTYPE_PV4SF_V4SF_UQI:
34617 case VOID_FTYPE_PV8DI_V8DI_UQI:
34618 case VOID_FTYPE_PV4DI_V4DI_UQI:
34619 case VOID_FTYPE_PV2DI_V2DI_UQI:
34620 case VOID_FTYPE_PV16SI_V16SI_UHI:
34621 case VOID_FTYPE_PV8SI_V8SI_UQI:
34622 case VOID_FTYPE_PV4SI_V4SI_UQI:
34623 switch (icode)
34625 /* These builtins and instructions require the memory
34626 to be properly aligned. */
34627 case CODE_FOR_avx512f_storev16sf_mask:
34628 case CODE_FOR_avx512f_storev16si_mask:
34629 case CODE_FOR_avx512f_storev8df_mask:
34630 case CODE_FOR_avx512f_storev8di_mask:
34631 case CODE_FOR_avx512vl_storev8sf_mask:
34632 case CODE_FOR_avx512vl_storev8si_mask:
34633 case CODE_FOR_avx512vl_storev4df_mask:
34634 case CODE_FOR_avx512vl_storev4di_mask:
34635 case CODE_FOR_avx512vl_storev4sf_mask:
34636 case CODE_FOR_avx512vl_storev4si_mask:
34637 case CODE_FOR_avx512vl_storev2df_mask:
34638 case CODE_FOR_avx512vl_storev2di_mask:
34639 aligned_mem = true;
34640 break;
34641 default:
34642 break;
34644 /* FALLTHRU */
34645 case VOID_FTYPE_PV8SF_V8SI_V8SF:
34646 case VOID_FTYPE_PV4DF_V4DI_V4DF:
34647 case VOID_FTYPE_PV4SF_V4SI_V4SF:
34648 case VOID_FTYPE_PV2DF_V2DI_V2DF:
34649 case VOID_FTYPE_PV8SI_V8SI_V8SI:
34650 case VOID_FTYPE_PV4DI_V4DI_V4DI:
34651 case VOID_FTYPE_PV4SI_V4SI_V4SI:
34652 case VOID_FTYPE_PV2DI_V2DI_V2DI:
34653 case VOID_FTYPE_PV8SI_V8DI_UQI:
34654 case VOID_FTYPE_PV8HI_V8DI_UQI:
34655 case VOID_FTYPE_PV16HI_V16SI_UHI:
34656 case VOID_FTYPE_PV16QI_V8DI_UQI:
34657 case VOID_FTYPE_PV16QI_V16SI_UHI:
34658 case VOID_FTYPE_PV4SI_V4DI_UQI:
34659 case VOID_FTYPE_PV4SI_V2DI_UQI:
34660 case VOID_FTYPE_PV8HI_V4DI_UQI:
34661 case VOID_FTYPE_PV8HI_V2DI_UQI:
34662 case VOID_FTYPE_PV8HI_V8SI_UQI:
34663 case VOID_FTYPE_PV8HI_V4SI_UQI:
34664 case VOID_FTYPE_PV16QI_V4DI_UQI:
34665 case VOID_FTYPE_PV16QI_V2DI_UQI:
34666 case VOID_FTYPE_PV16QI_V8SI_UQI:
34667 case VOID_FTYPE_PV16QI_V4SI_UQI:
34668 case VOID_FTYPE_PCHAR_V64QI_UDI:
34669 case VOID_FTYPE_PCHAR_V32QI_USI:
34670 case VOID_FTYPE_PCHAR_V16QI_UHI:
34671 case VOID_FTYPE_PSHORT_V32HI_USI:
34672 case VOID_FTYPE_PSHORT_V16HI_UHI:
34673 case VOID_FTYPE_PSHORT_V8HI_UQI:
34674 case VOID_FTYPE_PINT_V16SI_UHI:
34675 case VOID_FTYPE_PINT_V8SI_UQI:
34676 case VOID_FTYPE_PINT_V4SI_UQI:
34677 case VOID_FTYPE_PINT64_V8DI_UQI:
34678 case VOID_FTYPE_PINT64_V4DI_UQI:
34679 case VOID_FTYPE_PINT64_V2DI_UQI:
34680 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
34681 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
34682 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
34683 case VOID_FTYPE_PFLOAT_V16SF_UHI:
34684 case VOID_FTYPE_PFLOAT_V8SF_UQI:
34685 case VOID_FTYPE_PFLOAT_V4SF_UQI:
34686 case VOID_FTYPE_PV32QI_V32HI_USI:
34687 case VOID_FTYPE_PV16QI_V16HI_UHI:
34688 case VOID_FTYPE_PV8QI_V8HI_UQI:
34689 nargs = 2;
34690 klass = store;
34691 /* Reserve memory operand for target. */
34692 memory = ARRAY_SIZE (args);
34693 break;
34694 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
34695 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
34696 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
34697 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
34698 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
34699 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
34700 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
34701 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
34702 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
34703 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
34704 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
34705 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
34706 switch (icode)
34708 /* These builtins and instructions require the memory
34709 to be properly aligned. */
34710 case CODE_FOR_avx512f_loadv16sf_mask:
34711 case CODE_FOR_avx512f_loadv16si_mask:
34712 case CODE_FOR_avx512f_loadv8df_mask:
34713 case CODE_FOR_avx512f_loadv8di_mask:
34714 case CODE_FOR_avx512vl_loadv8sf_mask:
34715 case CODE_FOR_avx512vl_loadv8si_mask:
34716 case CODE_FOR_avx512vl_loadv4df_mask:
34717 case CODE_FOR_avx512vl_loadv4di_mask:
34718 case CODE_FOR_avx512vl_loadv4sf_mask:
34719 case CODE_FOR_avx512vl_loadv4si_mask:
34720 case CODE_FOR_avx512vl_loadv2df_mask:
34721 case CODE_FOR_avx512vl_loadv2di_mask:
34722 case CODE_FOR_avx512bw_loadv64qi_mask:
34723 case CODE_FOR_avx512vl_loadv32qi_mask:
34724 case CODE_FOR_avx512vl_loadv16qi_mask:
34725 case CODE_FOR_avx512bw_loadv32hi_mask:
34726 case CODE_FOR_avx512vl_loadv16hi_mask:
34727 case CODE_FOR_avx512vl_loadv8hi_mask:
34728 aligned_mem = true;
34729 break;
34730 default:
34731 break;
34733 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
34734 case V32QI_FTYPE_PCCHAR_V32QI_USI:
34735 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
34736 case V32HI_FTYPE_PCSHORT_V32HI_USI:
34737 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
34738 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
34739 case V16SI_FTYPE_PCINT_V16SI_UHI:
34740 case V8SI_FTYPE_PCINT_V8SI_UQI:
34741 case V4SI_FTYPE_PCINT_V4SI_UQI:
34742 case V8DI_FTYPE_PCINT64_V8DI_UQI:
34743 case V4DI_FTYPE_PCINT64_V4DI_UQI:
34744 case V2DI_FTYPE_PCINT64_V2DI_UQI:
34745 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
34746 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
34747 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
34748 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
34749 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
34750 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
34751 nargs = 3;
34752 klass = load;
34753 memory = 0;
34754 break;
34755 case VOID_FTYPE_UINT_UINT_UINT:
34756 case VOID_FTYPE_UINT64_UINT_UINT:
34757 case UCHAR_FTYPE_UINT_UINT_UINT:
34758 case UCHAR_FTYPE_UINT64_UINT_UINT:
34759 nargs = 3;
34760 klass = load;
34761 memory = ARRAY_SIZE (args);
34762 last_arg_constant = true;
34763 break;
34764 default:
34765 gcc_unreachable ();
34768 gcc_assert (nargs <= ARRAY_SIZE (args));
34770 if (klass == store)
34772 arg = CALL_EXPR_ARG (exp, 0);
34773 op = expand_normal (arg);
34774 gcc_assert (target == 0);
34775 if (memory)
34777 op = ix86_zero_extend_to_Pmode (op);
34778 target = gen_rtx_MEM (tmode, op);
34779 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
34780 on it. Try to improve it using get_pointer_alignment,
34781 and if the special builtin is one that requires strict
34782 mode alignment, also from it's GET_MODE_ALIGNMENT.
34783 Failure to do so could lead to ix86_legitimate_combined_insn
34784 rejecting all changes to such insns. */
34785 unsigned int align = get_pointer_alignment (arg);
34786 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
34787 align = GET_MODE_ALIGNMENT (tmode);
34788 if (MEM_ALIGN (target) < align)
34789 set_mem_align (target, align);
34791 else
34792 target = force_reg (tmode, op);
34793 arg_adjust = 1;
34795 else
34797 arg_adjust = 0;
34798 if (optimize
34799 || target == 0
34800 || !register_operand (target, tmode)
34801 || GET_MODE (target) != tmode)
34802 target = gen_reg_rtx (tmode);
34805 for (i = 0; i < nargs; i++)
34807 machine_mode mode = insn_p->operand[i + 1].mode;
34808 bool match;
34810 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
34811 op = expand_normal (arg);
34812 match = insn_p->operand[i + 1].predicate (op, mode);
34814 if (last_arg_constant && (i + 1) == nargs)
34816 if (!match)
34818 if (icode == CODE_FOR_lwp_lwpvalsi3
34819 || icode == CODE_FOR_lwp_lwpinssi3
34820 || icode == CODE_FOR_lwp_lwpvaldi3
34821 || icode == CODE_FOR_lwp_lwpinsdi3)
34822 error ("the last argument must be a 32-bit immediate");
34823 else
34824 error ("the last argument must be an 8-bit immediate");
34825 return const0_rtx;
34828 else
34830 if (i == memory)
34832 /* This must be the memory operand. */
34833 op = ix86_zero_extend_to_Pmode (op);
34834 op = gen_rtx_MEM (mode, op);
34835 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
34836 on it. Try to improve it using get_pointer_alignment,
34837 and if the special builtin is one that requires strict
34838 mode alignment, also from it's GET_MODE_ALIGNMENT.
34839 Failure to do so could lead to ix86_legitimate_combined_insn
34840 rejecting all changes to such insns. */
34841 unsigned int align = get_pointer_alignment (arg);
34842 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
34843 align = GET_MODE_ALIGNMENT (mode);
34844 if (MEM_ALIGN (op) < align)
34845 set_mem_align (op, align);
34847 else
34849 /* This must be register. */
34850 if (VECTOR_MODE_P (mode))
34851 op = safe_vector_operand (op, mode);
34853 op = fixup_modeless_constant (op, mode);
34855 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34856 op = copy_to_mode_reg (mode, op);
34857 else
34859 op = copy_to_reg (op);
34860 op = lowpart_subreg (mode, op, GET_MODE (op));
34865 args[i].op = op;
34866 args[i].mode = mode;
34869 switch (nargs)
34871 case 0:
34872 pat = GEN_FCN (icode) (target);
34873 break;
34874 case 1:
34875 pat = GEN_FCN (icode) (target, args[0].op);
34876 break;
34877 case 2:
34878 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34879 break;
34880 case 3:
34881 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34882 break;
34883 default:
34884 gcc_unreachable ();
34887 if (! pat)
34888 return 0;
34889 emit_insn (pat);
34890 return klass == store ? 0 : target;
34893 /* Return the integer constant in ARG. Constrain it to be in the range
34894 of the subparts of VEC_TYPE; issue an error if not. */
34896 static int
34897 get_element_number (tree vec_type, tree arg)
34899 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
34901 if (!tree_fits_uhwi_p (arg)
34902 || (elt = tree_to_uhwi (arg), elt > max))
34904 error ("selector must be an integer constant in the range 0..%wi", max);
34905 return 0;
34908 return elt;
34911 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
34912 ix86_expand_vector_init. We DO have language-level syntax for this, in
34913 the form of (type){ init-list }. Except that since we can't place emms
34914 instructions from inside the compiler, we can't allow the use of MMX
34915 registers unless the user explicitly asks for it. So we do *not* define
34916 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
34917 we have builtins invoked by mmintrin.h that gives us license to emit
34918 these sorts of instructions. */
34920 static rtx
34921 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
34923 machine_mode tmode = TYPE_MODE (type);
34924 machine_mode inner_mode = GET_MODE_INNER (tmode);
34925 int i, n_elt = GET_MODE_NUNITS (tmode);
34926 rtvec v = rtvec_alloc (n_elt);
34928 gcc_assert (VECTOR_MODE_P (tmode));
34929 gcc_assert (call_expr_nargs (exp) == n_elt);
34931 for (i = 0; i < n_elt; ++i)
34933 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
34934 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
34937 if (!target || !register_operand (target, tmode))
34938 target = gen_reg_rtx (tmode);
34940 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
34941 return target;
34944 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
34945 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
34946 had a language-level syntax for referencing vector elements. */
34948 static rtx
34949 ix86_expand_vec_ext_builtin (tree exp, rtx target)
34951 machine_mode tmode, mode0;
34952 tree arg0, arg1;
34953 int elt;
34954 rtx op0;
34956 arg0 = CALL_EXPR_ARG (exp, 0);
34957 arg1 = CALL_EXPR_ARG (exp, 1);
34959 op0 = expand_normal (arg0);
34960 elt = get_element_number (TREE_TYPE (arg0), arg1);
34962 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
34963 mode0 = TYPE_MODE (TREE_TYPE (arg0));
34964 gcc_assert (VECTOR_MODE_P (mode0));
34966 op0 = force_reg (mode0, op0);
34968 if (optimize || !target || !register_operand (target, tmode))
34969 target = gen_reg_rtx (tmode);
34971 ix86_expand_vector_extract (true, target, op0, elt);
34973 return target;
34976 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
34977 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
34978 a language-level syntax for referencing vector elements. */
34980 static rtx
34981 ix86_expand_vec_set_builtin (tree exp)
34983 machine_mode tmode, mode1;
34984 tree arg0, arg1, arg2;
34985 int elt;
34986 rtx op0, op1, target;
34988 arg0 = CALL_EXPR_ARG (exp, 0);
34989 arg1 = CALL_EXPR_ARG (exp, 1);
34990 arg2 = CALL_EXPR_ARG (exp, 2);
34992 tmode = TYPE_MODE (TREE_TYPE (arg0));
34993 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
34994 gcc_assert (VECTOR_MODE_P (tmode));
34996 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
34997 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
34998 elt = get_element_number (TREE_TYPE (arg0), arg2);
35000 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
35001 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
35003 op0 = force_reg (tmode, op0);
35004 op1 = force_reg (mode1, op1);
35006 /* OP0 is the source of these builtin functions and shouldn't be
35007 modified. Create a copy, use it and return it as target. */
35008 target = gen_reg_rtx (tmode);
35009 emit_move_insn (target, op0);
35010 ix86_expand_vector_set (true, target, op1, elt);
35012 return target;
35015 /* Emit conditional move of SRC to DST with condition
35016 OP1 CODE OP2. */
35017 static void
35018 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
35020 rtx t;
35022 if (TARGET_CMOVE)
35024 t = ix86_expand_compare (code, op1, op2);
35025 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
35026 src, dst)));
35028 else
35030 rtx_code_label *nomove = gen_label_rtx ();
35031 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
35032 const0_rtx, GET_MODE (op1), 1, nomove);
35033 emit_move_insn (dst, src);
35034 emit_label (nomove);
35038 /* Choose max of DST and SRC and put it to DST. */
35039 static void
35040 ix86_emit_move_max (rtx dst, rtx src)
35042 ix86_emit_cmove (dst, src, LTU, dst, src);
35045 /* Expand an expression EXP that calls a built-in function,
35046 with result going to TARGET if that's convenient
35047 (and in mode MODE if that's convenient).
35048 SUBTARGET may be used as the target for computing one of EXP's operands.
35049 IGNORE is nonzero if the value is to be ignored. */
35051 static rtx
35052 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
35053 machine_mode mode, int ignore)
35055 size_t i;
35056 enum insn_code icode;
35057 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
35058 tree arg0, arg1, arg2, arg3, arg4;
35059 rtx op0, op1, op2, op3, op4, pat, insn;
35060 machine_mode mode0, mode1, mode2, mode3, mode4;
35061 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
35063 /* For CPU builtins that can be folded, fold first and expand the fold. */
35064 switch (fcode)
35066 case IX86_BUILTIN_CPU_INIT:
35068 /* Make it call __cpu_indicator_init in libgcc. */
35069 tree call_expr, fndecl, type;
35070 type = build_function_type_list (integer_type_node, NULL_TREE);
35071 fndecl = build_fn_decl ("__cpu_indicator_init", type);
35072 call_expr = build_call_expr (fndecl, 0);
35073 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
35075 case IX86_BUILTIN_CPU_IS:
35076 case IX86_BUILTIN_CPU_SUPPORTS:
35078 tree arg0 = CALL_EXPR_ARG (exp, 0);
35079 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
35080 gcc_assert (fold_expr != NULL_TREE);
35081 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
35085 /* Determine whether the builtin function is available under the current ISA.
35086 Originally the builtin was not created if it wasn't applicable to the
35087 current ISA based on the command line switches. With function specific
35088 options, we need to check in the context of the function making the call
35089 whether it is supported. Treat AVX512VL and MMX specially. For other flags,
35090 if isa includes more than one ISA bit, treat those are requiring any
35091 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
35092 ISAs. Likewise for MMX, require both MMX and the non-MMX ISAs.
35093 Similarly for 64BIT, but we shouldn't be building such builtins
35094 at all, -m64 is a whole TU option. */
35095 if (((ix86_builtins_isa[fcode].isa
35096 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35097 | OPTION_MASK_ISA_64BIT))
35098 && !(ix86_builtins_isa[fcode].isa
35099 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35100 | OPTION_MASK_ISA_64BIT)
35101 & ix86_isa_flags))
35102 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
35103 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
35104 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_MMX)
35105 && !(ix86_isa_flags & OPTION_MASK_ISA_MMX))
35106 || (ix86_builtins_isa[fcode].isa2
35107 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
35109 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
35110 ix86_builtins_isa[fcode].isa2, 0, 0,
35111 NULL, NULL, (enum fpmath_unit) 0,
35112 false);
35113 if (!opts)
35114 error ("%qE needs unknown isa option", fndecl);
35115 else
35117 gcc_assert (opts != NULL);
35118 error ("%qE needs isa option %s", fndecl, opts);
35119 free (opts);
35121 return expand_call (exp, target, ignore);
35124 switch (fcode)
35126 case IX86_BUILTIN_BNDMK:
35127 if (!target
35128 || GET_MODE (target) != BNDmode
35129 || !register_operand (target, BNDmode))
35130 target = gen_reg_rtx (BNDmode);
35132 arg0 = CALL_EXPR_ARG (exp, 0);
35133 arg1 = CALL_EXPR_ARG (exp, 1);
35135 op0 = expand_normal (arg0);
35136 op1 = expand_normal (arg1);
35138 if (!register_operand (op0, Pmode))
35139 op0 = ix86_zero_extend_to_Pmode (op0);
35140 if (!register_operand (op1, Pmode))
35141 op1 = ix86_zero_extend_to_Pmode (op1);
35143 /* Builtin arg1 is size of block but instruction op1 should
35144 be (size - 1). */
35145 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
35146 NULL_RTX, 1, OPTAB_DIRECT);
35148 emit_insn (BNDmode == BND64mode
35149 ? gen_bnd64_mk (target, op0, op1)
35150 : gen_bnd32_mk (target, op0, op1));
35151 return target;
35153 case IX86_BUILTIN_BNDSTX:
35154 arg0 = CALL_EXPR_ARG (exp, 0);
35155 arg1 = CALL_EXPR_ARG (exp, 1);
35156 arg2 = CALL_EXPR_ARG (exp, 2);
35158 op0 = expand_normal (arg0);
35159 op1 = expand_normal (arg1);
35160 op2 = expand_normal (arg2);
35162 if (!register_operand (op0, Pmode))
35163 op0 = ix86_zero_extend_to_Pmode (op0);
35164 if (!register_operand (op1, BNDmode))
35165 op1 = copy_to_mode_reg (BNDmode, op1);
35166 if (!register_operand (op2, Pmode))
35167 op2 = ix86_zero_extend_to_Pmode (op2);
35169 emit_insn (BNDmode == BND64mode
35170 ? gen_bnd64_stx (op2, op0, op1)
35171 : gen_bnd32_stx (op2, op0, op1));
35172 return 0;
35174 case IX86_BUILTIN_BNDLDX:
35175 if (!target
35176 || GET_MODE (target) != BNDmode
35177 || !register_operand (target, BNDmode))
35178 target = gen_reg_rtx (BNDmode);
35180 arg0 = CALL_EXPR_ARG (exp, 0);
35181 arg1 = CALL_EXPR_ARG (exp, 1);
35183 op0 = expand_normal (arg0);
35184 op1 = expand_normal (arg1);
35186 if (!register_operand (op0, Pmode))
35187 op0 = ix86_zero_extend_to_Pmode (op0);
35188 if (!register_operand (op1, Pmode))
35189 op1 = ix86_zero_extend_to_Pmode (op1);
35191 emit_insn (BNDmode == BND64mode
35192 ? gen_bnd64_ldx (target, op0, op1)
35193 : gen_bnd32_ldx (target, op0, op1));
35194 return target;
35196 case IX86_BUILTIN_BNDCL:
35197 arg0 = CALL_EXPR_ARG (exp, 0);
35198 arg1 = CALL_EXPR_ARG (exp, 1);
35200 op0 = expand_normal (arg0);
35201 op1 = expand_normal (arg1);
35203 if (!register_operand (op0, Pmode))
35204 op0 = ix86_zero_extend_to_Pmode (op0);
35205 if (!register_operand (op1, BNDmode))
35206 op1 = copy_to_mode_reg (BNDmode, op1);
35208 emit_insn (BNDmode == BND64mode
35209 ? gen_bnd64_cl (op1, op0)
35210 : gen_bnd32_cl (op1, op0));
35211 return 0;
35213 case IX86_BUILTIN_BNDCU:
35214 arg0 = CALL_EXPR_ARG (exp, 0);
35215 arg1 = CALL_EXPR_ARG (exp, 1);
35217 op0 = expand_normal (arg0);
35218 op1 = expand_normal (arg1);
35220 if (!register_operand (op0, Pmode))
35221 op0 = ix86_zero_extend_to_Pmode (op0);
35222 if (!register_operand (op1, BNDmode))
35223 op1 = copy_to_mode_reg (BNDmode, op1);
35225 emit_insn (BNDmode == BND64mode
35226 ? gen_bnd64_cu (op1, op0)
35227 : gen_bnd32_cu (op1, op0));
35228 return 0;
35230 case IX86_BUILTIN_BNDRET:
35231 arg0 = CALL_EXPR_ARG (exp, 0);
35232 target = chkp_get_rtl_bounds (arg0);
35234 /* If no bounds were specified for returned value,
35235 then use INIT bounds. It usually happens when
35236 some built-in function is expanded. */
35237 if (!target)
35239 rtx t1 = gen_reg_rtx (Pmode);
35240 rtx t2 = gen_reg_rtx (Pmode);
35241 target = gen_reg_rtx (BNDmode);
35242 emit_move_insn (t1, const0_rtx);
35243 emit_move_insn (t2, constm1_rtx);
35244 emit_insn (BNDmode == BND64mode
35245 ? gen_bnd64_mk (target, t1, t2)
35246 : gen_bnd32_mk (target, t1, t2));
35249 gcc_assert (target && REG_P (target));
35250 return target;
35252 case IX86_BUILTIN_BNDNARROW:
35254 rtx m1, m1h1, m1h2, lb, ub, t1;
35256 /* Return value and lb. */
35257 arg0 = CALL_EXPR_ARG (exp, 0);
35258 /* Bounds. */
35259 arg1 = CALL_EXPR_ARG (exp, 1);
35260 /* Size. */
35261 arg2 = CALL_EXPR_ARG (exp, 2);
35263 lb = expand_normal (arg0);
35264 op1 = expand_normal (arg1);
35265 op2 = expand_normal (arg2);
35267 /* Size was passed but we need to use (size - 1) as for bndmk. */
35268 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
35269 NULL_RTX, 1, OPTAB_DIRECT);
35271 /* Add LB to size and inverse to get UB. */
35272 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
35273 op2, 1, OPTAB_DIRECT);
35274 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
35276 if (!register_operand (lb, Pmode))
35277 lb = ix86_zero_extend_to_Pmode (lb);
35278 if (!register_operand (ub, Pmode))
35279 ub = ix86_zero_extend_to_Pmode (ub);
35281 /* We need to move bounds to memory before any computations. */
35282 if (MEM_P (op1))
35283 m1 = op1;
35284 else
35286 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
35287 emit_move_insn (m1, op1);
35290 /* Generate mem expression to be used for access to LB and UB. */
35291 m1h1 = adjust_address (m1, Pmode, 0);
35292 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
35294 t1 = gen_reg_rtx (Pmode);
35296 /* Compute LB. */
35297 emit_move_insn (t1, m1h1);
35298 ix86_emit_move_max (t1, lb);
35299 emit_move_insn (m1h1, t1);
35301 /* Compute UB. UB is stored in 1's complement form. Therefore
35302 we also use max here. */
35303 emit_move_insn (t1, m1h2);
35304 ix86_emit_move_max (t1, ub);
35305 emit_move_insn (m1h2, t1);
35307 op2 = gen_reg_rtx (BNDmode);
35308 emit_move_insn (op2, m1);
35310 return chkp_join_splitted_slot (lb, op2);
35313 case IX86_BUILTIN_BNDINT:
35315 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
35317 if (!target
35318 || GET_MODE (target) != BNDmode
35319 || !register_operand (target, BNDmode))
35320 target = gen_reg_rtx (BNDmode);
35322 arg0 = CALL_EXPR_ARG (exp, 0);
35323 arg1 = CALL_EXPR_ARG (exp, 1);
35325 op0 = expand_normal (arg0);
35326 op1 = expand_normal (arg1);
35328 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
35329 rh1 = adjust_address (res, Pmode, 0);
35330 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
35332 /* Put first bounds to temporaries. */
35333 lb1 = gen_reg_rtx (Pmode);
35334 ub1 = gen_reg_rtx (Pmode);
35335 if (MEM_P (op0))
35337 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
35338 emit_move_insn (ub1, adjust_address (op0, Pmode,
35339 GET_MODE_SIZE (Pmode)));
35341 else
35343 emit_move_insn (res, op0);
35344 emit_move_insn (lb1, rh1);
35345 emit_move_insn (ub1, rh2);
35348 /* Put second bounds to temporaries. */
35349 lb2 = gen_reg_rtx (Pmode);
35350 ub2 = gen_reg_rtx (Pmode);
35351 if (MEM_P (op1))
35353 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
35354 emit_move_insn (ub2, adjust_address (op1, Pmode,
35355 GET_MODE_SIZE (Pmode)));
35357 else
35359 emit_move_insn (res, op1);
35360 emit_move_insn (lb2, rh1);
35361 emit_move_insn (ub2, rh2);
35364 /* Compute LB. */
35365 ix86_emit_move_max (lb1, lb2);
35366 emit_move_insn (rh1, lb1);
35368 /* Compute UB. UB is stored in 1's complement form. Therefore
35369 we also use max here. */
35370 ix86_emit_move_max (ub1, ub2);
35371 emit_move_insn (rh2, ub1);
35373 emit_move_insn (target, res);
35375 return target;
35378 case IX86_BUILTIN_SIZEOF:
35380 tree name;
35381 rtx symbol;
35383 if (!target
35384 || GET_MODE (target) != Pmode
35385 || !register_operand (target, Pmode))
35386 target = gen_reg_rtx (Pmode);
35388 arg0 = CALL_EXPR_ARG (exp, 0);
35389 gcc_assert (VAR_P (arg0));
35391 name = DECL_ASSEMBLER_NAME (arg0);
35392 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
35394 emit_insn (Pmode == SImode
35395 ? gen_move_size_reloc_si (target, symbol)
35396 : gen_move_size_reloc_di (target, symbol));
35398 return target;
35401 case IX86_BUILTIN_BNDLOWER:
35403 rtx mem, hmem;
35405 if (!target
35406 || GET_MODE (target) != Pmode
35407 || !register_operand (target, Pmode))
35408 target = gen_reg_rtx (Pmode);
35410 arg0 = CALL_EXPR_ARG (exp, 0);
35411 op0 = expand_normal (arg0);
35413 /* We need to move bounds to memory first. */
35414 if (MEM_P (op0))
35415 mem = op0;
35416 else
35418 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35419 emit_move_insn (mem, op0);
35422 /* Generate mem expression to access LB and load it. */
35423 hmem = adjust_address (mem, Pmode, 0);
35424 emit_move_insn (target, hmem);
35426 return target;
35429 case IX86_BUILTIN_BNDUPPER:
35431 rtx mem, hmem, res;
35433 if (!target
35434 || GET_MODE (target) != Pmode
35435 || !register_operand (target, Pmode))
35436 target = gen_reg_rtx (Pmode);
35438 arg0 = CALL_EXPR_ARG (exp, 0);
35439 op0 = expand_normal (arg0);
35441 /* We need to move bounds to memory first. */
35442 if (MEM_P (op0))
35443 mem = op0;
35444 else
35446 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35447 emit_move_insn (mem, op0);
35450 /* Generate mem expression to access UB. */
35451 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
35453 /* We need to inverse all bits of UB. */
35454 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
35456 if (res != target)
35457 emit_move_insn (target, res);
35459 return target;
35462 case IX86_BUILTIN_MASKMOVQ:
35463 case IX86_BUILTIN_MASKMOVDQU:
35464 icode = (fcode == IX86_BUILTIN_MASKMOVQ
35465 ? CODE_FOR_mmx_maskmovq
35466 : CODE_FOR_sse2_maskmovdqu);
35467 /* Note the arg order is different from the operand order. */
35468 arg1 = CALL_EXPR_ARG (exp, 0);
35469 arg2 = CALL_EXPR_ARG (exp, 1);
35470 arg0 = CALL_EXPR_ARG (exp, 2);
35471 op0 = expand_normal (arg0);
35472 op1 = expand_normal (arg1);
35473 op2 = expand_normal (arg2);
35474 mode0 = insn_data[icode].operand[0].mode;
35475 mode1 = insn_data[icode].operand[1].mode;
35476 mode2 = insn_data[icode].operand[2].mode;
35478 op0 = ix86_zero_extend_to_Pmode (op0);
35479 op0 = gen_rtx_MEM (mode1, op0);
35481 if (!insn_data[icode].operand[0].predicate (op0, mode0))
35482 op0 = copy_to_mode_reg (mode0, op0);
35483 if (!insn_data[icode].operand[1].predicate (op1, mode1))
35484 op1 = copy_to_mode_reg (mode1, op1);
35485 if (!insn_data[icode].operand[2].predicate (op2, mode2))
35486 op2 = copy_to_mode_reg (mode2, op2);
35487 pat = GEN_FCN (icode) (op0, op1, op2);
35488 if (! pat)
35489 return 0;
35490 emit_insn (pat);
35491 return 0;
35493 case IX86_BUILTIN_LDMXCSR:
35494 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
35495 target = assign_386_stack_local (SImode, SLOT_TEMP);
35496 emit_move_insn (target, op0);
35497 emit_insn (gen_sse_ldmxcsr (target));
35498 return 0;
35500 case IX86_BUILTIN_STMXCSR:
35501 target = assign_386_stack_local (SImode, SLOT_TEMP);
35502 emit_insn (gen_sse_stmxcsr (target));
35503 return copy_to_mode_reg (SImode, target);
35505 case IX86_BUILTIN_CLFLUSH:
35506 arg0 = CALL_EXPR_ARG (exp, 0);
35507 op0 = expand_normal (arg0);
35508 icode = CODE_FOR_sse2_clflush;
35509 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35510 op0 = ix86_zero_extend_to_Pmode (op0);
35512 emit_insn (gen_sse2_clflush (op0));
35513 return 0;
35515 case IX86_BUILTIN_CLWB:
35516 arg0 = CALL_EXPR_ARG (exp, 0);
35517 op0 = expand_normal (arg0);
35518 icode = CODE_FOR_clwb;
35519 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35520 op0 = ix86_zero_extend_to_Pmode (op0);
35522 emit_insn (gen_clwb (op0));
35523 return 0;
35525 case IX86_BUILTIN_CLFLUSHOPT:
35526 arg0 = CALL_EXPR_ARG (exp, 0);
35527 op0 = expand_normal (arg0);
35528 icode = CODE_FOR_clflushopt;
35529 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35530 op0 = ix86_zero_extend_to_Pmode (op0);
35532 emit_insn (gen_clflushopt (op0));
35533 return 0;
35535 case IX86_BUILTIN_MONITOR:
35536 case IX86_BUILTIN_MONITORX:
35537 arg0 = CALL_EXPR_ARG (exp, 0);
35538 arg1 = CALL_EXPR_ARG (exp, 1);
35539 arg2 = CALL_EXPR_ARG (exp, 2);
35540 op0 = expand_normal (arg0);
35541 op1 = expand_normal (arg1);
35542 op2 = expand_normal (arg2);
35543 if (!REG_P (op0))
35544 op0 = ix86_zero_extend_to_Pmode (op0);
35545 if (!REG_P (op1))
35546 op1 = copy_to_mode_reg (SImode, op1);
35547 if (!REG_P (op2))
35548 op2 = copy_to_mode_reg (SImode, op2);
35550 emit_insn (fcode == IX86_BUILTIN_MONITOR
35551 ? ix86_gen_monitor (op0, op1, op2)
35552 : ix86_gen_monitorx (op0, op1, op2));
35553 return 0;
35555 case IX86_BUILTIN_MWAIT:
35556 arg0 = CALL_EXPR_ARG (exp, 0);
35557 arg1 = CALL_EXPR_ARG (exp, 1);
35558 op0 = expand_normal (arg0);
35559 op1 = expand_normal (arg1);
35560 if (!REG_P (op0))
35561 op0 = copy_to_mode_reg (SImode, op0);
35562 if (!REG_P (op1))
35563 op1 = copy_to_mode_reg (SImode, op1);
35564 emit_insn (gen_sse3_mwait (op0, op1));
35565 return 0;
35567 case IX86_BUILTIN_MWAITX:
35568 arg0 = CALL_EXPR_ARG (exp, 0);
35569 arg1 = CALL_EXPR_ARG (exp, 1);
35570 arg2 = CALL_EXPR_ARG (exp, 2);
35571 op0 = expand_normal (arg0);
35572 op1 = expand_normal (arg1);
35573 op2 = expand_normal (arg2);
35574 if (!REG_P (op0))
35575 op0 = copy_to_mode_reg (SImode, op0);
35576 if (!REG_P (op1))
35577 op1 = copy_to_mode_reg (SImode, op1);
35578 if (!REG_P (op2))
35579 op2 = copy_to_mode_reg (SImode, op2);
35580 emit_insn (gen_mwaitx (op0, op1, op2));
35581 return 0;
35583 case IX86_BUILTIN_CLZERO:
35584 arg0 = CALL_EXPR_ARG (exp, 0);
35585 op0 = expand_normal (arg0);
35586 if (!REG_P (op0))
35587 op0 = ix86_zero_extend_to_Pmode (op0);
35588 emit_insn (ix86_gen_clzero (op0));
35589 return 0;
35591 case IX86_BUILTIN_VEC_INIT_V2SI:
35592 case IX86_BUILTIN_VEC_INIT_V4HI:
35593 case IX86_BUILTIN_VEC_INIT_V8QI:
35594 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
35596 case IX86_BUILTIN_VEC_EXT_V2DF:
35597 case IX86_BUILTIN_VEC_EXT_V2DI:
35598 case IX86_BUILTIN_VEC_EXT_V4SF:
35599 case IX86_BUILTIN_VEC_EXT_V4SI:
35600 case IX86_BUILTIN_VEC_EXT_V8HI:
35601 case IX86_BUILTIN_VEC_EXT_V2SI:
35602 case IX86_BUILTIN_VEC_EXT_V4HI:
35603 case IX86_BUILTIN_VEC_EXT_V16QI:
35604 return ix86_expand_vec_ext_builtin (exp, target);
35606 case IX86_BUILTIN_VEC_SET_V2DI:
35607 case IX86_BUILTIN_VEC_SET_V4SF:
35608 case IX86_BUILTIN_VEC_SET_V4SI:
35609 case IX86_BUILTIN_VEC_SET_V8HI:
35610 case IX86_BUILTIN_VEC_SET_V4HI:
35611 case IX86_BUILTIN_VEC_SET_V16QI:
35612 return ix86_expand_vec_set_builtin (exp);
35614 case IX86_BUILTIN_NANQ:
35615 case IX86_BUILTIN_NANSQ:
35616 return expand_call (exp, target, ignore);
35618 case IX86_BUILTIN_RDPMC:
35619 case IX86_BUILTIN_RDTSC:
35620 case IX86_BUILTIN_RDTSCP:
35621 case IX86_BUILTIN_XGETBV:
35623 op0 = gen_reg_rtx (DImode);
35624 op1 = gen_reg_rtx (DImode);
35626 if (fcode == IX86_BUILTIN_RDPMC)
35628 arg0 = CALL_EXPR_ARG (exp, 0);
35629 op2 = expand_normal (arg0);
35630 if (!register_operand (op2, SImode))
35631 op2 = copy_to_mode_reg (SImode, op2);
35633 insn = (TARGET_64BIT
35634 ? gen_rdpmc_rex64 (op0, op1, op2)
35635 : gen_rdpmc (op0, op2));
35636 emit_insn (insn);
35638 else if (fcode == IX86_BUILTIN_XGETBV)
35640 arg0 = CALL_EXPR_ARG (exp, 0);
35641 op2 = expand_normal (arg0);
35642 if (!register_operand (op2, SImode))
35643 op2 = copy_to_mode_reg (SImode, op2);
35645 insn = (TARGET_64BIT
35646 ? gen_xgetbv_rex64 (op0, op1, op2)
35647 : gen_xgetbv (op0, op2));
35648 emit_insn (insn);
35650 else if (fcode == IX86_BUILTIN_RDTSC)
35652 insn = (TARGET_64BIT
35653 ? gen_rdtsc_rex64 (op0, op1)
35654 : gen_rdtsc (op0));
35655 emit_insn (insn);
35657 else
35659 op2 = gen_reg_rtx (SImode);
35661 insn = (TARGET_64BIT
35662 ? gen_rdtscp_rex64 (op0, op1, op2)
35663 : gen_rdtscp (op0, op2));
35664 emit_insn (insn);
35666 arg0 = CALL_EXPR_ARG (exp, 0);
35667 op4 = expand_normal (arg0);
35668 if (!address_operand (op4, VOIDmode))
35670 op4 = convert_memory_address (Pmode, op4);
35671 op4 = copy_addr_to_reg (op4);
35673 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
35676 if (target == 0)
35678 /* mode is VOIDmode if __builtin_rd* has been called
35679 without lhs. */
35680 if (mode == VOIDmode)
35681 return target;
35682 target = gen_reg_rtx (mode);
35685 if (TARGET_64BIT)
35687 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
35688 op1, 1, OPTAB_DIRECT);
35689 op0 = expand_simple_binop (DImode, IOR, op0, op1,
35690 op0, 1, OPTAB_DIRECT);
35693 emit_move_insn (target, op0);
35694 return target;
35696 case IX86_BUILTIN_FXSAVE:
35697 case IX86_BUILTIN_FXRSTOR:
35698 case IX86_BUILTIN_FXSAVE64:
35699 case IX86_BUILTIN_FXRSTOR64:
35700 case IX86_BUILTIN_FNSTENV:
35701 case IX86_BUILTIN_FLDENV:
35702 mode0 = BLKmode;
35703 switch (fcode)
35705 case IX86_BUILTIN_FXSAVE:
35706 icode = CODE_FOR_fxsave;
35707 break;
35708 case IX86_BUILTIN_FXRSTOR:
35709 icode = CODE_FOR_fxrstor;
35710 break;
35711 case IX86_BUILTIN_FXSAVE64:
35712 icode = CODE_FOR_fxsave64;
35713 break;
35714 case IX86_BUILTIN_FXRSTOR64:
35715 icode = CODE_FOR_fxrstor64;
35716 break;
35717 case IX86_BUILTIN_FNSTENV:
35718 icode = CODE_FOR_fnstenv;
35719 break;
35720 case IX86_BUILTIN_FLDENV:
35721 icode = CODE_FOR_fldenv;
35722 break;
35723 default:
35724 gcc_unreachable ();
35727 arg0 = CALL_EXPR_ARG (exp, 0);
35728 op0 = expand_normal (arg0);
35730 if (!address_operand (op0, VOIDmode))
35732 op0 = convert_memory_address (Pmode, op0);
35733 op0 = copy_addr_to_reg (op0);
35735 op0 = gen_rtx_MEM (mode0, op0);
35737 pat = GEN_FCN (icode) (op0);
35738 if (pat)
35739 emit_insn (pat);
35740 return 0;
35742 case IX86_BUILTIN_XSETBV:
35743 arg0 = CALL_EXPR_ARG (exp, 0);
35744 arg1 = CALL_EXPR_ARG (exp, 1);
35745 op0 = expand_normal (arg0);
35746 op1 = expand_normal (arg1);
35748 if (!REG_P (op0))
35749 op0 = copy_to_mode_reg (SImode, op0);
35751 if (TARGET_64BIT)
35753 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
35754 NULL, 1, OPTAB_DIRECT);
35756 op2 = gen_lowpart (SImode, op2);
35757 op1 = gen_lowpart (SImode, op1);
35758 if (!REG_P (op1))
35759 op1 = copy_to_mode_reg (SImode, op1);
35760 if (!REG_P (op2))
35761 op2 = copy_to_mode_reg (SImode, op2);
35762 icode = CODE_FOR_xsetbv_rex64;
35763 pat = GEN_FCN (icode) (op0, op1, op2);
35765 else
35767 if (!REG_P (op1))
35768 op1 = copy_to_mode_reg (DImode, op1);
35769 icode = CODE_FOR_xsetbv;
35770 pat = GEN_FCN (icode) (op0, op1);
35772 if (pat)
35773 emit_insn (pat);
35774 return 0;
35776 case IX86_BUILTIN_XSAVE:
35777 case IX86_BUILTIN_XRSTOR:
35778 case IX86_BUILTIN_XSAVE64:
35779 case IX86_BUILTIN_XRSTOR64:
35780 case IX86_BUILTIN_XSAVEOPT:
35781 case IX86_BUILTIN_XSAVEOPT64:
35782 case IX86_BUILTIN_XSAVES:
35783 case IX86_BUILTIN_XRSTORS:
35784 case IX86_BUILTIN_XSAVES64:
35785 case IX86_BUILTIN_XRSTORS64:
35786 case IX86_BUILTIN_XSAVEC:
35787 case IX86_BUILTIN_XSAVEC64:
35788 arg0 = CALL_EXPR_ARG (exp, 0);
35789 arg1 = CALL_EXPR_ARG (exp, 1);
35790 op0 = expand_normal (arg0);
35791 op1 = expand_normal (arg1);
35793 if (!address_operand (op0, VOIDmode))
35795 op0 = convert_memory_address (Pmode, op0);
35796 op0 = copy_addr_to_reg (op0);
35798 op0 = gen_rtx_MEM (BLKmode, op0);
35800 op1 = force_reg (DImode, op1);
35802 if (TARGET_64BIT)
35804 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
35805 NULL, 1, OPTAB_DIRECT);
35806 switch (fcode)
35808 case IX86_BUILTIN_XSAVE:
35809 icode = CODE_FOR_xsave_rex64;
35810 break;
35811 case IX86_BUILTIN_XRSTOR:
35812 icode = CODE_FOR_xrstor_rex64;
35813 break;
35814 case IX86_BUILTIN_XSAVE64:
35815 icode = CODE_FOR_xsave64;
35816 break;
35817 case IX86_BUILTIN_XRSTOR64:
35818 icode = CODE_FOR_xrstor64;
35819 break;
35820 case IX86_BUILTIN_XSAVEOPT:
35821 icode = CODE_FOR_xsaveopt_rex64;
35822 break;
35823 case IX86_BUILTIN_XSAVEOPT64:
35824 icode = CODE_FOR_xsaveopt64;
35825 break;
35826 case IX86_BUILTIN_XSAVES:
35827 icode = CODE_FOR_xsaves_rex64;
35828 break;
35829 case IX86_BUILTIN_XRSTORS:
35830 icode = CODE_FOR_xrstors_rex64;
35831 break;
35832 case IX86_BUILTIN_XSAVES64:
35833 icode = CODE_FOR_xsaves64;
35834 break;
35835 case IX86_BUILTIN_XRSTORS64:
35836 icode = CODE_FOR_xrstors64;
35837 break;
35838 case IX86_BUILTIN_XSAVEC:
35839 icode = CODE_FOR_xsavec_rex64;
35840 break;
35841 case IX86_BUILTIN_XSAVEC64:
35842 icode = CODE_FOR_xsavec64;
35843 break;
35844 default:
35845 gcc_unreachable ();
35848 op2 = gen_lowpart (SImode, op2);
35849 op1 = gen_lowpart (SImode, op1);
35850 pat = GEN_FCN (icode) (op0, op1, op2);
35852 else
35854 switch (fcode)
35856 case IX86_BUILTIN_XSAVE:
35857 icode = CODE_FOR_xsave;
35858 break;
35859 case IX86_BUILTIN_XRSTOR:
35860 icode = CODE_FOR_xrstor;
35861 break;
35862 case IX86_BUILTIN_XSAVEOPT:
35863 icode = CODE_FOR_xsaveopt;
35864 break;
35865 case IX86_BUILTIN_XSAVES:
35866 icode = CODE_FOR_xsaves;
35867 break;
35868 case IX86_BUILTIN_XRSTORS:
35869 icode = CODE_FOR_xrstors;
35870 break;
35871 case IX86_BUILTIN_XSAVEC:
35872 icode = CODE_FOR_xsavec;
35873 break;
35874 default:
35875 gcc_unreachable ();
35877 pat = GEN_FCN (icode) (op0, op1);
35880 if (pat)
35881 emit_insn (pat);
35882 return 0;
35884 case IX86_BUILTIN_LLWPCB:
35885 arg0 = CALL_EXPR_ARG (exp, 0);
35886 op0 = expand_normal (arg0);
35887 icode = CODE_FOR_lwp_llwpcb;
35888 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35889 op0 = ix86_zero_extend_to_Pmode (op0);
35890 emit_insn (gen_lwp_llwpcb (op0));
35891 return 0;
35893 case IX86_BUILTIN_SLWPCB:
35894 icode = CODE_FOR_lwp_slwpcb;
35895 if (!target
35896 || !insn_data[icode].operand[0].predicate (target, Pmode))
35897 target = gen_reg_rtx (Pmode);
35898 emit_insn (gen_lwp_slwpcb (target));
35899 return target;
35901 case IX86_BUILTIN_BEXTRI32:
35902 case IX86_BUILTIN_BEXTRI64:
35903 arg0 = CALL_EXPR_ARG (exp, 0);
35904 arg1 = CALL_EXPR_ARG (exp, 1);
35905 op0 = expand_normal (arg0);
35906 op1 = expand_normal (arg1);
35907 icode = (fcode == IX86_BUILTIN_BEXTRI32
35908 ? CODE_FOR_tbm_bextri_si
35909 : CODE_FOR_tbm_bextri_di);
35910 if (!CONST_INT_P (op1))
35912 error ("last argument must be an immediate");
35913 return const0_rtx;
35915 else
35917 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
35918 unsigned char lsb_index = INTVAL (op1) & 0xFF;
35919 op1 = GEN_INT (length);
35920 op2 = GEN_INT (lsb_index);
35921 pat = GEN_FCN (icode) (target, op0, op1, op2);
35922 if (pat)
35923 emit_insn (pat);
35924 return target;
35927 case IX86_BUILTIN_RDRAND16_STEP:
35928 icode = CODE_FOR_rdrandhi_1;
35929 mode0 = HImode;
35930 goto rdrand_step;
35932 case IX86_BUILTIN_RDRAND32_STEP:
35933 icode = CODE_FOR_rdrandsi_1;
35934 mode0 = SImode;
35935 goto rdrand_step;
35937 case IX86_BUILTIN_RDRAND64_STEP:
35938 icode = CODE_FOR_rdranddi_1;
35939 mode0 = DImode;
35941 rdrand_step:
35942 arg0 = CALL_EXPR_ARG (exp, 0);
35943 op1 = expand_normal (arg0);
35944 if (!address_operand (op1, VOIDmode))
35946 op1 = convert_memory_address (Pmode, op1);
35947 op1 = copy_addr_to_reg (op1);
35950 op0 = gen_reg_rtx (mode0);
35951 emit_insn (GEN_FCN (icode) (op0));
35953 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
35955 op1 = gen_reg_rtx (SImode);
35956 emit_move_insn (op1, CONST1_RTX (SImode));
35958 /* Emit SImode conditional move. */
35959 if (mode0 == HImode)
35961 if (TARGET_ZERO_EXTEND_WITH_AND
35962 && optimize_function_for_speed_p (cfun))
35964 op2 = force_reg (SImode, const0_rtx);
35966 emit_insn (gen_movstricthi
35967 (gen_lowpart (HImode, op2), op0));
35969 else
35971 op2 = gen_reg_rtx (SImode);
35973 emit_insn (gen_zero_extendhisi2 (op2, op0));
35976 else if (mode0 == SImode)
35977 op2 = op0;
35978 else
35979 op2 = gen_rtx_SUBREG (SImode, op0, 0);
35981 if (target == 0
35982 || !register_operand (target, SImode))
35983 target = gen_reg_rtx (SImode);
35985 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
35986 const0_rtx);
35987 emit_insn (gen_rtx_SET (target,
35988 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
35989 return target;
35991 case IX86_BUILTIN_RDSEED16_STEP:
35992 icode = CODE_FOR_rdseedhi_1;
35993 mode0 = HImode;
35994 goto rdseed_step;
35996 case IX86_BUILTIN_RDSEED32_STEP:
35997 icode = CODE_FOR_rdseedsi_1;
35998 mode0 = SImode;
35999 goto rdseed_step;
36001 case IX86_BUILTIN_RDSEED64_STEP:
36002 icode = CODE_FOR_rdseeddi_1;
36003 mode0 = DImode;
36005 rdseed_step:
36006 arg0 = CALL_EXPR_ARG (exp, 0);
36007 op1 = expand_normal (arg0);
36008 if (!address_operand (op1, VOIDmode))
36010 op1 = convert_memory_address (Pmode, op1);
36011 op1 = copy_addr_to_reg (op1);
36014 op0 = gen_reg_rtx (mode0);
36015 emit_insn (GEN_FCN (icode) (op0));
36017 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36019 op2 = gen_reg_rtx (QImode);
36021 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
36022 const0_rtx);
36023 emit_insn (gen_rtx_SET (op2, pat));
36025 if (target == 0
36026 || !register_operand (target, SImode))
36027 target = gen_reg_rtx (SImode);
36029 emit_insn (gen_zero_extendqisi2 (target, op2));
36030 return target;
36032 case IX86_BUILTIN_SBB32:
36033 icode = CODE_FOR_subborrowsi;
36034 mode0 = SImode;
36035 goto handlecarry;
36037 case IX86_BUILTIN_SBB64:
36038 icode = CODE_FOR_subborrowdi;
36039 mode0 = DImode;
36040 goto handlecarry;
36042 case IX86_BUILTIN_ADDCARRYX32:
36043 icode = CODE_FOR_addcarrysi;
36044 mode0 = SImode;
36045 goto handlecarry;
36047 case IX86_BUILTIN_ADDCARRYX64:
36048 icode = CODE_FOR_addcarrydi;
36049 mode0 = DImode;
36051 handlecarry:
36052 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
36053 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
36054 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
36055 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
36057 op1 = expand_normal (arg0);
36058 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
36060 op2 = expand_normal (arg1);
36061 if (!register_operand (op2, mode0))
36062 op2 = copy_to_mode_reg (mode0, op2);
36064 op3 = expand_normal (arg2);
36065 if (!register_operand (op3, mode0))
36066 op3 = copy_to_mode_reg (mode0, op3);
36068 op4 = expand_normal (arg3);
36069 if (!address_operand (op4, VOIDmode))
36071 op4 = convert_memory_address (Pmode, op4);
36072 op4 = copy_addr_to_reg (op4);
36075 /* Generate CF from input operand. */
36076 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
36078 /* Generate instruction that consumes CF. */
36079 op0 = gen_reg_rtx (mode0);
36081 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
36082 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
36083 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
36085 /* Return current CF value. */
36086 if (target == 0)
36087 target = gen_reg_rtx (QImode);
36089 PUT_MODE (pat, QImode);
36090 emit_insn (gen_rtx_SET (target, pat));
36092 /* Store the result. */
36093 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
36095 return target;
36097 case IX86_BUILTIN_READ_FLAGS:
36098 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
36100 if (optimize
36101 || target == NULL_RTX
36102 || !nonimmediate_operand (target, word_mode)
36103 || GET_MODE (target) != word_mode)
36104 target = gen_reg_rtx (word_mode);
36106 emit_insn (gen_pop (target));
36107 return target;
36109 case IX86_BUILTIN_WRITE_FLAGS:
36111 arg0 = CALL_EXPR_ARG (exp, 0);
36112 op0 = expand_normal (arg0);
36113 if (!general_no_elim_operand (op0, word_mode))
36114 op0 = copy_to_mode_reg (word_mode, op0);
36116 emit_insn (gen_push (op0));
36117 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
36118 return 0;
36120 case IX86_BUILTIN_KTESTC8:
36121 icode = CODE_FOR_ktestqi;
36122 mode3 = CCCmode;
36123 goto kortest;
36125 case IX86_BUILTIN_KTESTZ8:
36126 icode = CODE_FOR_ktestqi;
36127 mode3 = CCZmode;
36128 goto kortest;
36130 case IX86_BUILTIN_KTESTC16:
36131 icode = CODE_FOR_ktesthi;
36132 mode3 = CCCmode;
36133 goto kortest;
36135 case IX86_BUILTIN_KTESTZ16:
36136 icode = CODE_FOR_ktesthi;
36137 mode3 = CCZmode;
36138 goto kortest;
36140 case IX86_BUILTIN_KTESTC32:
36141 icode = CODE_FOR_ktestsi;
36142 mode3 = CCCmode;
36143 goto kortest;
36145 case IX86_BUILTIN_KTESTZ32:
36146 icode = CODE_FOR_ktestsi;
36147 mode3 = CCZmode;
36148 goto kortest;
36150 case IX86_BUILTIN_KTESTC64:
36151 icode = CODE_FOR_ktestdi;
36152 mode3 = CCCmode;
36153 goto kortest;
36155 case IX86_BUILTIN_KTESTZ64:
36156 icode = CODE_FOR_ktestdi;
36157 mode3 = CCZmode;
36158 goto kortest;
36160 case IX86_BUILTIN_KORTESTC8:
36161 icode = CODE_FOR_kortestqi;
36162 mode3 = CCCmode;
36163 goto kortest;
36165 case IX86_BUILTIN_KORTESTZ8:
36166 icode = CODE_FOR_kortestqi;
36167 mode3 = CCZmode;
36168 goto kortest;
36170 case IX86_BUILTIN_KORTESTC16:
36171 icode = CODE_FOR_kortesthi;
36172 mode3 = CCCmode;
36173 goto kortest;
36175 case IX86_BUILTIN_KORTESTZ16:
36176 icode = CODE_FOR_kortesthi;
36177 mode3 = CCZmode;
36178 goto kortest;
36180 case IX86_BUILTIN_KORTESTC32:
36181 icode = CODE_FOR_kortestsi;
36182 mode3 = CCCmode;
36183 goto kortest;
36185 case IX86_BUILTIN_KORTESTZ32:
36186 icode = CODE_FOR_kortestsi;
36187 mode3 = CCZmode;
36188 goto kortest;
36190 case IX86_BUILTIN_KORTESTC64:
36191 icode = CODE_FOR_kortestdi;
36192 mode3 = CCCmode;
36193 goto kortest;
36195 case IX86_BUILTIN_KORTESTZ64:
36196 icode = CODE_FOR_kortestdi;
36197 mode3 = CCZmode;
36199 kortest:
36200 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
36201 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
36202 op0 = expand_normal (arg0);
36203 op1 = expand_normal (arg1);
36205 mode0 = insn_data[icode].operand[0].mode;
36206 mode1 = insn_data[icode].operand[1].mode;
36208 if (GET_MODE (op0) != VOIDmode)
36209 op0 = force_reg (GET_MODE (op0), op0);
36211 op0 = gen_lowpart (mode0, op0);
36213 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36214 op0 = copy_to_mode_reg (mode0, op0);
36216 if (GET_MODE (op1) != VOIDmode)
36217 op1 = force_reg (GET_MODE (op1), op1);
36219 op1 = gen_lowpart (mode1, op1);
36221 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36222 op1 = copy_to_mode_reg (mode1, op1);
36224 target = gen_reg_rtx (QImode);
36226 /* Emit kortest. */
36227 emit_insn (GEN_FCN (icode) (op0, op1));
36228 /* And use setcc to return result from flags. */
36229 ix86_expand_setcc (target, EQ,
36230 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
36231 return target;
36233 case IX86_BUILTIN_GATHERSIV2DF:
36234 icode = CODE_FOR_avx2_gathersiv2df;
36235 goto gather_gen;
36236 case IX86_BUILTIN_GATHERSIV4DF:
36237 icode = CODE_FOR_avx2_gathersiv4df;
36238 goto gather_gen;
36239 case IX86_BUILTIN_GATHERDIV2DF:
36240 icode = CODE_FOR_avx2_gatherdiv2df;
36241 goto gather_gen;
36242 case IX86_BUILTIN_GATHERDIV4DF:
36243 icode = CODE_FOR_avx2_gatherdiv4df;
36244 goto gather_gen;
36245 case IX86_BUILTIN_GATHERSIV4SF:
36246 icode = CODE_FOR_avx2_gathersiv4sf;
36247 goto gather_gen;
36248 case IX86_BUILTIN_GATHERSIV8SF:
36249 icode = CODE_FOR_avx2_gathersiv8sf;
36250 goto gather_gen;
36251 case IX86_BUILTIN_GATHERDIV4SF:
36252 icode = CODE_FOR_avx2_gatherdiv4sf;
36253 goto gather_gen;
36254 case IX86_BUILTIN_GATHERDIV8SF:
36255 icode = CODE_FOR_avx2_gatherdiv8sf;
36256 goto gather_gen;
36257 case IX86_BUILTIN_GATHERSIV2DI:
36258 icode = CODE_FOR_avx2_gathersiv2di;
36259 goto gather_gen;
36260 case IX86_BUILTIN_GATHERSIV4DI:
36261 icode = CODE_FOR_avx2_gathersiv4di;
36262 goto gather_gen;
36263 case IX86_BUILTIN_GATHERDIV2DI:
36264 icode = CODE_FOR_avx2_gatherdiv2di;
36265 goto gather_gen;
36266 case IX86_BUILTIN_GATHERDIV4DI:
36267 icode = CODE_FOR_avx2_gatherdiv4di;
36268 goto gather_gen;
36269 case IX86_BUILTIN_GATHERSIV4SI:
36270 icode = CODE_FOR_avx2_gathersiv4si;
36271 goto gather_gen;
36272 case IX86_BUILTIN_GATHERSIV8SI:
36273 icode = CODE_FOR_avx2_gathersiv8si;
36274 goto gather_gen;
36275 case IX86_BUILTIN_GATHERDIV4SI:
36276 icode = CODE_FOR_avx2_gatherdiv4si;
36277 goto gather_gen;
36278 case IX86_BUILTIN_GATHERDIV8SI:
36279 icode = CODE_FOR_avx2_gatherdiv8si;
36280 goto gather_gen;
36281 case IX86_BUILTIN_GATHERALTSIV4DF:
36282 icode = CODE_FOR_avx2_gathersiv4df;
36283 goto gather_gen;
36284 case IX86_BUILTIN_GATHERALTDIV8SF:
36285 icode = CODE_FOR_avx2_gatherdiv8sf;
36286 goto gather_gen;
36287 case IX86_BUILTIN_GATHERALTSIV4DI:
36288 icode = CODE_FOR_avx2_gathersiv4di;
36289 goto gather_gen;
36290 case IX86_BUILTIN_GATHERALTDIV8SI:
36291 icode = CODE_FOR_avx2_gatherdiv8si;
36292 goto gather_gen;
36293 case IX86_BUILTIN_GATHER3SIV16SF:
36294 icode = CODE_FOR_avx512f_gathersiv16sf;
36295 goto gather_gen;
36296 case IX86_BUILTIN_GATHER3SIV8DF:
36297 icode = CODE_FOR_avx512f_gathersiv8df;
36298 goto gather_gen;
36299 case IX86_BUILTIN_GATHER3DIV16SF:
36300 icode = CODE_FOR_avx512f_gatherdiv16sf;
36301 goto gather_gen;
36302 case IX86_BUILTIN_GATHER3DIV8DF:
36303 icode = CODE_FOR_avx512f_gatherdiv8df;
36304 goto gather_gen;
36305 case IX86_BUILTIN_GATHER3SIV16SI:
36306 icode = CODE_FOR_avx512f_gathersiv16si;
36307 goto gather_gen;
36308 case IX86_BUILTIN_GATHER3SIV8DI:
36309 icode = CODE_FOR_avx512f_gathersiv8di;
36310 goto gather_gen;
36311 case IX86_BUILTIN_GATHER3DIV16SI:
36312 icode = CODE_FOR_avx512f_gatherdiv16si;
36313 goto gather_gen;
36314 case IX86_BUILTIN_GATHER3DIV8DI:
36315 icode = CODE_FOR_avx512f_gatherdiv8di;
36316 goto gather_gen;
36317 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36318 icode = CODE_FOR_avx512f_gathersiv8df;
36319 goto gather_gen;
36320 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36321 icode = CODE_FOR_avx512f_gatherdiv16sf;
36322 goto gather_gen;
36323 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36324 icode = CODE_FOR_avx512f_gathersiv8di;
36325 goto gather_gen;
36326 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36327 icode = CODE_FOR_avx512f_gatherdiv16si;
36328 goto gather_gen;
36329 case IX86_BUILTIN_GATHER3SIV2DF:
36330 icode = CODE_FOR_avx512vl_gathersiv2df;
36331 goto gather_gen;
36332 case IX86_BUILTIN_GATHER3SIV4DF:
36333 icode = CODE_FOR_avx512vl_gathersiv4df;
36334 goto gather_gen;
36335 case IX86_BUILTIN_GATHER3DIV2DF:
36336 icode = CODE_FOR_avx512vl_gatherdiv2df;
36337 goto gather_gen;
36338 case IX86_BUILTIN_GATHER3DIV4DF:
36339 icode = CODE_FOR_avx512vl_gatherdiv4df;
36340 goto gather_gen;
36341 case IX86_BUILTIN_GATHER3SIV4SF:
36342 icode = CODE_FOR_avx512vl_gathersiv4sf;
36343 goto gather_gen;
36344 case IX86_BUILTIN_GATHER3SIV8SF:
36345 icode = CODE_FOR_avx512vl_gathersiv8sf;
36346 goto gather_gen;
36347 case IX86_BUILTIN_GATHER3DIV4SF:
36348 icode = CODE_FOR_avx512vl_gatherdiv4sf;
36349 goto gather_gen;
36350 case IX86_BUILTIN_GATHER3DIV8SF:
36351 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36352 goto gather_gen;
36353 case IX86_BUILTIN_GATHER3SIV2DI:
36354 icode = CODE_FOR_avx512vl_gathersiv2di;
36355 goto gather_gen;
36356 case IX86_BUILTIN_GATHER3SIV4DI:
36357 icode = CODE_FOR_avx512vl_gathersiv4di;
36358 goto gather_gen;
36359 case IX86_BUILTIN_GATHER3DIV2DI:
36360 icode = CODE_FOR_avx512vl_gatherdiv2di;
36361 goto gather_gen;
36362 case IX86_BUILTIN_GATHER3DIV4DI:
36363 icode = CODE_FOR_avx512vl_gatherdiv4di;
36364 goto gather_gen;
36365 case IX86_BUILTIN_GATHER3SIV4SI:
36366 icode = CODE_FOR_avx512vl_gathersiv4si;
36367 goto gather_gen;
36368 case IX86_BUILTIN_GATHER3SIV8SI:
36369 icode = CODE_FOR_avx512vl_gathersiv8si;
36370 goto gather_gen;
36371 case IX86_BUILTIN_GATHER3DIV4SI:
36372 icode = CODE_FOR_avx512vl_gatherdiv4si;
36373 goto gather_gen;
36374 case IX86_BUILTIN_GATHER3DIV8SI:
36375 icode = CODE_FOR_avx512vl_gatherdiv8si;
36376 goto gather_gen;
36377 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36378 icode = CODE_FOR_avx512vl_gathersiv4df;
36379 goto gather_gen;
36380 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36381 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36382 goto gather_gen;
36383 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36384 icode = CODE_FOR_avx512vl_gathersiv4di;
36385 goto gather_gen;
36386 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36387 icode = CODE_FOR_avx512vl_gatherdiv8si;
36388 goto gather_gen;
36389 case IX86_BUILTIN_SCATTERSIV16SF:
36390 icode = CODE_FOR_avx512f_scattersiv16sf;
36391 goto scatter_gen;
36392 case IX86_BUILTIN_SCATTERSIV8DF:
36393 icode = CODE_FOR_avx512f_scattersiv8df;
36394 goto scatter_gen;
36395 case IX86_BUILTIN_SCATTERDIV16SF:
36396 icode = CODE_FOR_avx512f_scatterdiv16sf;
36397 goto scatter_gen;
36398 case IX86_BUILTIN_SCATTERDIV8DF:
36399 icode = CODE_FOR_avx512f_scatterdiv8df;
36400 goto scatter_gen;
36401 case IX86_BUILTIN_SCATTERSIV16SI:
36402 icode = CODE_FOR_avx512f_scattersiv16si;
36403 goto scatter_gen;
36404 case IX86_BUILTIN_SCATTERSIV8DI:
36405 icode = CODE_FOR_avx512f_scattersiv8di;
36406 goto scatter_gen;
36407 case IX86_BUILTIN_SCATTERDIV16SI:
36408 icode = CODE_FOR_avx512f_scatterdiv16si;
36409 goto scatter_gen;
36410 case IX86_BUILTIN_SCATTERDIV8DI:
36411 icode = CODE_FOR_avx512f_scatterdiv8di;
36412 goto scatter_gen;
36413 case IX86_BUILTIN_SCATTERSIV8SF:
36414 icode = CODE_FOR_avx512vl_scattersiv8sf;
36415 goto scatter_gen;
36416 case IX86_BUILTIN_SCATTERSIV4SF:
36417 icode = CODE_FOR_avx512vl_scattersiv4sf;
36418 goto scatter_gen;
36419 case IX86_BUILTIN_SCATTERSIV4DF:
36420 icode = CODE_FOR_avx512vl_scattersiv4df;
36421 goto scatter_gen;
36422 case IX86_BUILTIN_SCATTERSIV2DF:
36423 icode = CODE_FOR_avx512vl_scattersiv2df;
36424 goto scatter_gen;
36425 case IX86_BUILTIN_SCATTERDIV8SF:
36426 icode = CODE_FOR_avx512vl_scatterdiv8sf;
36427 goto scatter_gen;
36428 case IX86_BUILTIN_SCATTERDIV4SF:
36429 icode = CODE_FOR_avx512vl_scatterdiv4sf;
36430 goto scatter_gen;
36431 case IX86_BUILTIN_SCATTERDIV4DF:
36432 icode = CODE_FOR_avx512vl_scatterdiv4df;
36433 goto scatter_gen;
36434 case IX86_BUILTIN_SCATTERDIV2DF:
36435 icode = CODE_FOR_avx512vl_scatterdiv2df;
36436 goto scatter_gen;
36437 case IX86_BUILTIN_SCATTERSIV8SI:
36438 icode = CODE_FOR_avx512vl_scattersiv8si;
36439 goto scatter_gen;
36440 case IX86_BUILTIN_SCATTERSIV4SI:
36441 icode = CODE_FOR_avx512vl_scattersiv4si;
36442 goto scatter_gen;
36443 case IX86_BUILTIN_SCATTERSIV4DI:
36444 icode = CODE_FOR_avx512vl_scattersiv4di;
36445 goto scatter_gen;
36446 case IX86_BUILTIN_SCATTERSIV2DI:
36447 icode = CODE_FOR_avx512vl_scattersiv2di;
36448 goto scatter_gen;
36449 case IX86_BUILTIN_SCATTERDIV8SI:
36450 icode = CODE_FOR_avx512vl_scatterdiv8si;
36451 goto scatter_gen;
36452 case IX86_BUILTIN_SCATTERDIV4SI:
36453 icode = CODE_FOR_avx512vl_scatterdiv4si;
36454 goto scatter_gen;
36455 case IX86_BUILTIN_SCATTERDIV4DI:
36456 icode = CODE_FOR_avx512vl_scatterdiv4di;
36457 goto scatter_gen;
36458 case IX86_BUILTIN_SCATTERDIV2DI:
36459 icode = CODE_FOR_avx512vl_scatterdiv2di;
36460 goto scatter_gen;
36461 case IX86_BUILTIN_GATHERPFDPD:
36462 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
36463 goto vec_prefetch_gen;
36464 case IX86_BUILTIN_SCATTERALTSIV8DF:
36465 icode = CODE_FOR_avx512f_scattersiv8df;
36466 goto scatter_gen;
36467 case IX86_BUILTIN_SCATTERALTDIV16SF:
36468 icode = CODE_FOR_avx512f_scatterdiv16sf;
36469 goto scatter_gen;
36470 case IX86_BUILTIN_SCATTERALTSIV8DI:
36471 icode = CODE_FOR_avx512f_scattersiv8di;
36472 goto scatter_gen;
36473 case IX86_BUILTIN_SCATTERALTDIV16SI:
36474 icode = CODE_FOR_avx512f_scatterdiv16si;
36475 goto scatter_gen;
36476 case IX86_BUILTIN_GATHERPFDPS:
36477 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
36478 goto vec_prefetch_gen;
36479 case IX86_BUILTIN_GATHERPFQPD:
36480 icode = CODE_FOR_avx512pf_gatherpfv8didf;
36481 goto vec_prefetch_gen;
36482 case IX86_BUILTIN_GATHERPFQPS:
36483 icode = CODE_FOR_avx512pf_gatherpfv8disf;
36484 goto vec_prefetch_gen;
36485 case IX86_BUILTIN_SCATTERPFDPD:
36486 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
36487 goto vec_prefetch_gen;
36488 case IX86_BUILTIN_SCATTERPFDPS:
36489 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
36490 goto vec_prefetch_gen;
36491 case IX86_BUILTIN_SCATTERPFQPD:
36492 icode = CODE_FOR_avx512pf_scatterpfv8didf;
36493 goto vec_prefetch_gen;
36494 case IX86_BUILTIN_SCATTERPFQPS:
36495 icode = CODE_FOR_avx512pf_scatterpfv8disf;
36496 goto vec_prefetch_gen;
36498 gather_gen:
36499 rtx half;
36500 rtx (*gen) (rtx, rtx);
36502 arg0 = CALL_EXPR_ARG (exp, 0);
36503 arg1 = CALL_EXPR_ARG (exp, 1);
36504 arg2 = CALL_EXPR_ARG (exp, 2);
36505 arg3 = CALL_EXPR_ARG (exp, 3);
36506 arg4 = CALL_EXPR_ARG (exp, 4);
36507 op0 = expand_normal (arg0);
36508 op1 = expand_normal (arg1);
36509 op2 = expand_normal (arg2);
36510 op3 = expand_normal (arg3);
36511 op4 = expand_normal (arg4);
36512 /* Note the arg order is different from the operand order. */
36513 mode0 = insn_data[icode].operand[1].mode;
36514 mode2 = insn_data[icode].operand[3].mode;
36515 mode3 = insn_data[icode].operand[4].mode;
36516 mode4 = insn_data[icode].operand[5].mode;
36518 if (target == NULL_RTX
36519 || GET_MODE (target) != insn_data[icode].operand[0].mode
36520 || !insn_data[icode].operand[0].predicate (target,
36521 GET_MODE (target)))
36522 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
36523 else
36524 subtarget = target;
36526 switch (fcode)
36528 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36529 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36530 half = gen_reg_rtx (V8SImode);
36531 if (!nonimmediate_operand (op2, V16SImode))
36532 op2 = copy_to_mode_reg (V16SImode, op2);
36533 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36534 op2 = half;
36535 break;
36536 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36537 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36538 case IX86_BUILTIN_GATHERALTSIV4DF:
36539 case IX86_BUILTIN_GATHERALTSIV4DI:
36540 half = gen_reg_rtx (V4SImode);
36541 if (!nonimmediate_operand (op2, V8SImode))
36542 op2 = copy_to_mode_reg (V8SImode, op2);
36543 emit_insn (gen_vec_extract_lo_v8si (half, op2));
36544 op2 = half;
36545 break;
36546 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36547 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36548 half = gen_reg_rtx (mode0);
36549 if (mode0 == V8SFmode)
36550 gen = gen_vec_extract_lo_v16sf;
36551 else
36552 gen = gen_vec_extract_lo_v16si;
36553 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36554 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36555 emit_insn (gen (half, op0));
36556 op0 = half;
36557 if (GET_MODE (op3) != VOIDmode)
36559 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36560 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36561 emit_insn (gen (half, op3));
36562 op3 = half;
36564 break;
36565 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36566 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36567 case IX86_BUILTIN_GATHERALTDIV8SF:
36568 case IX86_BUILTIN_GATHERALTDIV8SI:
36569 half = gen_reg_rtx (mode0);
36570 if (mode0 == V4SFmode)
36571 gen = gen_vec_extract_lo_v8sf;
36572 else
36573 gen = gen_vec_extract_lo_v8si;
36574 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36575 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36576 emit_insn (gen (half, op0));
36577 op0 = half;
36578 if (GET_MODE (op3) != VOIDmode)
36580 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36581 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36582 emit_insn (gen (half, op3));
36583 op3 = half;
36585 break;
36586 default:
36587 break;
36590 /* Force memory operand only with base register here. But we
36591 don't want to do it on memory operand for other builtin
36592 functions. */
36593 op1 = ix86_zero_extend_to_Pmode (op1);
36595 if (!insn_data[icode].operand[1].predicate (op0, mode0))
36596 op0 = copy_to_mode_reg (mode0, op0);
36597 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
36598 op1 = copy_to_mode_reg (Pmode, op1);
36599 if (!insn_data[icode].operand[3].predicate (op2, mode2))
36600 op2 = copy_to_mode_reg (mode2, op2);
36602 op3 = fixup_modeless_constant (op3, mode3);
36604 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
36606 if (!insn_data[icode].operand[4].predicate (op3, mode3))
36607 op3 = copy_to_mode_reg (mode3, op3);
36609 else
36611 op3 = copy_to_reg (op3);
36612 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
36614 if (!insn_data[icode].operand[5].predicate (op4, mode4))
36616 error ("the last argument must be scale 1, 2, 4, 8");
36617 return const0_rtx;
36620 /* Optimize. If mask is known to have all high bits set,
36621 replace op0 with pc_rtx to signal that the instruction
36622 overwrites the whole destination and doesn't use its
36623 previous contents. */
36624 if (optimize)
36626 if (TREE_CODE (arg3) == INTEGER_CST)
36628 if (integer_all_onesp (arg3))
36629 op0 = pc_rtx;
36631 else if (TREE_CODE (arg3) == VECTOR_CST)
36633 unsigned int negative = 0;
36634 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
36636 tree cst = VECTOR_CST_ELT (arg3, i);
36637 if (TREE_CODE (cst) == INTEGER_CST
36638 && tree_int_cst_sign_bit (cst))
36639 negative++;
36640 else if (TREE_CODE (cst) == REAL_CST
36641 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
36642 negative++;
36644 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
36645 op0 = pc_rtx;
36647 else if (TREE_CODE (arg3) == SSA_NAME
36648 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
36650 /* Recognize also when mask is like:
36651 __v2df src = _mm_setzero_pd ();
36652 __v2df mask = _mm_cmpeq_pd (src, src);
36654 __v8sf src = _mm256_setzero_ps ();
36655 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
36656 as that is a cheaper way to load all ones into
36657 a register than having to load a constant from
36658 memory. */
36659 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
36660 if (is_gimple_call (def_stmt))
36662 tree fndecl = gimple_call_fndecl (def_stmt);
36663 if (fndecl
36664 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
36665 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
36667 case IX86_BUILTIN_CMPPD:
36668 case IX86_BUILTIN_CMPPS:
36669 case IX86_BUILTIN_CMPPD256:
36670 case IX86_BUILTIN_CMPPS256:
36671 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
36672 break;
36673 /* FALLTHRU */
36674 case IX86_BUILTIN_CMPEQPD:
36675 case IX86_BUILTIN_CMPEQPS:
36676 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
36677 && initializer_zerop (gimple_call_arg (def_stmt,
36678 1)))
36679 op0 = pc_rtx;
36680 break;
36681 default:
36682 break;
36688 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
36689 if (! pat)
36690 return const0_rtx;
36691 emit_insn (pat);
36693 switch (fcode)
36695 case IX86_BUILTIN_GATHER3DIV16SF:
36696 if (target == NULL_RTX)
36697 target = gen_reg_rtx (V8SFmode);
36698 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
36699 break;
36700 case IX86_BUILTIN_GATHER3DIV16SI:
36701 if (target == NULL_RTX)
36702 target = gen_reg_rtx (V8SImode);
36703 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
36704 break;
36705 case IX86_BUILTIN_GATHER3DIV8SF:
36706 case IX86_BUILTIN_GATHERDIV8SF:
36707 if (target == NULL_RTX)
36708 target = gen_reg_rtx (V4SFmode);
36709 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
36710 break;
36711 case IX86_BUILTIN_GATHER3DIV8SI:
36712 case IX86_BUILTIN_GATHERDIV8SI:
36713 if (target == NULL_RTX)
36714 target = gen_reg_rtx (V4SImode);
36715 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
36716 break;
36717 default:
36718 target = subtarget;
36719 break;
36721 return target;
36723 scatter_gen:
36724 arg0 = CALL_EXPR_ARG (exp, 0);
36725 arg1 = CALL_EXPR_ARG (exp, 1);
36726 arg2 = CALL_EXPR_ARG (exp, 2);
36727 arg3 = CALL_EXPR_ARG (exp, 3);
36728 arg4 = CALL_EXPR_ARG (exp, 4);
36729 op0 = expand_normal (arg0);
36730 op1 = expand_normal (arg1);
36731 op2 = expand_normal (arg2);
36732 op3 = expand_normal (arg3);
36733 op4 = expand_normal (arg4);
36734 mode1 = insn_data[icode].operand[1].mode;
36735 mode2 = insn_data[icode].operand[2].mode;
36736 mode3 = insn_data[icode].operand[3].mode;
36737 mode4 = insn_data[icode].operand[4].mode;
36739 /* Scatter instruction stores operand op3 to memory with
36740 indices from op2 and scale from op4 under writemask op1.
36741 If index operand op2 has more elements then source operand
36742 op3 one need to use only its low half. And vice versa. */
36743 switch (fcode)
36745 case IX86_BUILTIN_SCATTERALTSIV8DF:
36746 case IX86_BUILTIN_SCATTERALTSIV8DI:
36747 half = gen_reg_rtx (V8SImode);
36748 if (!nonimmediate_operand (op2, V16SImode))
36749 op2 = copy_to_mode_reg (V16SImode, op2);
36750 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36751 op2 = half;
36752 break;
36753 case IX86_BUILTIN_SCATTERALTDIV16SF:
36754 case IX86_BUILTIN_SCATTERALTDIV16SI:
36755 half = gen_reg_rtx (mode3);
36756 if (mode3 == V8SFmode)
36757 gen = gen_vec_extract_lo_v16sf;
36758 else
36759 gen = gen_vec_extract_lo_v16si;
36760 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36761 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36762 emit_insn (gen (half, op3));
36763 op3 = half;
36764 break;
36765 default:
36766 break;
36769 /* Force memory operand only with base register here. But we
36770 don't want to do it on memory operand for other builtin
36771 functions. */
36772 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
36774 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36775 op0 = copy_to_mode_reg (Pmode, op0);
36777 op1 = fixup_modeless_constant (op1, mode1);
36779 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
36781 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36782 op1 = copy_to_mode_reg (mode1, op1);
36784 else
36786 op1 = copy_to_reg (op1);
36787 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
36790 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36791 op2 = copy_to_mode_reg (mode2, op2);
36793 if (!insn_data[icode].operand[3].predicate (op3, mode3))
36794 op3 = copy_to_mode_reg (mode3, op3);
36796 if (!insn_data[icode].operand[4].predicate (op4, mode4))
36798 error ("the last argument must be scale 1, 2, 4, 8");
36799 return const0_rtx;
36802 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
36803 if (! pat)
36804 return const0_rtx;
36806 emit_insn (pat);
36807 return 0;
36809 vec_prefetch_gen:
36810 arg0 = CALL_EXPR_ARG (exp, 0);
36811 arg1 = CALL_EXPR_ARG (exp, 1);
36812 arg2 = CALL_EXPR_ARG (exp, 2);
36813 arg3 = CALL_EXPR_ARG (exp, 3);
36814 arg4 = CALL_EXPR_ARG (exp, 4);
36815 op0 = expand_normal (arg0);
36816 op1 = expand_normal (arg1);
36817 op2 = expand_normal (arg2);
36818 op3 = expand_normal (arg3);
36819 op4 = expand_normal (arg4);
36820 mode0 = insn_data[icode].operand[0].mode;
36821 mode1 = insn_data[icode].operand[1].mode;
36822 mode3 = insn_data[icode].operand[3].mode;
36823 mode4 = insn_data[icode].operand[4].mode;
36825 op0 = fixup_modeless_constant (op0, mode0);
36827 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
36829 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36830 op0 = copy_to_mode_reg (mode0, op0);
36832 else
36834 op0 = copy_to_reg (op0);
36835 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
36838 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36839 op1 = copy_to_mode_reg (mode1, op1);
36841 /* Force memory operand only with base register here. But we
36842 don't want to do it on memory operand for other builtin
36843 functions. */
36844 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
36846 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
36847 op2 = copy_to_mode_reg (Pmode, op2);
36849 if (!insn_data[icode].operand[3].predicate (op3, mode3))
36851 error ("the forth argument must be scale 1, 2, 4, 8");
36852 return const0_rtx;
36855 if (!insn_data[icode].operand[4].predicate (op4, mode4))
36857 error ("incorrect hint operand");
36858 return const0_rtx;
36861 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
36862 if (! pat)
36863 return const0_rtx;
36865 emit_insn (pat);
36867 return 0;
36869 case IX86_BUILTIN_XABORT:
36870 icode = CODE_FOR_xabort;
36871 arg0 = CALL_EXPR_ARG (exp, 0);
36872 op0 = expand_normal (arg0);
36873 mode0 = insn_data[icode].operand[0].mode;
36874 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36876 error ("the xabort's argument must be an 8-bit immediate");
36877 return const0_rtx;
36879 emit_insn (gen_xabort (op0));
36880 return 0;
36882 case IX86_BUILTIN_RSTORSSP:
36883 case IX86_BUILTIN_CLRSSBSY:
36884 arg0 = CALL_EXPR_ARG (exp, 0);
36885 op0 = expand_normal (arg0);
36886 icode = (fcode == IX86_BUILTIN_RSTORSSP
36887 ? CODE_FOR_rstorssp
36888 : CODE_FOR_clrssbsy);
36889 if (!address_operand (op0, VOIDmode))
36891 op1 = convert_memory_address (Pmode, op0);
36892 op0 = copy_addr_to_reg (op1);
36894 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
36895 return 0;
36897 case IX86_BUILTIN_WRSSD:
36898 case IX86_BUILTIN_WRSSQ:
36899 case IX86_BUILTIN_WRUSSD:
36900 case IX86_BUILTIN_WRUSSQ:
36901 arg0 = CALL_EXPR_ARG (exp, 0);
36902 op0 = expand_normal (arg0);
36903 arg1 = CALL_EXPR_ARG (exp, 1);
36904 op1 = expand_normal (arg1);
36905 switch (fcode)
36907 case IX86_BUILTIN_WRSSD:
36908 icode = CODE_FOR_wrsssi;
36909 mode = SImode;
36910 break;
36911 case IX86_BUILTIN_WRSSQ:
36912 icode = CODE_FOR_wrssdi;
36913 mode = DImode;
36914 break;
36915 case IX86_BUILTIN_WRUSSD:
36916 icode = CODE_FOR_wrusssi;
36917 mode = SImode;
36918 break;
36919 case IX86_BUILTIN_WRUSSQ:
36920 icode = CODE_FOR_wrussdi;
36921 mode = DImode;
36922 break;
36924 op0 = force_reg (mode, op0);
36925 if (!address_operand (op1, VOIDmode))
36927 op2 = convert_memory_address (Pmode, op1);
36928 op1 = copy_addr_to_reg (op2);
36930 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
36931 return 0;
36933 default:
36934 break;
36937 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
36938 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
36940 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
36941 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
36942 target);
36945 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
36946 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
36948 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
36949 switch (fcode)
36951 case IX86_BUILTIN_FABSQ:
36952 case IX86_BUILTIN_COPYSIGNQ:
36953 if (!TARGET_SSE)
36954 /* Emit a normal call if SSE isn't available. */
36955 return expand_call (exp, target, ignore);
36956 /* FALLTHRU */
36957 default:
36958 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
36962 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
36963 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
36965 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
36966 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
36967 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
36968 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
36969 int masked = 1;
36970 machine_mode mode, wide_mode, nar_mode;
36972 nar_mode = V4SFmode;
36973 mode = V16SFmode;
36974 wide_mode = V64SFmode;
36975 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
36976 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
36978 switch (fcode)
36980 case IX86_BUILTIN_4FMAPS:
36981 fcn = gen_avx5124fmaddps_4fmaddps;
36982 masked = 0;
36983 goto v4fma_expand;
36985 case IX86_BUILTIN_4DPWSSD:
36986 nar_mode = V4SImode;
36987 mode = V16SImode;
36988 wide_mode = V64SImode;
36989 fcn = gen_avx5124vnniw_vp4dpwssd;
36990 masked = 0;
36991 goto v4fma_expand;
36993 case IX86_BUILTIN_4DPWSSDS:
36994 nar_mode = V4SImode;
36995 mode = V16SImode;
36996 wide_mode = V64SImode;
36997 fcn = gen_avx5124vnniw_vp4dpwssds;
36998 masked = 0;
36999 goto v4fma_expand;
37001 case IX86_BUILTIN_4FNMAPS:
37002 fcn = gen_avx5124fmaddps_4fnmaddps;
37003 masked = 0;
37004 goto v4fma_expand;
37006 case IX86_BUILTIN_4FNMAPS_MASK:
37007 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
37008 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
37009 goto v4fma_expand;
37011 case IX86_BUILTIN_4DPWSSD_MASK:
37012 nar_mode = V4SImode;
37013 mode = V16SImode;
37014 wide_mode = V64SImode;
37015 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
37016 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
37017 goto v4fma_expand;
37019 case IX86_BUILTIN_4DPWSSDS_MASK:
37020 nar_mode = V4SImode;
37021 mode = V16SImode;
37022 wide_mode = V64SImode;
37023 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
37024 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
37025 goto v4fma_expand;
37027 case IX86_BUILTIN_4FMAPS_MASK:
37029 tree args[4];
37030 rtx ops[4];
37031 rtx wide_reg;
37032 rtx accum;
37033 rtx addr;
37034 rtx mem;
37036 v4fma_expand:
37037 wide_reg = gen_reg_rtx (wide_mode);
37038 for (i = 0; i < 4; i++)
37040 args[i] = CALL_EXPR_ARG (exp, i);
37041 ops[i] = expand_normal (args[i]);
37043 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
37044 ops[i]);
37047 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37048 accum = force_reg (mode, accum);
37050 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37051 addr = force_reg (Pmode, addr);
37053 mem = gen_rtx_MEM (nar_mode, addr);
37055 target = gen_reg_rtx (mode);
37057 emit_move_insn (target, accum);
37059 if (! masked)
37060 emit_insn (fcn (target, accum, wide_reg, mem));
37061 else
37063 rtx merge, mask;
37064 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37066 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37068 if (CONST_INT_P (mask))
37069 mask = fixup_modeless_constant (mask, HImode);
37071 mask = force_reg (HImode, mask);
37073 if (GET_MODE (mask) != HImode)
37074 mask = gen_rtx_SUBREG (HImode, mask, 0);
37076 /* If merge is 0 then we're about to emit z-masked variant. */
37077 if (const0_operand (merge, mode))
37078 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37079 /* If merge is the same as accum then emit merge-masked variant. */
37080 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37082 merge = force_reg (mode, merge);
37083 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37085 /* Merge with something unknown might happen if we z-mask w/ -O0. */
37086 else
37088 target = gen_reg_rtx (mode);
37089 emit_move_insn (target, merge);
37090 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37093 return target;
37096 case IX86_BUILTIN_4FNMASS:
37097 fcn = gen_avx5124fmaddps_4fnmaddss;
37098 masked = 0;
37099 goto s4fma_expand;
37101 case IX86_BUILTIN_4FMASS:
37102 fcn = gen_avx5124fmaddps_4fmaddss;
37103 masked = 0;
37104 goto s4fma_expand;
37106 case IX86_BUILTIN_4FNMASS_MASK:
37107 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
37108 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
37109 goto s4fma_expand;
37111 case IX86_BUILTIN_4FMASS_MASK:
37113 tree args[4];
37114 rtx ops[4];
37115 rtx wide_reg;
37116 rtx accum;
37117 rtx addr;
37118 rtx mem;
37120 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
37121 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
37123 s4fma_expand:
37124 mode = V4SFmode;
37125 wide_reg = gen_reg_rtx (V64SFmode);
37126 for (i = 0; i < 4; i++)
37128 rtx tmp;
37129 args[i] = CALL_EXPR_ARG (exp, i);
37130 ops[i] = expand_normal (args[i]);
37132 tmp = gen_reg_rtx (SFmode);
37133 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
37135 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
37136 gen_rtx_SUBREG (V16SFmode, tmp, 0));
37139 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37140 accum = force_reg (V4SFmode, accum);
37142 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37143 addr = force_reg (Pmode, addr);
37145 mem = gen_rtx_MEM (V4SFmode, addr);
37147 target = gen_reg_rtx (V4SFmode);
37149 emit_move_insn (target, accum);
37151 if (! masked)
37152 emit_insn (fcn (target, accum, wide_reg, mem));
37153 else
37155 rtx merge, mask;
37156 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37158 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37160 if (CONST_INT_P (mask))
37161 mask = fixup_modeless_constant (mask, QImode);
37163 mask = force_reg (QImode, mask);
37165 if (GET_MODE (mask) != QImode)
37166 mask = gen_rtx_SUBREG (QImode, mask, 0);
37168 /* If merge is 0 then we're about to emit z-masked variant. */
37169 if (const0_operand (merge, mode))
37170 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37171 /* If merge is the same as accum then emit merge-masked
37172 variant. */
37173 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37175 merge = force_reg (mode, merge);
37176 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37178 /* Merge with something unknown might happen if we z-mask
37179 w/ -O0. */
37180 else
37182 target = gen_reg_rtx (mode);
37183 emit_move_insn (target, merge);
37184 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37187 return target;
37189 case IX86_BUILTIN_RDPID:
37190 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
37191 target);
37192 default:
37193 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
37197 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37198 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37200 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37201 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37204 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37205 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37207 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37208 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37211 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37212 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37214 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37215 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37218 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37219 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37221 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37222 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37225 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37226 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37228 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37229 const struct builtin_description *d = bdesc_multi_arg + i;
37230 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37231 (enum ix86_builtin_func_type)
37232 d->flag, d->comparison);
37235 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
37236 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
37238 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
37239 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
37240 target);
37243 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
37244 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
37246 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
37247 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
37248 target);
37251 gcc_unreachable ();
37254 /* This returns the target-specific builtin with code CODE if
37255 current_function_decl has visibility on this builtin, which is checked
37256 using isa flags. Returns NULL_TREE otherwise. */
37258 static tree ix86_get_builtin (enum ix86_builtins code)
37260 struct cl_target_option *opts;
37261 tree target_tree = NULL_TREE;
37263 /* Determine the isa flags of current_function_decl. */
37265 if (current_function_decl)
37266 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37268 if (target_tree == NULL)
37269 target_tree = target_option_default_node;
37271 opts = TREE_TARGET_OPTION (target_tree);
37273 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37274 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
37275 return ix86_builtin_decl (code, true);
37276 else
37277 return NULL_TREE;
37280 /* Return function decl for target specific builtin
37281 for given MPX builtin passed i FCODE. */
37282 static tree
37283 ix86_builtin_mpx_function (unsigned fcode)
37285 switch (fcode)
37287 case BUILT_IN_CHKP_BNDMK:
37288 return ix86_builtins[IX86_BUILTIN_BNDMK];
37290 case BUILT_IN_CHKP_BNDSTX:
37291 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37293 case BUILT_IN_CHKP_BNDLDX:
37294 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37296 case BUILT_IN_CHKP_BNDCL:
37297 return ix86_builtins[IX86_BUILTIN_BNDCL];
37299 case BUILT_IN_CHKP_BNDCU:
37300 return ix86_builtins[IX86_BUILTIN_BNDCU];
37302 case BUILT_IN_CHKP_BNDRET:
37303 return ix86_builtins[IX86_BUILTIN_BNDRET];
37305 case BUILT_IN_CHKP_INTERSECT:
37306 return ix86_builtins[IX86_BUILTIN_BNDINT];
37308 case BUILT_IN_CHKP_NARROW:
37309 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37311 case BUILT_IN_CHKP_SIZEOF:
37312 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37314 case BUILT_IN_CHKP_EXTRACT_LOWER:
37315 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37317 case BUILT_IN_CHKP_EXTRACT_UPPER:
37318 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37320 default:
37321 return NULL_TREE;
37324 gcc_unreachable ();
37327 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37329 Return an address to be used to load/store bounds for pointer
37330 passed in SLOT.
37332 SLOT_NO is an integer constant holding number of a target
37333 dependent special slot to be used in case SLOT is not a memory.
37335 SPECIAL_BASE is a pointer to be used as a base of fake address
37336 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37337 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37339 static rtx
37340 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37342 rtx addr = NULL;
37344 /* NULL slot means we pass bounds for pointer not passed to the
37345 function at all. Register slot means we pass pointer in a
37346 register. In both these cases bounds are passed via Bounds
37347 Table. Since we do not have actual pointer stored in memory,
37348 we have to use fake addresses to access Bounds Table. We
37349 start with (special_base - sizeof (void*)) and decrease this
37350 address by pointer size to get addresses for other slots. */
37351 if (!slot || REG_P (slot))
37353 gcc_assert (CONST_INT_P (slot_no));
37354 addr = plus_constant (Pmode, special_base,
37355 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37357 /* If pointer is passed in a memory then its address is used to
37358 access Bounds Table. */
37359 else if (MEM_P (slot))
37361 addr = XEXP (slot, 0);
37362 if (!register_operand (addr, Pmode))
37363 addr = copy_addr_to_reg (addr);
37365 else
37366 gcc_unreachable ();
37368 return addr;
37371 /* Expand pass uses this hook to load bounds for function parameter
37372 PTR passed in SLOT in case its bounds are not passed in a register.
37374 If SLOT is a memory, then bounds are loaded as for regular pointer
37375 loaded from memory. PTR may be NULL in case SLOT is a memory.
37376 In such case value of PTR (if required) may be loaded from SLOT.
37378 If SLOT is NULL or a register then SLOT_NO is an integer constant
37379 holding number of the target dependent special slot which should be
37380 used to obtain bounds.
37382 Return loaded bounds. */
37384 static rtx
37385 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
37387 rtx reg = gen_reg_rtx (BNDmode);
37388 rtx addr;
37390 /* Get address to be used to access Bounds Table. Special slots start
37391 at the location of return address of the current function. */
37392 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
37394 /* Load pointer value from a memory if we don't have it. */
37395 if (!ptr)
37397 gcc_assert (MEM_P (slot));
37398 ptr = copy_addr_to_reg (slot);
37401 if (!register_operand (ptr, Pmode))
37402 ptr = ix86_zero_extend_to_Pmode (ptr);
37404 emit_insn (BNDmode == BND64mode
37405 ? gen_bnd64_ldx (reg, addr, ptr)
37406 : gen_bnd32_ldx (reg, addr, ptr));
37408 return reg;
37411 /* Expand pass uses this hook to store BOUNDS for call argument PTR
37412 passed in SLOT in case BOUNDS are not passed in a register.
37414 If SLOT is a memory, then BOUNDS are stored as for regular pointer
37415 stored in memory. PTR may be NULL in case SLOT is a memory.
37416 In such case value of PTR (if required) may be loaded from SLOT.
37418 If SLOT is NULL or a register then SLOT_NO is an integer constant
37419 holding number of the target dependent special slot which should be
37420 used to store BOUNDS. */
37422 static void
37423 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
37425 rtx addr;
37427 /* Get address to be used to access Bounds Table. Special slots start
37428 at the location of return address of a called function. */
37429 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
37431 /* Load pointer value from a memory if we don't have it. */
37432 if (!ptr)
37434 gcc_assert (MEM_P (slot));
37435 ptr = copy_addr_to_reg (slot);
37438 if (!register_operand (ptr, Pmode))
37439 ptr = ix86_zero_extend_to_Pmode (ptr);
37441 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
37442 if (!register_operand (bounds, BNDmode))
37443 bounds = copy_to_mode_reg (BNDmode, bounds);
37445 emit_insn (BNDmode == BND64mode
37446 ? gen_bnd64_stx (addr, ptr, bounds)
37447 : gen_bnd32_stx (addr, ptr, bounds));
37450 /* Load and return bounds returned by function in SLOT. */
37452 static rtx
37453 ix86_load_returned_bounds (rtx slot)
37455 rtx res;
37457 gcc_assert (REG_P (slot));
37458 res = gen_reg_rtx (BNDmode);
37459 emit_move_insn (res, slot);
37461 return res;
37464 /* Store BOUNDS returned by function into SLOT. */
37466 static void
37467 ix86_store_returned_bounds (rtx slot, rtx bounds)
37469 gcc_assert (REG_P (slot));
37470 emit_move_insn (slot, bounds);
37473 /* Returns a function decl for a vectorized version of the combined function
37474 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
37475 if it is not available. */
37477 static tree
37478 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
37479 tree type_in)
37481 machine_mode in_mode, out_mode;
37482 int in_n, out_n;
37484 if (TREE_CODE (type_out) != VECTOR_TYPE
37485 || TREE_CODE (type_in) != VECTOR_TYPE)
37486 return NULL_TREE;
37488 out_mode = TYPE_MODE (TREE_TYPE (type_out));
37489 out_n = TYPE_VECTOR_SUBPARTS (type_out);
37490 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37491 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37493 switch (fn)
37495 CASE_CFN_EXP2:
37496 if (out_mode == SFmode && in_mode == SFmode)
37498 if (out_n == 16 && in_n == 16)
37499 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
37501 break;
37503 CASE_CFN_IFLOOR:
37504 CASE_CFN_LFLOOR:
37505 CASE_CFN_LLFLOOR:
37506 /* The round insn does not trap on denormals. */
37507 if (flag_trapping_math || !TARGET_SSE4_1)
37508 break;
37510 if (out_mode == SImode && in_mode == DFmode)
37512 if (out_n == 4 && in_n == 2)
37513 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
37514 else if (out_n == 8 && in_n == 4)
37515 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
37516 else if (out_n == 16 && in_n == 8)
37517 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
37519 if (out_mode == SImode && in_mode == SFmode)
37521 if (out_n == 4 && in_n == 4)
37522 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
37523 else if (out_n == 8 && in_n == 8)
37524 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
37525 else if (out_n == 16 && in_n == 16)
37526 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
37528 break;
37530 CASE_CFN_ICEIL:
37531 CASE_CFN_LCEIL:
37532 CASE_CFN_LLCEIL:
37533 /* The round insn does not trap on denormals. */
37534 if (flag_trapping_math || !TARGET_SSE4_1)
37535 break;
37537 if (out_mode == SImode && in_mode == DFmode)
37539 if (out_n == 4 && in_n == 2)
37540 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
37541 else if (out_n == 8 && in_n == 4)
37542 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
37543 else if (out_n == 16 && in_n == 8)
37544 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
37546 if (out_mode == SImode && in_mode == SFmode)
37548 if (out_n == 4 && in_n == 4)
37549 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
37550 else if (out_n == 8 && in_n == 8)
37551 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
37552 else if (out_n == 16 && in_n == 16)
37553 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
37555 break;
37557 CASE_CFN_IRINT:
37558 CASE_CFN_LRINT:
37559 CASE_CFN_LLRINT:
37560 if (out_mode == SImode && in_mode == DFmode)
37562 if (out_n == 4 && in_n == 2)
37563 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
37564 else if (out_n == 8 && in_n == 4)
37565 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
37566 else if (out_n == 16 && in_n == 8)
37567 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
37569 if (out_mode == SImode && in_mode == SFmode)
37571 if (out_n == 4 && in_n == 4)
37572 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
37573 else if (out_n == 8 && in_n == 8)
37574 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
37575 else if (out_n == 16 && in_n == 16)
37576 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
37578 break;
37580 CASE_CFN_IROUND:
37581 CASE_CFN_LROUND:
37582 CASE_CFN_LLROUND:
37583 /* The round insn does not trap on denormals. */
37584 if (flag_trapping_math || !TARGET_SSE4_1)
37585 break;
37587 if (out_mode == SImode && in_mode == DFmode)
37589 if (out_n == 4 && in_n == 2)
37590 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
37591 else if (out_n == 8 && in_n == 4)
37592 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
37593 else if (out_n == 16 && in_n == 8)
37594 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
37596 if (out_mode == SImode && in_mode == SFmode)
37598 if (out_n == 4 && in_n == 4)
37599 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
37600 else if (out_n == 8 && in_n == 8)
37601 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
37602 else if (out_n == 16 && in_n == 16)
37603 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
37605 break;
37607 CASE_CFN_FLOOR:
37608 /* The round insn does not trap on denormals. */
37609 if (flag_trapping_math || !TARGET_SSE4_1)
37610 break;
37612 if (out_mode == DFmode && in_mode == DFmode)
37614 if (out_n == 2 && in_n == 2)
37615 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
37616 else if (out_n == 4 && in_n == 4)
37617 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
37618 else if (out_n == 8 && in_n == 8)
37619 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
37621 if (out_mode == SFmode && in_mode == SFmode)
37623 if (out_n == 4 && in_n == 4)
37624 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
37625 else if (out_n == 8 && in_n == 8)
37626 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
37627 else if (out_n == 16 && in_n == 16)
37628 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
37630 break;
37632 CASE_CFN_CEIL:
37633 /* The round insn does not trap on denormals. */
37634 if (flag_trapping_math || !TARGET_SSE4_1)
37635 break;
37637 if (out_mode == DFmode && in_mode == DFmode)
37639 if (out_n == 2 && in_n == 2)
37640 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
37641 else if (out_n == 4 && in_n == 4)
37642 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
37643 else if (out_n == 8 && in_n == 8)
37644 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
37646 if (out_mode == SFmode && in_mode == SFmode)
37648 if (out_n == 4 && in_n == 4)
37649 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
37650 else if (out_n == 8 && in_n == 8)
37651 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
37652 else if (out_n == 16 && in_n == 16)
37653 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
37655 break;
37657 CASE_CFN_TRUNC:
37658 /* The round insn does not trap on denormals. */
37659 if (flag_trapping_math || !TARGET_SSE4_1)
37660 break;
37662 if (out_mode == DFmode && in_mode == DFmode)
37664 if (out_n == 2 && in_n == 2)
37665 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
37666 else if (out_n == 4 && in_n == 4)
37667 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
37668 else if (out_n == 8 && in_n == 8)
37669 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
37671 if (out_mode == SFmode && in_mode == SFmode)
37673 if (out_n == 4 && in_n == 4)
37674 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
37675 else if (out_n == 8 && in_n == 8)
37676 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
37677 else if (out_n == 16 && in_n == 16)
37678 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
37680 break;
37682 CASE_CFN_RINT:
37683 /* The round insn does not trap on denormals. */
37684 if (flag_trapping_math || !TARGET_SSE4_1)
37685 break;
37687 if (out_mode == DFmode && in_mode == DFmode)
37689 if (out_n == 2 && in_n == 2)
37690 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
37691 else if (out_n == 4 && in_n == 4)
37692 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
37694 if (out_mode == SFmode && in_mode == SFmode)
37696 if (out_n == 4 && in_n == 4)
37697 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
37698 else if (out_n == 8 && in_n == 8)
37699 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
37701 break;
37703 CASE_CFN_FMA:
37704 if (out_mode == DFmode && in_mode == DFmode)
37706 if (out_n == 2 && in_n == 2)
37707 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
37708 if (out_n == 4 && in_n == 4)
37709 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
37711 if (out_mode == SFmode && in_mode == SFmode)
37713 if (out_n == 4 && in_n == 4)
37714 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
37715 if (out_n == 8 && in_n == 8)
37716 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
37718 break;
37720 default:
37721 break;
37724 /* Dispatch to a handler for a vectorization library. */
37725 if (ix86_veclib_handler)
37726 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
37728 return NULL_TREE;
37731 /* Handler for an SVML-style interface to
37732 a library with vectorized intrinsics. */
37734 static tree
37735 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
37737 char name[20];
37738 tree fntype, new_fndecl, args;
37739 unsigned arity;
37740 const char *bname;
37741 machine_mode el_mode, in_mode;
37742 int n, in_n;
37744 /* The SVML is suitable for unsafe math only. */
37745 if (!flag_unsafe_math_optimizations)
37746 return NULL_TREE;
37748 el_mode = TYPE_MODE (TREE_TYPE (type_out));
37749 n = TYPE_VECTOR_SUBPARTS (type_out);
37750 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37751 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37752 if (el_mode != in_mode
37753 || n != in_n)
37754 return NULL_TREE;
37756 switch (fn)
37758 CASE_CFN_EXP:
37759 CASE_CFN_LOG:
37760 CASE_CFN_LOG10:
37761 CASE_CFN_POW:
37762 CASE_CFN_TANH:
37763 CASE_CFN_TAN:
37764 CASE_CFN_ATAN:
37765 CASE_CFN_ATAN2:
37766 CASE_CFN_ATANH:
37767 CASE_CFN_CBRT:
37768 CASE_CFN_SINH:
37769 CASE_CFN_SIN:
37770 CASE_CFN_ASINH:
37771 CASE_CFN_ASIN:
37772 CASE_CFN_COSH:
37773 CASE_CFN_COS:
37774 CASE_CFN_ACOSH:
37775 CASE_CFN_ACOS:
37776 if ((el_mode != DFmode || n != 2)
37777 && (el_mode != SFmode || n != 4))
37778 return NULL_TREE;
37779 break;
37781 default:
37782 return NULL_TREE;
37785 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
37786 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
37788 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
37789 strcpy (name, "vmlsLn4");
37790 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
37791 strcpy (name, "vmldLn2");
37792 else if (n == 4)
37794 sprintf (name, "vmls%s", bname+10);
37795 name[strlen (name)-1] = '4';
37797 else
37798 sprintf (name, "vmld%s2", bname+10);
37800 /* Convert to uppercase. */
37801 name[4] &= ~0x20;
37803 arity = 0;
37804 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
37805 arity++;
37807 if (arity == 1)
37808 fntype = build_function_type_list (type_out, type_in, NULL);
37809 else
37810 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
37812 /* Build a function declaration for the vectorized function. */
37813 new_fndecl = build_decl (BUILTINS_LOCATION,
37814 FUNCTION_DECL, get_identifier (name), fntype);
37815 TREE_PUBLIC (new_fndecl) = 1;
37816 DECL_EXTERNAL (new_fndecl) = 1;
37817 DECL_IS_NOVOPS (new_fndecl) = 1;
37818 TREE_READONLY (new_fndecl) = 1;
37820 return new_fndecl;
37823 /* Handler for an ACML-style interface to
37824 a library with vectorized intrinsics. */
37826 static tree
37827 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
37829 char name[20] = "__vr.._";
37830 tree fntype, new_fndecl, args;
37831 unsigned arity;
37832 const char *bname;
37833 machine_mode el_mode, in_mode;
37834 int n, in_n;
37836 /* The ACML is 64bits only and suitable for unsafe math only as
37837 it does not correctly support parts of IEEE with the required
37838 precision such as denormals. */
37839 if (!TARGET_64BIT
37840 || !flag_unsafe_math_optimizations)
37841 return NULL_TREE;
37843 el_mode = TYPE_MODE (TREE_TYPE (type_out));
37844 n = TYPE_VECTOR_SUBPARTS (type_out);
37845 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37846 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37847 if (el_mode != in_mode
37848 || n != in_n)
37849 return NULL_TREE;
37851 switch (fn)
37853 CASE_CFN_SIN:
37854 CASE_CFN_COS:
37855 CASE_CFN_EXP:
37856 CASE_CFN_LOG:
37857 CASE_CFN_LOG2:
37858 CASE_CFN_LOG10:
37859 if (el_mode == DFmode && n == 2)
37861 name[4] = 'd';
37862 name[5] = '2';
37864 else if (el_mode == SFmode && n == 4)
37866 name[4] = 's';
37867 name[5] = '4';
37869 else
37870 return NULL_TREE;
37871 break;
37873 default:
37874 return NULL_TREE;
37877 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
37878 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
37879 sprintf (name + 7, "%s", bname+10);
37881 arity = 0;
37882 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
37883 arity++;
37885 if (arity == 1)
37886 fntype = build_function_type_list (type_out, type_in, NULL);
37887 else
37888 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
37890 /* Build a function declaration for the vectorized function. */
37891 new_fndecl = build_decl (BUILTINS_LOCATION,
37892 FUNCTION_DECL, get_identifier (name), fntype);
37893 TREE_PUBLIC (new_fndecl) = 1;
37894 DECL_EXTERNAL (new_fndecl) = 1;
37895 DECL_IS_NOVOPS (new_fndecl) = 1;
37896 TREE_READONLY (new_fndecl) = 1;
37898 return new_fndecl;
37901 /* Returns a decl of a function that implements gather load with
37902 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
37903 Return NULL_TREE if it is not available. */
37905 static tree
37906 ix86_vectorize_builtin_gather (const_tree mem_vectype,
37907 const_tree index_type, int scale)
37909 bool si;
37910 enum ix86_builtins code;
37912 if (! TARGET_AVX2)
37913 return NULL_TREE;
37915 if ((TREE_CODE (index_type) != INTEGER_TYPE
37916 && !POINTER_TYPE_P (index_type))
37917 || (TYPE_MODE (index_type) != SImode
37918 && TYPE_MODE (index_type) != DImode))
37919 return NULL_TREE;
37921 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
37922 return NULL_TREE;
37924 /* v*gather* insn sign extends index to pointer mode. */
37925 if (TYPE_PRECISION (index_type) < POINTER_SIZE
37926 && TYPE_UNSIGNED (index_type))
37927 return NULL_TREE;
37929 if (scale <= 0
37930 || scale > 8
37931 || (scale & (scale - 1)) != 0)
37932 return NULL_TREE;
37934 si = TYPE_MODE (index_type) == SImode;
37935 switch (TYPE_MODE (mem_vectype))
37937 case E_V2DFmode:
37938 if (TARGET_AVX512VL)
37939 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
37940 else
37941 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
37942 break;
37943 case E_V4DFmode:
37944 if (TARGET_AVX512VL)
37945 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
37946 else
37947 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
37948 break;
37949 case E_V2DImode:
37950 if (TARGET_AVX512VL)
37951 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
37952 else
37953 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
37954 break;
37955 case E_V4DImode:
37956 if (TARGET_AVX512VL)
37957 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
37958 else
37959 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
37960 break;
37961 case E_V4SFmode:
37962 if (TARGET_AVX512VL)
37963 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
37964 else
37965 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
37966 break;
37967 case E_V8SFmode:
37968 if (TARGET_AVX512VL)
37969 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
37970 else
37971 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
37972 break;
37973 case E_V4SImode:
37974 if (TARGET_AVX512VL)
37975 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
37976 else
37977 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
37978 break;
37979 case E_V8SImode:
37980 if (TARGET_AVX512VL)
37981 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
37982 else
37983 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
37984 break;
37985 case E_V8DFmode:
37986 if (TARGET_AVX512F)
37987 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
37988 else
37989 return NULL_TREE;
37990 break;
37991 case E_V8DImode:
37992 if (TARGET_AVX512F)
37993 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
37994 else
37995 return NULL_TREE;
37996 break;
37997 case E_V16SFmode:
37998 if (TARGET_AVX512F)
37999 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38000 else
38001 return NULL_TREE;
38002 break;
38003 case E_V16SImode:
38004 if (TARGET_AVX512F)
38005 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38006 else
38007 return NULL_TREE;
38008 break;
38009 default:
38010 return NULL_TREE;
38013 return ix86_get_builtin (code);
38016 /* Returns a decl of a function that implements scatter store with
38017 register type VECTYPE and index type INDEX_TYPE and SCALE.
38018 Return NULL_TREE if it is not available. */
38020 static tree
38021 ix86_vectorize_builtin_scatter (const_tree vectype,
38022 const_tree index_type, int scale)
38024 bool si;
38025 enum ix86_builtins code;
38027 if (!TARGET_AVX512F)
38028 return NULL_TREE;
38030 if ((TREE_CODE (index_type) != INTEGER_TYPE
38031 && !POINTER_TYPE_P (index_type))
38032 || (TYPE_MODE (index_type) != SImode
38033 && TYPE_MODE (index_type) != DImode))
38034 return NULL_TREE;
38036 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38037 return NULL_TREE;
38039 /* v*scatter* insn sign extends index to pointer mode. */
38040 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38041 && TYPE_UNSIGNED (index_type))
38042 return NULL_TREE;
38044 /* Scale can be 1, 2, 4 or 8. */
38045 if (scale <= 0
38046 || scale > 8
38047 || (scale & (scale - 1)) != 0)
38048 return NULL_TREE;
38050 si = TYPE_MODE (index_type) == SImode;
38051 switch (TYPE_MODE (vectype))
38053 case E_V8DFmode:
38054 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38055 break;
38056 case E_V8DImode:
38057 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38058 break;
38059 case E_V16SFmode:
38060 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38061 break;
38062 case E_V16SImode:
38063 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38064 break;
38065 default:
38066 return NULL_TREE;
38069 return ix86_builtins[code];
38072 /* Return true if it is safe to use the rsqrt optabs to optimize
38073 1.0/sqrt. */
38075 static bool
38076 use_rsqrt_p ()
38078 return (TARGET_SSE_MATH
38079 && flag_finite_math_only
38080 && !flag_trapping_math
38081 && flag_unsafe_math_optimizations);
38084 /* Returns a code for a target-specific builtin that implements
38085 reciprocal of the function, or NULL_TREE if not available. */
38087 static tree
38088 ix86_builtin_reciprocal (tree fndecl)
38090 switch (DECL_FUNCTION_CODE (fndecl))
38092 /* Vectorized version of sqrt to rsqrt conversion. */
38093 case IX86_BUILTIN_SQRTPS_NR:
38094 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38096 case IX86_BUILTIN_SQRTPS_NR256:
38097 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38099 default:
38100 return NULL_TREE;
38104 /* Helper for avx_vpermilps256_operand et al. This is also used by
38105 the expansion functions to turn the parallel back into a mask.
38106 The return value is 0 for no match and the imm8+1 for a match. */
38109 avx_vpermilp_parallel (rtx par, machine_mode mode)
38111 unsigned i, nelt = GET_MODE_NUNITS (mode);
38112 unsigned mask = 0;
38113 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38115 if (XVECLEN (par, 0) != (int) nelt)
38116 return 0;
38118 /* Validate that all of the elements are constants, and not totally
38119 out of range. Copy the data into an integral array to make the
38120 subsequent checks easier. */
38121 for (i = 0; i < nelt; ++i)
38123 rtx er = XVECEXP (par, 0, i);
38124 unsigned HOST_WIDE_INT ei;
38126 if (!CONST_INT_P (er))
38127 return 0;
38128 ei = INTVAL (er);
38129 if (ei >= nelt)
38130 return 0;
38131 ipar[i] = ei;
38134 switch (mode)
38136 case E_V8DFmode:
38137 /* In the 512-bit DFmode case, we can only move elements within
38138 a 128-bit lane. First fill the second part of the mask,
38139 then fallthru. */
38140 for (i = 4; i < 6; ++i)
38142 if (ipar[i] < 4 || ipar[i] >= 6)
38143 return 0;
38144 mask |= (ipar[i] - 4) << i;
38146 for (i = 6; i < 8; ++i)
38148 if (ipar[i] < 6)
38149 return 0;
38150 mask |= (ipar[i] - 6) << i;
38152 /* FALLTHRU */
38154 case E_V4DFmode:
38155 /* In the 256-bit DFmode case, we can only move elements within
38156 a 128-bit lane. */
38157 for (i = 0; i < 2; ++i)
38159 if (ipar[i] >= 2)
38160 return 0;
38161 mask |= ipar[i] << i;
38163 for (i = 2; i < 4; ++i)
38165 if (ipar[i] < 2)
38166 return 0;
38167 mask |= (ipar[i] - 2) << i;
38169 break;
38171 case E_V16SFmode:
38172 /* In 512 bit SFmode case, permutation in the upper 256 bits
38173 must mirror the permutation in the lower 256-bits. */
38174 for (i = 0; i < 8; ++i)
38175 if (ipar[i] + 8 != ipar[i + 8])
38176 return 0;
38177 /* FALLTHRU */
38179 case E_V8SFmode:
38180 /* In 256 bit SFmode case, we have full freedom of
38181 movement within the low 128-bit lane, but the high 128-bit
38182 lane must mirror the exact same pattern. */
38183 for (i = 0; i < 4; ++i)
38184 if (ipar[i] + 4 != ipar[i + 4])
38185 return 0;
38186 nelt = 4;
38187 /* FALLTHRU */
38189 case E_V2DFmode:
38190 case E_V4SFmode:
38191 /* In the 128-bit case, we've full freedom in the placement of
38192 the elements from the source operand. */
38193 for (i = 0; i < nelt; ++i)
38194 mask |= ipar[i] << (i * (nelt / 2));
38195 break;
38197 default:
38198 gcc_unreachable ();
38201 /* Make sure success has a non-zero value by adding one. */
38202 return mask + 1;
38205 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38206 the expansion functions to turn the parallel back into a mask.
38207 The return value is 0 for no match and the imm8+1 for a match. */
38210 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38212 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38213 unsigned mask = 0;
38214 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38216 if (XVECLEN (par, 0) != (int) nelt)
38217 return 0;
38219 /* Validate that all of the elements are constants, and not totally
38220 out of range. Copy the data into an integral array to make the
38221 subsequent checks easier. */
38222 for (i = 0; i < nelt; ++i)
38224 rtx er = XVECEXP (par, 0, i);
38225 unsigned HOST_WIDE_INT ei;
38227 if (!CONST_INT_P (er))
38228 return 0;
38229 ei = INTVAL (er);
38230 if (ei >= 2 * nelt)
38231 return 0;
38232 ipar[i] = ei;
38235 /* Validate that the halves of the permute are halves. */
38236 for (i = 0; i < nelt2 - 1; ++i)
38237 if (ipar[i] + 1 != ipar[i + 1])
38238 return 0;
38239 for (i = nelt2; i < nelt - 1; ++i)
38240 if (ipar[i] + 1 != ipar[i + 1])
38241 return 0;
38243 /* Reconstruct the mask. */
38244 for (i = 0; i < 2; ++i)
38246 unsigned e = ipar[i * nelt2];
38247 if (e % nelt2)
38248 return 0;
38249 e /= nelt2;
38250 mask |= e << (i * 4);
38253 /* Make sure success has a non-zero value by adding one. */
38254 return mask + 1;
38257 /* Return a register priority for hard reg REGNO. */
38258 static int
38259 ix86_register_priority (int hard_regno)
38261 /* ebp and r13 as the base always wants a displacement, r12 as the
38262 base always wants an index. So discourage their usage in an
38263 address. */
38264 if (hard_regno == R12_REG || hard_regno == R13_REG)
38265 return 0;
38266 if (hard_regno == BP_REG)
38267 return 1;
38268 /* New x86-64 int registers result in bigger code size. Discourage
38269 them. */
38270 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38271 return 2;
38272 /* New x86-64 SSE registers result in bigger code size. Discourage
38273 them. */
38274 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38275 return 2;
38276 /* Usage of AX register results in smaller code. Prefer it. */
38277 if (hard_regno == AX_REG)
38278 return 4;
38279 return 3;
38282 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38284 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38285 QImode must go into class Q_REGS.
38286 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38287 movdf to do mem-to-mem moves through integer regs. */
38289 static reg_class_t
38290 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38292 machine_mode mode = GET_MODE (x);
38294 /* We're only allowed to return a subclass of CLASS. Many of the
38295 following checks fail for NO_REGS, so eliminate that early. */
38296 if (regclass == NO_REGS)
38297 return NO_REGS;
38299 /* All classes can load zeros. */
38300 if (x == CONST0_RTX (mode))
38301 return regclass;
38303 /* Force constants into memory if we are loading a (nonzero) constant into
38304 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38305 instructions to load from a constant. */
38306 if (CONSTANT_P (x)
38307 && (MAYBE_MMX_CLASS_P (regclass)
38308 || MAYBE_SSE_CLASS_P (regclass)
38309 || MAYBE_MASK_CLASS_P (regclass)))
38310 return NO_REGS;
38312 /* Floating-point constants need more complex checks. */
38313 if (CONST_DOUBLE_P (x))
38315 /* General regs can load everything. */
38316 if (INTEGER_CLASS_P (regclass))
38317 return regclass;
38319 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38320 zero above. We only want to wind up preferring 80387 registers if
38321 we plan on doing computation with them. */
38322 if (IS_STACK_MODE (mode)
38323 && standard_80387_constant_p (x) > 0)
38325 /* Limit class to FP regs. */
38326 if (FLOAT_CLASS_P (regclass))
38327 return FLOAT_REGS;
38328 else if (regclass == FP_TOP_SSE_REGS)
38329 return FP_TOP_REG;
38330 else if (regclass == FP_SECOND_SSE_REGS)
38331 return FP_SECOND_REG;
38334 return NO_REGS;
38337 /* Prefer SSE regs only, if we can use them for math. */
38338 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38339 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38341 /* Generally when we see PLUS here, it's the function invariant
38342 (plus soft-fp const_int). Which can only be computed into general
38343 regs. */
38344 if (GET_CODE (x) == PLUS)
38345 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38347 /* QImode constants are easy to load, but non-constant QImode data
38348 must go into Q_REGS. */
38349 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38351 if (Q_CLASS_P (regclass))
38352 return regclass;
38353 else if (reg_class_subset_p (Q_REGS, regclass))
38354 return Q_REGS;
38355 else
38356 return NO_REGS;
38359 return regclass;
38362 /* Discourage putting floating-point values in SSE registers unless
38363 SSE math is being used, and likewise for the 387 registers. */
38364 static reg_class_t
38365 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
38367 machine_mode mode = GET_MODE (x);
38369 /* Restrict the output reload class to the register bank that we are doing
38370 math on. If we would like not to return a subset of CLASS, reject this
38371 alternative: if reload cannot do this, it will still use its choice. */
38372 mode = GET_MODE (x);
38373 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38374 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
38376 if (IS_STACK_MODE (mode))
38378 if (regclass == FP_TOP_SSE_REGS)
38379 return FP_TOP_REG;
38380 else if (regclass == FP_SECOND_SSE_REGS)
38381 return FP_SECOND_REG;
38382 else
38383 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
38386 return regclass;
38389 static reg_class_t
38390 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
38391 machine_mode mode, secondary_reload_info *sri)
38393 /* Double-word spills from general registers to non-offsettable memory
38394 references (zero-extended addresses) require special handling. */
38395 if (TARGET_64BIT
38396 && MEM_P (x)
38397 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
38398 && INTEGER_CLASS_P (rclass)
38399 && !offsettable_memref_p (x))
38401 sri->icode = (in_p
38402 ? CODE_FOR_reload_noff_load
38403 : CODE_FOR_reload_noff_store);
38404 /* Add the cost of moving address to a temporary. */
38405 sri->extra_cost = 1;
38407 return NO_REGS;
38410 /* QImode spills from non-QI registers require
38411 intermediate register on 32bit targets. */
38412 if (mode == QImode
38413 && ((!TARGET_64BIT && !in_p
38414 && INTEGER_CLASS_P (rclass)
38415 && MAYBE_NON_Q_CLASS_P (rclass))
38416 || (!TARGET_AVX512DQ
38417 && MAYBE_MASK_CLASS_P (rclass))))
38419 int regno = true_regnum (x);
38421 /* Return Q_REGS if the operand is in memory. */
38422 if (regno == -1)
38423 return Q_REGS;
38425 return NO_REGS;
38428 /* This condition handles corner case where an expression involving
38429 pointers gets vectorized. We're trying to use the address of a
38430 stack slot as a vector initializer.
38432 (set (reg:V2DI 74 [ vect_cst_.2 ])
38433 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
38435 Eventually frame gets turned into sp+offset like this:
38437 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38438 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38439 (const_int 392 [0x188]))))
38441 That later gets turned into:
38443 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38444 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38445 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
38447 We'll have the following reload recorded:
38449 Reload 0: reload_in (DI) =
38450 (plus:DI (reg/f:DI 7 sp)
38451 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
38452 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38453 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
38454 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
38455 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38456 reload_reg_rtx: (reg:V2DI 22 xmm1)
38458 Which isn't going to work since SSE instructions can't handle scalar
38459 additions. Returning GENERAL_REGS forces the addition into integer
38460 register and reload can handle subsequent reloads without problems. */
38462 if (in_p && GET_CODE (x) == PLUS
38463 && SSE_CLASS_P (rclass)
38464 && SCALAR_INT_MODE_P (mode))
38465 return GENERAL_REGS;
38467 return NO_REGS;
38470 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
38472 static bool
38473 ix86_class_likely_spilled_p (reg_class_t rclass)
38475 switch (rclass)
38477 case AREG:
38478 case DREG:
38479 case CREG:
38480 case BREG:
38481 case AD_REGS:
38482 case SIREG:
38483 case DIREG:
38484 case SSE_FIRST_REG:
38485 case FP_TOP_REG:
38486 case FP_SECOND_REG:
38487 case BND_REGS:
38488 return true;
38490 default:
38491 break;
38494 return false;
38497 /* If we are copying between registers from different register sets
38498 (e.g. FP and integer), we may need a memory location.
38500 The function can't work reliably when one of the CLASSES is a class
38501 containing registers from multiple sets. We avoid this by never combining
38502 different sets in a single alternative in the machine description.
38503 Ensure that this constraint holds to avoid unexpected surprises.
38505 When STRICT is false, we are being called from REGISTER_MOVE_COST,
38506 so do not enforce these sanity checks.
38508 To optimize register_move_cost performance, define inline variant. */
38510 static inline bool
38511 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38512 reg_class_t class2, int strict)
38514 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
38515 return false;
38517 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
38518 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
38519 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
38520 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
38521 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
38522 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
38523 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
38524 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
38526 gcc_assert (!strict || lra_in_progress);
38527 return true;
38530 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
38531 return true;
38533 /* Between mask and general, we have moves no larger than word size. */
38534 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
38535 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
38536 return true;
38538 /* ??? This is a lie. We do have moves between mmx/general, and for
38539 mmx/sse2. But by saying we need secondary memory we discourage the
38540 register allocator from using the mmx registers unless needed. */
38541 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
38542 return true;
38544 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38546 /* SSE1 doesn't have any direct moves from other classes. */
38547 if (!TARGET_SSE2)
38548 return true;
38550 /* If the target says that inter-unit moves are more expensive
38551 than moving through memory, then don't generate them. */
38552 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
38553 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
38554 return true;
38556 /* Between SSE and general, we have moves no larger than word size. */
38557 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
38558 return true;
38561 return false;
38564 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
38566 static bool
38567 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38568 reg_class_t class2)
38570 return inline_secondary_memory_needed (mode, class1, class2, true);
38573 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
38575 get_secondary_mem widens integral modes to BITS_PER_WORD.
38576 There is no need to emit full 64 bit move on 64 bit targets
38577 for integral modes that can be moved using 32 bit move. */
38579 static machine_mode
38580 ix86_secondary_memory_needed_mode (machine_mode mode)
38582 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
38583 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
38584 return mode;
38587 /* Implement the TARGET_CLASS_MAX_NREGS hook.
38589 On the 80386, this is the size of MODE in words,
38590 except in the FP regs, where a single reg is always enough. */
38592 static unsigned char
38593 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
38595 if (MAYBE_INTEGER_CLASS_P (rclass))
38597 if (mode == XFmode)
38598 return (TARGET_64BIT ? 2 : 3);
38599 else if (mode == XCmode)
38600 return (TARGET_64BIT ? 4 : 6);
38601 else
38602 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38604 else
38606 if (COMPLEX_MODE_P (mode))
38607 return 2;
38608 else
38609 return 1;
38613 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
38615 static bool
38616 ix86_can_change_mode_class (machine_mode from, machine_mode to,
38617 reg_class_t regclass)
38619 if (from == to)
38620 return true;
38622 /* x87 registers can't do subreg at all, as all values are reformatted
38623 to extended precision. */
38624 if (MAYBE_FLOAT_CLASS_P (regclass))
38625 return false;
38627 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
38629 /* Vector registers do not support QI or HImode loads. If we don't
38630 disallow a change to these modes, reload will assume it's ok to
38631 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
38632 the vec_dupv4hi pattern. */
38633 if (GET_MODE_SIZE (from) < 4)
38634 return false;
38637 return true;
38640 /* Return index of MODE in the sse load/store tables. */
38642 static inline int
38643 sse_store_index (machine_mode mode)
38645 switch (GET_MODE_SIZE (mode))
38647 case 4:
38648 return 0;
38649 case 8:
38650 return 1;
38651 case 16:
38652 return 2;
38653 case 32:
38654 return 3;
38655 case 64:
38656 return 4;
38657 default:
38658 return -1;
38662 /* Return the cost of moving data of mode M between a
38663 register and memory. A value of 2 is the default; this cost is
38664 relative to those in `REGISTER_MOVE_COST'.
38666 This function is used extensively by register_move_cost that is used to
38667 build tables at startup. Make it inline in this case.
38668 When IN is 2, return maximum of in and out move cost.
38670 If moving between registers and memory is more expensive than
38671 between two registers, you should define this macro to express the
38672 relative cost.
38674 Model also increased moving costs of QImode registers in non
38675 Q_REGS classes.
38677 static inline int
38678 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
38679 int in)
38681 int cost;
38682 if (FLOAT_CLASS_P (regclass))
38684 int index;
38685 switch (mode)
38687 case E_SFmode:
38688 index = 0;
38689 break;
38690 case E_DFmode:
38691 index = 1;
38692 break;
38693 case E_XFmode:
38694 index = 2;
38695 break;
38696 default:
38697 return 100;
38699 if (in == 2)
38700 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
38701 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
38703 if (SSE_CLASS_P (regclass))
38705 int index = sse_store_index (mode);
38706 if (index == -1)
38707 return 100;
38708 if (in == 2)
38709 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
38710 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
38712 if (MMX_CLASS_P (regclass))
38714 int index;
38715 switch (GET_MODE_SIZE (mode))
38717 case 4:
38718 index = 0;
38719 break;
38720 case 8:
38721 index = 1;
38722 break;
38723 default:
38724 return 100;
38726 if (in)
38727 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
38728 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
38730 switch (GET_MODE_SIZE (mode))
38732 case 1:
38733 if (Q_CLASS_P (regclass) || TARGET_64BIT)
38735 if (!in)
38736 return ix86_cost->int_store[0];
38737 if (TARGET_PARTIAL_REG_DEPENDENCY
38738 && optimize_function_for_speed_p (cfun))
38739 cost = ix86_cost->movzbl_load;
38740 else
38741 cost = ix86_cost->int_load[0];
38742 if (in == 2)
38743 return MAX (cost, ix86_cost->int_store[0]);
38744 return cost;
38746 else
38748 if (in == 2)
38749 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
38750 if (in)
38751 return ix86_cost->movzbl_load;
38752 else
38753 return ix86_cost->int_store[0] + 4;
38755 break;
38756 case 2:
38757 if (in == 2)
38758 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
38759 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
38760 default:
38761 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
38762 if (mode == TFmode)
38763 mode = XFmode;
38764 if (in == 2)
38765 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
38766 else if (in)
38767 cost = ix86_cost->int_load[2];
38768 else
38769 cost = ix86_cost->int_store[2];
38770 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
38774 static int
38775 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
38776 bool in)
38778 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
38782 /* Return the cost of moving data from a register in class CLASS1 to
38783 one in class CLASS2.
38785 It is not required that the cost always equal 2 when FROM is the same as TO;
38786 on some machines it is expensive to move between registers if they are not
38787 general registers. */
38789 static int
38790 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
38791 reg_class_t class2_i)
38793 enum reg_class class1 = (enum reg_class) class1_i;
38794 enum reg_class class2 = (enum reg_class) class2_i;
38796 /* In case we require secondary memory, compute cost of the store followed
38797 by load. In order to avoid bad register allocation choices, we need
38798 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
38800 if (inline_secondary_memory_needed (mode, class1, class2, false))
38802 int cost = 1;
38804 cost += inline_memory_move_cost (mode, class1, 2);
38805 cost += inline_memory_move_cost (mode, class2, 2);
38807 /* In case of copying from general_purpose_register we may emit multiple
38808 stores followed by single load causing memory size mismatch stall.
38809 Count this as arbitrarily high cost of 20. */
38810 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
38811 && TARGET_MEMORY_MISMATCH_STALL
38812 && targetm.class_max_nregs (class1, mode)
38813 > targetm.class_max_nregs (class2, mode))
38814 cost += 20;
38816 /* In the case of FP/MMX moves, the registers actually overlap, and we
38817 have to switch modes in order to treat them differently. */
38818 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
38819 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
38820 cost += 20;
38822 return cost;
38825 /* Moves between SSE/MMX and integer unit are expensive. */
38826 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
38827 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38829 /* ??? By keeping returned value relatively high, we limit the number
38830 of moves between integer and MMX/SSE registers for all targets.
38831 Additionally, high value prevents problem with x86_modes_tieable_p(),
38832 where integer modes in MMX/SSE registers are not tieable
38833 because of missing QImode and HImode moves to, from or between
38834 MMX/SSE registers. */
38835 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
38836 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
38838 if (MAYBE_FLOAT_CLASS_P (class1))
38839 return ix86_cost->fp_move;
38840 if (MAYBE_SSE_CLASS_P (class1))
38842 if (GET_MODE_BITSIZE (mode) <= 128)
38843 return ix86_cost->xmm_move;
38844 if (GET_MODE_BITSIZE (mode) <= 256)
38845 return ix86_cost->ymm_move;
38846 return ix86_cost->zmm_move;
38848 if (MAYBE_MMX_CLASS_P (class1))
38849 return ix86_cost->mmx_move;
38850 return 2;
38853 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
38854 words of a value of mode MODE but can be less for certain modes in
38855 special long registers.
38857 Actually there are no two word move instructions for consecutive
38858 registers. And only registers 0-3 may have mov byte instructions
38859 applied to them. */
38861 static unsigned int
38862 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
38864 if (GENERAL_REGNO_P (regno))
38866 if (mode == XFmode)
38867 return TARGET_64BIT ? 2 : 3;
38868 if (mode == XCmode)
38869 return TARGET_64BIT ? 4 : 6;
38870 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38872 if (COMPLEX_MODE_P (mode))
38873 return 2;
38874 if (mode == V64SFmode || mode == V64SImode)
38875 return 4;
38876 return 1;
38879 /* Implement TARGET_HARD_REGNO_MODE_OK. */
38881 static bool
38882 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
38884 /* Flags and only flags can only hold CCmode values. */
38885 if (CC_REGNO_P (regno))
38886 return GET_MODE_CLASS (mode) == MODE_CC;
38887 if (GET_MODE_CLASS (mode) == MODE_CC
38888 || GET_MODE_CLASS (mode) == MODE_RANDOM
38889 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
38890 return false;
38891 if (STACK_REGNO_P (regno))
38892 return VALID_FP_MODE_P (mode);
38893 if (MASK_REGNO_P (regno))
38894 return (VALID_MASK_REG_MODE (mode)
38895 || (TARGET_AVX512BW
38896 && VALID_MASK_AVX512BW_MODE (mode)));
38897 if (BND_REGNO_P (regno))
38898 return VALID_BND_REG_MODE (mode);
38899 if (SSE_REGNO_P (regno))
38901 /* We implement the move patterns for all vector modes into and
38902 out of SSE registers, even when no operation instructions
38903 are available. */
38905 /* For AVX-512 we allow, regardless of regno:
38906 - XI mode
38907 - any of 512-bit wide vector mode
38908 - any scalar mode. */
38909 if (TARGET_AVX512F
38910 && (mode == XImode
38911 || VALID_AVX512F_REG_MODE (mode)
38912 || VALID_AVX512F_SCALAR_MODE (mode)))
38913 return true;
38915 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
38916 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
38917 && MOD4_SSE_REGNO_P (regno)
38918 && mode == V64SFmode)
38919 return true;
38921 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
38922 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
38923 && MOD4_SSE_REGNO_P (regno)
38924 && mode == V64SImode)
38925 return true;
38927 /* TODO check for QI/HI scalars. */
38928 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
38929 if (TARGET_AVX512VL
38930 && (mode == OImode
38931 || mode == TImode
38932 || VALID_AVX256_REG_MODE (mode)
38933 || VALID_AVX512VL_128_REG_MODE (mode)))
38934 return true;
38936 /* xmm16-xmm31 are only available for AVX-512. */
38937 if (EXT_REX_SSE_REGNO_P (regno))
38938 return false;
38940 /* OImode and AVX modes are available only when AVX is enabled. */
38941 return ((TARGET_AVX
38942 && VALID_AVX256_REG_OR_OI_MODE (mode))
38943 || VALID_SSE_REG_MODE (mode)
38944 || VALID_SSE2_REG_MODE (mode)
38945 || VALID_MMX_REG_MODE (mode)
38946 || VALID_MMX_REG_MODE_3DNOW (mode));
38948 if (MMX_REGNO_P (regno))
38950 /* We implement the move patterns for 3DNOW modes even in MMX mode,
38951 so if the register is available at all, then we can move data of
38952 the given mode into or out of it. */
38953 return (VALID_MMX_REG_MODE (mode)
38954 || VALID_MMX_REG_MODE_3DNOW (mode));
38957 if (mode == QImode)
38959 /* Take care for QImode values - they can be in non-QI regs,
38960 but then they do cause partial register stalls. */
38961 if (ANY_QI_REGNO_P (regno))
38962 return true;
38963 if (!TARGET_PARTIAL_REG_STALL)
38964 return true;
38965 /* LRA checks if the hard register is OK for the given mode.
38966 QImode values can live in non-QI regs, so we allow all
38967 registers here. */
38968 if (lra_in_progress)
38969 return true;
38970 return !can_create_pseudo_p ();
38972 /* We handle both integer and floats in the general purpose registers. */
38973 else if (VALID_INT_MODE_P (mode))
38974 return true;
38975 else if (VALID_FP_MODE_P (mode))
38976 return true;
38977 else if (VALID_DFP_MODE_P (mode))
38978 return true;
38979 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
38980 on to use that value in smaller contexts, this can easily force a
38981 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
38982 supporting DImode, allow it. */
38983 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
38984 return true;
38986 return false;
38989 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
38990 saves SSE registers across calls is Win64 (thus no need to check the
38991 current ABI here), and with AVX enabled Win64 only guarantees that
38992 the low 16 bytes are saved. */
38994 static bool
38995 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
38997 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
39000 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39001 tieable integer mode. */
39003 static bool
39004 ix86_tieable_integer_mode_p (machine_mode mode)
39006 switch (mode)
39008 case E_HImode:
39009 case E_SImode:
39010 return true;
39012 case E_QImode:
39013 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39015 case E_DImode:
39016 return TARGET_64BIT;
39018 default:
39019 return false;
39023 /* Implement TARGET_MODES_TIEABLE_P.
39025 Return true if MODE1 is accessible in a register that can hold MODE2
39026 without copying. That is, all register classes that can hold MODE2
39027 can also hold MODE1. */
39029 static bool
39030 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39032 if (mode1 == mode2)
39033 return true;
39035 if (ix86_tieable_integer_mode_p (mode1)
39036 && ix86_tieable_integer_mode_p (mode2))
39037 return true;
39039 /* MODE2 being XFmode implies fp stack or general regs, which means we
39040 can tie any smaller floating point modes to it. Note that we do not
39041 tie this with TFmode. */
39042 if (mode2 == XFmode)
39043 return mode1 == SFmode || mode1 == DFmode;
39045 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39046 that we can tie it with SFmode. */
39047 if (mode2 == DFmode)
39048 return mode1 == SFmode;
39050 /* If MODE2 is only appropriate for an SSE register, then tie with
39051 any other mode acceptable to SSE registers. */
39052 if (GET_MODE_SIZE (mode2) == 32
39053 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39054 return (GET_MODE_SIZE (mode1) == 32
39055 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39056 if (GET_MODE_SIZE (mode2) == 16
39057 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39058 return (GET_MODE_SIZE (mode1) == 16
39059 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39061 /* If MODE2 is appropriate for an MMX register, then tie
39062 with any other mode acceptable to MMX registers. */
39063 if (GET_MODE_SIZE (mode2) == 8
39064 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39065 return (GET_MODE_SIZE (mode1) == 8
39066 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39068 return false;
39071 /* Return the cost of moving between two registers of mode MODE. */
39073 static int
39074 ix86_set_reg_reg_cost (machine_mode mode)
39076 unsigned int units = UNITS_PER_WORD;
39078 switch (GET_MODE_CLASS (mode))
39080 default:
39081 break;
39083 case MODE_CC:
39084 units = GET_MODE_SIZE (CCmode);
39085 break;
39087 case MODE_FLOAT:
39088 if ((TARGET_SSE && mode == TFmode)
39089 || (TARGET_80387 && mode == XFmode)
39090 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39091 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39092 units = GET_MODE_SIZE (mode);
39093 break;
39095 case MODE_COMPLEX_FLOAT:
39096 if ((TARGET_SSE && mode == TCmode)
39097 || (TARGET_80387 && mode == XCmode)
39098 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39099 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39100 units = GET_MODE_SIZE (mode);
39101 break;
39103 case MODE_VECTOR_INT:
39104 case MODE_VECTOR_FLOAT:
39105 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39106 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39107 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39108 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39109 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39110 units = GET_MODE_SIZE (mode);
39113 /* Return the cost of moving between two registers of mode MODE,
39114 assuming that the move will be in pieces of at most UNITS bytes. */
39115 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39118 /* Return cost of vector operation in MODE given that scalar version has
39119 COST. If PARALLEL is true assume that CPU has more than one unit
39120 performing the operation. */
39122 static int
39123 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
39125 if (!VECTOR_MODE_P (mode))
39126 return cost;
39128 if (!parallel)
39129 return cost * GET_MODE_NUNITS (mode);
39130 if (GET_MODE_BITSIZE (mode) == 128
39131 && TARGET_SSE_SPLIT_REGS)
39132 return cost * 2;
39133 if (GET_MODE_BITSIZE (mode) > 128
39134 && TARGET_AVX128_OPTIMAL)
39135 return cost * GET_MODE_BITSIZE (mode) / 128;
39136 return cost;
39139 /* Compute a (partial) cost for rtx X. Return true if the complete
39140 cost has been computed, and false if subexpressions should be
39141 scanned. In either case, *TOTAL contains the cost result. */
39143 static bool
39144 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39145 int *total, bool speed)
39147 rtx mask;
39148 enum rtx_code code = GET_CODE (x);
39149 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39150 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39151 int src_cost;
39152 machine_mode inner_mode = mode;
39153 if (VECTOR_MODE_P (mode))
39154 inner_mode = GET_MODE_INNER (mode);
39156 switch (code)
39158 case SET:
39159 if (register_operand (SET_DEST (x), VOIDmode)
39160 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39162 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39163 return true;
39166 if (register_operand (SET_SRC (x), VOIDmode))
39167 /* Avoid potentially incorrect high cost from rtx_costs
39168 for non-tieable SUBREGs. */
39169 src_cost = 0;
39170 else
39172 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
39174 if (CONSTANT_P (SET_SRC (x)))
39175 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
39176 a small value, possibly zero for cheap constants. */
39177 src_cost += COSTS_N_INSNS (1);
39180 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
39181 return true;
39183 case CONST_INT:
39184 case CONST:
39185 case LABEL_REF:
39186 case SYMBOL_REF:
39187 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39188 *total = 3;
39189 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39190 *total = 2;
39191 else if (flag_pic && SYMBOLIC_CONST (x)
39192 && !(TARGET_64BIT
39193 && (GET_CODE (x) == LABEL_REF
39194 || (GET_CODE (x) == SYMBOL_REF
39195 && SYMBOL_REF_LOCAL_P (x))))
39196 /* Use 0 cost for CONST to improve its propagation. */
39197 && (TARGET_64BIT || GET_CODE (x) != CONST))
39198 *total = 1;
39199 else
39200 *total = 0;
39201 return true;
39203 case CONST_DOUBLE:
39204 if (IS_STACK_MODE (mode))
39205 switch (standard_80387_constant_p (x))
39207 case -1:
39208 case 0:
39209 break;
39210 case 1: /* 0.0 */
39211 *total = 1;
39212 return true;
39213 default: /* Other constants */
39214 *total = 2;
39215 return true;
39217 /* FALLTHRU */
39219 case CONST_VECTOR:
39220 switch (standard_sse_constant_p (x, mode))
39222 case 0:
39223 break;
39224 case 1: /* 0: xor eliminates false dependency */
39225 *total = 0;
39226 return true;
39227 default: /* -1: cmp contains false dependency */
39228 *total = 1;
39229 return true;
39231 /* FALLTHRU */
39233 case CONST_WIDE_INT:
39234 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39235 it'll probably end up. Add a penalty for size. */
39236 *total = (COSTS_N_INSNS (1)
39237 + (!TARGET_64BIT && flag_pic)
39238 + (GET_MODE_SIZE (mode) <= 4
39239 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39240 return true;
39242 case ZERO_EXTEND:
39243 /* The zero extensions is often completely free on x86_64, so make
39244 it as cheap as possible. */
39245 if (TARGET_64BIT && mode == DImode
39246 && GET_MODE (XEXP (x, 0)) == SImode)
39247 *total = 1;
39248 else if (TARGET_ZERO_EXTEND_WITH_AND)
39249 *total = cost->add;
39250 else
39251 *total = cost->movzx;
39252 return false;
39254 case SIGN_EXTEND:
39255 *total = cost->movsx;
39256 return false;
39258 case ASHIFT:
39259 if (SCALAR_INT_MODE_P (mode)
39260 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39261 && CONST_INT_P (XEXP (x, 1)))
39263 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39264 if (value == 1)
39266 *total = cost->add;
39267 return false;
39269 if ((value == 2 || value == 3)
39270 && cost->lea <= cost->shift_const)
39272 *total = cost->lea;
39273 return false;
39276 /* FALLTHRU */
39278 case ROTATE:
39279 case ASHIFTRT:
39280 case LSHIFTRT:
39281 case ROTATERT:
39282 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39284 /* ??? Should be SSE vector operation cost. */
39285 /* At least for published AMD latencies, this really is the same
39286 as the latency for a simple fpu operation like fabs. */
39287 /* V*QImode is emulated with 1-11 insns. */
39288 if (mode == V16QImode || mode == V32QImode)
39290 int count = 11;
39291 if (TARGET_XOP && mode == V16QImode)
39293 /* For XOP we use vpshab, which requires a broadcast of the
39294 value to the variable shift insn. For constants this
39295 means a V16Q const in mem; even when we can perform the
39296 shift with one insn set the cost to prefer paddb. */
39297 if (CONSTANT_P (XEXP (x, 1)))
39299 *total = ix86_vec_cost (mode,
39300 cost->sse_op
39301 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
39302 + (speed ? 2 : COSTS_N_BYTES (16)), true);
39303 return true;
39305 count = 3;
39307 else if (TARGET_SSSE3)
39308 count = 7;
39309 *total = ix86_vec_cost (mode, cost->sse_op * count, true);
39311 else
39312 *total = ix86_vec_cost (mode, cost->sse_op, true);
39314 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39316 if (CONST_INT_P (XEXP (x, 1)))
39318 if (INTVAL (XEXP (x, 1)) > 32)
39319 *total = cost->shift_const + COSTS_N_INSNS (2);
39320 else
39321 *total = cost->shift_const * 2;
39323 else
39325 if (GET_CODE (XEXP (x, 1)) == AND)
39326 *total = cost->shift_var * 2;
39327 else
39328 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
39331 else
39333 if (CONST_INT_P (XEXP (x, 1)))
39334 *total = cost->shift_const;
39335 else if (SUBREG_P (XEXP (x, 1))
39336 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
39338 /* Return the cost after shift-and truncation. */
39339 *total = cost->shift_var;
39340 return true;
39342 else
39343 *total = cost->shift_var;
39345 return false;
39347 case FMA:
39349 rtx sub;
39351 gcc_assert (FLOAT_MODE_P (mode));
39352 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39354 *total = ix86_vec_cost (mode,
39355 mode == SFmode ? cost->fmass : cost->fmasd,
39356 true);
39357 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39359 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39360 sub = XEXP (x, 0);
39361 if (GET_CODE (sub) == NEG)
39362 sub = XEXP (sub, 0);
39363 *total += rtx_cost (sub, mode, FMA, 0, speed);
39365 sub = XEXP (x, 2);
39366 if (GET_CODE (sub) == NEG)
39367 sub = XEXP (sub, 0);
39368 *total += rtx_cost (sub, mode, FMA, 2, speed);
39369 return true;
39372 case MULT:
39373 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39375 *total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
39376 return false;
39378 else if (X87_FLOAT_MODE_P (mode))
39380 *total = cost->fmul;
39381 return false;
39383 else if (FLOAT_MODE_P (mode))
39385 *total = ix86_vec_cost (mode,
39386 inner_mode == DFmode
39387 ? cost->mulsd : cost->mulss, true);
39388 return false;
39390 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39392 /* V*QImode is emulated with 7-13 insns. */
39393 if (mode == V16QImode || mode == V32QImode)
39395 int extra = 11;
39396 if (TARGET_XOP && mode == V16QImode)
39397 extra = 5;
39398 else if (TARGET_SSSE3)
39399 extra = 6;
39400 *total = ix86_vec_cost (mode,
39401 cost->mulss * 2 + cost->sse_op * extra,
39402 true);
39404 /* V*DImode is emulated with 5-8 insns. */
39405 else if (mode == V2DImode || mode == V4DImode)
39407 if (TARGET_XOP && mode == V2DImode)
39408 *total = ix86_vec_cost (mode,
39409 cost->mulss * 2 + cost->sse_op * 3,
39410 true);
39411 else
39412 *total = ix86_vec_cost (mode,
39413 cost->mulss * 3 + cost->sse_op * 5,
39414 true);
39416 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39417 insns, including two PMULUDQ. */
39418 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39419 *total = ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
39420 true);
39421 else
39422 *total = ix86_vec_cost (mode, cost->mulss, true);
39423 return false;
39425 else
39427 rtx op0 = XEXP (x, 0);
39428 rtx op1 = XEXP (x, 1);
39429 int nbits;
39430 if (CONST_INT_P (XEXP (x, 1)))
39432 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39433 for (nbits = 0; value != 0; value &= value - 1)
39434 nbits++;
39436 else
39437 /* This is arbitrary. */
39438 nbits = 7;
39440 /* Compute costs correctly for widening multiplication. */
39441 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39442 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39443 == GET_MODE_SIZE (mode))
39445 int is_mulwiden = 0;
39446 machine_mode inner_mode = GET_MODE (op0);
39448 if (GET_CODE (op0) == GET_CODE (op1))
39449 is_mulwiden = 1, op1 = XEXP (op1, 0);
39450 else if (CONST_INT_P (op1))
39452 if (GET_CODE (op0) == SIGN_EXTEND)
39453 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39454 == INTVAL (op1);
39455 else
39456 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39459 if (is_mulwiden)
39460 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39463 *total = (cost->mult_init[MODE_INDEX (mode)]
39464 + nbits * cost->mult_bit
39465 + rtx_cost (op0, mode, outer_code, opno, speed)
39466 + rtx_cost (op1, mode, outer_code, opno, speed));
39468 return true;
39471 case DIV:
39472 case UDIV:
39473 case MOD:
39474 case UMOD:
39475 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39476 *total = inner_mode == DFmode ? cost->divsd : cost->divss;
39477 else if (X87_FLOAT_MODE_P (mode))
39478 *total = cost->fdiv;
39479 else if (FLOAT_MODE_P (mode))
39480 *total = ix86_vec_cost (mode,
39481 inner_mode == DFmode ? cost->divsd : cost->divss,
39482 true);
39483 else
39484 *total = cost->divide[MODE_INDEX (mode)];
39485 return false;
39487 case PLUS:
39488 if (GET_MODE_CLASS (mode) == MODE_INT
39489 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
39491 if (GET_CODE (XEXP (x, 0)) == PLUS
39492 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
39493 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
39494 && CONSTANT_P (XEXP (x, 1)))
39496 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
39497 if (val == 2 || val == 4 || val == 8)
39499 *total = cost->lea;
39500 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39501 outer_code, opno, speed);
39502 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
39503 outer_code, opno, speed);
39504 *total += rtx_cost (XEXP (x, 1), mode,
39505 outer_code, opno, speed);
39506 return true;
39509 else if (GET_CODE (XEXP (x, 0)) == MULT
39510 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
39512 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
39513 if (val == 2 || val == 4 || val == 8)
39515 *total = cost->lea;
39516 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39517 outer_code, opno, speed);
39518 *total += rtx_cost (XEXP (x, 1), mode,
39519 outer_code, opno, speed);
39520 return true;
39523 else if (GET_CODE (XEXP (x, 0)) == PLUS)
39525 /* Add with carry, ignore the cost of adding a carry flag. */
39526 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
39527 *total = cost->add;
39528 else
39530 *total = cost->lea;
39531 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39532 outer_code, opno, speed);
39535 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39536 outer_code, opno, speed);
39537 *total += rtx_cost (XEXP (x, 1), mode,
39538 outer_code, opno, speed);
39539 return true;
39542 /* FALLTHRU */
39544 case MINUS:
39545 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
39546 if (GET_MODE_CLASS (mode) == MODE_INT
39547 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
39548 && GET_CODE (XEXP (x, 0)) == MINUS
39549 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
39551 *total = cost->add;
39552 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39553 outer_code, opno, speed);
39554 *total += rtx_cost (XEXP (x, 1), mode,
39555 outer_code, opno, speed);
39556 return true;
39559 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39561 *total = cost->addss;
39562 return false;
39564 else if (X87_FLOAT_MODE_P (mode))
39566 *total = cost->fadd;
39567 return false;
39569 else if (FLOAT_MODE_P (mode))
39571 *total = ix86_vec_cost (mode, cost->addss, true);
39572 return false;
39574 /* FALLTHRU */
39576 case AND:
39577 case IOR:
39578 case XOR:
39579 if (GET_MODE_CLASS (mode) == MODE_INT
39580 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39582 *total = (cost->add * 2
39583 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
39584 << (GET_MODE (XEXP (x, 0)) != DImode))
39585 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
39586 << (GET_MODE (XEXP (x, 1)) != DImode)));
39587 return true;
39589 /* FALLTHRU */
39591 case NEG:
39592 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39594 *total = cost->sse_op;
39595 return false;
39597 else if (X87_FLOAT_MODE_P (mode))
39599 *total = cost->fchs;
39600 return false;
39602 else if (FLOAT_MODE_P (mode))
39604 *total = ix86_vec_cost (mode, cost->sse_op, true);
39605 return false;
39607 /* FALLTHRU */
39609 case NOT:
39610 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39611 *total = ix86_vec_cost (mode, cost->sse_op, true);
39612 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39613 *total = cost->add * 2;
39614 else
39615 *total = cost->add;
39616 return false;
39618 case COMPARE:
39619 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
39620 && XEXP (XEXP (x, 0), 1) == const1_rtx
39621 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
39622 && XEXP (x, 1) == const0_rtx)
39624 /* This kind of construct is implemented using test[bwl].
39625 Treat it as if we had an AND. */
39626 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
39627 *total = (cost->add
39628 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
39629 opno, speed)
39630 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
39631 return true;
39634 /* The embedded comparison operand is completely free. */
39635 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
39636 && XEXP (x, 1) == const0_rtx)
39637 *total = 0;
39639 return false;
39641 case FLOAT_EXTEND:
39642 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
39643 *total = 0;
39644 else
39645 *total = ix86_vec_cost (mode, cost->addss, true);
39646 return false;
39648 case FLOAT_TRUNCATE:
39649 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
39650 *total = cost->fadd;
39651 else
39652 *total = ix86_vec_cost (mode, cost->addss, true);
39653 return false;
39655 case ABS:
39656 /* SSE requires memory load for the constant operand. It may make
39657 sense to account for this. Of course the constant operand may or
39658 may not be reused. */
39659 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39660 *total = cost->sse_op;
39661 else if (X87_FLOAT_MODE_P (mode))
39662 *total = cost->fabs;
39663 else if (FLOAT_MODE_P (mode))
39664 *total = ix86_vec_cost (mode, cost->sse_op, true);
39665 return false;
39667 case SQRT:
39668 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39669 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
39670 else if (X87_FLOAT_MODE_P (mode))
39671 *total = cost->fsqrt;
39672 else if (FLOAT_MODE_P (mode))
39673 *total = ix86_vec_cost (mode,
39674 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
39675 true);
39676 return false;
39678 case UNSPEC:
39679 if (XINT (x, 1) == UNSPEC_TP)
39680 *total = 0;
39681 return false;
39683 case VEC_SELECT:
39684 case VEC_CONCAT:
39685 case VEC_DUPLICATE:
39686 /* ??? Assume all of these vector manipulation patterns are
39687 recognizable. In which case they all pretty much have the
39688 same cost. */
39689 *total = cost->sse_op;
39690 return true;
39691 case VEC_MERGE:
39692 mask = XEXP (x, 2);
39693 /* This is masked instruction, assume the same cost,
39694 as nonmasked variant. */
39695 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
39696 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
39697 else
39698 *total = cost->sse_op;
39699 return true;
39701 default:
39702 return false;
39706 #if TARGET_MACHO
39708 static int current_machopic_label_num;
39710 /* Given a symbol name and its associated stub, write out the
39711 definition of the stub. */
39713 void
39714 machopic_output_stub (FILE *file, const char *symb, const char *stub)
39716 unsigned int length;
39717 char *binder_name, *symbol_name, lazy_ptr_name[32];
39718 int label = ++current_machopic_label_num;
39720 /* For 64-bit we shouldn't get here. */
39721 gcc_assert (!TARGET_64BIT);
39723 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
39724 symb = targetm.strip_name_encoding (symb);
39726 length = strlen (stub);
39727 binder_name = XALLOCAVEC (char, length + 32);
39728 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
39730 length = strlen (symb);
39731 symbol_name = XALLOCAVEC (char, length + 32);
39732 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
39734 sprintf (lazy_ptr_name, "L%d$lz", label);
39736 if (MACHOPIC_ATT_STUB)
39737 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
39738 else if (MACHOPIC_PURE)
39739 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
39740 else
39741 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
39743 fprintf (file, "%s:\n", stub);
39744 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
39746 if (MACHOPIC_ATT_STUB)
39748 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
39750 else if (MACHOPIC_PURE)
39752 /* PIC stub. */
39753 /* 25-byte PIC stub using "CALL get_pc_thunk". */
39754 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
39755 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
39756 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
39757 label, lazy_ptr_name, label);
39758 fprintf (file, "\tjmp\t*%%ecx\n");
39760 else
39761 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
39763 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
39764 it needs no stub-binding-helper. */
39765 if (MACHOPIC_ATT_STUB)
39766 return;
39768 fprintf (file, "%s:\n", binder_name);
39770 if (MACHOPIC_PURE)
39772 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
39773 fprintf (file, "\tpushl\t%%ecx\n");
39775 else
39776 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
39778 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
39780 /* N.B. Keep the correspondence of these
39781 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
39782 old-pic/new-pic/non-pic stubs; altering this will break
39783 compatibility with existing dylibs. */
39784 if (MACHOPIC_PURE)
39786 /* 25-byte PIC stub using "CALL get_pc_thunk". */
39787 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
39789 else
39790 /* 16-byte -mdynamic-no-pic stub. */
39791 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
39793 fprintf (file, "%s:\n", lazy_ptr_name);
39794 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
39795 fprintf (file, ASM_LONG "%s\n", binder_name);
39797 #endif /* TARGET_MACHO */
39799 /* Order the registers for register allocator. */
39801 void
39802 x86_order_regs_for_local_alloc (void)
39804 int pos = 0;
39805 int i;
39807 /* First allocate the local general purpose registers. */
39808 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
39809 if (GENERAL_REGNO_P (i) && call_used_regs[i])
39810 reg_alloc_order [pos++] = i;
39812 /* Global general purpose registers. */
39813 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
39814 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
39815 reg_alloc_order [pos++] = i;
39817 /* x87 registers come first in case we are doing FP math
39818 using them. */
39819 if (!TARGET_SSE_MATH)
39820 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
39821 reg_alloc_order [pos++] = i;
39823 /* SSE registers. */
39824 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
39825 reg_alloc_order [pos++] = i;
39826 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
39827 reg_alloc_order [pos++] = i;
39829 /* Extended REX SSE registers. */
39830 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
39831 reg_alloc_order [pos++] = i;
39833 /* Mask register. */
39834 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
39835 reg_alloc_order [pos++] = i;
39837 /* MPX bound registers. */
39838 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
39839 reg_alloc_order [pos++] = i;
39841 /* x87 registers. */
39842 if (TARGET_SSE_MATH)
39843 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
39844 reg_alloc_order [pos++] = i;
39846 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
39847 reg_alloc_order [pos++] = i;
39849 /* Initialize the rest of array as we do not allocate some registers
39850 at all. */
39851 while (pos < FIRST_PSEUDO_REGISTER)
39852 reg_alloc_order [pos++] = 0;
39855 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
39856 in struct attribute_spec handler. */
39857 static tree
39858 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
39859 tree args,
39860 int,
39861 bool *no_add_attrs)
39863 if (TREE_CODE (*node) != FUNCTION_TYPE
39864 && TREE_CODE (*node) != METHOD_TYPE
39865 && TREE_CODE (*node) != FIELD_DECL
39866 && TREE_CODE (*node) != TYPE_DECL)
39868 warning (OPT_Wattributes, "%qE attribute only applies to functions",
39869 name);
39870 *no_add_attrs = true;
39871 return NULL_TREE;
39873 if (TARGET_64BIT)
39875 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
39876 name);
39877 *no_add_attrs = true;
39878 return NULL_TREE;
39880 if (is_attribute_p ("callee_pop_aggregate_return", name))
39882 tree cst;
39884 cst = TREE_VALUE (args);
39885 if (TREE_CODE (cst) != INTEGER_CST)
39887 warning (OPT_Wattributes,
39888 "%qE attribute requires an integer constant argument",
39889 name);
39890 *no_add_attrs = true;
39892 else if (compare_tree_int (cst, 0) != 0
39893 && compare_tree_int (cst, 1) != 0)
39895 warning (OPT_Wattributes,
39896 "argument to %qE attribute is neither zero, nor one",
39897 name);
39898 *no_add_attrs = true;
39901 return NULL_TREE;
39904 return NULL_TREE;
39907 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
39908 struct attribute_spec.handler. */
39909 static tree
39910 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
39911 bool *no_add_attrs)
39913 if (TREE_CODE (*node) != FUNCTION_TYPE
39914 && TREE_CODE (*node) != METHOD_TYPE
39915 && TREE_CODE (*node) != FIELD_DECL
39916 && TREE_CODE (*node) != TYPE_DECL)
39918 warning (OPT_Wattributes, "%qE attribute only applies to functions",
39919 name);
39920 *no_add_attrs = true;
39921 return NULL_TREE;
39924 /* Can combine regparm with all attributes but fastcall. */
39925 if (is_attribute_p ("ms_abi", name))
39927 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
39929 error ("ms_abi and sysv_abi attributes are not compatible");
39932 return NULL_TREE;
39934 else if (is_attribute_p ("sysv_abi", name))
39936 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
39938 error ("ms_abi and sysv_abi attributes are not compatible");
39941 return NULL_TREE;
39944 return NULL_TREE;
39947 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
39948 struct attribute_spec.handler. */
39949 static tree
39950 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
39951 bool *no_add_attrs)
39953 tree *type = NULL;
39954 if (DECL_P (*node))
39956 if (TREE_CODE (*node) == TYPE_DECL)
39957 type = &TREE_TYPE (*node);
39959 else
39960 type = node;
39962 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
39964 warning (OPT_Wattributes, "%qE attribute ignored",
39965 name);
39966 *no_add_attrs = true;
39969 else if ((is_attribute_p ("ms_struct", name)
39970 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
39971 || ((is_attribute_p ("gcc_struct", name)
39972 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
39974 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
39975 name);
39976 *no_add_attrs = true;
39979 return NULL_TREE;
39982 static tree
39983 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
39984 bool *no_add_attrs)
39986 if (TREE_CODE (*node) != FUNCTION_DECL)
39988 warning (OPT_Wattributes, "%qE attribute only applies to functions",
39989 name);
39990 *no_add_attrs = true;
39992 return NULL_TREE;
39995 static tree
39996 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
39997 int, bool *)
39999 return NULL_TREE;
40002 static tree
40003 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40005 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40006 but the function type contains args and return type data. */
40007 tree func_type = *node;
40008 tree return_type = TREE_TYPE (func_type);
40010 int nargs = 0;
40011 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40012 while (current_arg_type
40013 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40015 if (nargs == 0)
40017 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40018 error ("interrupt service routine should have a pointer "
40019 "as the first argument");
40021 else if (nargs == 1)
40023 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40024 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40025 error ("interrupt service routine should have unsigned %s"
40026 "int as the second argument",
40027 TARGET_64BIT
40028 ? (TARGET_X32 ? "long long " : "long ")
40029 : "");
40031 nargs++;
40032 current_arg_type = TREE_CHAIN (current_arg_type);
40034 if (!nargs || nargs > 2)
40035 error ("interrupt service routine can only have a pointer argument "
40036 "and an optional integer argument");
40037 if (! VOID_TYPE_P (return_type))
40038 error ("interrupt service routine can't have non-void return value");
40040 return NULL_TREE;
40043 static bool
40044 ix86_ms_bitfield_layout_p (const_tree record_type)
40046 return ((TARGET_MS_BITFIELD_LAYOUT
40047 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40048 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40051 /* Returns an expression indicating where the this parameter is
40052 located on entry to the FUNCTION. */
40054 static rtx
40055 x86_this_parameter (tree function)
40057 tree type = TREE_TYPE (function);
40058 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40059 int nregs;
40061 if (TARGET_64BIT)
40063 const int *parm_regs;
40065 if (ix86_function_type_abi (type) == MS_ABI)
40066 parm_regs = x86_64_ms_abi_int_parameter_registers;
40067 else
40068 parm_regs = x86_64_int_parameter_registers;
40069 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40072 nregs = ix86_function_regparm (type, function);
40074 if (nregs > 0 && !stdarg_p (type))
40076 int regno;
40077 unsigned int ccvt = ix86_get_callcvt (type);
40079 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40080 regno = aggr ? DX_REG : CX_REG;
40081 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40083 regno = CX_REG;
40084 if (aggr)
40085 return gen_rtx_MEM (SImode,
40086 plus_constant (Pmode, stack_pointer_rtx, 4));
40088 else
40090 regno = AX_REG;
40091 if (aggr)
40093 regno = DX_REG;
40094 if (nregs == 1)
40095 return gen_rtx_MEM (SImode,
40096 plus_constant (Pmode,
40097 stack_pointer_rtx, 4));
40100 return gen_rtx_REG (SImode, regno);
40103 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40104 aggr ? 8 : 4));
40107 /* Determine whether x86_output_mi_thunk can succeed. */
40109 static bool
40110 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40111 const_tree function)
40113 /* 64-bit can handle anything. */
40114 if (TARGET_64BIT)
40115 return true;
40117 /* For 32-bit, everything's fine if we have one free register. */
40118 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40119 return true;
40121 /* Need a free register for vcall_offset. */
40122 if (vcall_offset)
40123 return false;
40125 /* Need a free register for GOT references. */
40126 if (flag_pic && !targetm.binds_local_p (function))
40127 return false;
40129 /* Otherwise ok. */
40130 return true;
40133 /* Output the assembler code for a thunk function. THUNK_DECL is the
40134 declaration for the thunk function itself, FUNCTION is the decl for
40135 the target function. DELTA is an immediate constant offset to be
40136 added to THIS. If VCALL_OFFSET is nonzero, the word at
40137 *(*this + vcall_offset) should be added to THIS. */
40139 static void
40140 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40141 HOST_WIDE_INT vcall_offset, tree function)
40143 rtx this_param = x86_this_parameter (function);
40144 rtx this_reg, tmp, fnaddr;
40145 unsigned int tmp_regno;
40146 rtx_insn *insn;
40148 if (TARGET_64BIT)
40149 tmp_regno = R10_REG;
40150 else
40152 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40153 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40154 tmp_regno = AX_REG;
40155 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40156 tmp_regno = DX_REG;
40157 else
40158 tmp_regno = CX_REG;
40161 emit_note (NOTE_INSN_PROLOGUE_END);
40163 /* CET is enabled, insert EB instruction. */
40164 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
40165 emit_insn (gen_nop_endbr ());
40167 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40168 pull it in now and let DELTA benefit. */
40169 if (REG_P (this_param))
40170 this_reg = this_param;
40171 else if (vcall_offset)
40173 /* Put the this parameter into %eax. */
40174 this_reg = gen_rtx_REG (Pmode, AX_REG);
40175 emit_move_insn (this_reg, this_param);
40177 else
40178 this_reg = NULL_RTX;
40180 /* Adjust the this parameter by a fixed constant. */
40181 if (delta)
40183 rtx delta_rtx = GEN_INT (delta);
40184 rtx delta_dst = this_reg ? this_reg : this_param;
40186 if (TARGET_64BIT)
40188 if (!x86_64_general_operand (delta_rtx, Pmode))
40190 tmp = gen_rtx_REG (Pmode, tmp_regno);
40191 emit_move_insn (tmp, delta_rtx);
40192 delta_rtx = tmp;
40196 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40199 /* Adjust the this parameter by a value stored in the vtable. */
40200 if (vcall_offset)
40202 rtx vcall_addr, vcall_mem, this_mem;
40204 tmp = gen_rtx_REG (Pmode, tmp_regno);
40206 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40207 if (Pmode != ptr_mode)
40208 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40209 emit_move_insn (tmp, this_mem);
40211 /* Adjust the this parameter. */
40212 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40213 if (TARGET_64BIT
40214 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40216 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40217 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40218 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40221 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40222 if (Pmode != ptr_mode)
40223 emit_insn (gen_addsi_1_zext (this_reg,
40224 gen_rtx_REG (ptr_mode,
40225 REGNO (this_reg)),
40226 vcall_mem));
40227 else
40228 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40231 /* If necessary, drop THIS back to its stack slot. */
40232 if (this_reg && this_reg != this_param)
40233 emit_move_insn (this_param, this_reg);
40235 fnaddr = XEXP (DECL_RTL (function), 0);
40236 if (TARGET_64BIT)
40238 if (!flag_pic || targetm.binds_local_p (function)
40239 || TARGET_PECOFF)
40241 else
40243 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40244 tmp = gen_rtx_CONST (Pmode, tmp);
40245 fnaddr = gen_const_mem (Pmode, tmp);
40248 else
40250 if (!flag_pic || targetm.binds_local_p (function))
40252 #if TARGET_MACHO
40253 else if (TARGET_MACHO)
40255 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40256 fnaddr = XEXP (fnaddr, 0);
40258 #endif /* TARGET_MACHO */
40259 else
40261 tmp = gen_rtx_REG (Pmode, CX_REG);
40262 output_set_got (tmp, NULL_RTX);
40264 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40265 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40266 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40267 fnaddr = gen_const_mem (Pmode, fnaddr);
40271 /* Our sibling call patterns do not allow memories, because we have no
40272 predicate that can distinguish between frame and non-frame memory.
40273 For our purposes here, we can get away with (ab)using a jump pattern,
40274 because we're going to do no optimization. */
40275 if (MEM_P (fnaddr))
40277 if (sibcall_insn_operand (fnaddr, word_mode))
40279 fnaddr = XEXP (DECL_RTL (function), 0);
40280 tmp = gen_rtx_MEM (QImode, fnaddr);
40281 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40282 tmp = emit_call_insn (tmp);
40283 SIBLING_CALL_P (tmp) = 1;
40285 else
40286 emit_jump_insn (gen_indirect_jump (fnaddr));
40288 else
40290 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40292 // CM_LARGE_PIC always uses pseudo PIC register which is
40293 // uninitialized. Since FUNCTION is local and calling it
40294 // doesn't go through PLT, we use scratch register %r11 as
40295 // PIC register and initialize it here.
40296 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40297 ix86_init_large_pic_reg (tmp_regno);
40298 fnaddr = legitimize_pic_address (fnaddr,
40299 gen_rtx_REG (Pmode, tmp_regno));
40302 if (!sibcall_insn_operand (fnaddr, word_mode))
40304 tmp = gen_rtx_REG (word_mode, tmp_regno);
40305 if (GET_MODE (fnaddr) != word_mode)
40306 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40307 emit_move_insn (tmp, fnaddr);
40308 fnaddr = tmp;
40311 tmp = gen_rtx_MEM (QImode, fnaddr);
40312 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40313 tmp = emit_call_insn (tmp);
40314 SIBLING_CALL_P (tmp) = 1;
40316 emit_barrier ();
40318 /* Emit just enough of rest_of_compilation to get the insns emitted.
40319 Note that use_thunk calls assemble_start_function et al. */
40320 insn = get_insns ();
40321 shorten_branches (insn);
40322 final_start_function (insn, file, 1);
40323 final (insn, file, 1);
40324 final_end_function ();
40327 static void
40328 x86_file_start (void)
40330 default_file_start ();
40331 if (TARGET_16BIT)
40332 fputs ("\t.code16gcc\n", asm_out_file);
40333 #if TARGET_MACHO
40334 darwin_file_start ();
40335 #endif
40336 if (X86_FILE_START_VERSION_DIRECTIVE)
40337 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40338 if (X86_FILE_START_FLTUSED)
40339 fputs ("\t.global\t__fltused\n", asm_out_file);
40340 if (ix86_asm_dialect == ASM_INTEL)
40341 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40345 x86_field_alignment (tree type, int computed)
40347 machine_mode mode;
40349 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40350 return computed;
40351 if (TARGET_IAMCU)
40352 return iamcu_alignment (type, computed);
40353 mode = TYPE_MODE (strip_array_types (type));
40354 if (mode == DFmode || mode == DCmode
40355 || GET_MODE_CLASS (mode) == MODE_INT
40356 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40357 return MIN (32, computed);
40358 return computed;
40361 /* Print call to TARGET to FILE. */
40363 static void
40364 x86_print_call_or_nop (FILE *file, const char *target)
40366 if (flag_nop_mcount)
40367 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
40368 else
40369 fprintf (file, "1:\tcall\t%s\n", target);
40372 /* Output assembler code to FILE to increment profiler label # LABELNO
40373 for profiling a function entry. */
40374 void
40375 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40377 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40378 : MCOUNT_NAME);
40379 if (TARGET_64BIT)
40381 #ifndef NO_PROFILE_COUNTERS
40382 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40383 #endif
40385 if (!TARGET_PECOFF && flag_pic)
40386 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40387 else
40388 x86_print_call_or_nop (file, mcount_name);
40390 else if (flag_pic)
40392 #ifndef NO_PROFILE_COUNTERS
40393 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40394 LPREFIX, labelno);
40395 #endif
40396 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40398 else
40400 #ifndef NO_PROFILE_COUNTERS
40401 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40402 LPREFIX, labelno);
40403 #endif
40404 x86_print_call_or_nop (file, mcount_name);
40407 if (flag_record_mcount)
40409 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40410 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40411 fprintf (file, "\t.previous\n");
40415 /* We don't have exact information about the insn sizes, but we may assume
40416 quite safely that we are informed about all 1 byte insns and memory
40417 address sizes. This is enough to eliminate unnecessary padding in
40418 99% of cases. */
40421 ix86_min_insn_size (rtx_insn *insn)
40423 int l = 0, len;
40425 if (!INSN_P (insn) || !active_insn_p (insn))
40426 return 0;
40428 /* Discard alignments we've emit and jump instructions. */
40429 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40430 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40431 return 0;
40433 /* Important case - calls are always 5 bytes.
40434 It is common to have many calls in the row. */
40435 if (CALL_P (insn)
40436 && symbolic_reference_mentioned_p (PATTERN (insn))
40437 && !SIBLING_CALL_P (insn))
40438 return 5;
40439 len = get_attr_length (insn);
40440 if (len <= 1)
40441 return 1;
40443 /* For normal instructions we rely on get_attr_length being exact,
40444 with a few exceptions. */
40445 if (!JUMP_P (insn))
40447 enum attr_type type = get_attr_type (insn);
40449 switch (type)
40451 case TYPE_MULTI:
40452 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40453 || asm_noperands (PATTERN (insn)) >= 0)
40454 return 0;
40455 break;
40456 case TYPE_OTHER:
40457 case TYPE_FCMP:
40458 break;
40459 default:
40460 /* Otherwise trust get_attr_length. */
40461 return len;
40464 l = get_attr_length_address (insn);
40465 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40466 l = 4;
40468 if (l)
40469 return 1+l;
40470 else
40471 return 2;
40474 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40476 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40477 window. */
40479 static void
40480 ix86_avoid_jump_mispredicts (void)
40482 rtx_insn *insn, *start = get_insns ();
40483 int nbytes = 0, njumps = 0;
40484 bool isjump = false;
40486 /* Look for all minimal intervals of instructions containing 4 jumps.
40487 The intervals are bounded by START and INSN. NBYTES is the total
40488 size of instructions in the interval including INSN and not including
40489 START. When the NBYTES is smaller than 16 bytes, it is possible
40490 that the end of START and INSN ends up in the same 16byte page.
40492 The smallest offset in the page INSN can start is the case where START
40493 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40494 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40496 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40497 have to, control transfer to label(s) can be performed through other
40498 means, and also we estimate minimum length of all asm stmts as 0. */
40499 for (insn = start; insn; insn = NEXT_INSN (insn))
40501 int min_size;
40503 if (LABEL_P (insn))
40505 int align = label_to_alignment (insn);
40506 int max_skip = label_to_max_skip (insn);
40508 if (max_skip > 15)
40509 max_skip = 15;
40510 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40511 already in the current 16 byte page, because otherwise
40512 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40513 bytes to reach 16 byte boundary. */
40514 if (align <= 0
40515 || (align <= 3 && max_skip != (1 << align) - 1))
40516 max_skip = 0;
40517 if (dump_file)
40518 fprintf (dump_file, "Label %i with max_skip %i\n",
40519 INSN_UID (insn), max_skip);
40520 if (max_skip)
40522 while (nbytes + max_skip >= 16)
40524 start = NEXT_INSN (start);
40525 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40526 || CALL_P (start))
40527 njumps--, isjump = true;
40528 else
40529 isjump = false;
40530 nbytes -= ix86_min_insn_size (start);
40533 continue;
40536 min_size = ix86_min_insn_size (insn);
40537 nbytes += min_size;
40538 if (dump_file)
40539 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
40540 INSN_UID (insn), min_size);
40541 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
40542 || CALL_P (insn))
40543 njumps++;
40544 else
40545 continue;
40547 while (njumps > 3)
40549 start = NEXT_INSN (start);
40550 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40551 || CALL_P (start))
40552 njumps--, isjump = true;
40553 else
40554 isjump = false;
40555 nbytes -= ix86_min_insn_size (start);
40557 gcc_assert (njumps >= 0);
40558 if (dump_file)
40559 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
40560 INSN_UID (start), INSN_UID (insn), nbytes);
40562 if (njumps == 3 && isjump && nbytes < 16)
40564 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
40566 if (dump_file)
40567 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
40568 INSN_UID (insn), padsize);
40569 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
40573 #endif
40575 /* AMD Athlon works faster
40576 when RET is not destination of conditional jump or directly preceded
40577 by other jump instruction. We avoid the penalty by inserting NOP just
40578 before the RET instructions in such cases. */
40579 static void
40580 ix86_pad_returns (void)
40582 edge e;
40583 edge_iterator ei;
40585 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40587 basic_block bb = e->src;
40588 rtx_insn *ret = BB_END (bb);
40589 rtx_insn *prev;
40590 bool replace = false;
40592 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
40593 || optimize_bb_for_size_p (bb))
40594 continue;
40595 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
40596 if (active_insn_p (prev) || LABEL_P (prev))
40597 break;
40598 if (prev && LABEL_P (prev))
40600 edge e;
40601 edge_iterator ei;
40603 FOR_EACH_EDGE (e, ei, bb->preds)
40604 if (EDGE_FREQUENCY (e) && e->src->index >= 0
40605 && !(e->flags & EDGE_FALLTHRU))
40607 replace = true;
40608 break;
40611 if (!replace)
40613 prev = prev_active_insn (ret);
40614 if (prev
40615 && ((JUMP_P (prev) && any_condjump_p (prev))
40616 || CALL_P (prev)))
40617 replace = true;
40618 /* Empty functions get branch mispredict even when
40619 the jump destination is not visible to us. */
40620 if (!prev && !optimize_function_for_size_p (cfun))
40621 replace = true;
40623 if (replace)
40625 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
40626 delete_insn (ret);
40631 /* Count the minimum number of instructions in BB. Return 4 if the
40632 number of instructions >= 4. */
40634 static int
40635 ix86_count_insn_bb (basic_block bb)
40637 rtx_insn *insn;
40638 int insn_count = 0;
40640 /* Count number of instructions in this block. Return 4 if the number
40641 of instructions >= 4. */
40642 FOR_BB_INSNS (bb, insn)
40644 /* Only happen in exit blocks. */
40645 if (JUMP_P (insn)
40646 && ANY_RETURN_P (PATTERN (insn)))
40647 break;
40649 if (NONDEBUG_INSN_P (insn)
40650 && GET_CODE (PATTERN (insn)) != USE
40651 && GET_CODE (PATTERN (insn)) != CLOBBER)
40653 insn_count++;
40654 if (insn_count >= 4)
40655 return insn_count;
40659 return insn_count;
40663 /* Count the minimum number of instructions in code path in BB.
40664 Return 4 if the number of instructions >= 4. */
40666 static int
40667 ix86_count_insn (basic_block bb)
40669 edge e;
40670 edge_iterator ei;
40671 int min_prev_count;
40673 /* Only bother counting instructions along paths with no
40674 more than 2 basic blocks between entry and exit. Given
40675 that BB has an edge to exit, determine if a predecessor
40676 of BB has an edge from entry. If so, compute the number
40677 of instructions in the predecessor block. If there
40678 happen to be multiple such blocks, compute the minimum. */
40679 min_prev_count = 4;
40680 FOR_EACH_EDGE (e, ei, bb->preds)
40682 edge prev_e;
40683 edge_iterator prev_ei;
40685 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
40687 min_prev_count = 0;
40688 break;
40690 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
40692 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
40694 int count = ix86_count_insn_bb (e->src);
40695 if (count < min_prev_count)
40696 min_prev_count = count;
40697 break;
40702 if (min_prev_count < 4)
40703 min_prev_count += ix86_count_insn_bb (bb);
40705 return min_prev_count;
40708 /* Pad short function to 4 instructions. */
40710 static void
40711 ix86_pad_short_function (void)
40713 edge e;
40714 edge_iterator ei;
40716 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40718 rtx_insn *ret = BB_END (e->src);
40719 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
40721 int insn_count = ix86_count_insn (e->src);
40723 /* Pad short function. */
40724 if (insn_count < 4)
40726 rtx_insn *insn = ret;
40728 /* Find epilogue. */
40729 while (insn
40730 && (!NOTE_P (insn)
40731 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
40732 insn = PREV_INSN (insn);
40734 if (!insn)
40735 insn = ret;
40737 /* Two NOPs count as one instruction. */
40738 insn_count = 2 * (4 - insn_count);
40739 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
40745 /* Fix up a Windows system unwinder issue. If an EH region falls through into
40746 the epilogue, the Windows system unwinder will apply epilogue logic and
40747 produce incorrect offsets. This can be avoided by adding a nop between
40748 the last insn that can throw and the first insn of the epilogue. */
40750 static void
40751 ix86_seh_fixup_eh_fallthru (void)
40753 edge e;
40754 edge_iterator ei;
40756 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40758 rtx_insn *insn, *next;
40760 /* Find the beginning of the epilogue. */
40761 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
40762 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
40763 break;
40764 if (insn == NULL)
40765 continue;
40767 /* We only care about preceding insns that can throw. */
40768 insn = prev_active_insn (insn);
40769 if (insn == NULL || !can_throw_internal (insn))
40770 continue;
40772 /* Do not separate calls from their debug information. */
40773 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
40774 if (NOTE_P (next)
40775 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
40776 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
40777 insn = next;
40778 else
40779 break;
40781 emit_insn_after (gen_nops (const1_rtx), insn);
40785 /* Given a register number BASE, the lowest of a group of registers, update
40786 regsets IN and OUT with the registers that should be avoided in input
40787 and output operands respectively when trying to avoid generating a modr/m
40788 byte for -fmitigate-rop. */
40790 static void
40791 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
40793 SET_HARD_REG_BIT (out, base);
40794 SET_HARD_REG_BIT (out, base + 1);
40795 SET_HARD_REG_BIT (in, base + 2);
40796 SET_HARD_REG_BIT (in, base + 3);
40799 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
40800 that certain encodings of modr/m bytes do not occur. */
40801 static void
40802 ix86_mitigate_rop (void)
40804 HARD_REG_SET input_risky;
40805 HARD_REG_SET output_risky;
40806 HARD_REG_SET inout_risky;
40808 CLEAR_HARD_REG_SET (output_risky);
40809 CLEAR_HARD_REG_SET (input_risky);
40810 SET_HARD_REG_BIT (output_risky, AX_REG);
40811 SET_HARD_REG_BIT (output_risky, CX_REG);
40812 SET_HARD_REG_BIT (input_risky, BX_REG);
40813 SET_HARD_REG_BIT (input_risky, DX_REG);
40814 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
40815 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
40816 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
40817 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
40818 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
40819 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
40820 COPY_HARD_REG_SET (inout_risky, input_risky);
40821 IOR_HARD_REG_SET (inout_risky, output_risky);
40823 df_note_add_problem ();
40824 /* Fix up what stack-regs did. */
40825 df_insn_rescan_all ();
40826 df_analyze ();
40828 regrename_init (true);
40829 regrename_analyze (NULL);
40831 auto_vec<du_head_p> cands;
40833 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
40835 if (!NONDEBUG_INSN_P (insn))
40836 continue;
40838 if (GET_CODE (PATTERN (insn)) == USE
40839 || GET_CODE (PATTERN (insn)) == CLOBBER)
40840 continue;
40842 extract_insn (insn);
40844 int opno0, opno1;
40845 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
40846 recog_data.n_operands, &opno0,
40847 &opno1);
40849 if (!ix86_rop_should_change_byte_p (modrm))
40850 continue;
40852 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
40854 /* This happens when regrename has to fail a block. */
40855 if (!info->op_info)
40856 continue;
40858 if (info->op_info[opno0].n_chains != 0)
40860 gcc_assert (info->op_info[opno0].n_chains == 1);
40861 du_head_p op0c;
40862 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
40863 if (op0c->target_data_1 + op0c->target_data_2 == 0
40864 && !op0c->cannot_rename)
40865 cands.safe_push (op0c);
40867 op0c->target_data_1++;
40869 if (info->op_info[opno1].n_chains != 0)
40871 gcc_assert (info->op_info[opno1].n_chains == 1);
40872 du_head_p op1c;
40873 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
40874 if (op1c->target_data_1 + op1c->target_data_2 == 0
40875 && !op1c->cannot_rename)
40876 cands.safe_push (op1c);
40878 op1c->target_data_2++;
40882 int i;
40883 du_head_p head;
40884 FOR_EACH_VEC_ELT (cands, i, head)
40886 int old_reg, best_reg;
40887 HARD_REG_SET unavailable;
40889 CLEAR_HARD_REG_SET (unavailable);
40890 if (head->target_data_1)
40891 IOR_HARD_REG_SET (unavailable, output_risky);
40892 if (head->target_data_2)
40893 IOR_HARD_REG_SET (unavailable, input_risky);
40895 int n_uses;
40896 reg_class superclass = regrename_find_superclass (head, &n_uses,
40897 &unavailable);
40898 old_reg = head->regno;
40899 best_reg = find_rename_reg (head, superclass, &unavailable,
40900 old_reg, false);
40901 bool ok = regrename_do_replace (head, best_reg);
40902 gcc_assert (ok);
40903 if (dump_file)
40904 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
40905 reg_names[best_reg], reg_class_names[superclass]);
40909 regrename_finish ();
40911 df_analyze ();
40913 basic_block bb;
40914 regset_head live;
40916 INIT_REG_SET (&live);
40918 FOR_EACH_BB_FN (bb, cfun)
40920 rtx_insn *insn;
40922 COPY_REG_SET (&live, DF_LR_OUT (bb));
40923 df_simulate_initialize_backwards (bb, &live);
40925 FOR_BB_INSNS_REVERSE (bb, insn)
40927 if (!NONDEBUG_INSN_P (insn))
40928 continue;
40930 df_simulate_one_insn_backwards (bb, insn, &live);
40932 if (GET_CODE (PATTERN (insn)) == USE
40933 || GET_CODE (PATTERN (insn)) == CLOBBER)
40934 continue;
40936 extract_insn (insn);
40937 constrain_operands_cached (insn, reload_completed);
40938 int opno0, opno1;
40939 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
40940 recog_data.n_operands, &opno0,
40941 &opno1);
40942 if (modrm < 0
40943 || !ix86_rop_should_change_byte_p (modrm)
40944 || opno0 == opno1)
40945 continue;
40947 rtx oldreg = recog_data.operand[opno1];
40948 preprocess_constraints (insn);
40949 const operand_alternative *alt = which_op_alt ();
40951 int i;
40952 for (i = 0; i < recog_data.n_operands; i++)
40953 if (i != opno1
40954 && alt[i].earlyclobber
40955 && reg_overlap_mentioned_p (recog_data.operand[i],
40956 oldreg))
40957 break;
40959 if (i < recog_data.n_operands)
40960 continue;
40962 if (dump_file)
40963 fprintf (dump_file,
40964 "attempting to fix modrm byte in insn %d:"
40965 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
40966 reg_class_names[alt[opno1].cl]);
40968 HARD_REG_SET unavailable;
40969 REG_SET_TO_HARD_REG_SET (unavailable, &live);
40970 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
40971 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
40972 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
40973 IOR_HARD_REG_SET (unavailable, output_risky);
40974 IOR_COMPL_HARD_REG_SET (unavailable,
40975 reg_class_contents[alt[opno1].cl]);
40977 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40978 if (!TEST_HARD_REG_BIT (unavailable, i))
40979 break;
40980 if (i == FIRST_PSEUDO_REGISTER)
40982 if (dump_file)
40983 fprintf (dump_file, ", none available\n");
40984 continue;
40986 if (dump_file)
40987 fprintf (dump_file, " -> %d\n", i);
40988 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
40989 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
40990 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
40995 /* Implement machine specific optimizations. We implement padding of returns
40996 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
40997 static void
40998 ix86_reorg (void)
41000 /* We are freeing block_for_insn in the toplev to keep compatibility
41001 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41002 compute_bb_for_insn ();
41004 if (flag_mitigate_rop)
41005 ix86_mitigate_rop ();
41007 if (TARGET_SEH && current_function_has_exception_handlers ())
41008 ix86_seh_fixup_eh_fallthru ();
41010 if (optimize && optimize_function_for_speed_p (cfun))
41012 if (TARGET_PAD_SHORT_FUNCTION)
41013 ix86_pad_short_function ();
41014 else if (TARGET_PAD_RETURNS)
41015 ix86_pad_returns ();
41016 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41017 if (TARGET_FOUR_JUMP_LIMIT)
41018 ix86_avoid_jump_mispredicts ();
41019 #endif
41023 /* Return nonzero when QImode register that must be represented via REX prefix
41024 is used. */
41025 bool
41026 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41028 int i;
41029 extract_insn_cached (insn);
41030 for (i = 0; i < recog_data.n_operands; i++)
41031 if (GENERAL_REG_P (recog_data.operand[i])
41032 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41033 return true;
41034 return false;
41037 /* Return true when INSN mentions register that must be encoded using REX
41038 prefix. */
41039 bool
41040 x86_extended_reg_mentioned_p (rtx insn)
41042 subrtx_iterator::array_type array;
41043 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41045 const_rtx x = *iter;
41046 if (REG_P (x)
41047 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41048 return true;
41050 return false;
41053 /* If profitable, negate (without causing overflow) integer constant
41054 of mode MODE at location LOC. Return true in this case. */
41055 bool
41056 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41058 HOST_WIDE_INT val;
41060 if (!CONST_INT_P (*loc))
41061 return false;
41063 switch (mode)
41065 case E_DImode:
41066 /* DImode x86_64 constants must fit in 32 bits. */
41067 gcc_assert (x86_64_immediate_operand (*loc, mode));
41069 mode = SImode;
41070 break;
41072 case E_SImode:
41073 case E_HImode:
41074 case E_QImode:
41075 break;
41077 default:
41078 gcc_unreachable ();
41081 /* Avoid overflows. */
41082 if (mode_signbit_p (mode, *loc))
41083 return false;
41085 val = INTVAL (*loc);
41087 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41088 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41089 if ((val < 0 && val != -128)
41090 || val == 128)
41092 *loc = GEN_INT (-val);
41093 return true;
41096 return false;
41099 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41100 optabs would emit if we didn't have TFmode patterns. */
41102 void
41103 x86_emit_floatuns (rtx operands[2])
41105 rtx_code_label *neglab, *donelab;
41106 rtx i0, i1, f0, in, out;
41107 machine_mode mode, inmode;
41109 inmode = GET_MODE (operands[1]);
41110 gcc_assert (inmode == SImode || inmode == DImode);
41112 out = operands[0];
41113 in = force_reg (inmode, operands[1]);
41114 mode = GET_MODE (out);
41115 neglab = gen_label_rtx ();
41116 donelab = gen_label_rtx ();
41117 f0 = gen_reg_rtx (mode);
41119 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41121 expand_float (out, in, 0);
41123 emit_jump_insn (gen_jump (donelab));
41124 emit_barrier ();
41126 emit_label (neglab);
41128 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41129 1, OPTAB_DIRECT);
41130 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41131 1, OPTAB_DIRECT);
41132 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41134 expand_float (f0, i0, 0);
41136 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41138 emit_label (donelab);
41141 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41142 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41143 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41144 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41146 /* Get a vector mode of the same size as the original but with elements
41147 twice as wide. This is only guaranteed to apply to integral vectors. */
41149 static inline machine_mode
41150 get_mode_wider_vector (machine_mode o)
41152 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41153 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
41154 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41155 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41156 return n;
41159 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41160 fill target with val via vec_duplicate. */
41162 static bool
41163 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41165 bool ok;
41166 rtx_insn *insn;
41167 rtx dup;
41169 /* First attempt to recognize VAL as-is. */
41170 dup = gen_rtx_VEC_DUPLICATE (mode, val);
41171 insn = emit_insn (gen_rtx_SET (target, dup));
41172 if (recog_memoized (insn) < 0)
41174 rtx_insn *seq;
41175 machine_mode innermode = GET_MODE_INNER (mode);
41176 rtx reg;
41178 /* If that fails, force VAL into a register. */
41180 start_sequence ();
41181 reg = force_reg (innermode, val);
41182 if (GET_MODE (reg) != innermode)
41183 reg = gen_lowpart (innermode, reg);
41184 XEXP (dup, 0) = reg;
41185 seq = get_insns ();
41186 end_sequence ();
41187 if (seq)
41188 emit_insn_before (seq, insn);
41190 ok = recog_memoized (insn) >= 0;
41191 gcc_assert (ok);
41193 return true;
41196 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41197 with all elements equal to VAR. Return true if successful. */
41199 static bool
41200 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41201 rtx target, rtx val)
41203 bool ok;
41205 switch (mode)
41207 case E_V2SImode:
41208 case E_V2SFmode:
41209 if (!mmx_ok)
41210 return false;
41211 /* FALLTHRU */
41213 case E_V4DFmode:
41214 case E_V4DImode:
41215 case E_V8SFmode:
41216 case E_V8SImode:
41217 case E_V2DFmode:
41218 case E_V2DImode:
41219 case E_V4SFmode:
41220 case E_V4SImode:
41221 case E_V16SImode:
41222 case E_V8DImode:
41223 case E_V16SFmode:
41224 case E_V8DFmode:
41225 return ix86_vector_duplicate_value (mode, target, val);
41227 case E_V4HImode:
41228 if (!mmx_ok)
41229 return false;
41230 if (TARGET_SSE || TARGET_3DNOW_A)
41232 rtx x;
41234 val = gen_lowpart (SImode, val);
41235 x = gen_rtx_TRUNCATE (HImode, val);
41236 x = gen_rtx_VEC_DUPLICATE (mode, x);
41237 emit_insn (gen_rtx_SET (target, x));
41238 return true;
41240 goto widen;
41242 case E_V8QImode:
41243 if (!mmx_ok)
41244 return false;
41245 goto widen;
41247 case E_V8HImode:
41248 if (TARGET_AVX2)
41249 return ix86_vector_duplicate_value (mode, target, val);
41251 if (TARGET_SSE2)
41253 struct expand_vec_perm_d dperm;
41254 rtx tmp1, tmp2;
41256 permute:
41257 memset (&dperm, 0, sizeof (dperm));
41258 dperm.target = target;
41259 dperm.vmode = mode;
41260 dperm.nelt = GET_MODE_NUNITS (mode);
41261 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41262 dperm.one_operand_p = true;
41264 /* Extend to SImode using a paradoxical SUBREG. */
41265 tmp1 = gen_reg_rtx (SImode);
41266 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41268 /* Insert the SImode value as low element of a V4SImode vector. */
41269 tmp2 = gen_reg_rtx (V4SImode);
41270 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41271 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41273 ok = (expand_vec_perm_1 (&dperm)
41274 || expand_vec_perm_broadcast_1 (&dperm));
41275 gcc_assert (ok);
41276 return ok;
41278 goto widen;
41280 case E_V16QImode:
41281 if (TARGET_AVX2)
41282 return ix86_vector_duplicate_value (mode, target, val);
41284 if (TARGET_SSE2)
41285 goto permute;
41286 goto widen;
41288 widen:
41289 /* Replicate the value once into the next wider mode and recurse. */
41291 machine_mode smode, wsmode, wvmode;
41292 rtx x;
41294 smode = GET_MODE_INNER (mode);
41295 wvmode = get_mode_wider_vector (mode);
41296 wsmode = GET_MODE_INNER (wvmode);
41298 val = convert_modes (wsmode, smode, val, true);
41299 x = expand_simple_binop (wsmode, ASHIFT, val,
41300 GEN_INT (GET_MODE_BITSIZE (smode)),
41301 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41302 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41304 x = gen_reg_rtx (wvmode);
41305 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41306 gcc_assert (ok);
41307 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41308 return ok;
41311 case E_V16HImode:
41312 case E_V32QImode:
41313 if (TARGET_AVX2)
41314 return ix86_vector_duplicate_value (mode, target, val);
41315 else
41317 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41318 rtx x = gen_reg_rtx (hvmode);
41320 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41321 gcc_assert (ok);
41323 x = gen_rtx_VEC_CONCAT (mode, x, x);
41324 emit_insn (gen_rtx_SET (target, x));
41326 return true;
41328 case E_V64QImode:
41329 case E_V32HImode:
41330 if (TARGET_AVX512BW)
41331 return ix86_vector_duplicate_value (mode, target, val);
41332 else
41334 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41335 rtx x = gen_reg_rtx (hvmode);
41337 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41338 gcc_assert (ok);
41340 x = gen_rtx_VEC_CONCAT (mode, x, x);
41341 emit_insn (gen_rtx_SET (target, x));
41343 return true;
41345 default:
41346 return false;
41350 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41351 whose ONE_VAR element is VAR, and other elements are zero. Return true
41352 if successful. */
41354 static bool
41355 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41356 rtx target, rtx var, int one_var)
41358 machine_mode vsimode;
41359 rtx new_target;
41360 rtx x, tmp;
41361 bool use_vector_set = false;
41363 switch (mode)
41365 case E_V2DImode:
41366 /* For SSE4.1, we normally use vector set. But if the second
41367 element is zero and inter-unit moves are OK, we use movq
41368 instead. */
41369 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41370 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41371 && one_var == 0));
41372 break;
41373 case E_V16QImode:
41374 case E_V4SImode:
41375 case E_V4SFmode:
41376 use_vector_set = TARGET_SSE4_1;
41377 break;
41378 case E_V8HImode:
41379 use_vector_set = TARGET_SSE2;
41380 break;
41381 case E_V4HImode:
41382 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41383 break;
41384 case E_V32QImode:
41385 case E_V16HImode:
41386 case E_V8SImode:
41387 case E_V8SFmode:
41388 case E_V4DFmode:
41389 use_vector_set = TARGET_AVX;
41390 break;
41391 case E_V4DImode:
41392 /* Use ix86_expand_vector_set in 64bit mode only. */
41393 use_vector_set = TARGET_AVX && TARGET_64BIT;
41394 break;
41395 default:
41396 break;
41399 if (use_vector_set)
41401 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41402 var = force_reg (GET_MODE_INNER (mode), var);
41403 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41404 return true;
41407 switch (mode)
41409 case E_V2SFmode:
41410 case E_V2SImode:
41411 if (!mmx_ok)
41412 return false;
41413 /* FALLTHRU */
41415 case E_V2DFmode:
41416 case E_V2DImode:
41417 if (one_var != 0)
41418 return false;
41419 var = force_reg (GET_MODE_INNER (mode), var);
41420 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41421 emit_insn (gen_rtx_SET (target, x));
41422 return true;
41424 case E_V4SFmode:
41425 case E_V4SImode:
41426 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41427 new_target = gen_reg_rtx (mode);
41428 else
41429 new_target = target;
41430 var = force_reg (GET_MODE_INNER (mode), var);
41431 x = gen_rtx_VEC_DUPLICATE (mode, var);
41432 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41433 emit_insn (gen_rtx_SET (new_target, x));
41434 if (one_var != 0)
41436 /* We need to shuffle the value to the correct position, so
41437 create a new pseudo to store the intermediate result. */
41439 /* With SSE2, we can use the integer shuffle insns. */
41440 if (mode != V4SFmode && TARGET_SSE2)
41442 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41443 const1_rtx,
41444 GEN_INT (one_var == 1 ? 0 : 1),
41445 GEN_INT (one_var == 2 ? 0 : 1),
41446 GEN_INT (one_var == 3 ? 0 : 1)));
41447 if (target != new_target)
41448 emit_move_insn (target, new_target);
41449 return true;
41452 /* Otherwise convert the intermediate result to V4SFmode and
41453 use the SSE1 shuffle instructions. */
41454 if (mode != V4SFmode)
41456 tmp = gen_reg_rtx (V4SFmode);
41457 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41459 else
41460 tmp = new_target;
41462 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41463 const1_rtx,
41464 GEN_INT (one_var == 1 ? 0 : 1),
41465 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41466 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41468 if (mode != V4SFmode)
41469 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41470 else if (tmp != target)
41471 emit_move_insn (target, tmp);
41473 else if (target != new_target)
41474 emit_move_insn (target, new_target);
41475 return true;
41477 case E_V8HImode:
41478 case E_V16QImode:
41479 vsimode = V4SImode;
41480 goto widen;
41481 case E_V4HImode:
41482 case E_V8QImode:
41483 if (!mmx_ok)
41484 return false;
41485 vsimode = V2SImode;
41486 goto widen;
41487 widen:
41488 if (one_var != 0)
41489 return false;
41491 /* Zero extend the variable element to SImode and recurse. */
41492 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41494 x = gen_reg_rtx (vsimode);
41495 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41496 var, one_var))
41497 gcc_unreachable ();
41499 emit_move_insn (target, gen_lowpart (mode, x));
41500 return true;
41502 default:
41503 return false;
41507 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41508 consisting of the values in VALS. It is known that all elements
41509 except ONE_VAR are constants. Return true if successful. */
41511 static bool
41512 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41513 rtx target, rtx vals, int one_var)
41515 rtx var = XVECEXP (vals, 0, one_var);
41516 machine_mode wmode;
41517 rtx const_vec, x;
41519 const_vec = copy_rtx (vals);
41520 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
41521 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
41523 switch (mode)
41525 case E_V2DFmode:
41526 case E_V2DImode:
41527 case E_V2SFmode:
41528 case E_V2SImode:
41529 /* For the two element vectors, it's just as easy to use
41530 the general case. */
41531 return false;
41533 case E_V4DImode:
41534 /* Use ix86_expand_vector_set in 64bit mode only. */
41535 if (!TARGET_64BIT)
41536 return false;
41537 /* FALLTHRU */
41538 case E_V4DFmode:
41539 case E_V8SFmode:
41540 case E_V8SImode:
41541 case E_V16HImode:
41542 case E_V32QImode:
41543 case E_V4SFmode:
41544 case E_V4SImode:
41545 case E_V8HImode:
41546 case E_V4HImode:
41547 break;
41549 case E_V16QImode:
41550 if (TARGET_SSE4_1)
41551 break;
41552 wmode = V8HImode;
41553 goto widen;
41554 case E_V8QImode:
41555 wmode = V4HImode;
41556 goto widen;
41557 widen:
41558 /* There's no way to set one QImode entry easily. Combine
41559 the variable value with its adjacent constant value, and
41560 promote to an HImode set. */
41561 x = XVECEXP (vals, 0, one_var ^ 1);
41562 if (one_var & 1)
41564 var = convert_modes (HImode, QImode, var, true);
41565 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
41566 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41567 x = GEN_INT (INTVAL (x) & 0xff);
41569 else
41571 var = convert_modes (HImode, QImode, var, true);
41572 x = gen_int_mode (INTVAL (x) << 8, HImode);
41574 if (x != const0_rtx)
41575 var = expand_simple_binop (HImode, IOR, var, x, var,
41576 1, OPTAB_LIB_WIDEN);
41578 x = gen_reg_rtx (wmode);
41579 emit_move_insn (x, gen_lowpart (wmode, const_vec));
41580 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
41582 emit_move_insn (target, gen_lowpart (mode, x));
41583 return true;
41585 default:
41586 return false;
41589 emit_move_insn (target, const_vec);
41590 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41591 return true;
41594 /* A subroutine of ix86_expand_vector_init_general. Use vector
41595 concatenate to handle the most general case: all values variable,
41596 and none identical. */
41598 static void
41599 ix86_expand_vector_init_concat (machine_mode mode,
41600 rtx target, rtx *ops, int n)
41602 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
41603 rtx first[16], second[8], third[4];
41604 rtvec v;
41605 int i, j;
41607 switch (n)
41609 case 2:
41610 switch (mode)
41612 case E_V16SImode:
41613 cmode = V8SImode;
41614 break;
41615 case E_V16SFmode:
41616 cmode = V8SFmode;
41617 break;
41618 case E_V8DImode:
41619 cmode = V4DImode;
41620 break;
41621 case E_V8DFmode:
41622 cmode = V4DFmode;
41623 break;
41624 case E_V8SImode:
41625 cmode = V4SImode;
41626 break;
41627 case E_V8SFmode:
41628 cmode = V4SFmode;
41629 break;
41630 case E_V4DImode:
41631 cmode = V2DImode;
41632 break;
41633 case E_V4DFmode:
41634 cmode = V2DFmode;
41635 break;
41636 case E_V4SImode:
41637 cmode = V2SImode;
41638 break;
41639 case E_V4SFmode:
41640 cmode = V2SFmode;
41641 break;
41642 case E_V2DImode:
41643 cmode = DImode;
41644 break;
41645 case E_V2SImode:
41646 cmode = SImode;
41647 break;
41648 case E_V2DFmode:
41649 cmode = DFmode;
41650 break;
41651 case E_V2SFmode:
41652 cmode = SFmode;
41653 break;
41654 default:
41655 gcc_unreachable ();
41658 if (!register_operand (ops[1], cmode))
41659 ops[1] = force_reg (cmode, ops[1]);
41660 if (!register_operand (ops[0], cmode))
41661 ops[0] = force_reg (cmode, ops[0]);
41662 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
41663 ops[1])));
41664 break;
41666 case 4:
41667 switch (mode)
41669 case E_V4DImode:
41670 cmode = V2DImode;
41671 break;
41672 case E_V4DFmode:
41673 cmode = V2DFmode;
41674 break;
41675 case E_V4SImode:
41676 cmode = V2SImode;
41677 break;
41678 case E_V4SFmode:
41679 cmode = V2SFmode;
41680 break;
41681 default:
41682 gcc_unreachable ();
41684 goto half;
41686 case 8:
41687 switch (mode)
41689 case E_V8DImode:
41690 cmode = V2DImode;
41691 hmode = V4DImode;
41692 break;
41693 case E_V8DFmode:
41694 cmode = V2DFmode;
41695 hmode = V4DFmode;
41696 break;
41697 case E_V8SImode:
41698 cmode = V2SImode;
41699 hmode = V4SImode;
41700 break;
41701 case E_V8SFmode:
41702 cmode = V2SFmode;
41703 hmode = V4SFmode;
41704 break;
41705 default:
41706 gcc_unreachable ();
41708 goto half;
41710 case 16:
41711 switch (mode)
41713 case E_V16SImode:
41714 cmode = V2SImode;
41715 hmode = V4SImode;
41716 gmode = V8SImode;
41717 break;
41718 case E_V16SFmode:
41719 cmode = V2SFmode;
41720 hmode = V4SFmode;
41721 gmode = V8SFmode;
41722 break;
41723 default:
41724 gcc_unreachable ();
41726 goto half;
41728 half:
41729 /* FIXME: We process inputs backward to help RA. PR 36222. */
41730 i = n - 1;
41731 j = (n >> 1) - 1;
41732 for (; i > 0; i -= 2, j--)
41734 first[j] = gen_reg_rtx (cmode);
41735 v = gen_rtvec (2, ops[i - 1], ops[i]);
41736 ix86_expand_vector_init (false, first[j],
41737 gen_rtx_PARALLEL (cmode, v));
41740 n >>= 1;
41741 if (n > 4)
41743 gcc_assert (hmode != VOIDmode);
41744 gcc_assert (gmode != VOIDmode);
41745 for (i = j = 0; i < n; i += 2, j++)
41747 second[j] = gen_reg_rtx (hmode);
41748 ix86_expand_vector_init_concat (hmode, second [j],
41749 &first [i], 2);
41751 n >>= 1;
41752 for (i = j = 0; i < n; i += 2, j++)
41754 third[j] = gen_reg_rtx (gmode);
41755 ix86_expand_vector_init_concat (gmode, third[j],
41756 &second[i], 2);
41758 n >>= 1;
41759 ix86_expand_vector_init_concat (mode, target, third, n);
41761 else if (n > 2)
41763 gcc_assert (hmode != VOIDmode);
41764 for (i = j = 0; i < n; i += 2, j++)
41766 second[j] = gen_reg_rtx (hmode);
41767 ix86_expand_vector_init_concat (hmode, second [j],
41768 &first [i], 2);
41770 n >>= 1;
41771 ix86_expand_vector_init_concat (mode, target, second, n);
41773 else
41774 ix86_expand_vector_init_concat (mode, target, first, n);
41775 break;
41777 default:
41778 gcc_unreachable ();
41782 /* A subroutine of ix86_expand_vector_init_general. Use vector
41783 interleave to handle the most general case: all values variable,
41784 and none identical. */
41786 static void
41787 ix86_expand_vector_init_interleave (machine_mode mode,
41788 rtx target, rtx *ops, int n)
41790 machine_mode first_imode, second_imode, third_imode, inner_mode;
41791 int i, j;
41792 rtx op0, op1;
41793 rtx (*gen_load_even) (rtx, rtx, rtx);
41794 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
41795 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
41797 switch (mode)
41799 case E_V8HImode:
41800 gen_load_even = gen_vec_setv8hi;
41801 gen_interleave_first_low = gen_vec_interleave_lowv4si;
41802 gen_interleave_second_low = gen_vec_interleave_lowv2di;
41803 inner_mode = HImode;
41804 first_imode = V4SImode;
41805 second_imode = V2DImode;
41806 third_imode = VOIDmode;
41807 break;
41808 case E_V16QImode:
41809 gen_load_even = gen_vec_setv16qi;
41810 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
41811 gen_interleave_second_low = gen_vec_interleave_lowv4si;
41812 inner_mode = QImode;
41813 first_imode = V8HImode;
41814 second_imode = V4SImode;
41815 third_imode = V2DImode;
41816 break;
41817 default:
41818 gcc_unreachable ();
41821 for (i = 0; i < n; i++)
41823 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
41824 op0 = gen_reg_rtx (SImode);
41825 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
41827 /* Insert the SImode value as low element of V4SImode vector. */
41828 op1 = gen_reg_rtx (V4SImode);
41829 op0 = gen_rtx_VEC_MERGE (V4SImode,
41830 gen_rtx_VEC_DUPLICATE (V4SImode,
41831 op0),
41832 CONST0_RTX (V4SImode),
41833 const1_rtx);
41834 emit_insn (gen_rtx_SET (op1, op0));
41836 /* Cast the V4SImode vector back to a vector in orignal mode. */
41837 op0 = gen_reg_rtx (mode);
41838 emit_move_insn (op0, gen_lowpart (mode, op1));
41840 /* Load even elements into the second position. */
41841 emit_insn (gen_load_even (op0,
41842 force_reg (inner_mode,
41843 ops [i + i + 1]),
41844 const1_rtx));
41846 /* Cast vector to FIRST_IMODE vector. */
41847 ops[i] = gen_reg_rtx (first_imode);
41848 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
41851 /* Interleave low FIRST_IMODE vectors. */
41852 for (i = j = 0; i < n; i += 2, j++)
41854 op0 = gen_reg_rtx (first_imode);
41855 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
41857 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
41858 ops[j] = gen_reg_rtx (second_imode);
41859 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
41862 /* Interleave low SECOND_IMODE vectors. */
41863 switch (second_imode)
41865 case E_V4SImode:
41866 for (i = j = 0; i < n / 2; i += 2, j++)
41868 op0 = gen_reg_rtx (second_imode);
41869 emit_insn (gen_interleave_second_low (op0, ops[i],
41870 ops[i + 1]));
41872 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
41873 vector. */
41874 ops[j] = gen_reg_rtx (third_imode);
41875 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
41877 second_imode = V2DImode;
41878 gen_interleave_second_low = gen_vec_interleave_lowv2di;
41879 /* FALLTHRU */
41881 case E_V2DImode:
41882 op0 = gen_reg_rtx (second_imode);
41883 emit_insn (gen_interleave_second_low (op0, ops[0],
41884 ops[1]));
41886 /* Cast the SECOND_IMODE vector back to a vector on original
41887 mode. */
41888 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
41889 break;
41891 default:
41892 gcc_unreachable ();
41896 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
41897 all values variable, and none identical. */
41899 static void
41900 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
41901 rtx target, rtx vals)
41903 rtx ops[64], op0, op1, op2, op3, op4, op5;
41904 machine_mode half_mode = VOIDmode;
41905 machine_mode quarter_mode = VOIDmode;
41906 int n, i;
41908 switch (mode)
41910 case E_V2SFmode:
41911 case E_V2SImode:
41912 if (!mmx_ok && !TARGET_SSE)
41913 break;
41914 /* FALLTHRU */
41916 case E_V16SImode:
41917 case E_V16SFmode:
41918 case E_V8DFmode:
41919 case E_V8DImode:
41920 case E_V8SFmode:
41921 case E_V8SImode:
41922 case E_V4DFmode:
41923 case E_V4DImode:
41924 case E_V4SFmode:
41925 case E_V4SImode:
41926 case E_V2DFmode:
41927 case E_V2DImode:
41928 n = GET_MODE_NUNITS (mode);
41929 for (i = 0; i < n; i++)
41930 ops[i] = XVECEXP (vals, 0, i);
41931 ix86_expand_vector_init_concat (mode, target, ops, n);
41932 return;
41934 case E_V2TImode:
41935 for (i = 0; i < 2; i++)
41936 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
41937 op0 = gen_reg_rtx (V4DImode);
41938 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
41939 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
41940 return;
41942 case E_V4TImode:
41943 for (i = 0; i < 4; i++)
41944 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
41945 ops[4] = gen_reg_rtx (V4DImode);
41946 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
41947 ops[5] = gen_reg_rtx (V4DImode);
41948 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
41949 op0 = gen_reg_rtx (V8DImode);
41950 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
41951 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
41952 return;
41954 case E_V32QImode:
41955 half_mode = V16QImode;
41956 goto half;
41958 case E_V16HImode:
41959 half_mode = V8HImode;
41960 goto half;
41962 half:
41963 n = GET_MODE_NUNITS (mode);
41964 for (i = 0; i < n; i++)
41965 ops[i] = XVECEXP (vals, 0, i);
41966 op0 = gen_reg_rtx (half_mode);
41967 op1 = gen_reg_rtx (half_mode);
41968 ix86_expand_vector_init_interleave (half_mode, op0, ops,
41969 n >> 2);
41970 ix86_expand_vector_init_interleave (half_mode, op1,
41971 &ops [n >> 1], n >> 2);
41972 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
41973 return;
41975 case E_V64QImode:
41976 quarter_mode = V16QImode;
41977 half_mode = V32QImode;
41978 goto quarter;
41980 case E_V32HImode:
41981 quarter_mode = V8HImode;
41982 half_mode = V16HImode;
41983 goto quarter;
41985 quarter:
41986 n = GET_MODE_NUNITS (mode);
41987 for (i = 0; i < n; i++)
41988 ops[i] = XVECEXP (vals, 0, i);
41989 op0 = gen_reg_rtx (quarter_mode);
41990 op1 = gen_reg_rtx (quarter_mode);
41991 op2 = gen_reg_rtx (quarter_mode);
41992 op3 = gen_reg_rtx (quarter_mode);
41993 op4 = gen_reg_rtx (half_mode);
41994 op5 = gen_reg_rtx (half_mode);
41995 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
41996 n >> 3);
41997 ix86_expand_vector_init_interleave (quarter_mode, op1,
41998 &ops [n >> 2], n >> 3);
41999 ix86_expand_vector_init_interleave (quarter_mode, op2,
42000 &ops [n >> 1], n >> 3);
42001 ix86_expand_vector_init_interleave (quarter_mode, op3,
42002 &ops [(n >> 1) | (n >> 2)], n >> 3);
42003 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42004 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42005 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42006 return;
42008 case E_V16QImode:
42009 if (!TARGET_SSE4_1)
42010 break;
42011 /* FALLTHRU */
42013 case E_V8HImode:
42014 if (!TARGET_SSE2)
42015 break;
42017 /* Don't use ix86_expand_vector_init_interleave if we can't
42018 move from GPR to SSE register directly. */
42019 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42020 break;
42022 n = GET_MODE_NUNITS (mode);
42023 for (i = 0; i < n; i++)
42024 ops[i] = XVECEXP (vals, 0, i);
42025 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42026 return;
42028 case E_V4HImode:
42029 case E_V8QImode:
42030 break;
42032 default:
42033 gcc_unreachable ();
42037 int i, j, n_elts, n_words, n_elt_per_word;
42038 machine_mode inner_mode;
42039 rtx words[4], shift;
42041 inner_mode = GET_MODE_INNER (mode);
42042 n_elts = GET_MODE_NUNITS (mode);
42043 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42044 n_elt_per_word = n_elts / n_words;
42045 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42047 for (i = 0; i < n_words; ++i)
42049 rtx word = NULL_RTX;
42051 for (j = 0; j < n_elt_per_word; ++j)
42053 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42054 elt = convert_modes (word_mode, inner_mode, elt, true);
42056 if (j == 0)
42057 word = elt;
42058 else
42060 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42061 word, 1, OPTAB_LIB_WIDEN);
42062 word = expand_simple_binop (word_mode, IOR, word, elt,
42063 word, 1, OPTAB_LIB_WIDEN);
42067 words[i] = word;
42070 if (n_words == 1)
42071 emit_move_insn (target, gen_lowpart (mode, words[0]));
42072 else if (n_words == 2)
42074 rtx tmp = gen_reg_rtx (mode);
42075 emit_clobber (tmp);
42076 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42077 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42078 emit_move_insn (target, tmp);
42080 else if (n_words == 4)
42082 rtx tmp = gen_reg_rtx (V4SImode);
42083 gcc_assert (word_mode == SImode);
42084 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42085 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42086 emit_move_insn (target, gen_lowpart (mode, tmp));
42088 else
42089 gcc_unreachable ();
42093 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42094 instructions unless MMX_OK is true. */
42096 void
42097 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42099 machine_mode mode = GET_MODE (target);
42100 machine_mode inner_mode = GET_MODE_INNER (mode);
42101 int n_elts = GET_MODE_NUNITS (mode);
42102 int n_var = 0, one_var = -1;
42103 bool all_same = true, all_const_zero = true;
42104 int i;
42105 rtx x;
42107 /* Handle first initialization from vector elts. */
42108 if (n_elts != XVECLEN (vals, 0))
42110 rtx subtarget = target;
42111 x = XVECEXP (vals, 0, 0);
42112 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
42113 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
42115 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
42116 if (inner_mode == QImode || inner_mode == HImode)
42118 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
42119 mode = mode_for_vector (SImode, n_bits / 4).require ();
42120 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
42121 ops[0] = gen_lowpart (inner_mode, ops[0]);
42122 ops[1] = gen_lowpart (inner_mode, ops[1]);
42123 subtarget = gen_reg_rtx (mode);
42125 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
42126 if (subtarget != target)
42127 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
42128 return;
42130 gcc_unreachable ();
42133 for (i = 0; i < n_elts; ++i)
42135 x = XVECEXP (vals, 0, i);
42136 if (!(CONST_SCALAR_INT_P (x)
42137 || CONST_DOUBLE_P (x)
42138 || CONST_FIXED_P (x)))
42139 n_var++, one_var = i;
42140 else if (x != CONST0_RTX (inner_mode))
42141 all_const_zero = false;
42142 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42143 all_same = false;
42146 /* Constants are best loaded from the constant pool. */
42147 if (n_var == 0)
42149 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42150 return;
42153 /* If all values are identical, broadcast the value. */
42154 if (all_same
42155 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42156 XVECEXP (vals, 0, 0)))
42157 return;
42159 /* Values where only one field is non-constant are best loaded from
42160 the pool and overwritten via move later. */
42161 if (n_var == 1)
42163 if (all_const_zero
42164 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42165 XVECEXP (vals, 0, one_var),
42166 one_var))
42167 return;
42169 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42170 return;
42173 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42176 void
42177 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42179 machine_mode mode = GET_MODE (target);
42180 machine_mode inner_mode = GET_MODE_INNER (mode);
42181 machine_mode half_mode;
42182 bool use_vec_merge = false;
42183 rtx tmp;
42184 static rtx (*gen_extract[6][2]) (rtx, rtx)
42186 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42187 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42188 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42189 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42190 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42191 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42193 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42195 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42196 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42197 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42198 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42199 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42200 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42202 int i, j, n;
42203 machine_mode mmode = VOIDmode;
42204 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42206 switch (mode)
42208 case E_V2SFmode:
42209 case E_V2SImode:
42210 if (mmx_ok)
42212 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42213 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42214 if (elt == 0)
42215 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42216 else
42217 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42218 emit_insn (gen_rtx_SET (target, tmp));
42219 return;
42221 break;
42223 case E_V2DImode:
42224 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42225 if (use_vec_merge)
42226 break;
42228 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42229 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42230 if (elt == 0)
42231 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42232 else
42233 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42234 emit_insn (gen_rtx_SET (target, tmp));
42235 return;
42237 case E_V2DFmode:
42239 rtx op0, op1;
42241 /* For the two element vectors, we implement a VEC_CONCAT with
42242 the extraction of the other element. */
42244 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42245 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42247 if (elt == 0)
42248 op0 = val, op1 = tmp;
42249 else
42250 op0 = tmp, op1 = val;
42252 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42253 emit_insn (gen_rtx_SET (target, tmp));
42255 return;
42257 case E_V4SFmode:
42258 use_vec_merge = TARGET_SSE4_1;
42259 if (use_vec_merge)
42260 break;
42262 switch (elt)
42264 case 0:
42265 use_vec_merge = true;
42266 break;
42268 case 1:
42269 /* tmp = target = A B C D */
42270 tmp = copy_to_reg (target);
42271 /* target = A A B B */
42272 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42273 /* target = X A B B */
42274 ix86_expand_vector_set (false, target, val, 0);
42275 /* target = A X C D */
42276 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42277 const1_rtx, const0_rtx,
42278 GEN_INT (2+4), GEN_INT (3+4)));
42279 return;
42281 case 2:
42282 /* tmp = target = A B C D */
42283 tmp = copy_to_reg (target);
42284 /* tmp = X B C D */
42285 ix86_expand_vector_set (false, tmp, val, 0);
42286 /* target = A B X D */
42287 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42288 const0_rtx, const1_rtx,
42289 GEN_INT (0+4), GEN_INT (3+4)));
42290 return;
42292 case 3:
42293 /* tmp = target = A B C D */
42294 tmp = copy_to_reg (target);
42295 /* tmp = X B C D */
42296 ix86_expand_vector_set (false, tmp, val, 0);
42297 /* target = A B X D */
42298 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42299 const0_rtx, const1_rtx,
42300 GEN_INT (2+4), GEN_INT (0+4)));
42301 return;
42303 default:
42304 gcc_unreachable ();
42306 break;
42308 case E_V4SImode:
42309 use_vec_merge = TARGET_SSE4_1;
42310 if (use_vec_merge)
42311 break;
42313 /* Element 0 handled by vec_merge below. */
42314 if (elt == 0)
42316 use_vec_merge = true;
42317 break;
42320 if (TARGET_SSE2)
42322 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42323 store into element 0, then shuffle them back. */
42325 rtx order[4];
42327 order[0] = GEN_INT (elt);
42328 order[1] = const1_rtx;
42329 order[2] = const2_rtx;
42330 order[3] = GEN_INT (3);
42331 order[elt] = const0_rtx;
42333 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42334 order[1], order[2], order[3]));
42336 ix86_expand_vector_set (false, target, val, 0);
42338 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42339 order[1], order[2], order[3]));
42341 else
42343 /* For SSE1, we have to reuse the V4SF code. */
42344 rtx t = gen_reg_rtx (V4SFmode);
42345 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42346 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42347 emit_move_insn (target, gen_lowpart (mode, t));
42349 return;
42351 case E_V8HImode:
42352 use_vec_merge = TARGET_SSE2;
42353 break;
42354 case E_V4HImode:
42355 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42356 break;
42358 case E_V16QImode:
42359 use_vec_merge = TARGET_SSE4_1;
42360 break;
42362 case E_V8QImode:
42363 break;
42365 case E_V32QImode:
42366 half_mode = V16QImode;
42367 j = 0;
42368 n = 16;
42369 goto half;
42371 case E_V16HImode:
42372 half_mode = V8HImode;
42373 j = 1;
42374 n = 8;
42375 goto half;
42377 case E_V8SImode:
42378 half_mode = V4SImode;
42379 j = 2;
42380 n = 4;
42381 goto half;
42383 case E_V4DImode:
42384 half_mode = V2DImode;
42385 j = 3;
42386 n = 2;
42387 goto half;
42389 case E_V8SFmode:
42390 half_mode = V4SFmode;
42391 j = 4;
42392 n = 4;
42393 goto half;
42395 case E_V4DFmode:
42396 half_mode = V2DFmode;
42397 j = 5;
42398 n = 2;
42399 goto half;
42401 half:
42402 /* Compute offset. */
42403 i = elt / n;
42404 elt %= n;
42406 gcc_assert (i <= 1);
42408 /* Extract the half. */
42409 tmp = gen_reg_rtx (half_mode);
42410 emit_insn (gen_extract[j][i] (tmp, target));
42412 /* Put val in tmp at elt. */
42413 ix86_expand_vector_set (false, tmp, val, elt);
42415 /* Put it back. */
42416 emit_insn (gen_insert[j][i] (target, target, tmp));
42417 return;
42419 case E_V8DFmode:
42420 if (TARGET_AVX512F)
42422 mmode = QImode;
42423 gen_blendm = gen_avx512f_blendmv8df;
42425 break;
42427 case E_V8DImode:
42428 if (TARGET_AVX512F)
42430 mmode = QImode;
42431 gen_blendm = gen_avx512f_blendmv8di;
42433 break;
42435 case E_V16SFmode:
42436 if (TARGET_AVX512F)
42438 mmode = HImode;
42439 gen_blendm = gen_avx512f_blendmv16sf;
42441 break;
42443 case E_V16SImode:
42444 if (TARGET_AVX512F)
42446 mmode = HImode;
42447 gen_blendm = gen_avx512f_blendmv16si;
42449 break;
42451 case E_V32HImode:
42452 if (TARGET_AVX512F && TARGET_AVX512BW)
42454 mmode = SImode;
42455 gen_blendm = gen_avx512bw_blendmv32hi;
42457 break;
42459 case E_V64QImode:
42460 if (TARGET_AVX512F && TARGET_AVX512BW)
42462 mmode = DImode;
42463 gen_blendm = gen_avx512bw_blendmv64qi;
42465 break;
42467 default:
42468 break;
42471 if (mmode != VOIDmode)
42473 tmp = gen_reg_rtx (mode);
42474 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42475 /* The avx512*_blendm<mode> expanders have different operand order
42476 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42477 elements where the mask is set and second input operand otherwise,
42478 in {sse,avx}*_*blend* the first input operand is used for elements
42479 where the mask is clear and second input operand otherwise. */
42480 emit_insn (gen_blendm (target, target, tmp,
42481 force_reg (mmode,
42482 gen_int_mode (1 << elt, mmode))));
42484 else if (use_vec_merge)
42486 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42487 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42488 emit_insn (gen_rtx_SET (target, tmp));
42490 else
42492 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42494 emit_move_insn (mem, target);
42496 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42497 emit_move_insn (tmp, val);
42499 emit_move_insn (target, mem);
42503 void
42504 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42506 machine_mode mode = GET_MODE (vec);
42507 machine_mode inner_mode = GET_MODE_INNER (mode);
42508 bool use_vec_extr = false;
42509 rtx tmp;
42511 switch (mode)
42513 case E_V2SImode:
42514 case E_V2SFmode:
42515 if (!mmx_ok)
42516 break;
42517 /* FALLTHRU */
42519 case E_V2DFmode:
42520 case E_V2DImode:
42521 case E_V2TImode:
42522 case E_V4TImode:
42523 use_vec_extr = true;
42524 break;
42526 case E_V4SFmode:
42527 use_vec_extr = TARGET_SSE4_1;
42528 if (use_vec_extr)
42529 break;
42531 switch (elt)
42533 case 0:
42534 tmp = vec;
42535 break;
42537 case 1:
42538 case 3:
42539 tmp = gen_reg_rtx (mode);
42540 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42541 GEN_INT (elt), GEN_INT (elt),
42542 GEN_INT (elt+4), GEN_INT (elt+4)));
42543 break;
42545 case 2:
42546 tmp = gen_reg_rtx (mode);
42547 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42548 break;
42550 default:
42551 gcc_unreachable ();
42553 vec = tmp;
42554 use_vec_extr = true;
42555 elt = 0;
42556 break;
42558 case E_V4SImode:
42559 use_vec_extr = TARGET_SSE4_1;
42560 if (use_vec_extr)
42561 break;
42563 if (TARGET_SSE2)
42565 switch (elt)
42567 case 0:
42568 tmp = vec;
42569 break;
42571 case 1:
42572 case 3:
42573 tmp = gen_reg_rtx (mode);
42574 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
42575 GEN_INT (elt), GEN_INT (elt),
42576 GEN_INT (elt), GEN_INT (elt)));
42577 break;
42579 case 2:
42580 tmp = gen_reg_rtx (mode);
42581 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
42582 break;
42584 default:
42585 gcc_unreachable ();
42587 vec = tmp;
42588 use_vec_extr = true;
42589 elt = 0;
42591 else
42593 /* For SSE1, we have to reuse the V4SF code. */
42594 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
42595 gen_lowpart (V4SFmode, vec), elt);
42596 return;
42598 break;
42600 case E_V8HImode:
42601 use_vec_extr = TARGET_SSE2;
42602 break;
42603 case E_V4HImode:
42604 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42605 break;
42607 case E_V16QImode:
42608 use_vec_extr = TARGET_SSE4_1;
42609 break;
42611 case E_V8SFmode:
42612 if (TARGET_AVX)
42614 tmp = gen_reg_rtx (V4SFmode);
42615 if (elt < 4)
42616 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
42617 else
42618 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
42619 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42620 return;
42622 break;
42624 case E_V4DFmode:
42625 if (TARGET_AVX)
42627 tmp = gen_reg_rtx (V2DFmode);
42628 if (elt < 2)
42629 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
42630 else
42631 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
42632 ix86_expand_vector_extract (false, target, tmp, elt & 1);
42633 return;
42635 break;
42637 case E_V32QImode:
42638 if (TARGET_AVX)
42640 tmp = gen_reg_rtx (V16QImode);
42641 if (elt < 16)
42642 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
42643 else
42644 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
42645 ix86_expand_vector_extract (false, target, tmp, elt & 15);
42646 return;
42648 break;
42650 case E_V16HImode:
42651 if (TARGET_AVX)
42653 tmp = gen_reg_rtx (V8HImode);
42654 if (elt < 8)
42655 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
42656 else
42657 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
42658 ix86_expand_vector_extract (false, target, tmp, elt & 7);
42659 return;
42661 break;
42663 case E_V8SImode:
42664 if (TARGET_AVX)
42666 tmp = gen_reg_rtx (V4SImode);
42667 if (elt < 4)
42668 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
42669 else
42670 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
42671 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42672 return;
42674 break;
42676 case E_V4DImode:
42677 if (TARGET_AVX)
42679 tmp = gen_reg_rtx (V2DImode);
42680 if (elt < 2)
42681 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
42682 else
42683 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
42684 ix86_expand_vector_extract (false, target, tmp, elt & 1);
42685 return;
42687 break;
42689 case E_V32HImode:
42690 if (TARGET_AVX512BW)
42692 tmp = gen_reg_rtx (V16HImode);
42693 if (elt < 16)
42694 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
42695 else
42696 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
42697 ix86_expand_vector_extract (false, target, tmp, elt & 15);
42698 return;
42700 break;
42702 case E_V64QImode:
42703 if (TARGET_AVX512BW)
42705 tmp = gen_reg_rtx (V32QImode);
42706 if (elt < 32)
42707 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
42708 else
42709 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
42710 ix86_expand_vector_extract (false, target, tmp, elt & 31);
42711 return;
42713 break;
42715 case E_V16SFmode:
42716 tmp = gen_reg_rtx (V8SFmode);
42717 if (elt < 8)
42718 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
42719 else
42720 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
42721 ix86_expand_vector_extract (false, target, tmp, elt & 7);
42722 return;
42724 case E_V8DFmode:
42725 tmp = gen_reg_rtx (V4DFmode);
42726 if (elt < 4)
42727 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
42728 else
42729 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
42730 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42731 return;
42733 case E_V16SImode:
42734 tmp = gen_reg_rtx (V8SImode);
42735 if (elt < 8)
42736 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
42737 else
42738 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
42739 ix86_expand_vector_extract (false, target, tmp, elt & 7);
42740 return;
42742 case E_V8DImode:
42743 tmp = gen_reg_rtx (V4DImode);
42744 if (elt < 4)
42745 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
42746 else
42747 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
42748 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42749 return;
42751 case E_V8QImode:
42752 /* ??? Could extract the appropriate HImode element and shift. */
42753 default:
42754 break;
42757 if (use_vec_extr)
42759 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
42760 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
42762 /* Let the rtl optimizers know about the zero extension performed. */
42763 if (inner_mode == QImode || inner_mode == HImode)
42765 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
42766 target = gen_lowpart (SImode, target);
42769 emit_insn (gen_rtx_SET (target, tmp));
42771 else
42773 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42775 emit_move_insn (mem, vec);
42777 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42778 emit_move_insn (target, tmp);
42782 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
42783 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
42784 The upper bits of DEST are undefined, though they shouldn't cause
42785 exceptions (some bits from src or all zeros are ok). */
42787 static void
42788 emit_reduc_half (rtx dest, rtx src, int i)
42790 rtx tem, d = dest;
42791 switch (GET_MODE (src))
42793 case E_V4SFmode:
42794 if (i == 128)
42795 tem = gen_sse_movhlps (dest, src, src);
42796 else
42797 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
42798 GEN_INT (1 + 4), GEN_INT (1 + 4));
42799 break;
42800 case E_V2DFmode:
42801 tem = gen_vec_interleave_highv2df (dest, src, src);
42802 break;
42803 case E_V16QImode:
42804 case E_V8HImode:
42805 case E_V4SImode:
42806 case E_V2DImode:
42807 d = gen_reg_rtx (V1TImode);
42808 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
42809 GEN_INT (i / 2));
42810 break;
42811 case E_V8SFmode:
42812 if (i == 256)
42813 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
42814 else
42815 tem = gen_avx_shufps256 (dest, src, src,
42816 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
42817 break;
42818 case E_V4DFmode:
42819 if (i == 256)
42820 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
42821 else
42822 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
42823 break;
42824 case E_V32QImode:
42825 case E_V16HImode:
42826 case E_V8SImode:
42827 case E_V4DImode:
42828 if (i == 256)
42830 if (GET_MODE (dest) != V4DImode)
42831 d = gen_reg_rtx (V4DImode);
42832 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
42833 gen_lowpart (V4DImode, src),
42834 const1_rtx);
42836 else
42838 d = gen_reg_rtx (V2TImode);
42839 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
42840 GEN_INT (i / 2));
42842 break;
42843 case E_V64QImode:
42844 case E_V32HImode:
42845 case E_V16SImode:
42846 case E_V16SFmode:
42847 case E_V8DImode:
42848 case E_V8DFmode:
42849 if (i > 128)
42850 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
42851 gen_lowpart (V16SImode, src),
42852 gen_lowpart (V16SImode, src),
42853 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
42854 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
42855 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
42856 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
42857 GEN_INT (0xC), GEN_INT (0xD),
42858 GEN_INT (0xE), GEN_INT (0xF),
42859 GEN_INT (0x10), GEN_INT (0x11),
42860 GEN_INT (0x12), GEN_INT (0x13),
42861 GEN_INT (0x14), GEN_INT (0x15),
42862 GEN_INT (0x16), GEN_INT (0x17));
42863 else
42864 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
42865 gen_lowpart (V16SImode, src),
42866 GEN_INT (i == 128 ? 0x2 : 0x1),
42867 GEN_INT (0x3),
42868 GEN_INT (0x3),
42869 GEN_INT (0x3),
42870 GEN_INT (i == 128 ? 0x6 : 0x5),
42871 GEN_INT (0x7),
42872 GEN_INT (0x7),
42873 GEN_INT (0x7),
42874 GEN_INT (i == 128 ? 0xA : 0x9),
42875 GEN_INT (0xB),
42876 GEN_INT (0xB),
42877 GEN_INT (0xB),
42878 GEN_INT (i == 128 ? 0xE : 0xD),
42879 GEN_INT (0xF),
42880 GEN_INT (0xF),
42881 GEN_INT (0xF));
42882 break;
42883 default:
42884 gcc_unreachable ();
42886 emit_insn (tem);
42887 if (d != dest)
42888 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
42891 /* Expand a vector reduction. FN is the binary pattern to reduce;
42892 DEST is the destination; IN is the input vector. */
42894 void
42895 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
42897 rtx half, dst, vec = in;
42898 machine_mode mode = GET_MODE (in);
42899 int i;
42901 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
42902 if (TARGET_SSE4_1
42903 && mode == V8HImode
42904 && fn == gen_uminv8hi3)
42906 emit_insn (gen_sse4_1_phminposuw (dest, in));
42907 return;
42910 for (i = GET_MODE_BITSIZE (mode);
42911 i > GET_MODE_UNIT_BITSIZE (mode);
42912 i >>= 1)
42914 half = gen_reg_rtx (mode);
42915 emit_reduc_half (half, vec, i);
42916 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
42917 dst = dest;
42918 else
42919 dst = gen_reg_rtx (mode);
42920 emit_insn (fn (dst, half, vec));
42921 vec = dst;
42925 /* Target hook for scalar_mode_supported_p. */
42926 static bool
42927 ix86_scalar_mode_supported_p (scalar_mode mode)
42929 if (DECIMAL_FLOAT_MODE_P (mode))
42930 return default_decimal_float_supported_p ();
42931 else if (mode == TFmode)
42932 return true;
42933 else
42934 return default_scalar_mode_supported_p (mode);
42937 /* Implements target hook vector_mode_supported_p. */
42938 static bool
42939 ix86_vector_mode_supported_p (machine_mode mode)
42941 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
42942 return true;
42943 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
42944 return true;
42945 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
42946 return true;
42947 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
42948 return true;
42949 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
42950 return true;
42951 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
42952 return true;
42953 return false;
42956 /* Target hook for c_mode_for_suffix. */
42957 static machine_mode
42958 ix86_c_mode_for_suffix (char suffix)
42960 if (suffix == 'q')
42961 return TFmode;
42962 if (suffix == 'w')
42963 return XFmode;
42965 return VOIDmode;
42968 /* Worker function for TARGET_MD_ASM_ADJUST.
42970 We implement asm flag outputs, and maintain source compatibility
42971 with the old cc0-based compiler. */
42973 static rtx_insn *
42974 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
42975 vec<const char *> &constraints,
42976 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
42978 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
42979 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
42981 bool saw_asm_flag = false;
42983 start_sequence ();
42984 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
42986 const char *con = constraints[i];
42987 if (strncmp (con, "=@cc", 4) != 0)
42988 continue;
42989 con += 4;
42990 if (strchr (con, ',') != NULL)
42992 error ("alternatives not allowed in asm flag output");
42993 continue;
42996 bool invert = false;
42997 if (con[0] == 'n')
42998 invert = true, con++;
43000 machine_mode mode = CCmode;
43001 rtx_code code = UNKNOWN;
43003 switch (con[0])
43005 case 'a':
43006 if (con[1] == 0)
43007 mode = CCAmode, code = EQ;
43008 else if (con[1] == 'e' && con[2] == 0)
43009 mode = CCCmode, code = NE;
43010 break;
43011 case 'b':
43012 if (con[1] == 0)
43013 mode = CCCmode, code = EQ;
43014 else if (con[1] == 'e' && con[2] == 0)
43015 mode = CCAmode, code = NE;
43016 break;
43017 case 'c':
43018 if (con[1] == 0)
43019 mode = CCCmode, code = EQ;
43020 break;
43021 case 'e':
43022 if (con[1] == 0)
43023 mode = CCZmode, code = EQ;
43024 break;
43025 case 'g':
43026 if (con[1] == 0)
43027 mode = CCGCmode, code = GT;
43028 else if (con[1] == 'e' && con[2] == 0)
43029 mode = CCGCmode, code = GE;
43030 break;
43031 case 'l':
43032 if (con[1] == 0)
43033 mode = CCGCmode, code = LT;
43034 else if (con[1] == 'e' && con[2] == 0)
43035 mode = CCGCmode, code = LE;
43036 break;
43037 case 'o':
43038 if (con[1] == 0)
43039 mode = CCOmode, code = EQ;
43040 break;
43041 case 'p':
43042 if (con[1] == 0)
43043 mode = CCPmode, code = EQ;
43044 break;
43045 case 's':
43046 if (con[1] == 0)
43047 mode = CCSmode, code = EQ;
43048 break;
43049 case 'z':
43050 if (con[1] == 0)
43051 mode = CCZmode, code = EQ;
43052 break;
43054 if (code == UNKNOWN)
43056 error ("unknown asm flag output %qs", constraints[i]);
43057 continue;
43059 if (invert)
43060 code = reverse_condition (code);
43062 rtx dest = outputs[i];
43063 if (!saw_asm_flag)
43065 /* This is the first asm flag output. Here we put the flags
43066 register in as the real output and adjust the condition to
43067 allow it. */
43068 constraints[i] = "=Bf";
43069 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43070 saw_asm_flag = true;
43072 else
43074 /* We don't need the flags register as output twice. */
43075 constraints[i] = "=X";
43076 outputs[i] = gen_rtx_SCRATCH (SImode);
43079 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43080 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43082 machine_mode dest_mode = GET_MODE (dest);
43083 if (!SCALAR_INT_MODE_P (dest_mode))
43085 error ("invalid type for asm flag output");
43086 continue;
43089 if (dest_mode == DImode && !TARGET_64BIT)
43090 dest_mode = SImode;
43092 if (dest_mode != QImode)
43094 rtx destqi = gen_reg_rtx (QImode);
43095 emit_insn (gen_rtx_SET (destqi, x));
43097 if (TARGET_ZERO_EXTEND_WITH_AND
43098 && optimize_function_for_speed_p (cfun))
43100 x = force_reg (dest_mode, const0_rtx);
43102 emit_insn (gen_movstrictqi
43103 (gen_lowpart (QImode, x), destqi));
43105 else
43106 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43109 if (dest_mode != GET_MODE (dest))
43111 rtx tmp = gen_reg_rtx (SImode);
43113 emit_insn (gen_rtx_SET (tmp, x));
43114 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43116 else
43117 emit_insn (gen_rtx_SET (dest, x));
43119 rtx_insn *seq = get_insns ();
43120 end_sequence ();
43122 if (saw_asm_flag)
43123 return seq;
43124 else
43126 /* If we had no asm flag outputs, clobber the flags. */
43127 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43128 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43129 return NULL;
43133 /* Implements target vector targetm.asm.encode_section_info. */
43135 static void ATTRIBUTE_UNUSED
43136 ix86_encode_section_info (tree decl, rtx rtl, int first)
43138 default_encode_section_info (decl, rtl, first);
43140 if (ix86_in_large_data_p (decl))
43141 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43144 /* Worker function for REVERSE_CONDITION. */
43146 enum rtx_code
43147 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43149 return (mode != CCFPmode && mode != CCFPUmode
43150 ? reverse_condition (code)
43151 : reverse_condition_maybe_unordered (code));
43154 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43155 to OPERANDS[0]. */
43157 const char *
43158 output_387_reg_move (rtx_insn *insn, rtx *operands)
43160 if (REG_P (operands[0]))
43162 if (REG_P (operands[1])
43163 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43165 if (REGNO (operands[0]) == FIRST_STACK_REG)
43166 return output_387_ffreep (operands, 0);
43167 return "fstp\t%y0";
43169 if (STACK_TOP_P (operands[0]))
43170 return "fld%Z1\t%y1";
43171 return "fst\t%y0";
43173 else if (MEM_P (operands[0]))
43175 gcc_assert (REG_P (operands[1]));
43176 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43177 return "fstp%Z0\t%y0";
43178 else
43180 /* There is no non-popping store to memory for XFmode.
43181 So if we need one, follow the store with a load. */
43182 if (GET_MODE (operands[0]) == XFmode)
43183 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43184 else
43185 return "fst%Z0\t%y0";
43188 else
43189 gcc_unreachable();
43192 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43193 FP status register is set. */
43195 void
43196 ix86_emit_fp_unordered_jump (rtx label)
43198 rtx reg = gen_reg_rtx (HImode);
43199 rtx temp;
43201 emit_insn (gen_x86_fnstsw_1 (reg));
43203 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43205 emit_insn (gen_x86_sahf_1 (reg));
43207 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43208 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43210 else
43212 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
43214 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43215 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43218 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43219 gen_rtx_LABEL_REF (VOIDmode, label),
43220 pc_rtx);
43221 temp = gen_rtx_SET (pc_rtx, temp);
43223 emit_jump_insn (temp);
43224 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43227 /* Output code to perform a log1p XFmode calculation. */
43229 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43231 rtx_code_label *label1 = gen_label_rtx ();
43232 rtx_code_label *label2 = gen_label_rtx ();
43234 rtx tmp = gen_reg_rtx (XFmode);
43235 rtx tmp2 = gen_reg_rtx (XFmode);
43236 rtx test;
43238 emit_insn (gen_absxf2 (tmp, op1));
43239 test = gen_rtx_GE (VOIDmode, tmp,
43240 const_double_from_real_value (
43241 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43242 XFmode));
43243 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43245 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43246 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43247 emit_jump (label2);
43249 emit_label (label1);
43250 emit_move_insn (tmp, CONST1_RTX (XFmode));
43251 emit_insn (gen_addxf3 (tmp, op1, tmp));
43252 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43253 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43255 emit_label (label2);
43258 /* Emit code for round calculation. */
43259 void ix86_emit_i387_round (rtx op0, rtx op1)
43261 machine_mode inmode = GET_MODE (op1);
43262 machine_mode outmode = GET_MODE (op0);
43263 rtx e1, e2, res, tmp, tmp1, half;
43264 rtx scratch = gen_reg_rtx (HImode);
43265 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43266 rtx_code_label *jump_label = gen_label_rtx ();
43267 rtx insn;
43268 rtx (*gen_abs) (rtx, rtx);
43269 rtx (*gen_neg) (rtx, rtx);
43271 switch (inmode)
43273 case E_SFmode:
43274 gen_abs = gen_abssf2;
43275 break;
43276 case E_DFmode:
43277 gen_abs = gen_absdf2;
43278 break;
43279 case E_XFmode:
43280 gen_abs = gen_absxf2;
43281 break;
43282 default:
43283 gcc_unreachable ();
43286 switch (outmode)
43288 case E_SFmode:
43289 gen_neg = gen_negsf2;
43290 break;
43291 case E_DFmode:
43292 gen_neg = gen_negdf2;
43293 break;
43294 case E_XFmode:
43295 gen_neg = gen_negxf2;
43296 break;
43297 case E_HImode:
43298 gen_neg = gen_neghi2;
43299 break;
43300 case E_SImode:
43301 gen_neg = gen_negsi2;
43302 break;
43303 case E_DImode:
43304 gen_neg = gen_negdi2;
43305 break;
43306 default:
43307 gcc_unreachable ();
43310 e1 = gen_reg_rtx (inmode);
43311 e2 = gen_reg_rtx (inmode);
43312 res = gen_reg_rtx (outmode);
43314 half = const_double_from_real_value (dconsthalf, inmode);
43316 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43318 /* scratch = fxam(op1) */
43319 emit_insn (gen_rtx_SET (scratch,
43320 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43321 UNSPEC_FXAM)));
43322 /* e1 = fabs(op1) */
43323 emit_insn (gen_abs (e1, op1));
43325 /* e2 = e1 + 0.5 */
43326 half = force_reg (inmode, half);
43327 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43329 /* res = floor(e2) */
43330 if (inmode != XFmode)
43332 tmp1 = gen_reg_rtx (XFmode);
43334 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43336 else
43337 tmp1 = e2;
43339 switch (outmode)
43341 case E_SFmode:
43342 case E_DFmode:
43344 rtx tmp0 = gen_reg_rtx (XFmode);
43346 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43348 emit_insn (gen_rtx_SET (res,
43349 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43350 UNSPEC_TRUNC_NOOP)));
43352 break;
43353 case E_XFmode:
43354 emit_insn (gen_frndintxf2_floor (res, tmp1));
43355 break;
43356 case E_HImode:
43357 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43358 break;
43359 case E_SImode:
43360 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43361 break;
43362 case E_DImode:
43363 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43364 break;
43365 default:
43366 gcc_unreachable ();
43369 /* flags = signbit(a) */
43370 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
43372 /* if (flags) then res = -res */
43373 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43374 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43375 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43376 pc_rtx);
43377 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43378 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43379 JUMP_LABEL (insn) = jump_label;
43381 emit_insn (gen_neg (res, res));
43383 emit_label (jump_label);
43384 LABEL_NUSES (jump_label) = 1;
43386 emit_move_insn (op0, res);
43389 /* Output code to perform a Newton-Rhapson approximation of a single precision
43390 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43392 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43394 rtx x0, x1, e0, e1;
43396 x0 = gen_reg_rtx (mode);
43397 e0 = gen_reg_rtx (mode);
43398 e1 = gen_reg_rtx (mode);
43399 x1 = gen_reg_rtx (mode);
43401 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43403 b = force_reg (mode, b);
43405 /* x0 = rcp(b) estimate */
43406 if (mode == V16SFmode || mode == V8DFmode)
43408 if (TARGET_AVX512ER)
43410 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43411 UNSPEC_RCP28)));
43412 /* res = a * x0 */
43413 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43414 return;
43416 else
43417 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43418 UNSPEC_RCP14)));
43420 else
43421 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43422 UNSPEC_RCP)));
43424 /* e0 = x0 * b */
43425 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43427 /* e0 = x0 * e0 */
43428 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43430 /* e1 = x0 + x0 */
43431 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43433 /* x1 = e1 - e0 */
43434 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43436 /* res = a * x1 */
43437 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43440 /* Output code to perform a Newton-Rhapson approximation of a
43441 single precision floating point [reciprocal] square root. */
43443 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43445 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43446 REAL_VALUE_TYPE r;
43447 int unspec;
43449 x0 = gen_reg_rtx (mode);
43450 e0 = gen_reg_rtx (mode);
43451 e1 = gen_reg_rtx (mode);
43452 e2 = gen_reg_rtx (mode);
43453 e3 = gen_reg_rtx (mode);
43455 if (TARGET_AVX512ER && mode == V16SFmode)
43457 if (recip)
43458 /* res = rsqrt28(a) estimate */
43459 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43460 UNSPEC_RSQRT28)));
43461 else
43463 /* x0 = rsqrt28(a) estimate */
43464 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43465 UNSPEC_RSQRT28)));
43466 /* res = rcp28(x0) estimate */
43467 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43468 UNSPEC_RCP28)));
43470 return;
43473 real_from_integer (&r, VOIDmode, -3, SIGNED);
43474 mthree = const_double_from_real_value (r, SFmode);
43476 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43477 mhalf = const_double_from_real_value (r, SFmode);
43478 unspec = UNSPEC_RSQRT;
43480 if (VECTOR_MODE_P (mode))
43482 mthree = ix86_build_const_vector (mode, true, mthree);
43483 mhalf = ix86_build_const_vector (mode, true, mhalf);
43484 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43485 if (GET_MODE_SIZE (mode) == 64)
43486 unspec = UNSPEC_RSQRT14;
43489 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43490 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43492 a = force_reg (mode, a);
43494 /* x0 = rsqrt(a) estimate */
43495 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43496 unspec)));
43498 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43499 if (!recip)
43501 rtx zero = force_reg (mode, CONST0_RTX(mode));
43502 rtx mask;
43504 /* Handle masked compare. */
43505 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43507 mask = gen_reg_rtx (HImode);
43508 /* Imm value 0x4 corresponds to not-equal comparison. */
43509 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43510 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43512 else
43514 mask = gen_reg_rtx (mode);
43515 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43516 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43520 /* e0 = x0 * a */
43521 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43522 /* e1 = e0 * x0 */
43523 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43525 /* e2 = e1 - 3. */
43526 mthree = force_reg (mode, mthree);
43527 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43529 mhalf = force_reg (mode, mhalf);
43530 if (recip)
43531 /* e3 = -.5 * x0 */
43532 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43533 else
43534 /* e3 = -.5 * e0 */
43535 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43536 /* ret = e2 * e3 */
43537 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43540 #ifdef TARGET_SOLARIS
43541 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43543 static void
43544 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43545 tree decl)
43547 /* With Binutils 2.15, the "@unwind" marker must be specified on
43548 every occurrence of the ".eh_frame" section, not just the first
43549 one. */
43550 if (TARGET_64BIT
43551 && strcmp (name, ".eh_frame") == 0)
43553 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43554 flags & SECTION_WRITE ? "aw" : "a");
43555 return;
43558 #ifndef USE_GAS
43559 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
43561 solaris_elf_asm_comdat_section (name, flags, decl);
43562 return;
43564 #endif
43566 default_elf_asm_named_section (name, flags, decl);
43568 #endif /* TARGET_SOLARIS */
43570 /* Return the mangling of TYPE if it is an extended fundamental type. */
43572 static const char *
43573 ix86_mangle_type (const_tree type)
43575 type = TYPE_MAIN_VARIANT (type);
43577 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
43578 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
43579 return NULL;
43581 switch (TYPE_MODE (type))
43583 case E_TFmode:
43584 /* __float128 is "g". */
43585 return "g";
43586 case E_XFmode:
43587 /* "long double" or __float80 is "e". */
43588 return "e";
43589 default:
43590 return NULL;
43594 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
43596 static tree
43597 ix86_stack_protect_guard (void)
43599 if (TARGET_SSP_TLS_GUARD)
43601 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
43602 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
43603 tree type = build_qualified_type (type_node, qual);
43604 tree t;
43606 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
43608 t = ix86_tls_stack_chk_guard_decl;
43610 if (t == NULL)
43612 rtx x;
43614 t = build_decl
43615 (UNKNOWN_LOCATION, VAR_DECL,
43616 get_identifier (ix86_stack_protector_guard_symbol_str),
43617 type);
43618 TREE_STATIC (t) = 1;
43619 TREE_PUBLIC (t) = 1;
43620 DECL_EXTERNAL (t) = 1;
43621 TREE_USED (t) = 1;
43622 TREE_THIS_VOLATILE (t) = 1;
43623 DECL_ARTIFICIAL (t) = 1;
43624 DECL_IGNORED_P (t) = 1;
43626 /* Do not share RTL as the declaration is visible outside of
43627 current function. */
43628 x = DECL_RTL (t);
43629 RTX_FLAG (x, used) = 1;
43631 ix86_tls_stack_chk_guard_decl = t;
43634 else
43636 tree asptrtype = build_pointer_type (type);
43638 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
43639 t = build2 (MEM_REF, asptrtype, t,
43640 build_int_cst (asptrtype, 0));
43643 return t;
43646 return default_stack_protect_guard ();
43649 /* For 32-bit code we can save PIC register setup by using
43650 __stack_chk_fail_local hidden function instead of calling
43651 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
43652 register, so it is better to call __stack_chk_fail directly. */
43654 static tree ATTRIBUTE_UNUSED
43655 ix86_stack_protect_fail (void)
43657 return TARGET_64BIT
43658 ? default_external_stack_protect_fail ()
43659 : default_hidden_stack_protect_fail ();
43662 /* Select a format to encode pointers in exception handling data. CODE
43663 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
43664 true if the symbol may be affected by dynamic relocations.
43666 ??? All x86 object file formats are capable of representing this.
43667 After all, the relocation needed is the same as for the call insn.
43668 Whether or not a particular assembler allows us to enter such, I
43669 guess we'll have to see. */
43671 asm_preferred_eh_data_format (int code, int global)
43673 if (flag_pic)
43675 int type = DW_EH_PE_sdata8;
43676 if (!TARGET_64BIT
43677 || ix86_cmodel == CM_SMALL_PIC
43678 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
43679 type = DW_EH_PE_sdata4;
43680 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
43682 if (ix86_cmodel == CM_SMALL
43683 || (ix86_cmodel == CM_MEDIUM && code))
43684 return DW_EH_PE_udata4;
43685 return DW_EH_PE_absptr;
43688 /* Expand copysign from SIGN to the positive value ABS_VALUE
43689 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
43690 the sign-bit. */
43691 static void
43692 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
43694 machine_mode mode = GET_MODE (sign);
43695 rtx sgn = gen_reg_rtx (mode);
43696 if (mask == NULL_RTX)
43698 machine_mode vmode;
43700 if (mode == SFmode)
43701 vmode = V4SFmode;
43702 else if (mode == DFmode)
43703 vmode = V2DFmode;
43704 else
43705 vmode = mode;
43707 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
43708 if (!VECTOR_MODE_P (mode))
43710 /* We need to generate a scalar mode mask in this case. */
43711 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
43712 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
43713 mask = gen_reg_rtx (mode);
43714 emit_insn (gen_rtx_SET (mask, tmp));
43717 else
43718 mask = gen_rtx_NOT (mode, mask);
43719 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
43720 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
43723 /* Expand fabs (OP0) and return a new rtx that holds the result. The
43724 mask for masking out the sign-bit is stored in *SMASK, if that is
43725 non-null. */
43726 static rtx
43727 ix86_expand_sse_fabs (rtx op0, rtx *smask)
43729 machine_mode vmode, mode = GET_MODE (op0);
43730 rtx xa, mask;
43732 xa = gen_reg_rtx (mode);
43733 if (mode == SFmode)
43734 vmode = V4SFmode;
43735 else if (mode == DFmode)
43736 vmode = V2DFmode;
43737 else
43738 vmode = mode;
43739 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
43740 if (!VECTOR_MODE_P (mode))
43742 /* We need to generate a scalar mode mask in this case. */
43743 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
43744 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
43745 mask = gen_reg_rtx (mode);
43746 emit_insn (gen_rtx_SET (mask, tmp));
43748 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
43750 if (smask)
43751 *smask = mask;
43753 return xa;
43756 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
43757 swapping the operands if SWAP_OPERANDS is true. The expanded
43758 code is a forward jump to a newly created label in case the
43759 comparison is true. The generated label rtx is returned. */
43760 static rtx_code_label *
43761 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
43762 bool swap_operands)
43764 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
43765 rtx_code_label *label;
43766 rtx tmp;
43768 if (swap_operands)
43769 std::swap (op0, op1);
43771 label = gen_label_rtx ();
43772 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
43773 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
43774 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
43775 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
43776 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
43777 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43778 JUMP_LABEL (tmp) = label;
43780 return label;
43783 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
43784 using comparison code CODE. Operands are swapped for the comparison if
43785 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
43786 static rtx
43787 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
43788 bool swap_operands)
43790 rtx (*insn)(rtx, rtx, rtx, rtx);
43791 machine_mode mode = GET_MODE (op0);
43792 rtx mask = gen_reg_rtx (mode);
43794 if (swap_operands)
43795 std::swap (op0, op1);
43797 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
43799 emit_insn (insn (mask, op0, op1,
43800 gen_rtx_fmt_ee (code, mode, op0, op1)));
43801 return mask;
43804 /* Generate and return a rtx of mode MODE for 2**n where n is the number
43805 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
43806 static rtx
43807 ix86_gen_TWO52 (machine_mode mode)
43809 REAL_VALUE_TYPE TWO52r;
43810 rtx TWO52;
43812 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
43813 TWO52 = const_double_from_real_value (TWO52r, mode);
43814 TWO52 = force_reg (mode, TWO52);
43816 return TWO52;
43819 /* Expand SSE sequence for computing lround from OP1 storing
43820 into OP0. */
43821 void
43822 ix86_expand_lround (rtx op0, rtx op1)
43824 /* C code for the stuff we're doing below:
43825 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
43826 return (long)tmp;
43828 machine_mode mode = GET_MODE (op1);
43829 const struct real_format *fmt;
43830 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
43831 rtx adj;
43833 /* load nextafter (0.5, 0.0) */
43834 fmt = REAL_MODE_FORMAT (mode);
43835 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
43836 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
43838 /* adj = copysign (0.5, op1) */
43839 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
43840 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
43842 /* adj = op1 + adj */
43843 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
43845 /* op0 = (imode)adj */
43846 expand_fix (op0, adj, 0);
43849 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
43850 into OPERAND0. */
43851 void
43852 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
43854 /* C code for the stuff we're doing below (for do_floor):
43855 xi = (long)op1;
43856 xi -= (double)xi > op1 ? 1 : 0;
43857 return xi;
43859 machine_mode fmode = GET_MODE (op1);
43860 machine_mode imode = GET_MODE (op0);
43861 rtx ireg, freg, tmp;
43862 rtx_code_label *label;
43864 /* reg = (long)op1 */
43865 ireg = gen_reg_rtx (imode);
43866 expand_fix (ireg, op1, 0);
43868 /* freg = (double)reg */
43869 freg = gen_reg_rtx (fmode);
43870 expand_float (freg, ireg, 0);
43872 /* ireg = (freg > op1) ? ireg - 1 : ireg */
43873 label = ix86_expand_sse_compare_and_jump (UNLE,
43874 freg, op1, !do_floor);
43875 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
43876 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
43877 emit_move_insn (ireg, tmp);
43879 emit_label (label);
43880 LABEL_NUSES (label) = 1;
43882 emit_move_insn (op0, ireg);
43885 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
43886 result in OPERAND0. */
43887 void
43888 ix86_expand_rint (rtx operand0, rtx operand1)
43890 /* C code for the stuff we're doing below:
43891 xa = fabs (operand1);
43892 if (!isless (xa, 2**52))
43893 return operand1;
43894 xa = xa + 2**52 - 2**52;
43895 return copysign (xa, operand1);
43897 machine_mode mode = GET_MODE (operand0);
43898 rtx res, xa, TWO52, mask;
43899 rtx_code_label *label;
43901 res = gen_reg_rtx (mode);
43902 emit_move_insn (res, operand1);
43904 /* xa = abs (operand1) */
43905 xa = ix86_expand_sse_fabs (res, &mask);
43907 /* if (!isless (xa, TWO52)) goto label; */
43908 TWO52 = ix86_gen_TWO52 (mode);
43909 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
43911 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
43912 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
43914 ix86_sse_copysign_to_positive (res, xa, res, mask);
43916 emit_label (label);
43917 LABEL_NUSES (label) = 1;
43919 emit_move_insn (operand0, res);
43922 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
43923 into OPERAND0. */
43924 void
43925 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
43927 /* C code for the stuff we expand below.
43928 double xa = fabs (x), x2;
43929 if (!isless (xa, TWO52))
43930 return x;
43931 xa = xa + TWO52 - TWO52;
43932 x2 = copysign (xa, x);
43933 Compensate. Floor:
43934 if (x2 > x)
43935 x2 -= 1;
43936 Compensate. Ceil:
43937 if (x2 < x)
43938 x2 -= -1;
43939 return x2;
43941 machine_mode mode = GET_MODE (operand0);
43942 rtx xa, TWO52, tmp, one, res, mask;
43943 rtx_code_label *label;
43945 TWO52 = ix86_gen_TWO52 (mode);
43947 /* Temporary for holding the result, initialized to the input
43948 operand to ease control flow. */
43949 res = gen_reg_rtx (mode);
43950 emit_move_insn (res, operand1);
43952 /* xa = abs (operand1) */
43953 xa = ix86_expand_sse_fabs (res, &mask);
43955 /* if (!isless (xa, TWO52)) goto label; */
43956 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
43958 /* xa = xa + TWO52 - TWO52; */
43959 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
43960 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
43962 /* xa = copysign (xa, operand1) */
43963 ix86_sse_copysign_to_positive (xa, xa, res, mask);
43965 /* generate 1.0 or -1.0 */
43966 one = force_reg (mode,
43967 const_double_from_real_value (do_floor
43968 ? dconst1 : dconstm1, mode));
43970 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
43971 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
43972 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
43973 /* We always need to subtract here to preserve signed zero. */
43974 tmp = expand_simple_binop (mode, MINUS,
43975 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
43976 emit_move_insn (res, tmp);
43978 emit_label (label);
43979 LABEL_NUSES (label) = 1;
43981 emit_move_insn (operand0, res);
43984 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
43985 into OPERAND0. */
43986 void
43987 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
43989 /* C code for the stuff we expand below.
43990 double xa = fabs (x), x2;
43991 if (!isless (xa, TWO52))
43992 return x;
43993 x2 = (double)(long)x;
43994 Compensate. Floor:
43995 if (x2 > x)
43996 x2 -= 1;
43997 Compensate. Ceil:
43998 if (x2 < x)
43999 x2 += 1;
44000 if (HONOR_SIGNED_ZEROS (mode))
44001 return copysign (x2, x);
44002 return x2;
44004 machine_mode mode = GET_MODE (operand0);
44005 rtx xa, xi, TWO52, tmp, one, res, mask;
44006 rtx_code_label *label;
44008 TWO52 = ix86_gen_TWO52 (mode);
44010 /* Temporary for holding the result, initialized to the input
44011 operand to ease control flow. */
44012 res = gen_reg_rtx (mode);
44013 emit_move_insn (res, operand1);
44015 /* xa = abs (operand1) */
44016 xa = ix86_expand_sse_fabs (res, &mask);
44018 /* if (!isless (xa, TWO52)) goto label; */
44019 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44021 /* xa = (double)(long)x */
44022 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44023 expand_fix (xi, res, 0);
44024 expand_float (xa, xi, 0);
44026 /* generate 1.0 */
44027 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44029 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44030 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44031 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44032 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44033 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44034 emit_move_insn (res, tmp);
44036 if (HONOR_SIGNED_ZEROS (mode))
44037 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44039 emit_label (label);
44040 LABEL_NUSES (label) = 1;
44042 emit_move_insn (operand0, res);
44045 /* Expand SSE sequence for computing round from OPERAND1 storing
44046 into OPERAND0. Sequence that works without relying on DImode truncation
44047 via cvttsd2siq that is only available on 64bit targets. */
44048 void
44049 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44051 /* C code for the stuff we expand below.
44052 double xa = fabs (x), xa2, x2;
44053 if (!isless (xa, TWO52))
44054 return x;
44055 Using the absolute value and copying back sign makes
44056 -0.0 -> -0.0 correct.
44057 xa2 = xa + TWO52 - TWO52;
44058 Compensate.
44059 dxa = xa2 - xa;
44060 if (dxa <= -0.5)
44061 xa2 += 1;
44062 else if (dxa > 0.5)
44063 xa2 -= 1;
44064 x2 = copysign (xa2, x);
44065 return x2;
44067 machine_mode mode = GET_MODE (operand0);
44068 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44069 rtx_code_label *label;
44071 TWO52 = ix86_gen_TWO52 (mode);
44073 /* Temporary for holding the result, initialized to the input
44074 operand to ease control flow. */
44075 res = gen_reg_rtx (mode);
44076 emit_move_insn (res, operand1);
44078 /* xa = abs (operand1) */
44079 xa = ix86_expand_sse_fabs (res, &mask);
44081 /* if (!isless (xa, TWO52)) goto label; */
44082 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44084 /* xa2 = xa + TWO52 - TWO52; */
44085 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44086 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44088 /* dxa = xa2 - xa; */
44089 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44091 /* generate 0.5, 1.0 and -0.5 */
44092 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44093 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44094 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44095 0, OPTAB_DIRECT);
44097 /* Compensate. */
44098 tmp = gen_reg_rtx (mode);
44099 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44100 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44101 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44102 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44103 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44104 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44105 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44106 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44108 /* res = copysign (xa2, operand1) */
44109 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44111 emit_label (label);
44112 LABEL_NUSES (label) = 1;
44114 emit_move_insn (operand0, res);
44117 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44118 into OPERAND0. */
44119 void
44120 ix86_expand_trunc (rtx operand0, rtx operand1)
44122 /* C code for SSE variant we expand below.
44123 double xa = fabs (x), x2;
44124 if (!isless (xa, TWO52))
44125 return x;
44126 x2 = (double)(long)x;
44127 if (HONOR_SIGNED_ZEROS (mode))
44128 return copysign (x2, x);
44129 return x2;
44131 machine_mode mode = GET_MODE (operand0);
44132 rtx xa, xi, TWO52, res, mask;
44133 rtx_code_label *label;
44135 TWO52 = ix86_gen_TWO52 (mode);
44137 /* Temporary for holding the result, initialized to the input
44138 operand to ease control flow. */
44139 res = gen_reg_rtx (mode);
44140 emit_move_insn (res, operand1);
44142 /* xa = abs (operand1) */
44143 xa = ix86_expand_sse_fabs (res, &mask);
44145 /* if (!isless (xa, TWO52)) goto label; */
44146 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44148 /* x = (double)(long)x */
44149 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44150 expand_fix (xi, res, 0);
44151 expand_float (res, xi, 0);
44153 if (HONOR_SIGNED_ZEROS (mode))
44154 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44156 emit_label (label);
44157 LABEL_NUSES (label) = 1;
44159 emit_move_insn (operand0, res);
44162 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44163 into OPERAND0. */
44164 void
44165 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44167 machine_mode mode = GET_MODE (operand0);
44168 rtx xa, mask, TWO52, one, res, smask, tmp;
44169 rtx_code_label *label;
44171 /* C code for SSE variant we expand below.
44172 double xa = fabs (x), x2;
44173 if (!isless (xa, TWO52))
44174 return x;
44175 xa2 = xa + TWO52 - TWO52;
44176 Compensate:
44177 if (xa2 > xa)
44178 xa2 -= 1.0;
44179 x2 = copysign (xa2, x);
44180 return x2;
44183 TWO52 = ix86_gen_TWO52 (mode);
44185 /* Temporary for holding the result, initialized to the input
44186 operand to ease control flow. */
44187 res = gen_reg_rtx (mode);
44188 emit_move_insn (res, operand1);
44190 /* xa = abs (operand1) */
44191 xa = ix86_expand_sse_fabs (res, &smask);
44193 /* if (!isless (xa, TWO52)) goto label; */
44194 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44196 /* res = xa + TWO52 - TWO52; */
44197 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44198 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44199 emit_move_insn (res, tmp);
44201 /* generate 1.0 */
44202 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44204 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44205 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44206 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44207 tmp = expand_simple_binop (mode, MINUS,
44208 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44209 emit_move_insn (res, tmp);
44211 /* res = copysign (res, operand1) */
44212 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44214 emit_label (label);
44215 LABEL_NUSES (label) = 1;
44217 emit_move_insn (operand0, res);
44220 /* Expand SSE sequence for computing round from OPERAND1 storing
44221 into OPERAND0. */
44222 void
44223 ix86_expand_round (rtx operand0, rtx operand1)
44225 /* C code for the stuff we're doing below:
44226 double xa = fabs (x);
44227 if (!isless (xa, TWO52))
44228 return x;
44229 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44230 return copysign (xa, x);
44232 machine_mode mode = GET_MODE (operand0);
44233 rtx res, TWO52, xa, xi, half, mask;
44234 rtx_code_label *label;
44235 const struct real_format *fmt;
44236 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44238 /* Temporary for holding the result, initialized to the input
44239 operand to ease control flow. */
44240 res = gen_reg_rtx (mode);
44241 emit_move_insn (res, operand1);
44243 TWO52 = ix86_gen_TWO52 (mode);
44244 xa = ix86_expand_sse_fabs (res, &mask);
44245 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44247 /* load nextafter (0.5, 0.0) */
44248 fmt = REAL_MODE_FORMAT (mode);
44249 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44250 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44252 /* xa = xa + 0.5 */
44253 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44254 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44256 /* xa = (double)(int64_t)xa */
44257 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44258 expand_fix (xi, xa, 0);
44259 expand_float (xa, xi, 0);
44261 /* res = copysign (xa, operand1) */
44262 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44264 emit_label (label);
44265 LABEL_NUSES (label) = 1;
44267 emit_move_insn (operand0, res);
44270 /* Expand SSE sequence for computing round
44271 from OP1 storing into OP0 using sse4 round insn. */
44272 void
44273 ix86_expand_round_sse4 (rtx op0, rtx op1)
44275 machine_mode mode = GET_MODE (op0);
44276 rtx e1, e2, res, half;
44277 const struct real_format *fmt;
44278 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44279 rtx (*gen_copysign) (rtx, rtx, rtx);
44280 rtx (*gen_round) (rtx, rtx, rtx);
44282 switch (mode)
44284 case E_SFmode:
44285 gen_copysign = gen_copysignsf3;
44286 gen_round = gen_sse4_1_roundsf2;
44287 break;
44288 case E_DFmode:
44289 gen_copysign = gen_copysigndf3;
44290 gen_round = gen_sse4_1_rounddf2;
44291 break;
44292 default:
44293 gcc_unreachable ();
44296 /* round (a) = trunc (a + copysign (0.5, a)) */
44298 /* load nextafter (0.5, 0.0) */
44299 fmt = REAL_MODE_FORMAT (mode);
44300 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44301 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44302 half = const_double_from_real_value (pred_half, mode);
44304 /* e1 = copysign (0.5, op1) */
44305 e1 = gen_reg_rtx (mode);
44306 emit_insn (gen_copysign (e1, half, op1));
44308 /* e2 = op1 + e1 */
44309 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44311 /* res = trunc (e2) */
44312 res = gen_reg_rtx (mode);
44313 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44315 emit_move_insn (op0, res);
44319 /* Table of valid machine attributes. */
44320 static const struct attribute_spec ix86_attribute_table[] =
44322 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44323 affects_type_identity } */
44324 /* Stdcall attribute says callee is responsible for popping arguments
44325 if they are not variable. */
44326 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44327 true },
44328 /* Fastcall attribute says callee is responsible for popping arguments
44329 if they are not variable. */
44330 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44331 true },
44332 /* Thiscall attribute says callee is responsible for popping arguments
44333 if they are not variable. */
44334 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44335 true },
44336 /* Cdecl attribute says the callee is a normal C declaration */
44337 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44338 true },
44339 /* Regparm attribute specifies how many integer arguments are to be
44340 passed in registers. */
44341 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44342 true },
44343 /* Sseregparm attribute says we are using x86_64 calling conventions
44344 for FP arguments. */
44345 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44346 true },
44347 /* The transactional memory builtins are implicitly regparm or fastcall
44348 depending on the ABI. Override the generic do-nothing attribute that
44349 these builtins were declared with. */
44350 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44351 true },
44352 /* force_align_arg_pointer says this function realigns the stack at entry. */
44353 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44354 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
44355 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44356 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
44357 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
44358 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44359 false },
44360 #endif
44361 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44362 false },
44363 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44364 false },
44365 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44366 SUBTARGET_ATTRIBUTE_TABLE,
44367 #endif
44368 /* ms_abi and sysv_abi calling convention function attributes. */
44369 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44370 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44371 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
44372 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
44373 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
44374 false },
44375 { "callee_pop_aggregate_return", 1, 1, false, true, true,
44376 ix86_handle_callee_pop_aggregate_return, true },
44377 { "interrupt", 0, 0, false, true, true,
44378 ix86_handle_interrupt_attribute, false },
44379 { "no_caller_saved_registers", 0, 0, false, true, true,
44380 ix86_handle_no_caller_saved_registers_attribute, false },
44381 { "naked", 0, 0, true, false, false,
44382 ix86_handle_fndecl_attribute, false },
44384 /* End element. */
44385 { NULL, 0, 0, false, false, false, NULL, false }
44388 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44389 static int
44390 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44391 tree vectype, int)
44393 bool fp = false;
44394 machine_mode mode = TImode;
44395 int index;
44396 if (vectype != NULL)
44398 fp = FLOAT_TYPE_P (vectype);
44399 mode = TYPE_MODE (vectype);
44402 switch (type_of_cost)
44404 case scalar_stmt:
44405 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
44407 case scalar_load:
44408 /* load/store costs are relative to register move which is 2. Recompute
44409 it to COSTS_N_INSNS so everything have same base. */
44410 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
44411 : ix86_cost->int_load [2]) / 2;
44413 case scalar_store:
44414 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
44415 : ix86_cost->int_store [2]) / 2;
44417 case vector_stmt:
44418 return ix86_vec_cost (mode,
44419 fp ? ix86_cost->addss : ix86_cost->sse_op,
44420 true);
44422 case vector_load:
44423 index = sse_store_index (mode);
44424 gcc_assert (index >= 0);
44425 return ix86_vec_cost (mode,
44426 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
44427 true);
44429 case vector_store:
44430 index = sse_store_index (mode);
44431 return ix86_vec_cost (mode,
44432 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
44433 true);
44435 case vec_to_scalar:
44436 case scalar_to_vec:
44437 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
44439 /* We should have separate costs for unaligned loads and gather/scatter.
44440 Do that incrementally. */
44441 case unaligned_load:
44442 case vector_gather_load:
44443 index = sse_store_index (mode);
44444 return ix86_vec_cost (mode,
44445 COSTS_N_INSNS
44446 (ix86_cost->sse_unaligned_load[index]) / 2,
44447 true);
44449 case unaligned_store:
44450 case vector_scatter_store:
44451 index = sse_store_index (mode);
44452 return ix86_vec_cost (mode,
44453 COSTS_N_INSNS
44454 (ix86_cost->sse_unaligned_store[index]) / 2,
44455 true);
44457 case cond_branch_taken:
44458 return ix86_cost->cond_taken_branch_cost;
44460 case cond_branch_not_taken:
44461 return ix86_cost->cond_not_taken_branch_cost;
44463 case vec_perm:
44464 case vec_promote_demote:
44465 return ix86_vec_cost (mode,
44466 ix86_cost->sse_op, true);
44468 case vec_construct:
44469 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
44471 default:
44472 gcc_unreachable ();
44476 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44477 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44478 insn every time. */
44480 static GTY(()) rtx_insn *vselect_insn;
44482 /* Initialize vselect_insn. */
44484 static void
44485 init_vselect_insn (void)
44487 unsigned i;
44488 rtx x;
44490 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44491 for (i = 0; i < MAX_VECT_LEN; ++i)
44492 XVECEXP (x, 0, i) = const0_rtx;
44493 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44494 const0_rtx), x);
44495 x = gen_rtx_SET (const0_rtx, x);
44496 start_sequence ();
44497 vselect_insn = emit_insn (x);
44498 end_sequence ();
44501 /* Construct (set target (vec_select op0 (parallel perm))) and
44502 return true if that's a valid instruction in the active ISA. */
44504 static bool
44505 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44506 unsigned nelt, bool testing_p)
44508 unsigned int i;
44509 rtx x, save_vconcat;
44510 int icode;
44512 if (vselect_insn == NULL_RTX)
44513 init_vselect_insn ();
44515 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44516 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44517 for (i = 0; i < nelt; ++i)
44518 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44519 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44520 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44521 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44522 SET_DEST (PATTERN (vselect_insn)) = target;
44523 icode = recog_memoized (vselect_insn);
44525 if (icode >= 0 && !testing_p)
44526 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44528 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44529 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44530 INSN_CODE (vselect_insn) = -1;
44532 return icode >= 0;
44535 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44537 static bool
44538 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44539 const unsigned char *perm, unsigned nelt,
44540 bool testing_p)
44542 machine_mode v2mode;
44543 rtx x;
44544 bool ok;
44546 if (vselect_insn == NULL_RTX)
44547 init_vselect_insn ();
44549 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
44550 return false;
44551 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44552 PUT_MODE (x, v2mode);
44553 XEXP (x, 0) = op0;
44554 XEXP (x, 1) = op1;
44555 ok = expand_vselect (target, x, perm, nelt, testing_p);
44556 XEXP (x, 0) = const0_rtx;
44557 XEXP (x, 1) = const0_rtx;
44558 return ok;
44561 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44562 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44564 static bool
44565 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44567 machine_mode mmode, vmode = d->vmode;
44568 unsigned i, mask, nelt = d->nelt;
44569 rtx target, op0, op1, maskop, x;
44570 rtx rperm[32], vperm;
44572 if (d->one_operand_p)
44573 return false;
44574 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44575 && (TARGET_AVX512BW
44576 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44578 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44580 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
44582 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
44584 else
44585 return false;
44587 /* This is a blend, not a permute. Elements must stay in their
44588 respective lanes. */
44589 for (i = 0; i < nelt; ++i)
44591 unsigned e = d->perm[i];
44592 if (!(e == i || e == i + nelt))
44593 return false;
44596 if (d->testing_p)
44597 return true;
44599 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
44600 decision should be extracted elsewhere, so that we only try that
44601 sequence once all budget==3 options have been tried. */
44602 target = d->target;
44603 op0 = d->op0;
44604 op1 = d->op1;
44605 mask = 0;
44607 switch (vmode)
44609 case E_V8DFmode:
44610 case E_V16SFmode:
44611 case E_V4DFmode:
44612 case E_V8SFmode:
44613 case E_V2DFmode:
44614 case E_V4SFmode:
44615 case E_V8HImode:
44616 case E_V8SImode:
44617 case E_V32HImode:
44618 case E_V64QImode:
44619 case E_V16SImode:
44620 case E_V8DImode:
44621 for (i = 0; i < nelt; ++i)
44622 mask |= (d->perm[i] >= nelt) << i;
44623 break;
44625 case E_V2DImode:
44626 for (i = 0; i < 2; ++i)
44627 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
44628 vmode = V8HImode;
44629 goto do_subreg;
44631 case E_V4SImode:
44632 for (i = 0; i < 4; ++i)
44633 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
44634 vmode = V8HImode;
44635 goto do_subreg;
44637 case E_V16QImode:
44638 /* See if bytes move in pairs so we can use pblendw with
44639 an immediate argument, rather than pblendvb with a vector
44640 argument. */
44641 for (i = 0; i < 16; i += 2)
44642 if (d->perm[i] + 1 != d->perm[i + 1])
44644 use_pblendvb:
44645 for (i = 0; i < nelt; ++i)
44646 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
44648 finish_pblendvb:
44649 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
44650 vperm = force_reg (vmode, vperm);
44652 if (GET_MODE_SIZE (vmode) == 16)
44653 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
44654 else
44655 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
44656 if (target != d->target)
44657 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
44658 return true;
44661 for (i = 0; i < 8; ++i)
44662 mask |= (d->perm[i * 2] >= 16) << i;
44663 vmode = V8HImode;
44664 /* FALLTHRU */
44666 do_subreg:
44667 target = gen_reg_rtx (vmode);
44668 op0 = gen_lowpart (vmode, op0);
44669 op1 = gen_lowpart (vmode, op1);
44670 break;
44672 case E_V32QImode:
44673 /* See if bytes move in pairs. If not, vpblendvb must be used. */
44674 for (i = 0; i < 32; i += 2)
44675 if (d->perm[i] + 1 != d->perm[i + 1])
44676 goto use_pblendvb;
44677 /* See if bytes move in quadruplets. If yes, vpblendd
44678 with immediate can be used. */
44679 for (i = 0; i < 32; i += 4)
44680 if (d->perm[i] + 2 != d->perm[i + 2])
44681 break;
44682 if (i < 32)
44684 /* See if bytes move the same in both lanes. If yes,
44685 vpblendw with immediate can be used. */
44686 for (i = 0; i < 16; i += 2)
44687 if (d->perm[i] + 16 != d->perm[i + 16])
44688 goto use_pblendvb;
44690 /* Use vpblendw. */
44691 for (i = 0; i < 16; ++i)
44692 mask |= (d->perm[i * 2] >= 32) << i;
44693 vmode = V16HImode;
44694 goto do_subreg;
44697 /* Use vpblendd. */
44698 for (i = 0; i < 8; ++i)
44699 mask |= (d->perm[i * 4] >= 32) << i;
44700 vmode = V8SImode;
44701 goto do_subreg;
44703 case E_V16HImode:
44704 /* See if words move in pairs. If yes, vpblendd can be used. */
44705 for (i = 0; i < 16; i += 2)
44706 if (d->perm[i] + 1 != d->perm[i + 1])
44707 break;
44708 if (i < 16)
44710 /* See if words move the same in both lanes. If not,
44711 vpblendvb must be used. */
44712 for (i = 0; i < 8; i++)
44713 if (d->perm[i] + 8 != d->perm[i + 8])
44715 /* Use vpblendvb. */
44716 for (i = 0; i < 32; ++i)
44717 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
44719 vmode = V32QImode;
44720 nelt = 32;
44721 target = gen_reg_rtx (vmode);
44722 op0 = gen_lowpart (vmode, op0);
44723 op1 = gen_lowpart (vmode, op1);
44724 goto finish_pblendvb;
44727 /* Use vpblendw. */
44728 for (i = 0; i < 16; ++i)
44729 mask |= (d->perm[i] >= 16) << i;
44730 break;
44733 /* Use vpblendd. */
44734 for (i = 0; i < 8; ++i)
44735 mask |= (d->perm[i * 2] >= 16) << i;
44736 vmode = V8SImode;
44737 goto do_subreg;
44739 case E_V4DImode:
44740 /* Use vpblendd. */
44741 for (i = 0; i < 4; ++i)
44742 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
44743 vmode = V8SImode;
44744 goto do_subreg;
44746 default:
44747 gcc_unreachable ();
44750 switch (vmode)
44752 case E_V8DFmode:
44753 case E_V8DImode:
44754 mmode = QImode;
44755 break;
44756 case E_V16SFmode:
44757 case E_V16SImode:
44758 mmode = HImode;
44759 break;
44760 case E_V32HImode:
44761 mmode = SImode;
44762 break;
44763 case E_V64QImode:
44764 mmode = DImode;
44765 break;
44766 default:
44767 mmode = VOIDmode;
44770 if (mmode != VOIDmode)
44771 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
44772 else
44773 maskop = GEN_INT (mask);
44775 /* This matches five different patterns with the different modes. */
44776 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
44777 x = gen_rtx_SET (target, x);
44778 emit_insn (x);
44779 if (target != d->target)
44780 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
44782 return true;
44785 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44786 in terms of the variable form of vpermilps.
44788 Note that we will have already failed the immediate input vpermilps,
44789 which requires that the high and low part shuffle be identical; the
44790 variable form doesn't require that. */
44792 static bool
44793 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
44795 rtx rperm[8], vperm;
44796 unsigned i;
44798 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
44799 return false;
44801 /* We can only permute within the 128-bit lane. */
44802 for (i = 0; i < 8; ++i)
44804 unsigned e = d->perm[i];
44805 if (i < 4 ? e >= 4 : e < 4)
44806 return false;
44809 if (d->testing_p)
44810 return true;
44812 for (i = 0; i < 8; ++i)
44814 unsigned e = d->perm[i];
44816 /* Within each 128-bit lane, the elements of op0 are numbered
44817 from 0 and the elements of op1 are numbered from 4. */
44818 if (e >= 8 + 4)
44819 e -= 8;
44820 else if (e >= 4)
44821 e -= 4;
44823 rperm[i] = GEN_INT (e);
44826 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
44827 vperm = force_reg (V8SImode, vperm);
44828 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
44830 return true;
44833 /* Return true if permutation D can be performed as VMODE permutation
44834 instead. */
44836 static bool
44837 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
44839 unsigned int i, j, chunk;
44841 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
44842 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
44843 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
44844 return false;
44846 if (GET_MODE_NUNITS (vmode) >= d->nelt)
44847 return true;
44849 chunk = d->nelt / GET_MODE_NUNITS (vmode);
44850 for (i = 0; i < d->nelt; i += chunk)
44851 if (d->perm[i] & (chunk - 1))
44852 return false;
44853 else
44854 for (j = 1; j < chunk; ++j)
44855 if (d->perm[i] + j != d->perm[i + j])
44856 return false;
44858 return true;
44861 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44862 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
44864 static bool
44865 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
44867 unsigned i, nelt, eltsz, mask;
44868 unsigned char perm[64];
44869 machine_mode vmode = V16QImode;
44870 rtx rperm[64], vperm, target, op0, op1;
44872 nelt = d->nelt;
44874 if (!d->one_operand_p)
44876 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
44878 if (TARGET_AVX2
44879 && valid_perm_using_mode_p (V2TImode, d))
44881 if (d->testing_p)
44882 return true;
44884 /* Use vperm2i128 insn. The pattern uses
44885 V4DImode instead of V2TImode. */
44886 target = d->target;
44887 if (d->vmode != V4DImode)
44888 target = gen_reg_rtx (V4DImode);
44889 op0 = gen_lowpart (V4DImode, d->op0);
44890 op1 = gen_lowpart (V4DImode, d->op1);
44891 rperm[0]
44892 = GEN_INT ((d->perm[0] / (nelt / 2))
44893 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
44894 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
44895 if (target != d->target)
44896 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
44897 return true;
44899 return false;
44902 else
44904 if (GET_MODE_SIZE (d->vmode) == 16)
44906 if (!TARGET_SSSE3)
44907 return false;
44909 else if (GET_MODE_SIZE (d->vmode) == 32)
44911 if (!TARGET_AVX2)
44912 return false;
44914 /* V4DImode should be already handled through
44915 expand_vselect by vpermq instruction. */
44916 gcc_assert (d->vmode != V4DImode);
44918 vmode = V32QImode;
44919 if (d->vmode == V8SImode
44920 || d->vmode == V16HImode
44921 || d->vmode == V32QImode)
44923 /* First see if vpermq can be used for
44924 V8SImode/V16HImode/V32QImode. */
44925 if (valid_perm_using_mode_p (V4DImode, d))
44927 for (i = 0; i < 4; i++)
44928 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
44929 if (d->testing_p)
44930 return true;
44931 target = gen_reg_rtx (V4DImode);
44932 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
44933 perm, 4, false))
44935 emit_move_insn (d->target,
44936 gen_lowpart (d->vmode, target));
44937 return true;
44939 return false;
44942 /* Next see if vpermd can be used. */
44943 if (valid_perm_using_mode_p (V8SImode, d))
44944 vmode = V8SImode;
44946 /* Or if vpermps can be used. */
44947 else if (d->vmode == V8SFmode)
44948 vmode = V8SImode;
44950 if (vmode == V32QImode)
44952 /* vpshufb only works intra lanes, it is not
44953 possible to shuffle bytes in between the lanes. */
44954 for (i = 0; i < nelt; ++i)
44955 if ((d->perm[i] ^ i) & (nelt / 2))
44956 return false;
44959 else if (GET_MODE_SIZE (d->vmode) == 64)
44961 if (!TARGET_AVX512BW)
44962 return false;
44964 /* If vpermq didn't work, vpshufb won't work either. */
44965 if (d->vmode == V8DFmode || d->vmode == V8DImode)
44966 return false;
44968 vmode = V64QImode;
44969 if (d->vmode == V16SImode
44970 || d->vmode == V32HImode
44971 || d->vmode == V64QImode)
44973 /* First see if vpermq can be used for
44974 V16SImode/V32HImode/V64QImode. */
44975 if (valid_perm_using_mode_p (V8DImode, d))
44977 for (i = 0; i < 8; i++)
44978 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
44979 if (d->testing_p)
44980 return true;
44981 target = gen_reg_rtx (V8DImode);
44982 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
44983 perm, 8, false))
44985 emit_move_insn (d->target,
44986 gen_lowpart (d->vmode, target));
44987 return true;
44989 return false;
44992 /* Next see if vpermd can be used. */
44993 if (valid_perm_using_mode_p (V16SImode, d))
44994 vmode = V16SImode;
44996 /* Or if vpermps can be used. */
44997 else if (d->vmode == V16SFmode)
44998 vmode = V16SImode;
44999 if (vmode == V64QImode)
45001 /* vpshufb only works intra lanes, it is not
45002 possible to shuffle bytes in between the lanes. */
45003 for (i = 0; i < nelt; ++i)
45004 if ((d->perm[i] ^ i) & (nelt / 4))
45005 return false;
45008 else
45009 return false;
45012 if (d->testing_p)
45013 return true;
45015 if (vmode == V8SImode)
45016 for (i = 0; i < 8; ++i)
45017 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45018 else if (vmode == V16SImode)
45019 for (i = 0; i < 16; ++i)
45020 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45021 else
45023 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45024 if (!d->one_operand_p)
45025 mask = 2 * nelt - 1;
45026 else if (vmode == V16QImode)
45027 mask = nelt - 1;
45028 else if (vmode == V64QImode)
45029 mask = nelt / 4 - 1;
45030 else
45031 mask = nelt / 2 - 1;
45033 for (i = 0; i < nelt; ++i)
45035 unsigned j, e = d->perm[i] & mask;
45036 for (j = 0; j < eltsz; ++j)
45037 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45041 vperm = gen_rtx_CONST_VECTOR (vmode,
45042 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45043 vperm = force_reg (vmode, vperm);
45045 target = d->target;
45046 if (d->vmode != vmode)
45047 target = gen_reg_rtx (vmode);
45048 op0 = gen_lowpart (vmode, d->op0);
45049 if (d->one_operand_p)
45051 if (vmode == V16QImode)
45052 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45053 else if (vmode == V32QImode)
45054 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45055 else if (vmode == V64QImode)
45056 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45057 else if (vmode == V8SFmode)
45058 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45059 else if (vmode == V8SImode)
45060 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45061 else if (vmode == V16SFmode)
45062 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45063 else if (vmode == V16SImode)
45064 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45065 else
45066 gcc_unreachable ();
45068 else
45070 op1 = gen_lowpart (vmode, d->op1);
45071 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45073 if (target != d->target)
45074 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45076 return true;
45079 /* For V*[QHS]Imode permutations, check if the same permutation
45080 can't be performed in a 2x, 4x or 8x wider inner mode. */
45082 static bool
45083 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45084 struct expand_vec_perm_d *nd)
45086 int i;
45087 machine_mode mode = VOIDmode;
45089 switch (d->vmode)
45091 case E_V16QImode: mode = V8HImode; break;
45092 case E_V32QImode: mode = V16HImode; break;
45093 case E_V64QImode: mode = V32HImode; break;
45094 case E_V8HImode: mode = V4SImode; break;
45095 case E_V16HImode: mode = V8SImode; break;
45096 case E_V32HImode: mode = V16SImode; break;
45097 case E_V4SImode: mode = V2DImode; break;
45098 case E_V8SImode: mode = V4DImode; break;
45099 case E_V16SImode: mode = V8DImode; break;
45100 default: return false;
45102 for (i = 0; i < d->nelt; i += 2)
45103 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45104 return false;
45105 nd->vmode = mode;
45106 nd->nelt = d->nelt / 2;
45107 for (i = 0; i < nd->nelt; i++)
45108 nd->perm[i] = d->perm[2 * i] / 2;
45109 if (GET_MODE_INNER (mode) != DImode)
45110 canonicalize_vector_int_perm (nd, nd);
45111 if (nd != d)
45113 nd->one_operand_p = d->one_operand_p;
45114 nd->testing_p = d->testing_p;
45115 if (d->op0 == d->op1)
45116 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45117 else
45119 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45120 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45122 if (d->testing_p)
45123 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45124 else
45125 nd->target = gen_reg_rtx (nd->vmode);
45127 return true;
45130 /* Try to expand one-operand permutation with constant mask. */
45132 static bool
45133 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45135 machine_mode mode = GET_MODE (d->op0);
45136 machine_mode maskmode = mode;
45137 rtx (*gen) (rtx, rtx, rtx) = NULL;
45138 rtx target, op0, mask;
45139 rtx vec[64];
45141 if (!rtx_equal_p (d->op0, d->op1))
45142 return false;
45144 if (!TARGET_AVX512F)
45145 return false;
45147 switch (mode)
45149 case E_V16SImode:
45150 gen = gen_avx512f_permvarv16si;
45151 break;
45152 case E_V16SFmode:
45153 gen = gen_avx512f_permvarv16sf;
45154 maskmode = V16SImode;
45155 break;
45156 case E_V8DImode:
45157 gen = gen_avx512f_permvarv8di;
45158 break;
45159 case E_V8DFmode:
45160 gen = gen_avx512f_permvarv8df;
45161 maskmode = V8DImode;
45162 break;
45163 default:
45164 return false;
45167 target = d->target;
45168 op0 = d->op0;
45169 for (int i = 0; i < d->nelt; ++i)
45170 vec[i] = GEN_INT (d->perm[i]);
45171 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45172 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45173 return true;
45176 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45177 in a single instruction. */
45179 static bool
45180 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45182 unsigned i, nelt = d->nelt;
45183 struct expand_vec_perm_d nd;
45185 /* Check plain VEC_SELECT first, because AVX has instructions that could
45186 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45187 input where SEL+CONCAT may not. */
45188 if (d->one_operand_p)
45190 int mask = nelt - 1;
45191 bool identity_perm = true;
45192 bool broadcast_perm = true;
45194 for (i = 0; i < nelt; i++)
45196 nd.perm[i] = d->perm[i] & mask;
45197 if (nd.perm[i] != i)
45198 identity_perm = false;
45199 if (nd.perm[i])
45200 broadcast_perm = false;
45203 if (identity_perm)
45205 if (!d->testing_p)
45206 emit_move_insn (d->target, d->op0);
45207 return true;
45209 else if (broadcast_perm && TARGET_AVX2)
45211 /* Use vpbroadcast{b,w,d}. */
45212 rtx (*gen) (rtx, rtx) = NULL;
45213 switch (d->vmode)
45215 case E_V64QImode:
45216 if (TARGET_AVX512BW)
45217 gen = gen_avx512bw_vec_dupv64qi_1;
45218 break;
45219 case E_V32QImode:
45220 gen = gen_avx2_pbroadcastv32qi_1;
45221 break;
45222 case E_V32HImode:
45223 if (TARGET_AVX512BW)
45224 gen = gen_avx512bw_vec_dupv32hi_1;
45225 break;
45226 case E_V16HImode:
45227 gen = gen_avx2_pbroadcastv16hi_1;
45228 break;
45229 case E_V16SImode:
45230 if (TARGET_AVX512F)
45231 gen = gen_avx512f_vec_dupv16si_1;
45232 break;
45233 case E_V8SImode:
45234 gen = gen_avx2_pbroadcastv8si_1;
45235 break;
45236 case E_V16QImode:
45237 gen = gen_avx2_pbroadcastv16qi;
45238 break;
45239 case E_V8HImode:
45240 gen = gen_avx2_pbroadcastv8hi;
45241 break;
45242 case E_V16SFmode:
45243 if (TARGET_AVX512F)
45244 gen = gen_avx512f_vec_dupv16sf_1;
45245 break;
45246 case E_V8SFmode:
45247 gen = gen_avx2_vec_dupv8sf_1;
45248 break;
45249 case E_V8DFmode:
45250 if (TARGET_AVX512F)
45251 gen = gen_avx512f_vec_dupv8df_1;
45252 break;
45253 case E_V8DImode:
45254 if (TARGET_AVX512F)
45255 gen = gen_avx512f_vec_dupv8di_1;
45256 break;
45257 /* For other modes prefer other shuffles this function creates. */
45258 default: break;
45260 if (gen != NULL)
45262 if (!d->testing_p)
45263 emit_insn (gen (d->target, d->op0));
45264 return true;
45268 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45269 return true;
45271 /* There are plenty of patterns in sse.md that are written for
45272 SEL+CONCAT and are not replicated for a single op. Perhaps
45273 that should be changed, to avoid the nastiness here. */
45275 /* Recognize interleave style patterns, which means incrementing
45276 every other permutation operand. */
45277 for (i = 0; i < nelt; i += 2)
45279 nd.perm[i] = d->perm[i] & mask;
45280 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45282 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45283 d->testing_p))
45284 return true;
45286 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45287 if (nelt >= 4)
45289 for (i = 0; i < nelt; i += 4)
45291 nd.perm[i + 0] = d->perm[i + 0] & mask;
45292 nd.perm[i + 1] = d->perm[i + 1] & mask;
45293 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45294 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45297 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45298 d->testing_p))
45299 return true;
45303 /* Finally, try the fully general two operand permute. */
45304 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45305 d->testing_p))
45306 return true;
45308 /* Recognize interleave style patterns with reversed operands. */
45309 if (!d->one_operand_p)
45311 for (i = 0; i < nelt; ++i)
45313 unsigned e = d->perm[i];
45314 if (e >= nelt)
45315 e -= nelt;
45316 else
45317 e += nelt;
45318 nd.perm[i] = e;
45321 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45322 d->testing_p))
45323 return true;
45326 /* Try the SSE4.1 blend variable merge instructions. */
45327 if (expand_vec_perm_blend (d))
45328 return true;
45330 /* Try one of the AVX vpermil variable permutations. */
45331 if (expand_vec_perm_vpermil (d))
45332 return true;
45334 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45335 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45336 if (expand_vec_perm_pshufb (d))
45337 return true;
45339 /* Try the AVX2 vpalignr instruction. */
45340 if (expand_vec_perm_palignr (d, true))
45341 return true;
45343 /* Try the AVX512F vperm{s,d} instructions. */
45344 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45345 return true;
45347 /* Try the AVX512F vpermi2 instructions. */
45348 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45349 return true;
45351 /* See if we can get the same permutation in different vector integer
45352 mode. */
45353 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45355 if (!d->testing_p)
45356 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45357 return true;
45359 return false;
45362 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45363 in terms of a pair of pshuflw + pshufhw instructions. */
45365 static bool
45366 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45368 unsigned char perm2[MAX_VECT_LEN];
45369 unsigned i;
45370 bool ok;
45372 if (d->vmode != V8HImode || !d->one_operand_p)
45373 return false;
45375 /* The two permutations only operate in 64-bit lanes. */
45376 for (i = 0; i < 4; ++i)
45377 if (d->perm[i] >= 4)
45378 return false;
45379 for (i = 4; i < 8; ++i)
45380 if (d->perm[i] < 4)
45381 return false;
45383 if (d->testing_p)
45384 return true;
45386 /* Emit the pshuflw. */
45387 memcpy (perm2, d->perm, 4);
45388 for (i = 4; i < 8; ++i)
45389 perm2[i] = i;
45390 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45391 gcc_assert (ok);
45393 /* Emit the pshufhw. */
45394 memcpy (perm2 + 4, d->perm + 4, 4);
45395 for (i = 0; i < 4; ++i)
45396 perm2[i] = i;
45397 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45398 gcc_assert (ok);
45400 return true;
45403 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45404 the permutation using the SSSE3 palignr instruction. This succeeds
45405 when all of the elements in PERM fit within one vector and we merely
45406 need to shift them down so that a single vector permutation has a
45407 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45408 the vpalignr instruction itself can perform the requested permutation. */
45410 static bool
45411 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45413 unsigned i, nelt = d->nelt;
45414 unsigned min, max, minswap, maxswap;
45415 bool in_order, ok, swap = false;
45416 rtx shift, target;
45417 struct expand_vec_perm_d dcopy;
45419 /* Even with AVX, palignr only operates on 128-bit vectors,
45420 in AVX2 palignr operates on both 128-bit lanes. */
45421 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45422 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45423 return false;
45425 min = 2 * nelt;
45426 max = 0;
45427 minswap = 2 * nelt;
45428 maxswap = 0;
45429 for (i = 0; i < nelt; ++i)
45431 unsigned e = d->perm[i];
45432 unsigned eswap = d->perm[i] ^ nelt;
45433 if (GET_MODE_SIZE (d->vmode) == 32)
45435 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45436 eswap = e ^ (nelt / 2);
45438 if (e < min)
45439 min = e;
45440 if (e > max)
45441 max = e;
45442 if (eswap < minswap)
45443 minswap = eswap;
45444 if (eswap > maxswap)
45445 maxswap = eswap;
45447 if (min == 0
45448 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45450 if (d->one_operand_p
45451 || minswap == 0
45452 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45453 ? nelt / 2 : nelt))
45454 return false;
45455 swap = true;
45456 min = minswap;
45457 max = maxswap;
45460 /* Given that we have SSSE3, we know we'll be able to implement the
45461 single operand permutation after the palignr with pshufb for
45462 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45463 first. */
45464 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45465 return true;
45467 dcopy = *d;
45468 if (swap)
45470 dcopy.op0 = d->op1;
45471 dcopy.op1 = d->op0;
45472 for (i = 0; i < nelt; ++i)
45473 dcopy.perm[i] ^= nelt;
45476 in_order = true;
45477 for (i = 0; i < nelt; ++i)
45479 unsigned e = dcopy.perm[i];
45480 if (GET_MODE_SIZE (d->vmode) == 32
45481 && e >= nelt
45482 && (e & (nelt / 2 - 1)) < min)
45483 e = e - min - (nelt / 2);
45484 else
45485 e = e - min;
45486 if (e != i)
45487 in_order = false;
45488 dcopy.perm[i] = e;
45490 dcopy.one_operand_p = true;
45492 if (single_insn_only_p && !in_order)
45493 return false;
45495 /* For AVX2, test whether we can permute the result in one instruction. */
45496 if (d->testing_p)
45498 if (in_order)
45499 return true;
45500 dcopy.op1 = dcopy.op0;
45501 return expand_vec_perm_1 (&dcopy);
45504 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45505 if (GET_MODE_SIZE (d->vmode) == 16)
45507 target = gen_reg_rtx (TImode);
45508 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45509 gen_lowpart (TImode, dcopy.op0), shift));
45511 else
45513 target = gen_reg_rtx (V2TImode);
45514 emit_insn (gen_avx2_palignrv2ti (target,
45515 gen_lowpart (V2TImode, dcopy.op1),
45516 gen_lowpart (V2TImode, dcopy.op0),
45517 shift));
45520 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45522 /* Test for the degenerate case where the alignment by itself
45523 produces the desired permutation. */
45524 if (in_order)
45526 emit_move_insn (d->target, dcopy.op0);
45527 return true;
45530 ok = expand_vec_perm_1 (&dcopy);
45531 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45533 return ok;
45536 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45537 the permutation using the SSE4_1 pblendv instruction. Potentially
45538 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45540 static bool
45541 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45543 unsigned i, which, nelt = d->nelt;
45544 struct expand_vec_perm_d dcopy, dcopy1;
45545 machine_mode vmode = d->vmode;
45546 bool ok;
45548 /* Use the same checks as in expand_vec_perm_blend. */
45549 if (d->one_operand_p)
45550 return false;
45551 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45553 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45555 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45557 else
45558 return false;
45560 /* Figure out where permutation elements stay not in their
45561 respective lanes. */
45562 for (i = 0, which = 0; i < nelt; ++i)
45564 unsigned e = d->perm[i];
45565 if (e != i)
45566 which |= (e < nelt ? 1 : 2);
45568 /* We can pblend the part where elements stay not in their
45569 respective lanes only when these elements are all in one
45570 half of a permutation.
45571 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45572 lanes, but both 8 and 9 >= 8
45573 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45574 respective lanes and 8 >= 8, but 2 not. */
45575 if (which != 1 && which != 2)
45576 return false;
45577 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45578 return true;
45580 /* First we apply one operand permutation to the part where
45581 elements stay not in their respective lanes. */
45582 dcopy = *d;
45583 if (which == 2)
45584 dcopy.op0 = dcopy.op1 = d->op1;
45585 else
45586 dcopy.op0 = dcopy.op1 = d->op0;
45587 if (!d->testing_p)
45588 dcopy.target = gen_reg_rtx (vmode);
45589 dcopy.one_operand_p = true;
45591 for (i = 0; i < nelt; ++i)
45592 dcopy.perm[i] = d->perm[i] & (nelt - 1);
45594 ok = expand_vec_perm_1 (&dcopy);
45595 if (GET_MODE_SIZE (vmode) != 16 && !ok)
45596 return false;
45597 else
45598 gcc_assert (ok);
45599 if (d->testing_p)
45600 return true;
45602 /* Next we put permuted elements into their positions. */
45603 dcopy1 = *d;
45604 if (which == 2)
45605 dcopy1.op1 = dcopy.target;
45606 else
45607 dcopy1.op0 = dcopy.target;
45609 for (i = 0; i < nelt; ++i)
45610 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
45612 ok = expand_vec_perm_blend (&dcopy1);
45613 gcc_assert (ok);
45615 return true;
45618 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
45620 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45621 a two vector permutation into a single vector permutation by using
45622 an interleave operation to merge the vectors. */
45624 static bool
45625 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
45627 struct expand_vec_perm_d dremap, dfinal;
45628 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
45629 unsigned HOST_WIDE_INT contents;
45630 unsigned char remap[2 * MAX_VECT_LEN];
45631 rtx_insn *seq;
45632 bool ok, same_halves = false;
45634 if (GET_MODE_SIZE (d->vmode) == 16)
45636 if (d->one_operand_p)
45637 return false;
45639 else if (GET_MODE_SIZE (d->vmode) == 32)
45641 if (!TARGET_AVX)
45642 return false;
45643 /* For 32-byte modes allow even d->one_operand_p.
45644 The lack of cross-lane shuffling in some instructions
45645 might prevent a single insn shuffle. */
45646 dfinal = *d;
45647 dfinal.testing_p = true;
45648 /* If expand_vec_perm_interleave3 can expand this into
45649 a 3 insn sequence, give up and let it be expanded as
45650 3 insn sequence. While that is one insn longer,
45651 it doesn't need a memory operand and in the common
45652 case that both interleave low and high permutations
45653 with the same operands are adjacent needs 4 insns
45654 for both after CSE. */
45655 if (expand_vec_perm_interleave3 (&dfinal))
45656 return false;
45658 else
45659 return false;
45661 /* Examine from whence the elements come. */
45662 contents = 0;
45663 for (i = 0; i < nelt; ++i)
45664 contents |= HOST_WIDE_INT_1U << d->perm[i];
45666 memset (remap, 0xff, sizeof (remap));
45667 dremap = *d;
45669 if (GET_MODE_SIZE (d->vmode) == 16)
45671 unsigned HOST_WIDE_INT h1, h2, h3, h4;
45673 /* Split the two input vectors into 4 halves. */
45674 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
45675 h2 = h1 << nelt2;
45676 h3 = h2 << nelt2;
45677 h4 = h3 << nelt2;
45679 /* If the elements from the low halves use interleave low, and similarly
45680 for interleave high. If the elements are from mis-matched halves, we
45681 can use shufps for V4SF/V4SI or do a DImode shuffle. */
45682 if ((contents & (h1 | h3)) == contents)
45684 /* punpckl* */
45685 for (i = 0; i < nelt2; ++i)
45687 remap[i] = i * 2;
45688 remap[i + nelt] = i * 2 + 1;
45689 dremap.perm[i * 2] = i;
45690 dremap.perm[i * 2 + 1] = i + nelt;
45692 if (!TARGET_SSE2 && d->vmode == V4SImode)
45693 dremap.vmode = V4SFmode;
45695 else if ((contents & (h2 | h4)) == contents)
45697 /* punpckh* */
45698 for (i = 0; i < nelt2; ++i)
45700 remap[i + nelt2] = i * 2;
45701 remap[i + nelt + nelt2] = i * 2 + 1;
45702 dremap.perm[i * 2] = i + nelt2;
45703 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
45705 if (!TARGET_SSE2 && d->vmode == V4SImode)
45706 dremap.vmode = V4SFmode;
45708 else if ((contents & (h1 | h4)) == contents)
45710 /* shufps */
45711 for (i = 0; i < nelt2; ++i)
45713 remap[i] = i;
45714 remap[i + nelt + nelt2] = i + nelt2;
45715 dremap.perm[i] = i;
45716 dremap.perm[i + nelt2] = i + nelt + nelt2;
45718 if (nelt != 4)
45720 /* shufpd */
45721 dremap.vmode = V2DImode;
45722 dremap.nelt = 2;
45723 dremap.perm[0] = 0;
45724 dremap.perm[1] = 3;
45727 else if ((contents & (h2 | h3)) == contents)
45729 /* shufps */
45730 for (i = 0; i < nelt2; ++i)
45732 remap[i + nelt2] = i;
45733 remap[i + nelt] = i + nelt2;
45734 dremap.perm[i] = i + nelt2;
45735 dremap.perm[i + nelt2] = i + nelt;
45737 if (nelt != 4)
45739 /* shufpd */
45740 dremap.vmode = V2DImode;
45741 dremap.nelt = 2;
45742 dremap.perm[0] = 1;
45743 dremap.perm[1] = 2;
45746 else
45747 return false;
45749 else
45751 unsigned int nelt4 = nelt / 4, nzcnt = 0;
45752 unsigned HOST_WIDE_INT q[8];
45753 unsigned int nonzero_halves[4];
45755 /* Split the two input vectors into 8 quarters. */
45756 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
45757 for (i = 1; i < 8; ++i)
45758 q[i] = q[0] << (nelt4 * i);
45759 for (i = 0; i < 4; ++i)
45760 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
45762 nonzero_halves[nzcnt] = i;
45763 ++nzcnt;
45766 if (nzcnt == 1)
45768 gcc_assert (d->one_operand_p);
45769 nonzero_halves[1] = nonzero_halves[0];
45770 same_halves = true;
45772 else if (d->one_operand_p)
45774 gcc_assert (nonzero_halves[0] == 0);
45775 gcc_assert (nonzero_halves[1] == 1);
45778 if (nzcnt <= 2)
45780 if (d->perm[0] / nelt2 == nonzero_halves[1])
45782 /* Attempt to increase the likelihood that dfinal
45783 shuffle will be intra-lane. */
45784 std::swap (nonzero_halves[0], nonzero_halves[1]);
45787 /* vperm2f128 or vperm2i128. */
45788 for (i = 0; i < nelt2; ++i)
45790 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
45791 remap[i + nonzero_halves[0] * nelt2] = i;
45792 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
45793 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
45796 if (d->vmode != V8SFmode
45797 && d->vmode != V4DFmode
45798 && d->vmode != V8SImode)
45800 dremap.vmode = V8SImode;
45801 dremap.nelt = 8;
45802 for (i = 0; i < 4; ++i)
45804 dremap.perm[i] = i + nonzero_halves[0] * 4;
45805 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
45809 else if (d->one_operand_p)
45810 return false;
45811 else if (TARGET_AVX2
45812 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
45814 /* vpunpckl* */
45815 for (i = 0; i < nelt4; ++i)
45817 remap[i] = i * 2;
45818 remap[i + nelt] = i * 2 + 1;
45819 remap[i + nelt2] = i * 2 + nelt2;
45820 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
45821 dremap.perm[i * 2] = i;
45822 dremap.perm[i * 2 + 1] = i + nelt;
45823 dremap.perm[i * 2 + nelt2] = i + nelt2;
45824 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
45827 else if (TARGET_AVX2
45828 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
45830 /* vpunpckh* */
45831 for (i = 0; i < nelt4; ++i)
45833 remap[i + nelt4] = i * 2;
45834 remap[i + nelt + nelt4] = i * 2 + 1;
45835 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
45836 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
45837 dremap.perm[i * 2] = i + nelt4;
45838 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
45839 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
45840 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
45843 else
45844 return false;
45847 /* Use the remapping array set up above to move the elements from their
45848 swizzled locations into their final destinations. */
45849 dfinal = *d;
45850 for (i = 0; i < nelt; ++i)
45852 unsigned e = remap[d->perm[i]];
45853 gcc_assert (e < nelt);
45854 /* If same_halves is true, both halves of the remapped vector are the
45855 same. Avoid cross-lane accesses if possible. */
45856 if (same_halves && i >= nelt2)
45858 gcc_assert (e < nelt2);
45859 dfinal.perm[i] = e + nelt2;
45861 else
45862 dfinal.perm[i] = e;
45864 if (!d->testing_p)
45866 dremap.target = gen_reg_rtx (dremap.vmode);
45867 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
45869 dfinal.op1 = dfinal.op0;
45870 dfinal.one_operand_p = true;
45872 /* Test if the final remap can be done with a single insn. For V4SFmode or
45873 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
45874 start_sequence ();
45875 ok = expand_vec_perm_1 (&dfinal);
45876 seq = get_insns ();
45877 end_sequence ();
45879 if (!ok)
45880 return false;
45882 if (d->testing_p)
45883 return true;
45885 if (dremap.vmode != dfinal.vmode)
45887 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
45888 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
45891 ok = expand_vec_perm_1 (&dremap);
45892 gcc_assert (ok);
45894 emit_insn (seq);
45895 return true;
45898 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45899 a single vector cross-lane permutation into vpermq followed
45900 by any of the single insn permutations. */
45902 static bool
45903 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
45905 struct expand_vec_perm_d dremap, dfinal;
45906 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
45907 unsigned contents[2];
45908 bool ok;
45910 if (!(TARGET_AVX2
45911 && (d->vmode == V32QImode || d->vmode == V16HImode)
45912 && d->one_operand_p))
45913 return false;
45915 contents[0] = 0;
45916 contents[1] = 0;
45917 for (i = 0; i < nelt2; ++i)
45919 contents[0] |= 1u << (d->perm[i] / nelt4);
45920 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
45923 for (i = 0; i < 2; ++i)
45925 unsigned int cnt = 0;
45926 for (j = 0; j < 4; ++j)
45927 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
45928 return false;
45931 if (d->testing_p)
45932 return true;
45934 dremap = *d;
45935 dremap.vmode = V4DImode;
45936 dremap.nelt = 4;
45937 dremap.target = gen_reg_rtx (V4DImode);
45938 dremap.op0 = gen_lowpart (V4DImode, d->op0);
45939 dremap.op1 = dremap.op0;
45940 dremap.one_operand_p = true;
45941 for (i = 0; i < 2; ++i)
45943 unsigned int cnt = 0;
45944 for (j = 0; j < 4; ++j)
45945 if ((contents[i] & (1u << j)) != 0)
45946 dremap.perm[2 * i + cnt++] = j;
45947 for (; cnt < 2; ++cnt)
45948 dremap.perm[2 * i + cnt] = 0;
45951 dfinal = *d;
45952 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
45953 dfinal.op1 = dfinal.op0;
45954 dfinal.one_operand_p = true;
45955 for (i = 0, j = 0; i < nelt; ++i)
45957 if (i == nelt2)
45958 j = 2;
45959 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
45960 if ((d->perm[i] / nelt4) == dremap.perm[j])
45962 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
45963 dfinal.perm[i] |= nelt4;
45964 else
45965 gcc_unreachable ();
45968 ok = expand_vec_perm_1 (&dremap);
45969 gcc_assert (ok);
45971 ok = expand_vec_perm_1 (&dfinal);
45972 gcc_assert (ok);
45974 return true;
45977 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
45978 a vector permutation using two instructions, vperm2f128 resp.
45979 vperm2i128 followed by any single in-lane permutation. */
45981 static bool
45982 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
45984 struct expand_vec_perm_d dfirst, dsecond;
45985 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
45986 bool ok;
45988 if (!TARGET_AVX
45989 || GET_MODE_SIZE (d->vmode) != 32
45990 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
45991 return false;
45993 dsecond = *d;
45994 dsecond.one_operand_p = false;
45995 dsecond.testing_p = true;
45997 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
45998 immediate. For perm < 16 the second permutation uses
45999 d->op0 as first operand, for perm >= 16 it uses d->op1
46000 as first operand. The second operand is the result of
46001 vperm2[fi]128. */
46002 for (perm = 0; perm < 32; perm++)
46004 /* Ignore permutations which do not move anything cross-lane. */
46005 if (perm < 16)
46007 /* The second shuffle for e.g. V4DFmode has
46008 0123 and ABCD operands.
46009 Ignore AB23, as 23 is already in the second lane
46010 of the first operand. */
46011 if ((perm & 0xc) == (1 << 2)) continue;
46012 /* And 01CD, as 01 is in the first lane of the first
46013 operand. */
46014 if ((perm & 3) == 0) continue;
46015 /* And 4567, as then the vperm2[fi]128 doesn't change
46016 anything on the original 4567 second operand. */
46017 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46019 else
46021 /* The second shuffle for e.g. V4DFmode has
46022 4567 and ABCD operands.
46023 Ignore AB67, as 67 is already in the second lane
46024 of the first operand. */
46025 if ((perm & 0xc) == (3 << 2)) continue;
46026 /* And 45CD, as 45 is in the first lane of the first
46027 operand. */
46028 if ((perm & 3) == 2) continue;
46029 /* And 0123, as then the vperm2[fi]128 doesn't change
46030 anything on the original 0123 first operand. */
46031 if ((perm & 0xf) == (1 << 2)) continue;
46034 for (i = 0; i < nelt; i++)
46036 j = d->perm[i] / nelt2;
46037 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46038 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46039 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46040 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46041 else
46042 break;
46045 if (i == nelt)
46047 start_sequence ();
46048 ok = expand_vec_perm_1 (&dsecond);
46049 end_sequence ();
46051 else
46052 ok = false;
46054 if (ok)
46056 if (d->testing_p)
46057 return true;
46059 /* Found a usable second shuffle. dfirst will be
46060 vperm2f128 on d->op0 and d->op1. */
46061 dsecond.testing_p = false;
46062 dfirst = *d;
46063 dfirst.target = gen_reg_rtx (d->vmode);
46064 for (i = 0; i < nelt; i++)
46065 dfirst.perm[i] = (i & (nelt2 - 1))
46066 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46068 canonicalize_perm (&dfirst);
46069 ok = expand_vec_perm_1 (&dfirst);
46070 gcc_assert (ok);
46072 /* And dsecond is some single insn shuffle, taking
46073 d->op0 and result of vperm2f128 (if perm < 16) or
46074 d->op1 and result of vperm2f128 (otherwise). */
46075 if (perm >= 16)
46076 dsecond.op0 = dsecond.op1;
46077 dsecond.op1 = dfirst.target;
46079 ok = expand_vec_perm_1 (&dsecond);
46080 gcc_assert (ok);
46082 return true;
46085 /* For one operand, the only useful vperm2f128 permutation is 0x01
46086 aka lanes swap. */
46087 if (d->one_operand_p)
46088 return false;
46091 return false;
46094 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46095 a two vector permutation using 2 intra-lane interleave insns
46096 and cross-lane shuffle for 32-byte vectors. */
46098 static bool
46099 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46101 unsigned i, nelt;
46102 rtx (*gen) (rtx, rtx, rtx);
46104 if (d->one_operand_p)
46105 return false;
46106 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46108 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46110 else
46111 return false;
46113 nelt = d->nelt;
46114 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46115 return false;
46116 for (i = 0; i < nelt; i += 2)
46117 if (d->perm[i] != d->perm[0] + i / 2
46118 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46119 return false;
46121 if (d->testing_p)
46122 return true;
46124 switch (d->vmode)
46126 case E_V32QImode:
46127 if (d->perm[0])
46128 gen = gen_vec_interleave_highv32qi;
46129 else
46130 gen = gen_vec_interleave_lowv32qi;
46131 break;
46132 case E_V16HImode:
46133 if (d->perm[0])
46134 gen = gen_vec_interleave_highv16hi;
46135 else
46136 gen = gen_vec_interleave_lowv16hi;
46137 break;
46138 case E_V8SImode:
46139 if (d->perm[0])
46140 gen = gen_vec_interleave_highv8si;
46141 else
46142 gen = gen_vec_interleave_lowv8si;
46143 break;
46144 case E_V4DImode:
46145 if (d->perm[0])
46146 gen = gen_vec_interleave_highv4di;
46147 else
46148 gen = gen_vec_interleave_lowv4di;
46149 break;
46150 case E_V8SFmode:
46151 if (d->perm[0])
46152 gen = gen_vec_interleave_highv8sf;
46153 else
46154 gen = gen_vec_interleave_lowv8sf;
46155 break;
46156 case E_V4DFmode:
46157 if (d->perm[0])
46158 gen = gen_vec_interleave_highv4df;
46159 else
46160 gen = gen_vec_interleave_lowv4df;
46161 break;
46162 default:
46163 gcc_unreachable ();
46166 emit_insn (gen (d->target, d->op0, d->op1));
46167 return true;
46170 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46171 a single vector permutation using a single intra-lane vector
46172 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46173 the non-swapped and swapped vectors together. */
46175 static bool
46176 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46178 struct expand_vec_perm_d dfirst, dsecond;
46179 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46180 rtx_insn *seq;
46181 bool ok;
46182 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46184 if (!TARGET_AVX
46185 || TARGET_AVX2
46186 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46187 || !d->one_operand_p)
46188 return false;
46190 dfirst = *d;
46191 for (i = 0; i < nelt; i++)
46192 dfirst.perm[i] = 0xff;
46193 for (i = 0, msk = 0; i < nelt; i++)
46195 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46196 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46197 return false;
46198 dfirst.perm[j] = d->perm[i];
46199 if (j != i)
46200 msk |= (1 << i);
46202 for (i = 0; i < nelt; i++)
46203 if (dfirst.perm[i] == 0xff)
46204 dfirst.perm[i] = i;
46206 if (!d->testing_p)
46207 dfirst.target = gen_reg_rtx (dfirst.vmode);
46209 start_sequence ();
46210 ok = expand_vec_perm_1 (&dfirst);
46211 seq = get_insns ();
46212 end_sequence ();
46214 if (!ok)
46215 return false;
46217 if (d->testing_p)
46218 return true;
46220 emit_insn (seq);
46222 dsecond = *d;
46223 dsecond.op0 = dfirst.target;
46224 dsecond.op1 = dfirst.target;
46225 dsecond.one_operand_p = true;
46226 dsecond.target = gen_reg_rtx (dsecond.vmode);
46227 for (i = 0; i < nelt; i++)
46228 dsecond.perm[i] = i ^ nelt2;
46230 ok = expand_vec_perm_1 (&dsecond);
46231 gcc_assert (ok);
46233 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46234 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46235 return true;
46238 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46239 permutation using two vperm2f128, followed by a vshufpd insn blending
46240 the two vectors together. */
46242 static bool
46243 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46245 struct expand_vec_perm_d dfirst, dsecond, dthird;
46246 bool ok;
46248 if (!TARGET_AVX || (d->vmode != V4DFmode))
46249 return false;
46251 if (d->testing_p)
46252 return true;
46254 dfirst = *d;
46255 dsecond = *d;
46256 dthird = *d;
46258 dfirst.perm[0] = (d->perm[0] & ~1);
46259 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46260 dfirst.perm[2] = (d->perm[2] & ~1);
46261 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46262 dsecond.perm[0] = (d->perm[1] & ~1);
46263 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46264 dsecond.perm[2] = (d->perm[3] & ~1);
46265 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46266 dthird.perm[0] = (d->perm[0] % 2);
46267 dthird.perm[1] = (d->perm[1] % 2) + 4;
46268 dthird.perm[2] = (d->perm[2] % 2) + 2;
46269 dthird.perm[3] = (d->perm[3] % 2) + 6;
46271 dfirst.target = gen_reg_rtx (dfirst.vmode);
46272 dsecond.target = gen_reg_rtx (dsecond.vmode);
46273 dthird.op0 = dfirst.target;
46274 dthird.op1 = dsecond.target;
46275 dthird.one_operand_p = false;
46277 canonicalize_perm (&dfirst);
46278 canonicalize_perm (&dsecond);
46280 ok = expand_vec_perm_1 (&dfirst)
46281 && expand_vec_perm_1 (&dsecond)
46282 && expand_vec_perm_1 (&dthird);
46284 gcc_assert (ok);
46286 return true;
46289 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46290 permutation with two pshufb insns and an ior. We should have already
46291 failed all two instruction sequences. */
46293 static bool
46294 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46296 rtx rperm[2][16], vperm, l, h, op, m128;
46297 unsigned int i, nelt, eltsz;
46299 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46300 return false;
46301 gcc_assert (!d->one_operand_p);
46303 if (d->testing_p)
46304 return true;
46306 nelt = d->nelt;
46307 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46309 /* Generate two permutation masks. If the required element is within
46310 the given vector it is shuffled into the proper lane. If the required
46311 element is in the other vector, force a zero into the lane by setting
46312 bit 7 in the permutation mask. */
46313 m128 = GEN_INT (-128);
46314 for (i = 0; i < nelt; ++i)
46316 unsigned j, e = d->perm[i];
46317 unsigned which = (e >= nelt);
46318 if (e >= nelt)
46319 e -= nelt;
46321 for (j = 0; j < eltsz; ++j)
46323 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46324 rperm[1-which][i*eltsz + j] = m128;
46328 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46329 vperm = force_reg (V16QImode, vperm);
46331 l = gen_reg_rtx (V16QImode);
46332 op = gen_lowpart (V16QImode, d->op0);
46333 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46335 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46336 vperm = force_reg (V16QImode, vperm);
46338 h = gen_reg_rtx (V16QImode);
46339 op = gen_lowpart (V16QImode, d->op1);
46340 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46342 op = d->target;
46343 if (d->vmode != V16QImode)
46344 op = gen_reg_rtx (V16QImode);
46345 emit_insn (gen_iorv16qi3 (op, l, h));
46346 if (op != d->target)
46347 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46349 return true;
46352 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46353 with two vpshufb insns, vpermq and vpor. We should have already failed
46354 all two or three instruction sequences. */
46356 static bool
46357 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46359 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46360 unsigned int i, nelt, eltsz;
46362 if (!TARGET_AVX2
46363 || !d->one_operand_p
46364 || (d->vmode != V32QImode && d->vmode != V16HImode))
46365 return false;
46367 if (d->testing_p)
46368 return true;
46370 nelt = d->nelt;
46371 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46373 /* Generate two permutation masks. If the required element is within
46374 the same lane, it is shuffled in. If the required element from the
46375 other lane, force a zero by setting bit 7 in the permutation mask.
46376 In the other mask the mask has non-negative elements if element
46377 is requested from the other lane, but also moved to the other lane,
46378 so that the result of vpshufb can have the two V2TImode halves
46379 swapped. */
46380 m128 = GEN_INT (-128);
46381 for (i = 0; i < nelt; ++i)
46383 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46384 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46386 for (j = 0; j < eltsz; ++j)
46388 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46389 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46393 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46394 vperm = force_reg (V32QImode, vperm);
46396 h = gen_reg_rtx (V32QImode);
46397 op = gen_lowpart (V32QImode, d->op0);
46398 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46400 /* Swap the 128-byte lanes of h into hp. */
46401 hp = gen_reg_rtx (V4DImode);
46402 op = gen_lowpart (V4DImode, h);
46403 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46404 const1_rtx));
46406 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46407 vperm = force_reg (V32QImode, vperm);
46409 l = gen_reg_rtx (V32QImode);
46410 op = gen_lowpart (V32QImode, d->op0);
46411 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46413 op = d->target;
46414 if (d->vmode != V32QImode)
46415 op = gen_reg_rtx (V32QImode);
46416 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46417 if (op != d->target)
46418 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46420 return true;
46423 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46424 and extract-odd permutations of two V32QImode and V16QImode operand
46425 with two vpshufb insns, vpor and vpermq. We should have already
46426 failed all two or three instruction sequences. */
46428 static bool
46429 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46431 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46432 unsigned int i, nelt, eltsz;
46434 if (!TARGET_AVX2
46435 || d->one_operand_p
46436 || (d->vmode != V32QImode && d->vmode != V16HImode))
46437 return false;
46439 for (i = 0; i < d->nelt; ++i)
46440 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46441 return false;
46443 if (d->testing_p)
46444 return true;
46446 nelt = d->nelt;
46447 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46449 /* Generate two permutation masks. In the first permutation mask
46450 the first quarter will contain indexes for the first half
46451 of the op0, the second quarter will contain bit 7 set, third quarter
46452 will contain indexes for the second half of the op0 and the
46453 last quarter bit 7 set. In the second permutation mask
46454 the first quarter will contain bit 7 set, the second quarter
46455 indexes for the first half of the op1, the third quarter bit 7 set
46456 and last quarter indexes for the second half of the op1.
46457 I.e. the first mask e.g. for V32QImode extract even will be:
46458 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46459 (all values masked with 0xf except for -128) and second mask
46460 for extract even will be
46461 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46462 m128 = GEN_INT (-128);
46463 for (i = 0; i < nelt; ++i)
46465 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46466 unsigned which = d->perm[i] >= nelt;
46467 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46469 for (j = 0; j < eltsz; ++j)
46471 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46472 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46476 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46477 vperm = force_reg (V32QImode, vperm);
46479 l = gen_reg_rtx (V32QImode);
46480 op = gen_lowpart (V32QImode, d->op0);
46481 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46483 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46484 vperm = force_reg (V32QImode, vperm);
46486 h = gen_reg_rtx (V32QImode);
46487 op = gen_lowpart (V32QImode, d->op1);
46488 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46490 ior = gen_reg_rtx (V32QImode);
46491 emit_insn (gen_iorv32qi3 (ior, l, h));
46493 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46494 op = gen_reg_rtx (V4DImode);
46495 ior = gen_lowpart (V4DImode, ior);
46496 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46497 const1_rtx, GEN_INT (3)));
46498 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46500 return true;
46503 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46504 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46505 with two "and" and "pack" or two "shift" and "pack" insns. We should
46506 have already failed all two instruction sequences. */
46508 static bool
46509 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46511 rtx op, dop0, dop1, t, rperm[16];
46512 unsigned i, odd, c, s, nelt = d->nelt;
46513 bool end_perm = false;
46514 machine_mode half_mode;
46515 rtx (*gen_and) (rtx, rtx, rtx);
46516 rtx (*gen_pack) (rtx, rtx, rtx);
46517 rtx (*gen_shift) (rtx, rtx, rtx);
46519 if (d->one_operand_p)
46520 return false;
46522 switch (d->vmode)
46524 case E_V8HImode:
46525 /* Required for "pack". */
46526 if (!TARGET_SSE4_1)
46527 return false;
46528 c = 0xffff;
46529 s = 16;
46530 half_mode = V4SImode;
46531 gen_and = gen_andv4si3;
46532 gen_pack = gen_sse4_1_packusdw;
46533 gen_shift = gen_lshrv4si3;
46534 break;
46535 case E_V16QImode:
46536 /* No check as all instructions are SSE2. */
46537 c = 0xff;
46538 s = 8;
46539 half_mode = V8HImode;
46540 gen_and = gen_andv8hi3;
46541 gen_pack = gen_sse2_packuswb;
46542 gen_shift = gen_lshrv8hi3;
46543 break;
46544 case E_V16HImode:
46545 if (!TARGET_AVX2)
46546 return false;
46547 c = 0xffff;
46548 s = 16;
46549 half_mode = V8SImode;
46550 gen_and = gen_andv8si3;
46551 gen_pack = gen_avx2_packusdw;
46552 gen_shift = gen_lshrv8si3;
46553 end_perm = true;
46554 break;
46555 case E_V32QImode:
46556 if (!TARGET_AVX2)
46557 return false;
46558 c = 0xff;
46559 s = 8;
46560 half_mode = V16HImode;
46561 gen_and = gen_andv16hi3;
46562 gen_pack = gen_avx2_packuswb;
46563 gen_shift = gen_lshrv16hi3;
46564 end_perm = true;
46565 break;
46566 default:
46567 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46568 general shuffles. */
46569 return false;
46572 /* Check that permutation is even or odd. */
46573 odd = d->perm[0];
46574 if (odd > 1)
46575 return false;
46577 for (i = 1; i < nelt; ++i)
46578 if (d->perm[i] != 2 * i + odd)
46579 return false;
46581 if (d->testing_p)
46582 return true;
46584 dop0 = gen_reg_rtx (half_mode);
46585 dop1 = gen_reg_rtx (half_mode);
46586 if (odd == 0)
46588 for (i = 0; i < nelt / 2; i++)
46589 rperm[i] = GEN_INT (c);
46590 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
46591 t = force_reg (half_mode, t);
46592 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
46593 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
46595 else
46597 emit_insn (gen_shift (dop0,
46598 gen_lowpart (half_mode, d->op0),
46599 GEN_INT (s)));
46600 emit_insn (gen_shift (dop1,
46601 gen_lowpart (half_mode, d->op1),
46602 GEN_INT (s)));
46604 /* In AVX2 for 256 bit case we need to permute pack result. */
46605 if (TARGET_AVX2 && end_perm)
46607 op = gen_reg_rtx (d->vmode);
46608 t = gen_reg_rtx (V4DImode);
46609 emit_insn (gen_pack (op, dop0, dop1));
46610 emit_insn (gen_avx2_permv4di_1 (t,
46611 gen_lowpart (V4DImode, op),
46612 const0_rtx,
46613 const2_rtx,
46614 const1_rtx,
46615 GEN_INT (3)));
46616 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
46618 else
46619 emit_insn (gen_pack (d->target, dop0, dop1));
46621 return true;
46624 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46625 and extract-odd permutations of two V64QI operands
46626 with two "shifts", two "truncs" and one "concat" insns for "odd"
46627 and two "truncs" and one concat insn for "even."
46628 Have already failed all two instruction sequences. */
46630 static bool
46631 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
46633 rtx t1, t2, t3, t4;
46634 unsigned i, odd, nelt = d->nelt;
46636 if (!TARGET_AVX512BW
46637 || d->one_operand_p
46638 || d->vmode != V64QImode)
46639 return false;
46641 /* Check that permutation is even or odd. */
46642 odd = d->perm[0];
46643 if (odd > 1)
46644 return false;
46646 for (i = 1; i < nelt; ++i)
46647 if (d->perm[i] != 2 * i + odd)
46648 return false;
46650 if (d->testing_p)
46651 return true;
46654 if (odd)
46656 t1 = gen_reg_rtx (V32HImode);
46657 t2 = gen_reg_rtx (V32HImode);
46658 emit_insn (gen_lshrv32hi3 (t1,
46659 gen_lowpart (V32HImode, d->op0),
46660 GEN_INT (8)));
46661 emit_insn (gen_lshrv32hi3 (t2,
46662 gen_lowpart (V32HImode, d->op1),
46663 GEN_INT (8)));
46665 else
46667 t1 = gen_lowpart (V32HImode, d->op0);
46668 t2 = gen_lowpart (V32HImode, d->op1);
46671 t3 = gen_reg_rtx (V32QImode);
46672 t4 = gen_reg_rtx (V32QImode);
46673 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
46674 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
46675 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
46677 return true;
46680 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
46681 and extract-odd permutations. */
46683 static bool
46684 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
46686 rtx t1, t2, t3, t4, t5;
46688 switch (d->vmode)
46690 case E_V4DFmode:
46691 if (d->testing_p)
46692 break;
46693 t1 = gen_reg_rtx (V4DFmode);
46694 t2 = gen_reg_rtx (V4DFmode);
46696 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
46697 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
46698 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
46700 /* Now an unpck[lh]pd will produce the result required. */
46701 if (odd)
46702 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
46703 else
46704 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
46705 emit_insn (t3);
46706 break;
46708 case E_V8SFmode:
46710 int mask = odd ? 0xdd : 0x88;
46712 if (d->testing_p)
46713 break;
46714 t1 = gen_reg_rtx (V8SFmode);
46715 t2 = gen_reg_rtx (V8SFmode);
46716 t3 = gen_reg_rtx (V8SFmode);
46718 /* Shuffle within the 128-bit lanes to produce:
46719 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
46720 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
46721 GEN_INT (mask)));
46723 /* Shuffle the lanes around to produce:
46724 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
46725 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
46726 GEN_INT (0x3)));
46728 /* Shuffle within the 128-bit lanes to produce:
46729 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
46730 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
46732 /* Shuffle within the 128-bit lanes to produce:
46733 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
46734 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
46736 /* Shuffle the lanes around to produce:
46737 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
46738 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
46739 GEN_INT (0x20)));
46741 break;
46743 case E_V2DFmode:
46744 case E_V4SFmode:
46745 case E_V2DImode:
46746 case E_V4SImode:
46747 /* These are always directly implementable by expand_vec_perm_1. */
46748 gcc_unreachable ();
46750 case E_V8HImode:
46751 if (TARGET_SSE4_1)
46752 return expand_vec_perm_even_odd_pack (d);
46753 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
46754 return expand_vec_perm_pshufb2 (d);
46755 else
46757 if (d->testing_p)
46758 break;
46759 /* We need 2*log2(N)-1 operations to achieve odd/even
46760 with interleave. */
46761 t1 = gen_reg_rtx (V8HImode);
46762 t2 = gen_reg_rtx (V8HImode);
46763 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
46764 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
46765 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
46766 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
46767 if (odd)
46768 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
46769 else
46770 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
46771 emit_insn (t3);
46773 break;
46775 case E_V16QImode:
46776 return expand_vec_perm_even_odd_pack (d);
46778 case E_V16HImode:
46779 case E_V32QImode:
46780 return expand_vec_perm_even_odd_pack (d);
46782 case E_V64QImode:
46783 return expand_vec_perm_even_odd_trunc (d);
46785 case E_V4DImode:
46786 if (!TARGET_AVX2)
46788 struct expand_vec_perm_d d_copy = *d;
46789 d_copy.vmode = V4DFmode;
46790 if (d->testing_p)
46791 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
46792 else
46793 d_copy.target = gen_reg_rtx (V4DFmode);
46794 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
46795 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
46796 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
46798 if (!d->testing_p)
46799 emit_move_insn (d->target,
46800 gen_lowpart (V4DImode, d_copy.target));
46801 return true;
46803 return false;
46806 if (d->testing_p)
46807 break;
46809 t1 = gen_reg_rtx (V4DImode);
46810 t2 = gen_reg_rtx (V4DImode);
46812 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
46813 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
46814 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
46816 /* Now an vpunpck[lh]qdq will produce the result required. */
46817 if (odd)
46818 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
46819 else
46820 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
46821 emit_insn (t3);
46822 break;
46824 case E_V8SImode:
46825 if (!TARGET_AVX2)
46827 struct expand_vec_perm_d d_copy = *d;
46828 d_copy.vmode = V8SFmode;
46829 if (d->testing_p)
46830 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
46831 else
46832 d_copy.target = gen_reg_rtx (V8SFmode);
46833 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
46834 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
46835 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
46837 if (!d->testing_p)
46838 emit_move_insn (d->target,
46839 gen_lowpart (V8SImode, d_copy.target));
46840 return true;
46842 return false;
46845 if (d->testing_p)
46846 break;
46848 t1 = gen_reg_rtx (V8SImode);
46849 t2 = gen_reg_rtx (V8SImode);
46850 t3 = gen_reg_rtx (V4DImode);
46851 t4 = gen_reg_rtx (V4DImode);
46852 t5 = gen_reg_rtx (V4DImode);
46854 /* Shuffle the lanes around into
46855 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
46856 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
46857 gen_lowpart (V4DImode, d->op1),
46858 GEN_INT (0x20)));
46859 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
46860 gen_lowpart (V4DImode, d->op1),
46861 GEN_INT (0x31)));
46863 /* Swap the 2nd and 3rd position in each lane into
46864 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
46865 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
46866 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
46867 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
46868 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
46870 /* Now an vpunpck[lh]qdq will produce
46871 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
46872 if (odd)
46873 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
46874 gen_lowpart (V4DImode, t2));
46875 else
46876 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
46877 gen_lowpart (V4DImode, t2));
46878 emit_insn (t3);
46879 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
46880 break;
46882 default:
46883 gcc_unreachable ();
46886 return true;
46889 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
46890 extract-even and extract-odd permutations. */
46892 static bool
46893 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
46895 unsigned i, odd, nelt = d->nelt;
46897 odd = d->perm[0];
46898 if (odd != 0 && odd != 1)
46899 return false;
46901 for (i = 1; i < nelt; ++i)
46902 if (d->perm[i] != 2 * i + odd)
46903 return false;
46905 return expand_vec_perm_even_odd_1 (d, odd);
46908 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
46909 permutations. We assume that expand_vec_perm_1 has already failed. */
46911 static bool
46912 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
46914 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
46915 machine_mode vmode = d->vmode;
46916 unsigned char perm2[4];
46917 rtx op0 = d->op0, dest;
46918 bool ok;
46920 switch (vmode)
46922 case E_V4DFmode:
46923 case E_V8SFmode:
46924 /* These are special-cased in sse.md so that we can optionally
46925 use the vbroadcast instruction. They expand to two insns
46926 if the input happens to be in a register. */
46927 gcc_unreachable ();
46929 case E_V2DFmode:
46930 case E_V2DImode:
46931 case E_V4SFmode:
46932 case E_V4SImode:
46933 /* These are always implementable using standard shuffle patterns. */
46934 gcc_unreachable ();
46936 case E_V8HImode:
46937 case E_V16QImode:
46938 /* These can be implemented via interleave. We save one insn by
46939 stopping once we have promoted to V4SImode and then use pshufd. */
46940 if (d->testing_p)
46941 return true;
46944 rtx dest;
46945 rtx (*gen) (rtx, rtx, rtx)
46946 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
46947 : gen_vec_interleave_lowv8hi;
46949 if (elt >= nelt2)
46951 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
46952 : gen_vec_interleave_highv8hi;
46953 elt -= nelt2;
46955 nelt2 /= 2;
46957 dest = gen_reg_rtx (vmode);
46958 emit_insn (gen (dest, op0, op0));
46959 vmode = get_mode_wider_vector (vmode);
46960 op0 = gen_lowpart (vmode, dest);
46962 while (vmode != V4SImode);
46964 memset (perm2, elt, 4);
46965 dest = gen_reg_rtx (V4SImode);
46966 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
46967 gcc_assert (ok);
46968 if (!d->testing_p)
46969 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
46970 return true;
46972 case E_V64QImode:
46973 case E_V32QImode:
46974 case E_V16HImode:
46975 case E_V8SImode:
46976 case E_V4DImode:
46977 /* For AVX2 broadcasts of the first element vpbroadcast* or
46978 vpermq should be used by expand_vec_perm_1. */
46979 gcc_assert (!TARGET_AVX2 || d->perm[0]);
46980 return false;
46982 default:
46983 gcc_unreachable ();
46987 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
46988 broadcast permutations. */
46990 static bool
46991 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
46993 unsigned i, elt, nelt = d->nelt;
46995 if (!d->one_operand_p)
46996 return false;
46998 elt = d->perm[0];
46999 for (i = 1; i < nelt; ++i)
47000 if (d->perm[i] != elt)
47001 return false;
47003 return expand_vec_perm_broadcast_1 (d);
47006 /* Implement arbitrary permutations of two V64QImode operands
47007 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
47008 static bool
47009 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
47011 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47012 return false;
47014 if (d->testing_p)
47015 return true;
47017 struct expand_vec_perm_d ds[2];
47018 rtx rperm[128], vperm, target0, target1;
47019 unsigned int i, nelt;
47020 machine_mode vmode;
47022 nelt = d->nelt;
47023 vmode = V64QImode;
47025 for (i = 0; i < 2; i++)
47027 ds[i] = *d;
47028 ds[i].vmode = V32HImode;
47029 ds[i].nelt = 32;
47030 ds[i].target = gen_reg_rtx (V32HImode);
47031 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47032 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47035 /* Prepare permutations such that the first one takes care of
47036 putting the even bytes into the right positions or one higher
47037 positions (ds[0]) and the second one takes care of
47038 putting the odd bytes into the right positions or one below
47039 (ds[1]). */
47041 for (i = 0; i < nelt; i++)
47043 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47044 if (i & 1)
47046 rperm[i] = constm1_rtx;
47047 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47049 else
47051 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47052 rperm[i + 64] = constm1_rtx;
47056 bool ok = expand_vec_perm_1 (&ds[0]);
47057 gcc_assert (ok);
47058 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47060 ok = expand_vec_perm_1 (&ds[1]);
47061 gcc_assert (ok);
47062 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47064 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47065 vperm = force_reg (vmode, vperm);
47066 target0 = gen_reg_rtx (V64QImode);
47067 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47069 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47070 vperm = force_reg (vmode, vperm);
47071 target1 = gen_reg_rtx (V64QImode);
47072 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47074 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47075 return true;
47078 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47079 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47080 all the shorter instruction sequences. */
47082 static bool
47083 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47085 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47086 unsigned int i, nelt, eltsz;
47087 bool used[4];
47089 if (!TARGET_AVX2
47090 || d->one_operand_p
47091 || (d->vmode != V32QImode && d->vmode != V16HImode))
47092 return false;
47094 if (d->testing_p)
47095 return true;
47097 nelt = d->nelt;
47098 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47100 /* Generate 4 permutation masks. If the required element is within
47101 the same lane, it is shuffled in. If the required element from the
47102 other lane, force a zero by setting bit 7 in the permutation mask.
47103 In the other mask the mask has non-negative elements if element
47104 is requested from the other lane, but also moved to the other lane,
47105 so that the result of vpshufb can have the two V2TImode halves
47106 swapped. */
47107 m128 = GEN_INT (-128);
47108 for (i = 0; i < 32; ++i)
47110 rperm[0][i] = m128;
47111 rperm[1][i] = m128;
47112 rperm[2][i] = m128;
47113 rperm[3][i] = m128;
47115 used[0] = false;
47116 used[1] = false;
47117 used[2] = false;
47118 used[3] = false;
47119 for (i = 0; i < nelt; ++i)
47121 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47122 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47123 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47125 for (j = 0; j < eltsz; ++j)
47126 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47127 used[which] = true;
47130 for (i = 0; i < 2; ++i)
47132 if (!used[2 * i + 1])
47134 h[i] = NULL_RTX;
47135 continue;
47137 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47138 gen_rtvec_v (32, rperm[2 * i + 1]));
47139 vperm = force_reg (V32QImode, vperm);
47140 h[i] = gen_reg_rtx (V32QImode);
47141 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47142 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47145 /* Swap the 128-byte lanes of h[X]. */
47146 for (i = 0; i < 2; ++i)
47148 if (h[i] == NULL_RTX)
47149 continue;
47150 op = gen_reg_rtx (V4DImode);
47151 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47152 const2_rtx, GEN_INT (3), const0_rtx,
47153 const1_rtx));
47154 h[i] = gen_lowpart (V32QImode, op);
47157 for (i = 0; i < 2; ++i)
47159 if (!used[2 * i])
47161 l[i] = NULL_RTX;
47162 continue;
47164 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47165 vperm = force_reg (V32QImode, vperm);
47166 l[i] = gen_reg_rtx (V32QImode);
47167 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47168 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47171 for (i = 0; i < 2; ++i)
47173 if (h[i] && l[i])
47175 op = gen_reg_rtx (V32QImode);
47176 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47177 l[i] = op;
47179 else if (h[i])
47180 l[i] = h[i];
47183 gcc_assert (l[0] && l[1]);
47184 op = d->target;
47185 if (d->vmode != V32QImode)
47186 op = gen_reg_rtx (V32QImode);
47187 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47188 if (op != d->target)
47189 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47190 return true;
47193 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47194 With all of the interface bits taken care of, perform the expansion
47195 in D and return true on success. */
47197 static bool
47198 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47200 /* Try a single instruction expansion. */
47201 if (expand_vec_perm_1 (d))
47202 return true;
47204 /* Try sequences of two instructions. */
47206 if (expand_vec_perm_pshuflw_pshufhw (d))
47207 return true;
47209 if (expand_vec_perm_palignr (d, false))
47210 return true;
47212 if (expand_vec_perm_interleave2 (d))
47213 return true;
47215 if (expand_vec_perm_broadcast (d))
47216 return true;
47218 if (expand_vec_perm_vpermq_perm_1 (d))
47219 return true;
47221 if (expand_vec_perm_vperm2f128 (d))
47222 return true;
47224 if (expand_vec_perm_pblendv (d))
47225 return true;
47227 /* Try sequences of three instructions. */
47229 if (expand_vec_perm_even_odd_pack (d))
47230 return true;
47232 if (expand_vec_perm_2vperm2f128_vshuf (d))
47233 return true;
47235 if (expand_vec_perm_pshufb2 (d))
47236 return true;
47238 if (expand_vec_perm_interleave3 (d))
47239 return true;
47241 if (expand_vec_perm_vperm2f128_vblend (d))
47242 return true;
47244 /* Try sequences of four instructions. */
47246 if (expand_vec_perm_even_odd_trunc (d))
47247 return true;
47248 if (expand_vec_perm_vpshufb2_vpermq (d))
47249 return true;
47251 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47252 return true;
47254 if (expand_vec_perm_vpermi2_vpshub2 (d))
47255 return true;
47257 /* ??? Look for narrow permutations whose element orderings would
47258 allow the promotion to a wider mode. */
47260 /* ??? Look for sequences of interleave or a wider permute that place
47261 the data into the correct lanes for a half-vector shuffle like
47262 pshuf[lh]w or vpermilps. */
47264 /* ??? Look for sequences of interleave that produce the desired results.
47265 The combinatorics of punpck[lh] get pretty ugly... */
47267 if (expand_vec_perm_even_odd (d))
47268 return true;
47270 /* Even longer sequences. */
47271 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47272 return true;
47274 /* See if we can get the same permutation in different vector integer
47275 mode. */
47276 struct expand_vec_perm_d nd;
47277 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47279 if (!d->testing_p)
47280 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47281 return true;
47284 return false;
47287 /* If a permutation only uses one operand, make it clear. Returns true
47288 if the permutation references both operands. */
47290 static bool
47291 canonicalize_perm (struct expand_vec_perm_d *d)
47293 int i, which, nelt = d->nelt;
47295 for (i = which = 0; i < nelt; ++i)
47296 which |= (d->perm[i] < nelt ? 1 : 2);
47298 d->one_operand_p = true;
47299 switch (which)
47301 default:
47302 gcc_unreachable();
47304 case 3:
47305 if (!rtx_equal_p (d->op0, d->op1))
47307 d->one_operand_p = false;
47308 break;
47310 /* The elements of PERM do not suggest that only the first operand
47311 is used, but both operands are identical. Allow easier matching
47312 of the permutation by folding the permutation into the single
47313 input vector. */
47314 /* FALLTHRU */
47316 case 2:
47317 for (i = 0; i < nelt; ++i)
47318 d->perm[i] &= nelt - 1;
47319 d->op0 = d->op1;
47320 break;
47322 case 1:
47323 d->op1 = d->op0;
47324 break;
47327 return (which == 3);
47330 bool
47331 ix86_expand_vec_perm_const (rtx operands[4])
47333 struct expand_vec_perm_d d;
47334 unsigned char perm[MAX_VECT_LEN];
47335 int i, nelt;
47336 bool two_args;
47337 rtx sel;
47339 d.target = operands[0];
47340 d.op0 = operands[1];
47341 d.op1 = operands[2];
47342 sel = operands[3];
47344 d.vmode = GET_MODE (d.target);
47345 gcc_assert (VECTOR_MODE_P (d.vmode));
47346 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47347 d.testing_p = false;
47349 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47350 gcc_assert (XVECLEN (sel, 0) == nelt);
47351 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47353 for (i = 0; i < nelt; ++i)
47355 rtx e = XVECEXP (sel, 0, i);
47356 int ei = INTVAL (e) & (2 * nelt - 1);
47357 d.perm[i] = ei;
47358 perm[i] = ei;
47361 two_args = canonicalize_perm (&d);
47363 if (ix86_expand_vec_perm_const_1 (&d))
47364 return true;
47366 /* If the selector says both arguments are needed, but the operands are the
47367 same, the above tried to expand with one_operand_p and flattened selector.
47368 If that didn't work, retry without one_operand_p; we succeeded with that
47369 during testing. */
47370 if (two_args && d.one_operand_p)
47372 d.one_operand_p = false;
47373 memcpy (d.perm, perm, sizeof (perm));
47374 return ix86_expand_vec_perm_const_1 (&d);
47377 return false;
47380 /* Implement targetm.vectorize.vec_perm_const_ok. */
47382 static bool
47383 ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
47385 struct expand_vec_perm_d d;
47386 unsigned int i, nelt, which;
47387 bool ret;
47389 d.vmode = vmode;
47390 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47391 d.testing_p = true;
47393 /* Given sufficient ISA support we can just return true here
47394 for selected vector modes. */
47395 switch (d.vmode)
47397 case E_V16SFmode:
47398 case E_V16SImode:
47399 case E_V8DImode:
47400 case E_V8DFmode:
47401 if (TARGET_AVX512F)
47402 /* All implementable with a single vpermi2 insn. */
47403 return true;
47404 break;
47405 case E_V32HImode:
47406 if (TARGET_AVX512BW)
47407 /* All implementable with a single vpermi2 insn. */
47408 return true;
47409 break;
47410 case E_V64QImode:
47411 if (TARGET_AVX512BW)
47412 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
47413 return true;
47414 break;
47415 case E_V8SImode:
47416 case E_V8SFmode:
47417 case E_V4DFmode:
47418 case E_V4DImode:
47419 if (TARGET_AVX512VL)
47420 /* All implementable with a single vpermi2 insn. */
47421 return true;
47422 break;
47423 case E_V16HImode:
47424 if (TARGET_AVX2)
47425 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47426 return true;
47427 break;
47428 case E_V32QImode:
47429 if (TARGET_AVX2)
47430 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47431 return true;
47432 break;
47433 case E_V4SImode:
47434 case E_V4SFmode:
47435 case E_V8HImode:
47436 case E_V16QImode:
47437 /* All implementable with a single vpperm insn. */
47438 if (TARGET_XOP)
47439 return true;
47440 /* All implementable with 2 pshufb + 1 ior. */
47441 if (TARGET_SSSE3)
47442 return true;
47443 break;
47444 case E_V2DImode:
47445 case E_V2DFmode:
47446 /* All implementable with shufpd or unpck[lh]pd. */
47447 return true;
47448 default:
47449 return false;
47452 /* Extract the values from the vector CST into the permutation
47453 array in D. */
47454 for (i = which = 0; i < nelt; ++i)
47456 unsigned char e = sel[i];
47457 gcc_assert (e < 2 * nelt);
47458 d.perm[i] = e;
47459 which |= (e < nelt ? 1 : 2);
47462 /* For all elements from second vector, fold the elements to first. */
47463 if (which == 2)
47464 for (i = 0; i < nelt; ++i)
47465 d.perm[i] -= nelt;
47467 /* Check whether the mask can be applied to the vector type. */
47468 d.one_operand_p = (which != 3);
47470 /* Implementable with shufps or pshufd. */
47471 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47472 return true;
47474 /* Otherwise we have to go through the motions and see if we can
47475 figure out how to generate the requested permutation. */
47476 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47477 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47478 if (!d.one_operand_p)
47479 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47481 start_sequence ();
47482 ret = ix86_expand_vec_perm_const_1 (&d);
47483 end_sequence ();
47485 return ret;
47488 void
47489 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47491 struct expand_vec_perm_d d;
47492 unsigned i, nelt;
47494 d.target = targ;
47495 d.op0 = op0;
47496 d.op1 = op1;
47497 d.vmode = GET_MODE (targ);
47498 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47499 d.one_operand_p = false;
47500 d.testing_p = false;
47502 for (i = 0; i < nelt; ++i)
47503 d.perm[i] = i * 2 + odd;
47505 /* We'll either be able to implement the permutation directly... */
47506 if (expand_vec_perm_1 (&d))
47507 return;
47509 /* ... or we use the special-case patterns. */
47510 expand_vec_perm_even_odd_1 (&d, odd);
47513 static void
47514 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47516 struct expand_vec_perm_d d;
47517 unsigned i, nelt, base;
47518 bool ok;
47520 d.target = targ;
47521 d.op0 = op0;
47522 d.op1 = op1;
47523 d.vmode = GET_MODE (targ);
47524 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47525 d.one_operand_p = false;
47526 d.testing_p = false;
47528 base = high_p ? nelt / 2 : 0;
47529 for (i = 0; i < nelt / 2; ++i)
47531 d.perm[i * 2] = i + base;
47532 d.perm[i * 2 + 1] = i + base + nelt;
47535 /* Note that for AVX this isn't one instruction. */
47536 ok = ix86_expand_vec_perm_const_1 (&d);
47537 gcc_assert (ok);
47541 /* Expand a vector operation CODE for a V*QImode in terms of the
47542 same operation on V*HImode. */
47544 void
47545 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47547 machine_mode qimode = GET_MODE (dest);
47548 machine_mode himode;
47549 rtx (*gen_il) (rtx, rtx, rtx);
47550 rtx (*gen_ih) (rtx, rtx, rtx);
47551 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47552 struct expand_vec_perm_d d;
47553 bool ok, full_interleave;
47554 bool uns_p = false;
47555 int i;
47557 switch (qimode)
47559 case E_V16QImode:
47560 himode = V8HImode;
47561 gen_il = gen_vec_interleave_lowv16qi;
47562 gen_ih = gen_vec_interleave_highv16qi;
47563 break;
47564 case E_V32QImode:
47565 himode = V16HImode;
47566 gen_il = gen_avx2_interleave_lowv32qi;
47567 gen_ih = gen_avx2_interleave_highv32qi;
47568 break;
47569 case E_V64QImode:
47570 himode = V32HImode;
47571 gen_il = gen_avx512bw_interleave_lowv64qi;
47572 gen_ih = gen_avx512bw_interleave_highv64qi;
47573 break;
47574 default:
47575 gcc_unreachable ();
47578 op2_l = op2_h = op2;
47579 switch (code)
47581 case MULT:
47582 /* Unpack data such that we've got a source byte in each low byte of
47583 each word. We don't care what goes into the high byte of each word.
47584 Rather than trying to get zero in there, most convenient is to let
47585 it be a copy of the low byte. */
47586 op2_l = gen_reg_rtx (qimode);
47587 op2_h = gen_reg_rtx (qimode);
47588 emit_insn (gen_il (op2_l, op2, op2));
47589 emit_insn (gen_ih (op2_h, op2, op2));
47590 /* FALLTHRU */
47592 op1_l = gen_reg_rtx (qimode);
47593 op1_h = gen_reg_rtx (qimode);
47594 emit_insn (gen_il (op1_l, op1, op1));
47595 emit_insn (gen_ih (op1_h, op1, op1));
47596 full_interleave = qimode == V16QImode;
47597 break;
47599 case ASHIFT:
47600 case LSHIFTRT:
47601 uns_p = true;
47602 /* FALLTHRU */
47603 case ASHIFTRT:
47604 op1_l = gen_reg_rtx (himode);
47605 op1_h = gen_reg_rtx (himode);
47606 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
47607 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
47608 full_interleave = true;
47609 break;
47610 default:
47611 gcc_unreachable ();
47614 /* Perform the operation. */
47615 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
47616 1, OPTAB_DIRECT);
47617 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
47618 1, OPTAB_DIRECT);
47619 gcc_assert (res_l && res_h);
47621 /* Merge the data back into the right place. */
47622 d.target = dest;
47623 d.op0 = gen_lowpart (qimode, res_l);
47624 d.op1 = gen_lowpart (qimode, res_h);
47625 d.vmode = qimode;
47626 d.nelt = GET_MODE_NUNITS (qimode);
47627 d.one_operand_p = false;
47628 d.testing_p = false;
47630 if (full_interleave)
47632 /* For SSE2, we used an full interleave, so the desired
47633 results are in the even elements. */
47634 for (i = 0; i < d.nelt; ++i)
47635 d.perm[i] = i * 2;
47637 else
47639 /* For AVX, the interleave used above was not cross-lane. So the
47640 extraction is evens but with the second and third quarter swapped.
47641 Happily, that is even one insn shorter than even extraction.
47642 For AVX512BW we have 4 lanes. We extract evens from within a lane,
47643 always first from the first and then from the second source operand,
47644 the index bits above the low 4 bits remains the same.
47645 Thus, for d.nelt == 32 we want permutation
47646 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
47647 and for d.nelt == 64 we want permutation
47648 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
47649 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
47650 for (i = 0; i < d.nelt; ++i)
47651 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
47654 ok = ix86_expand_vec_perm_const_1 (&d);
47655 gcc_assert (ok);
47657 set_unique_reg_note (get_last_insn (), REG_EQUAL,
47658 gen_rtx_fmt_ee (code, qimode, op1, op2));
47661 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
47662 if op is CONST_VECTOR with all odd elements equal to their
47663 preceding element. */
47665 static bool
47666 const_vector_equal_evenodd_p (rtx op)
47668 machine_mode mode = GET_MODE (op);
47669 int i, nunits = GET_MODE_NUNITS (mode);
47670 if (GET_CODE (op) != CONST_VECTOR
47671 || nunits != CONST_VECTOR_NUNITS (op))
47672 return false;
47673 for (i = 0; i < nunits; i += 2)
47674 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
47675 return false;
47676 return true;
47679 void
47680 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
47681 bool uns_p, bool odd_p)
47683 machine_mode mode = GET_MODE (op1);
47684 machine_mode wmode = GET_MODE (dest);
47685 rtx x;
47686 rtx orig_op1 = op1, orig_op2 = op2;
47688 if (!nonimmediate_operand (op1, mode))
47689 op1 = force_reg (mode, op1);
47690 if (!nonimmediate_operand (op2, mode))
47691 op2 = force_reg (mode, op2);
47693 /* We only play even/odd games with vectors of SImode. */
47694 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
47696 /* If we're looking for the odd results, shift those members down to
47697 the even slots. For some cpus this is faster than a PSHUFD. */
47698 if (odd_p)
47700 /* For XOP use vpmacsdqh, but only for smult, as it is only
47701 signed. */
47702 if (TARGET_XOP && mode == V4SImode && !uns_p)
47704 x = force_reg (wmode, CONST0_RTX (wmode));
47705 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
47706 return;
47709 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
47710 if (!const_vector_equal_evenodd_p (orig_op1))
47711 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
47712 x, NULL, 1, OPTAB_DIRECT);
47713 if (!const_vector_equal_evenodd_p (orig_op2))
47714 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
47715 x, NULL, 1, OPTAB_DIRECT);
47716 op1 = gen_lowpart (mode, op1);
47717 op2 = gen_lowpart (mode, op2);
47720 if (mode == V16SImode)
47722 if (uns_p)
47723 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
47724 else
47725 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
47727 else if (mode == V8SImode)
47729 if (uns_p)
47730 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
47731 else
47732 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
47734 else if (uns_p)
47735 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
47736 else if (TARGET_SSE4_1)
47737 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
47738 else
47740 rtx s1, s2, t0, t1, t2;
47742 /* The easiest way to implement this without PMULDQ is to go through
47743 the motions as if we are performing a full 64-bit multiply. With
47744 the exception that we need to do less shuffling of the elements. */
47746 /* Compute the sign-extension, aka highparts, of the two operands. */
47747 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
47748 op1, pc_rtx, pc_rtx);
47749 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
47750 op2, pc_rtx, pc_rtx);
47752 /* Multiply LO(A) * HI(B), and vice-versa. */
47753 t1 = gen_reg_rtx (wmode);
47754 t2 = gen_reg_rtx (wmode);
47755 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
47756 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
47758 /* Multiply LO(A) * LO(B). */
47759 t0 = gen_reg_rtx (wmode);
47760 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
47762 /* Combine and shift the highparts into place. */
47763 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
47764 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
47765 1, OPTAB_DIRECT);
47767 /* Combine high and low parts. */
47768 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
47769 return;
47771 emit_insn (x);
47774 void
47775 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
47776 bool uns_p, bool high_p)
47778 machine_mode wmode = GET_MODE (dest);
47779 machine_mode mode = GET_MODE (op1);
47780 rtx t1, t2, t3, t4, mask;
47782 switch (mode)
47784 case E_V4SImode:
47785 t1 = gen_reg_rtx (mode);
47786 t2 = gen_reg_rtx (mode);
47787 if (TARGET_XOP && !uns_p)
47789 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
47790 shuffle the elements once so that all elements are in the right
47791 place for immediate use: { A C B D }. */
47792 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
47793 const1_rtx, GEN_INT (3)));
47794 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
47795 const1_rtx, GEN_INT (3)));
47797 else
47799 /* Put the elements into place for the multiply. */
47800 ix86_expand_vec_interleave (t1, op1, op1, high_p);
47801 ix86_expand_vec_interleave (t2, op2, op2, high_p);
47802 high_p = false;
47804 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
47805 break;
47807 case E_V8SImode:
47808 /* Shuffle the elements between the lanes. After this we
47809 have { A B E F | C D G H } for each operand. */
47810 t1 = gen_reg_rtx (V4DImode);
47811 t2 = gen_reg_rtx (V4DImode);
47812 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
47813 const0_rtx, const2_rtx,
47814 const1_rtx, GEN_INT (3)));
47815 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
47816 const0_rtx, const2_rtx,
47817 const1_rtx, GEN_INT (3)));
47819 /* Shuffle the elements within the lanes. After this we
47820 have { A A B B | C C D D } or { E E F F | G G H H }. */
47821 t3 = gen_reg_rtx (V8SImode);
47822 t4 = gen_reg_rtx (V8SImode);
47823 mask = GEN_INT (high_p
47824 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
47825 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
47826 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
47827 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
47829 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
47830 break;
47832 case E_V8HImode:
47833 case E_V16HImode:
47834 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
47835 uns_p, OPTAB_DIRECT);
47836 t2 = expand_binop (mode,
47837 uns_p ? umul_highpart_optab : smul_highpart_optab,
47838 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
47839 gcc_assert (t1 && t2);
47841 t3 = gen_reg_rtx (mode);
47842 ix86_expand_vec_interleave (t3, t1, t2, high_p);
47843 emit_move_insn (dest, gen_lowpart (wmode, t3));
47844 break;
47846 case E_V16QImode:
47847 case E_V32QImode:
47848 case E_V32HImode:
47849 case E_V16SImode:
47850 case E_V64QImode:
47851 t1 = gen_reg_rtx (wmode);
47852 t2 = gen_reg_rtx (wmode);
47853 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
47854 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
47856 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
47857 break;
47859 default:
47860 gcc_unreachable ();
47864 void
47865 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
47867 rtx res_1, res_2, res_3, res_4;
47869 res_1 = gen_reg_rtx (V4SImode);
47870 res_2 = gen_reg_rtx (V4SImode);
47871 res_3 = gen_reg_rtx (V2DImode);
47872 res_4 = gen_reg_rtx (V2DImode);
47873 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
47874 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
47876 /* Move the results in element 2 down to element 1; we don't care
47877 what goes in elements 2 and 3. Then we can merge the parts
47878 back together with an interleave.
47880 Note that two other sequences were tried:
47881 (1) Use interleaves at the start instead of psrldq, which allows
47882 us to use a single shufps to merge things back at the end.
47883 (2) Use shufps here to combine the two vectors, then pshufd to
47884 put the elements in the correct order.
47885 In both cases the cost of the reformatting stall was too high
47886 and the overall sequence slower. */
47888 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
47889 const0_rtx, const2_rtx,
47890 const0_rtx, const0_rtx));
47891 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
47892 const0_rtx, const2_rtx,
47893 const0_rtx, const0_rtx));
47894 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
47896 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
47899 void
47900 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
47902 machine_mode mode = GET_MODE (op0);
47903 rtx t1, t2, t3, t4, t5, t6;
47905 if (TARGET_AVX512DQ && mode == V8DImode)
47906 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
47907 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
47908 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
47909 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
47910 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
47911 else if (TARGET_XOP && mode == V2DImode)
47913 /* op1: A,B,C,D, op2: E,F,G,H */
47914 op1 = gen_lowpart (V4SImode, op1);
47915 op2 = gen_lowpart (V4SImode, op2);
47917 t1 = gen_reg_rtx (V4SImode);
47918 t2 = gen_reg_rtx (V4SImode);
47919 t3 = gen_reg_rtx (V2DImode);
47920 t4 = gen_reg_rtx (V2DImode);
47922 /* t1: B,A,D,C */
47923 emit_insn (gen_sse2_pshufd_1 (t1, op1,
47924 GEN_INT (1),
47925 GEN_INT (0),
47926 GEN_INT (3),
47927 GEN_INT (2)));
47929 /* t2: (B*E),(A*F),(D*G),(C*H) */
47930 emit_insn (gen_mulv4si3 (t2, t1, op2));
47932 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
47933 emit_insn (gen_xop_phadddq (t3, t2));
47935 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
47936 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
47938 /* Multiply lower parts and add all */
47939 t5 = gen_reg_rtx (V2DImode);
47940 emit_insn (gen_vec_widen_umult_even_v4si (t5,
47941 gen_lowpart (V4SImode, op1),
47942 gen_lowpart (V4SImode, op2)));
47943 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
47946 else
47948 machine_mode nmode;
47949 rtx (*umul) (rtx, rtx, rtx);
47951 if (mode == V2DImode)
47953 umul = gen_vec_widen_umult_even_v4si;
47954 nmode = V4SImode;
47956 else if (mode == V4DImode)
47958 umul = gen_vec_widen_umult_even_v8si;
47959 nmode = V8SImode;
47961 else if (mode == V8DImode)
47963 umul = gen_vec_widen_umult_even_v16si;
47964 nmode = V16SImode;
47966 else
47967 gcc_unreachable ();
47970 /* Multiply low parts. */
47971 t1 = gen_reg_rtx (mode);
47972 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
47974 /* Shift input vectors right 32 bits so we can multiply high parts. */
47975 t6 = GEN_INT (32);
47976 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
47977 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
47979 /* Multiply high parts by low parts. */
47980 t4 = gen_reg_rtx (mode);
47981 t5 = gen_reg_rtx (mode);
47982 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
47983 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
47985 /* Combine and shift the highparts back. */
47986 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
47987 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
47989 /* Combine high and low parts. */
47990 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
47993 set_unique_reg_note (get_last_insn (), REG_EQUAL,
47994 gen_rtx_MULT (mode, op1, op2));
47997 /* Return 1 if control tansfer instruction INSN
47998 should be encoded with bnd prefix.
47999 If insn is NULL then return 1 when control
48000 transfer instructions should be prefixed with
48001 bnd by default for current function. */
48003 bool
48004 ix86_bnd_prefixed_insn_p (rtx insn)
48006 /* For call insns check special flag. */
48007 if (insn && CALL_P (insn))
48009 rtx call = get_call_rtx_from (insn);
48010 if (call)
48011 return CALL_EXPR_WITH_BOUNDS_P (call);
48014 /* All other insns are prefixed only if function is instrumented. */
48015 return chkp_function_instrumented_p (current_function_decl);
48018 /* Return 1 if control tansfer instruction INSN
48019 should be encoded with notrack prefix. */
48021 static bool
48022 ix86_notrack_prefixed_insn_p (rtx insn)
48024 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
48025 return false;
48027 if (CALL_P (insn))
48029 rtx call = get_call_rtx_from (insn);
48030 gcc_assert (call != NULL_RTX);
48031 rtx addr = XEXP (call, 0);
48033 /* Do not emit 'notrack' if it's not an indirect call. */
48034 if (MEM_P (addr)
48035 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
48036 return false;
48037 else
48038 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
48041 if (JUMP_P (insn) && !flag_cet_switch)
48043 rtx target = JUMP_LABEL (insn);
48044 if (target == NULL_RTX || ANY_RETURN_P (target))
48045 return false;
48047 /* Check the jump is a switch table. */
48048 rtx_insn *label = as_a<rtx_insn *> (target);
48049 rtx_insn *table = next_insn (label);
48050 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
48051 return false;
48052 else
48053 return true;
48055 return false;
48058 /* Calculate integer abs() using only SSE2 instructions. */
48060 void
48061 ix86_expand_sse2_abs (rtx target, rtx input)
48063 machine_mode mode = GET_MODE (target);
48064 rtx tmp0, tmp1, x;
48066 switch (mode)
48068 /* For 32-bit signed integer X, the best way to calculate the absolute
48069 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48070 case E_V4SImode:
48071 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48072 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48073 NULL, 0, OPTAB_DIRECT);
48074 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48075 NULL, 0, OPTAB_DIRECT);
48076 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48077 target, 0, OPTAB_DIRECT);
48078 break;
48080 /* For 16-bit signed integer X, the best way to calculate the absolute
48081 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48082 case E_V8HImode:
48083 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48085 x = expand_simple_binop (mode, SMAX, tmp0, input,
48086 target, 0, OPTAB_DIRECT);
48087 break;
48089 /* For 8-bit signed integer X, the best way to calculate the absolute
48090 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48091 as SSE2 provides the PMINUB insn. */
48092 case E_V16QImode:
48093 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48095 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48096 target, 0, OPTAB_DIRECT);
48097 break;
48099 default:
48100 gcc_unreachable ();
48103 if (x != target)
48104 emit_move_insn (target, x);
48107 /* Expand an extract from a vector register through pextr insn.
48108 Return true if successful. */
48110 bool
48111 ix86_expand_pextr (rtx *operands)
48113 rtx dst = operands[0];
48114 rtx src = operands[1];
48116 unsigned int size = INTVAL (operands[2]);
48117 unsigned int pos = INTVAL (operands[3]);
48119 if (SUBREG_P (dst))
48121 /* Reject non-lowpart subregs. */
48122 if (SUBREG_BYTE (dst) > 0)
48123 return false;
48124 dst = SUBREG_REG (dst);
48127 if (SUBREG_P (src))
48129 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48130 src = SUBREG_REG (src);
48133 switch (GET_MODE (src))
48135 case E_V16QImode:
48136 case E_V8HImode:
48137 case E_V4SImode:
48138 case E_V2DImode:
48139 case E_V1TImode:
48140 case E_TImode:
48142 machine_mode srcmode, dstmode;
48143 rtx d, pat;
48145 if (!int_mode_for_size (size, 0).exists (&dstmode))
48146 return false;
48148 switch (dstmode)
48150 case E_QImode:
48151 if (!TARGET_SSE4_1)
48152 return false;
48153 srcmode = V16QImode;
48154 break;
48156 case E_HImode:
48157 if (!TARGET_SSE2)
48158 return false;
48159 srcmode = V8HImode;
48160 break;
48162 case E_SImode:
48163 if (!TARGET_SSE4_1)
48164 return false;
48165 srcmode = V4SImode;
48166 break;
48168 case E_DImode:
48169 gcc_assert (TARGET_64BIT);
48170 if (!TARGET_SSE4_1)
48171 return false;
48172 srcmode = V2DImode;
48173 break;
48175 default:
48176 return false;
48179 /* Reject extractions from misaligned positions. */
48180 if (pos & (size-1))
48181 return false;
48183 if (GET_MODE (dst) == dstmode)
48184 d = dst;
48185 else
48186 d = gen_reg_rtx (dstmode);
48188 /* Construct insn pattern. */
48189 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48190 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48192 /* Let the rtl optimizers know about the zero extension performed. */
48193 if (dstmode == QImode || dstmode == HImode)
48195 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48196 d = gen_lowpart (SImode, d);
48199 emit_insn (gen_rtx_SET (d, pat));
48201 if (d != dst)
48202 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48203 return true;
48206 default:
48207 return false;
48211 /* Expand an insert into a vector register through pinsr insn.
48212 Return true if successful. */
48214 bool
48215 ix86_expand_pinsr (rtx *operands)
48217 rtx dst = operands[0];
48218 rtx src = operands[3];
48220 unsigned int size = INTVAL (operands[1]);
48221 unsigned int pos = INTVAL (operands[2]);
48223 if (SUBREG_P (dst))
48225 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48226 dst = SUBREG_REG (dst);
48229 switch (GET_MODE (dst))
48231 case E_V16QImode:
48232 case E_V8HImode:
48233 case E_V4SImode:
48234 case E_V2DImode:
48235 case E_V1TImode:
48236 case E_TImode:
48238 machine_mode srcmode, dstmode;
48239 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48240 rtx d;
48242 if (!int_mode_for_size (size, 0).exists (&srcmode))
48243 return false;
48245 switch (srcmode)
48247 case E_QImode:
48248 if (!TARGET_SSE4_1)
48249 return false;
48250 dstmode = V16QImode;
48251 pinsr = gen_sse4_1_pinsrb;
48252 break;
48254 case E_HImode:
48255 if (!TARGET_SSE2)
48256 return false;
48257 dstmode = V8HImode;
48258 pinsr = gen_sse2_pinsrw;
48259 break;
48261 case E_SImode:
48262 if (!TARGET_SSE4_1)
48263 return false;
48264 dstmode = V4SImode;
48265 pinsr = gen_sse4_1_pinsrd;
48266 break;
48268 case E_DImode:
48269 gcc_assert (TARGET_64BIT);
48270 if (!TARGET_SSE4_1)
48271 return false;
48272 dstmode = V2DImode;
48273 pinsr = gen_sse4_1_pinsrq;
48274 break;
48276 default:
48277 return false;
48280 /* Reject insertions to misaligned positions. */
48281 if (pos & (size-1))
48282 return false;
48284 if (SUBREG_P (src))
48286 unsigned int srcpos = SUBREG_BYTE (src);
48288 if (srcpos > 0)
48290 rtx extr_ops[4];
48292 extr_ops[0] = gen_reg_rtx (srcmode);
48293 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48294 extr_ops[2] = GEN_INT (size);
48295 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48297 if (!ix86_expand_pextr (extr_ops))
48298 return false;
48300 src = extr_ops[0];
48302 else
48303 src = gen_lowpart (srcmode, SUBREG_REG (src));
48306 if (GET_MODE (dst) == dstmode)
48307 d = dst;
48308 else
48309 d = gen_reg_rtx (dstmode);
48311 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48312 gen_lowpart (srcmode, src),
48313 GEN_INT (1 << (pos / size))));
48314 if (d != dst)
48315 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48316 return true;
48319 default:
48320 return false;
48324 /* This function returns the calling abi specific va_list type node.
48325 It returns the FNDECL specific va_list type. */
48327 static tree
48328 ix86_fn_abi_va_list (tree fndecl)
48330 if (!TARGET_64BIT)
48331 return va_list_type_node;
48332 gcc_assert (fndecl != NULL_TREE);
48334 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48335 return ms_va_list_type_node;
48336 else
48337 return sysv_va_list_type_node;
48340 /* Returns the canonical va_list type specified by TYPE. If there
48341 is no valid TYPE provided, it return NULL_TREE. */
48343 static tree
48344 ix86_canonical_va_list_type (tree type)
48346 if (TARGET_64BIT)
48348 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48349 return ms_va_list_type_node;
48351 if ((TREE_CODE (type) == ARRAY_TYPE
48352 && integer_zerop (array_type_nelts (type)))
48353 || POINTER_TYPE_P (type))
48355 tree elem_type = TREE_TYPE (type);
48356 if (TREE_CODE (elem_type) == RECORD_TYPE
48357 && lookup_attribute ("sysv_abi va_list",
48358 TYPE_ATTRIBUTES (elem_type)))
48359 return sysv_va_list_type_node;
48362 return NULL_TREE;
48365 return std_canonical_va_list_type (type);
48368 /* Iterate through the target-specific builtin types for va_list.
48369 IDX denotes the iterator, *PTREE is set to the result type of
48370 the va_list builtin, and *PNAME to its internal type.
48371 Returns zero if there is no element for this index, otherwise
48372 IDX should be increased upon the next call.
48373 Note, do not iterate a base builtin's name like __builtin_va_list.
48374 Used from c_common_nodes_and_builtins. */
48376 static int
48377 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48379 if (TARGET_64BIT)
48381 switch (idx)
48383 default:
48384 break;
48386 case 0:
48387 *ptree = ms_va_list_type_node;
48388 *pname = "__builtin_ms_va_list";
48389 return 1;
48391 case 1:
48392 *ptree = sysv_va_list_type_node;
48393 *pname = "__builtin_sysv_va_list";
48394 return 1;
48398 return 0;
48401 #undef TARGET_SCHED_DISPATCH
48402 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
48403 #undef TARGET_SCHED_DISPATCH_DO
48404 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
48405 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48406 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48407 #undef TARGET_SCHED_REORDER
48408 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
48409 #undef TARGET_SCHED_ADJUST_PRIORITY
48410 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48411 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48412 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48413 ix86_dependencies_evaluation_hook
48416 /* Implementation of reassociation_width target hook used by
48417 reassoc phase to identify parallelism level in reassociated
48418 tree. Statements tree_code is passed in OPC. Arguments type
48419 is passed in MODE. */
48421 static int
48422 ix86_reassociation_width (unsigned int op, machine_mode mode)
48424 int width = 1;
48425 /* Vector part. */
48426 if (VECTOR_MODE_P (mode))
48428 int div = 1;
48429 if (INTEGRAL_MODE_P (mode))
48430 width = ix86_cost->reassoc_vec_int;
48431 else if (FLOAT_MODE_P (mode))
48432 width = ix86_cost->reassoc_vec_fp;
48434 if (width == 1)
48435 return 1;
48437 /* Integer vector instructions execute in FP unit
48438 and can execute 3 additions and one multiplication per cycle. */
48439 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
48440 && op != PLUS && op != MINUS)
48441 return 1;
48443 /* Account for targets that splits wide vectors into multiple parts. */
48444 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
48445 div = GET_MODE_BITSIZE (mode) / 128;
48446 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
48447 div = GET_MODE_BITSIZE (mode) / 64;
48448 width = (width + div - 1) / div;
48450 /* Scalar part. */
48451 else if (INTEGRAL_MODE_P (mode))
48452 width = ix86_cost->reassoc_int;
48453 else if (FLOAT_MODE_P (mode))
48454 width = ix86_cost->reassoc_fp;
48456 /* Avoid using too many registers in 32bit mode. */
48457 if (!TARGET_64BIT && width > 2)
48458 width = 2;
48459 return width;
48462 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
48463 place emms and femms instructions. */
48465 static machine_mode
48466 ix86_preferred_simd_mode (scalar_mode mode)
48468 if (!TARGET_SSE)
48469 return word_mode;
48471 switch (mode)
48473 case E_QImode:
48474 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48475 return V64QImode;
48476 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48477 return V32QImode;
48478 else
48479 return V16QImode;
48481 case E_HImode:
48482 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48483 return V32HImode;
48484 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48485 return V16HImode;
48486 else
48487 return V8HImode;
48489 case E_SImode:
48490 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48491 return V16SImode;
48492 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48493 return V8SImode;
48494 else
48495 return V4SImode;
48497 case E_DImode:
48498 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48499 return V8DImode;
48500 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48501 return V4DImode;
48502 else
48503 return V2DImode;
48505 case E_SFmode:
48506 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48507 return V16SFmode;
48508 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48509 return V8SFmode;
48510 else
48511 return V4SFmode;
48513 case E_DFmode:
48514 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48515 return V8DFmode;
48516 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48517 return V4DFmode;
48518 else if (TARGET_SSE2)
48519 return V2DFmode;
48520 /* FALLTHRU */
48522 default:
48523 return word_mode;
48527 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
48528 vectors. If AVX512F is enabled then try vectorizing with 512bit,
48529 256bit and 128bit vectors. */
48531 static unsigned int
48532 ix86_autovectorize_vector_sizes (void)
48534 unsigned int bytesizes = 0;
48536 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48537 bytesizes |= (64 | 32 | 16);
48538 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48539 bytesizes |= (32 | 16);
48541 return bytesizes;
48544 /* Implemenation of targetm.vectorize.get_mask_mode. */
48546 static opt_machine_mode
48547 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
48549 unsigned elem_size = vector_size / nunits;
48551 /* Scalar mask case. */
48552 if ((TARGET_AVX512F && vector_size == 64)
48553 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
48555 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
48556 return smallest_int_mode_for_size (nunits);
48559 scalar_int_mode elem_mode
48560 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
48562 gcc_assert (elem_size * nunits == vector_size);
48564 return mode_for_vector (elem_mode, nunits);
48569 /* Return class of registers which could be used for pseudo of MODE
48570 and of class RCLASS for spilling instead of memory. Return NO_REGS
48571 if it is not possible or non-profitable. */
48573 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
48575 static reg_class_t
48576 ix86_spill_class (reg_class_t rclass, machine_mode mode)
48578 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
48579 && TARGET_SSE2
48580 && TARGET_INTER_UNIT_MOVES_TO_VEC
48581 && TARGET_INTER_UNIT_MOVES_FROM_VEC
48582 && (mode == SImode || (TARGET_64BIT && mode == DImode))
48583 && INTEGER_CLASS_P (rclass))
48584 return ALL_SSE_REGS;
48585 return NO_REGS;
48588 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
48589 but returns a lower bound. */
48591 static unsigned int
48592 ix86_max_noce_ifcvt_seq_cost (edge e)
48594 bool predictable_p = predictable_edge_p (e);
48596 enum compiler_param param
48597 = (predictable_p
48598 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
48599 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
48601 /* If we have a parameter set, use that, otherwise take a guess using
48602 BRANCH_COST. */
48603 if (global_options_set.x_param_values[param])
48604 return PARAM_VALUE (param);
48605 else
48606 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
48609 /* Return true if SEQ is a good candidate as a replacement for the
48610 if-convertible sequence described in IF_INFO. */
48612 static bool
48613 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
48615 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
48617 int cmov_cnt = 0;
48618 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
48619 Maybe we should allow even more conditional moves as long as they
48620 are used far enough not to stall the CPU, or also consider
48621 IF_INFO->TEST_BB succ edge probabilities. */
48622 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
48624 rtx set = single_set (insn);
48625 if (!set)
48626 continue;
48627 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
48628 continue;
48629 rtx src = SET_SRC (set);
48630 machine_mode mode = GET_MODE (src);
48631 if (GET_MODE_CLASS (mode) != MODE_INT
48632 && GET_MODE_CLASS (mode) != MODE_FLOAT)
48633 continue;
48634 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
48635 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
48636 continue;
48637 /* insn is CMOV or FCMOV. */
48638 if (++cmov_cnt > 1)
48639 return false;
48642 return default_noce_conversion_profitable_p (seq, if_info);
48645 /* Implement targetm.vectorize.init_cost. */
48647 static void *
48648 ix86_init_cost (struct loop *)
48650 unsigned *cost = XNEWVEC (unsigned, 3);
48651 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
48652 return cost;
48655 /* Implement targetm.vectorize.add_stmt_cost. */
48657 static unsigned
48658 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
48659 struct _stmt_vec_info *stmt_info, int misalign,
48660 enum vect_cost_model_location where)
48662 unsigned *cost = (unsigned *) data;
48663 unsigned retval = 0;
48665 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
48666 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
48668 /* Penalize DFmode vector operations for Bonnell. */
48669 if (TARGET_BONNELL && kind == vector_stmt
48670 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
48671 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
48673 /* Statements in an inner loop relative to the loop being
48674 vectorized are weighted more heavily. The value here is
48675 arbitrary and could potentially be improved with analysis. */
48676 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
48677 count *= 50; /* FIXME. */
48679 retval = (unsigned) (count * stmt_cost);
48681 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
48682 for Silvermont as it has out of order integer pipeline and can execute
48683 2 scalar instruction per tick, but has in order SIMD pipeline. */
48684 if ((TARGET_SILVERMONT || TARGET_INTEL)
48685 && stmt_info && stmt_info->stmt)
48687 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
48688 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
48689 retval = (retval * 17) / 10;
48692 cost[where] += retval;
48694 return retval;
48697 /* Implement targetm.vectorize.finish_cost. */
48699 static void
48700 ix86_finish_cost (void *data, unsigned *prologue_cost,
48701 unsigned *body_cost, unsigned *epilogue_cost)
48703 unsigned *cost = (unsigned *) data;
48704 *prologue_cost = cost[vect_prologue];
48705 *body_cost = cost[vect_body];
48706 *epilogue_cost = cost[vect_epilogue];
48709 /* Implement targetm.vectorize.destroy_cost_data. */
48711 static void
48712 ix86_destroy_cost_data (void *data)
48714 free (data);
48717 /* Validate target specific memory model bits in VAL. */
48719 static unsigned HOST_WIDE_INT
48720 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
48722 enum memmodel model = memmodel_from_int (val);
48723 bool strong;
48725 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
48726 |MEMMODEL_MASK)
48727 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
48729 warning (OPT_Winvalid_memory_model,
48730 "Unknown architecture specific memory model");
48731 return MEMMODEL_SEQ_CST;
48733 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
48734 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
48736 warning (OPT_Winvalid_memory_model,
48737 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
48738 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
48740 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
48742 warning (OPT_Winvalid_memory_model,
48743 "HLE_RELEASE not used with RELEASE or stronger memory model");
48744 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
48746 return val;
48749 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
48750 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
48751 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
48752 or number of vecsize_mangle variants that should be emitted. */
48754 static int
48755 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
48756 struct cgraph_simd_clone *clonei,
48757 tree base_type, int num)
48759 int ret = 1;
48761 if (clonei->simdlen
48762 && (clonei->simdlen < 2
48763 || clonei->simdlen > 1024
48764 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
48766 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
48767 "unsupported simdlen %d", clonei->simdlen);
48768 return 0;
48771 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
48772 if (TREE_CODE (ret_type) != VOID_TYPE)
48773 switch (TYPE_MODE (ret_type))
48775 case E_QImode:
48776 case E_HImode:
48777 case E_SImode:
48778 case E_DImode:
48779 case E_SFmode:
48780 case E_DFmode:
48781 /* case E_SCmode: */
48782 /* case E_DCmode: */
48783 break;
48784 default:
48785 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
48786 "unsupported return type %qT for simd\n", ret_type);
48787 return 0;
48790 tree t;
48791 int i;
48793 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
48794 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
48795 switch (TYPE_MODE (TREE_TYPE (t)))
48797 case E_QImode:
48798 case E_HImode:
48799 case E_SImode:
48800 case E_DImode:
48801 case E_SFmode:
48802 case E_DFmode:
48803 /* case E_SCmode: */
48804 /* case E_DCmode: */
48805 break;
48806 default:
48807 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
48808 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
48809 return 0;
48812 if (clonei->cilk_elemental)
48814 /* Parse here processor clause. If not present, default to 'b'. */
48815 clonei->vecsize_mangle = 'b';
48817 else if (!TREE_PUBLIC (node->decl))
48819 /* If the function isn't exported, we can pick up just one ISA
48820 for the clones. */
48821 if (TARGET_AVX512F)
48822 clonei->vecsize_mangle = 'e';
48823 else if (TARGET_AVX2)
48824 clonei->vecsize_mangle = 'd';
48825 else if (TARGET_AVX)
48826 clonei->vecsize_mangle = 'c';
48827 else
48828 clonei->vecsize_mangle = 'b';
48829 ret = 1;
48831 else
48833 clonei->vecsize_mangle = "bcde"[num];
48834 ret = 4;
48836 clonei->mask_mode = VOIDmode;
48837 switch (clonei->vecsize_mangle)
48839 case 'b':
48840 clonei->vecsize_int = 128;
48841 clonei->vecsize_float = 128;
48842 break;
48843 case 'c':
48844 clonei->vecsize_int = 128;
48845 clonei->vecsize_float = 256;
48846 break;
48847 case 'd':
48848 clonei->vecsize_int = 256;
48849 clonei->vecsize_float = 256;
48850 break;
48851 case 'e':
48852 clonei->vecsize_int = 512;
48853 clonei->vecsize_float = 512;
48854 if (TYPE_MODE (base_type) == QImode)
48855 clonei->mask_mode = DImode;
48856 else
48857 clonei->mask_mode = SImode;
48858 break;
48860 if (clonei->simdlen == 0)
48862 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
48863 clonei->simdlen = clonei->vecsize_int;
48864 else
48865 clonei->simdlen = clonei->vecsize_float;
48866 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
48868 else if (clonei->simdlen > 16)
48870 /* For compatibility with ICC, use the same upper bounds
48871 for simdlen. In particular, for CTYPE below, use the return type,
48872 unless the function returns void, in that case use the characteristic
48873 type. If it is possible for given SIMDLEN to pass CTYPE value
48874 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
48875 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
48876 emit corresponding clone. */
48877 tree ctype = ret_type;
48878 if (TREE_CODE (ret_type) == VOID_TYPE)
48879 ctype = base_type;
48880 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
48881 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
48882 cnt /= clonei->vecsize_int;
48883 else
48884 cnt /= clonei->vecsize_float;
48885 if (cnt > (TARGET_64BIT ? 16 : 8))
48887 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
48888 "unsupported simdlen %d", clonei->simdlen);
48889 return 0;
48892 return ret;
48895 /* Add target attribute to SIMD clone NODE if needed. */
48897 static void
48898 ix86_simd_clone_adjust (struct cgraph_node *node)
48900 const char *str = NULL;
48901 gcc_assert (node->decl == cfun->decl);
48902 switch (node->simdclone->vecsize_mangle)
48904 case 'b':
48905 if (!TARGET_SSE2)
48906 str = "sse2";
48907 break;
48908 case 'c':
48909 if (!TARGET_AVX)
48910 str = "avx";
48911 break;
48912 case 'd':
48913 if (!TARGET_AVX2)
48914 str = "avx2";
48915 break;
48916 case 'e':
48917 if (!TARGET_AVX512F)
48918 str = "avx512f";
48919 break;
48920 default:
48921 gcc_unreachable ();
48923 if (str == NULL)
48924 return;
48925 push_cfun (NULL);
48926 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
48927 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
48928 gcc_assert (ok);
48929 pop_cfun ();
48930 ix86_reset_previous_fndecl ();
48931 ix86_set_current_function (node->decl);
48934 /* If SIMD clone NODE can't be used in a vectorized loop
48935 in current function, return -1, otherwise return a badness of using it
48936 (0 if it is most desirable from vecsize_mangle point of view, 1
48937 slightly less desirable, etc.). */
48939 static int
48940 ix86_simd_clone_usable (struct cgraph_node *node)
48942 switch (node->simdclone->vecsize_mangle)
48944 case 'b':
48945 if (!TARGET_SSE2)
48946 return -1;
48947 if (!TARGET_AVX)
48948 return 0;
48949 return TARGET_AVX2 ? 2 : 1;
48950 case 'c':
48951 if (!TARGET_AVX)
48952 return -1;
48953 return TARGET_AVX2 ? 1 : 0;
48954 case 'd':
48955 if (!TARGET_AVX2)
48956 return -1;
48957 return 0;
48958 case 'e':
48959 if (!TARGET_AVX512F)
48960 return -1;
48961 return 0;
48962 default:
48963 gcc_unreachable ();
48967 /* This function adjusts the unroll factor based on
48968 the hardware capabilities. For ex, bdver3 has
48969 a loop buffer which makes unrolling of smaller
48970 loops less important. This function decides the
48971 unroll factor using number of memory references
48972 (value 32 is used) as a heuristic. */
48974 static unsigned
48975 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
48977 basic_block *bbs;
48978 rtx_insn *insn;
48979 unsigned i;
48980 unsigned mem_count = 0;
48982 if (!TARGET_ADJUST_UNROLL)
48983 return nunroll;
48985 /* Count the number of memory references within the loop body.
48986 This value determines the unrolling factor for bdver3 and bdver4
48987 architectures. */
48988 subrtx_iterator::array_type array;
48989 bbs = get_loop_body (loop);
48990 for (i = 0; i < loop->num_nodes; i++)
48991 FOR_BB_INSNS (bbs[i], insn)
48992 if (NONDEBUG_INSN_P (insn))
48993 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
48994 if (const_rtx x = *iter)
48995 if (MEM_P (x))
48997 machine_mode mode = GET_MODE (x);
48998 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
48999 if (n_words > 4)
49000 mem_count += 2;
49001 else
49002 mem_count += 1;
49004 free (bbs);
49006 if (mem_count && mem_count <=32)
49007 return 32/mem_count;
49009 return nunroll;
49013 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
49015 static bool
49016 ix86_float_exceptions_rounding_supported_p (void)
49018 /* For x87 floating point with standard excess precision handling,
49019 there is no adddf3 pattern (since x87 floating point only has
49020 XFmode operations) so the default hook implementation gets this
49021 wrong. */
49022 return TARGET_80387 || TARGET_SSE_MATH;
49025 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
49027 static void
49028 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
49030 if (!TARGET_80387 && !TARGET_SSE_MATH)
49031 return;
49032 tree exceptions_var = create_tmp_var_raw (integer_type_node);
49033 if (TARGET_80387)
49035 tree fenv_index_type = build_index_type (size_int (6));
49036 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
49037 tree fenv_var = create_tmp_var_raw (fenv_type);
49038 TREE_ADDRESSABLE (fenv_var) = 1;
49039 tree fenv_ptr = build_pointer_type (fenv_type);
49040 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
49041 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
49042 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
49043 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
49044 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
49045 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
49046 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
49047 tree hold_fnclex = build_call_expr (fnclex, 0);
49048 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
49049 NULL_TREE, NULL_TREE);
49050 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
49051 hold_fnclex);
49052 *clear = build_call_expr (fnclex, 0);
49053 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
49054 tree fnstsw_call = build_call_expr (fnstsw, 0);
49055 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
49056 sw_var, fnstsw_call);
49057 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
49058 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
49059 exceptions_var, exceptions_x87);
49060 *update = build2 (COMPOUND_EXPR, integer_type_node,
49061 sw_mod, update_mod);
49062 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
49063 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
49065 if (TARGET_SSE_MATH)
49067 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
49068 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
49069 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
49070 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
49071 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
49072 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
49073 mxcsr_orig_var, stmxcsr_hold_call);
49074 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
49075 mxcsr_orig_var,
49076 build_int_cst (unsigned_type_node, 0x1f80));
49077 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
49078 build_int_cst (unsigned_type_node, 0xffffffc0));
49079 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
49080 mxcsr_mod_var, hold_mod_val);
49081 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49082 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
49083 hold_assign_orig, hold_assign_mod);
49084 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
49085 ldmxcsr_hold_call);
49086 if (*hold)
49087 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
49088 else
49089 *hold = hold_all;
49090 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49091 if (*clear)
49092 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
49093 ldmxcsr_clear_call);
49094 else
49095 *clear = ldmxcsr_clear_call;
49096 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
49097 tree exceptions_sse = fold_convert (integer_type_node,
49098 stxmcsr_update_call);
49099 if (*update)
49101 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
49102 exceptions_var, exceptions_sse);
49103 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
49104 exceptions_var, exceptions_mod);
49105 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
49106 exceptions_assign);
49108 else
49109 *update = build2 (MODIFY_EXPR, integer_type_node,
49110 exceptions_var, exceptions_sse);
49111 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
49112 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49113 ldmxcsr_update_call);
49115 tree atomic_feraiseexcept
49116 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
49117 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
49118 1, exceptions_var);
49119 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49120 atomic_feraiseexcept_call);
49123 /* Return mode to be used for bounds or VOIDmode
49124 if bounds are not supported. */
49126 static machine_mode
49127 ix86_mpx_bound_mode ()
49129 /* Do not support pointer checker if MPX
49130 is not enabled. */
49131 if (!TARGET_MPX)
49133 if (flag_check_pointer_bounds)
49134 warning (0, "Pointer Checker requires MPX support on this target."
49135 " Use -mmpx options to enable MPX.");
49136 return VOIDmode;
49139 return BNDmode;
49142 /* Return constant used to statically initialize constant bounds.
49144 This function is used to create special bound values. For now
49145 only INIT bounds and NONE bounds are expected. More special
49146 values may be added later. */
49148 static tree
49149 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
49151 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
49152 : build_zero_cst (pointer_sized_int_node);
49153 tree high = ub ? build_zero_cst (pointer_sized_int_node)
49154 : build_minus_one_cst (pointer_sized_int_node);
49156 /* This function is supposed to be used to create INIT and
49157 NONE bounds only. */
49158 gcc_assert ((lb == 0 && ub == -1)
49159 || (lb == -1 && ub == 0));
49161 return build_complex (NULL, low, high);
49164 /* Generate a list of statements STMTS to initialize pointer bounds
49165 variable VAR with bounds LB and UB. Return the number of generated
49166 statements. */
49168 static int
49169 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
49171 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
49172 tree lhs, modify, var_p;
49174 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
49175 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
49177 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
49178 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
49179 append_to_statement_list (modify, stmts);
49181 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
49182 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
49183 TYPE_SIZE_UNIT (pointer_sized_int_node)));
49184 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
49185 append_to_statement_list (modify, stmts);
49187 return 2;
49190 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
49191 /* For i386, common symbol is local only for non-PIE binaries. For
49192 x86-64, common symbol is local only for non-PIE binaries or linker
49193 supports copy reloc in PIE binaries. */
49195 static bool
49196 ix86_binds_local_p (const_tree exp)
49198 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
49199 (!flag_pic
49200 || (TARGET_64BIT
49201 && HAVE_LD_PIE_COPYRELOC != 0)));
49203 #endif
49205 /* If MEM is in the form of [base+offset], extract the two parts
49206 of address and set to BASE and OFFSET, otherwise return false. */
49208 static bool
49209 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
49211 rtx addr;
49213 gcc_assert (MEM_P (mem));
49215 addr = XEXP (mem, 0);
49217 if (GET_CODE (addr) == CONST)
49218 addr = XEXP (addr, 0);
49220 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
49222 *base = addr;
49223 *offset = const0_rtx;
49224 return true;
49227 if (GET_CODE (addr) == PLUS
49228 && (REG_P (XEXP (addr, 0))
49229 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49230 && CONST_INT_P (XEXP (addr, 1)))
49232 *base = XEXP (addr, 0);
49233 *offset = XEXP (addr, 1);
49234 return true;
49237 return false;
49240 /* Given OPERANDS of consecutive load/store, check if we can merge
49241 them into move multiple. LOAD is true if they are load instructions.
49242 MODE is the mode of memory operands. */
49244 bool
49245 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
49246 machine_mode mode)
49248 HOST_WIDE_INT offval_1, offval_2, msize;
49249 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
49251 if (load)
49253 mem_1 = operands[1];
49254 mem_2 = operands[3];
49255 reg_1 = operands[0];
49256 reg_2 = operands[2];
49258 else
49260 mem_1 = operands[0];
49261 mem_2 = operands[2];
49262 reg_1 = operands[1];
49263 reg_2 = operands[3];
49266 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
49268 if (REGNO (reg_1) != REGNO (reg_2))
49269 return false;
49271 /* Check if the addresses are in the form of [base+offset]. */
49272 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
49273 return false;
49274 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
49275 return false;
49277 /* Check if the bases are the same. */
49278 if (!rtx_equal_p (base_1, base_2))
49279 return false;
49281 offval_1 = INTVAL (offset_1);
49282 offval_2 = INTVAL (offset_2);
49283 msize = GET_MODE_SIZE (mode);
49284 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
49285 if (offval_1 + msize != offval_2)
49286 return false;
49288 return true;
49291 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
49293 static bool
49294 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
49295 optimization_type opt_type)
49297 switch (op)
49299 case asin_optab:
49300 case acos_optab:
49301 case log1p_optab:
49302 case exp_optab:
49303 case exp10_optab:
49304 case exp2_optab:
49305 case expm1_optab:
49306 case ldexp_optab:
49307 case scalb_optab:
49308 case round_optab:
49309 return opt_type == OPTIMIZE_FOR_SPEED;
49311 case rint_optab:
49312 if (SSE_FLOAT_MODE_P (mode1)
49313 && TARGET_SSE_MATH
49314 && !flag_trapping_math
49315 && !TARGET_SSE4_1)
49316 return opt_type == OPTIMIZE_FOR_SPEED;
49317 return true;
49319 case floor_optab:
49320 case ceil_optab:
49321 case btrunc_optab:
49322 if (SSE_FLOAT_MODE_P (mode1)
49323 && TARGET_SSE_MATH
49324 && !flag_trapping_math
49325 && TARGET_SSE4_1)
49326 return true;
49327 return opt_type == OPTIMIZE_FOR_SPEED;
49329 case rsqrt_optab:
49330 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
49332 default:
49333 return true;
49337 /* Address space support.
49339 This is not "far pointers" in the 16-bit sense, but an easy way
49340 to use %fs and %gs segment prefixes. Therefore:
49342 (a) All address spaces have the same modes,
49343 (b) All address spaces have the same addresss forms,
49344 (c) While %fs and %gs are technically subsets of the generic
49345 address space, they are probably not subsets of each other.
49346 (d) Since we have no access to the segment base register values
49347 without resorting to a system call, we cannot convert a
49348 non-default address space to a default address space.
49349 Therefore we do not claim %fs or %gs are subsets of generic.
49351 Therefore we can (mostly) use the default hooks. */
49353 /* All use of segmentation is assumed to make address 0 valid. */
49355 static bool
49356 ix86_addr_space_zero_address_valid (addr_space_t as)
49358 return as != ADDR_SPACE_GENERIC;
49361 static void
49362 ix86_init_libfuncs (void)
49364 if (TARGET_64BIT)
49366 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
49367 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
49369 else
49371 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
49372 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
49375 #if TARGET_MACHO
49376 darwin_rename_builtins ();
49377 #endif
49380 /* Generate call to __divmoddi4. */
49382 static void
49383 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
49384 rtx op0, rtx op1,
49385 rtx *quot_p, rtx *rem_p)
49387 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
49389 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
49390 mode,
49391 op0, GET_MODE (op0),
49392 op1, GET_MODE (op1),
49393 XEXP (rem, 0), Pmode);
49394 *quot_p = quot;
49395 *rem_p = rem;
49398 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
49399 FPU, assume that the fpcw is set to extended precision; when using
49400 only SSE, rounding is correct; when using both SSE and the FPU,
49401 the rounding precision is indeterminate, since either may be chosen
49402 apparently at random. */
49404 static enum flt_eval_method
49405 ix86_excess_precision (enum excess_precision_type type)
49407 switch (type)
49409 case EXCESS_PRECISION_TYPE_FAST:
49410 /* The fastest type to promote to will always be the native type,
49411 whether that occurs with implicit excess precision or
49412 otherwise. */
49413 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49414 case EXCESS_PRECISION_TYPE_STANDARD:
49415 case EXCESS_PRECISION_TYPE_IMPLICIT:
49416 /* Otherwise, the excess precision we want when we are
49417 in a standards compliant mode, and the implicit precision we
49418 provide would be identical were it not for the unpredictable
49419 cases. */
49420 if (!TARGET_80387)
49421 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49422 else if (!TARGET_MIX_SSE_I387)
49424 if (!TARGET_SSE_MATH)
49425 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
49426 else if (TARGET_SSE2)
49427 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49430 /* If we are in standards compliant mode, but we know we will
49431 calculate in unpredictable precision, return
49432 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
49433 excess precision if the target can't guarantee it will honor
49434 it. */
49435 return (type == EXCESS_PRECISION_TYPE_STANDARD
49436 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
49437 : FLT_EVAL_METHOD_UNPREDICTABLE);
49438 default:
49439 gcc_unreachable ();
49442 return FLT_EVAL_METHOD_UNPREDICTABLE;
49445 /* Target-specific selftests. */
49447 #if CHECKING_P
49449 namespace selftest {
49451 /* Verify that hard regs are dumped as expected (in compact mode). */
49453 static void
49454 ix86_test_dumping_hard_regs ()
49456 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
49457 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
49460 /* Test dumping an insn with repeated references to the same SCRATCH,
49461 to verify the rtx_reuse code. */
49463 static void
49464 ix86_test_dumping_memory_blockage ()
49466 set_new_first_and_last_insn (NULL, NULL);
49468 rtx pat = gen_memory_blockage ();
49469 rtx_reuse_manager r;
49470 r.preprocess (pat);
49472 /* Verify that the repeated references to the SCRATCH show use
49473 reuse IDS. The first should be prefixed with a reuse ID,
49474 and the second should be dumped as a "reuse_rtx" of that ID.
49475 The expected string assumes Pmode == DImode. */
49476 if (Pmode == DImode)
49477 ASSERT_RTL_DUMP_EQ_WITH_REUSE
49478 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
49479 " (unspec:BLK [\n"
49480 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
49481 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
49484 /* Verify loading an RTL dump; specifically a dump of copying
49485 a param on x86_64 from a hard reg into the frame.
49486 This test is target-specific since the dump contains target-specific
49487 hard reg names. */
49489 static void
49490 ix86_test_loading_dump_fragment_1 ()
49492 rtl_dump_test t (SELFTEST_LOCATION,
49493 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
49495 rtx_insn *insn = get_insn_by_uid (1);
49497 /* The block structure and indentation here is purely for
49498 readability; it mirrors the structure of the rtx. */
49499 tree mem_expr;
49501 rtx pat = PATTERN (insn);
49502 ASSERT_EQ (SET, GET_CODE (pat));
49504 rtx dest = SET_DEST (pat);
49505 ASSERT_EQ (MEM, GET_CODE (dest));
49506 /* Verify the "/c" was parsed. */
49507 ASSERT_TRUE (RTX_FLAG (dest, call));
49508 ASSERT_EQ (SImode, GET_MODE (dest));
49510 rtx addr = XEXP (dest, 0);
49511 ASSERT_EQ (PLUS, GET_CODE (addr));
49512 ASSERT_EQ (DImode, GET_MODE (addr));
49514 rtx lhs = XEXP (addr, 0);
49515 /* Verify that the "frame" REG was consolidated. */
49516 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
49519 rtx rhs = XEXP (addr, 1);
49520 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
49521 ASSERT_EQ (-4, INTVAL (rhs));
49524 /* Verify the "[1 i+0 S4 A32]" was parsed. */
49525 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
49526 /* "i" should have been handled by synthesizing a global int
49527 variable named "i". */
49528 mem_expr = MEM_EXPR (dest);
49529 ASSERT_NE (mem_expr, NULL);
49530 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
49531 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
49532 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
49533 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
49534 /* "+0". */
49535 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
49536 ASSERT_EQ (0, MEM_OFFSET (dest));
49537 /* "S4". */
49538 ASSERT_EQ (4, MEM_SIZE (dest));
49539 /* "A32. */
49540 ASSERT_EQ (32, MEM_ALIGN (dest));
49543 rtx src = SET_SRC (pat);
49544 ASSERT_EQ (REG, GET_CODE (src));
49545 ASSERT_EQ (SImode, GET_MODE (src));
49546 ASSERT_EQ (5, REGNO (src));
49547 tree reg_expr = REG_EXPR (src);
49548 /* "i" here should point to the same var as for the MEM_EXPR. */
49549 ASSERT_EQ (reg_expr, mem_expr);
49554 /* Verify that the RTL loader copes with a call_insn dump.
49555 This test is target-specific since the dump contains a target-specific
49556 hard reg name. */
49558 static void
49559 ix86_test_loading_call_insn ()
49561 /* The test dump includes register "xmm0", where requires TARGET_SSE
49562 to exist. */
49563 if (!TARGET_SSE)
49564 return;
49566 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
49568 rtx_insn *insn = get_insns ();
49569 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
49571 /* "/j". */
49572 ASSERT_TRUE (RTX_FLAG (insn, jump));
49574 rtx pat = PATTERN (insn);
49575 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
49577 /* Verify REG_NOTES. */
49579 /* "(expr_list:REG_CALL_DECL". */
49580 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
49581 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
49582 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
49584 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
49585 rtx_expr_list *note1 = note0->next ();
49586 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
49588 ASSERT_EQ (NULL, note1->next ());
49591 /* Verify CALL_INSN_FUNCTION_USAGE. */
49593 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
49594 rtx_expr_list *usage
49595 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
49596 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
49597 ASSERT_EQ (DFmode, GET_MODE (usage));
49598 ASSERT_EQ (USE, GET_CODE (usage->element ()));
49599 ASSERT_EQ (NULL, usage->next ());
49603 /* Verify that the RTL loader copes a dump from print_rtx_function.
49604 This test is target-specific since the dump contains target-specific
49605 hard reg names. */
49607 static void
49608 ix86_test_loading_full_dump ()
49610 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
49612 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
49614 rtx_insn *insn_1 = get_insn_by_uid (1);
49615 ASSERT_EQ (NOTE, GET_CODE (insn_1));
49617 rtx_insn *insn_7 = get_insn_by_uid (7);
49618 ASSERT_EQ (INSN, GET_CODE (insn_7));
49619 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
49621 rtx_insn *insn_15 = get_insn_by_uid (15);
49622 ASSERT_EQ (INSN, GET_CODE (insn_15));
49623 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
49625 /* Verify crtl->return_rtx. */
49626 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
49627 ASSERT_EQ (0, REGNO (crtl->return_rtx));
49628 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
49631 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
49632 In particular, verify that it correctly loads the 2nd operand.
49633 This test is target-specific since these are machine-specific
49634 operands (and enums). */
49636 static void
49637 ix86_test_loading_unspec ()
49639 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
49641 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
49643 ASSERT_TRUE (cfun);
49645 /* Test of an UNSPEC. */
49646 rtx_insn *insn = get_insns ();
49647 ASSERT_EQ (INSN, GET_CODE (insn));
49648 rtx set = single_set (insn);
49649 ASSERT_NE (NULL, set);
49650 rtx dst = SET_DEST (set);
49651 ASSERT_EQ (MEM, GET_CODE (dst));
49652 rtx src = SET_SRC (set);
49653 ASSERT_EQ (UNSPEC, GET_CODE (src));
49654 ASSERT_EQ (BLKmode, GET_MODE (src));
49655 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
49657 rtx v0 = XVECEXP (src, 0, 0);
49659 /* Verify that the two uses of the first SCRATCH have pointer
49660 equality. */
49661 rtx scratch_a = XEXP (dst, 0);
49662 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
49664 rtx scratch_b = XEXP (v0, 0);
49665 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
49667 ASSERT_EQ (scratch_a, scratch_b);
49669 /* Verify that the two mems are thus treated as equal. */
49670 ASSERT_TRUE (rtx_equal_p (dst, v0));
49672 /* Verify the the insn is recognized. */
49673 ASSERT_NE(-1, recog_memoized (insn));
49675 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
49676 insn = NEXT_INSN (insn);
49677 ASSERT_EQ (INSN, GET_CODE (insn));
49679 set = single_set (insn);
49680 ASSERT_NE (NULL, set);
49682 src = SET_SRC (set);
49683 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
49684 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
49687 /* Run all target-specific selftests. */
49689 static void
49690 ix86_run_selftests (void)
49692 ix86_test_dumping_hard_regs ();
49693 ix86_test_dumping_memory_blockage ();
49695 /* Various tests of loading RTL dumps, here because they contain
49696 ix86-isms (e.g. names of hard regs). */
49697 ix86_test_loading_dump_fragment_1 ();
49698 ix86_test_loading_call_insn ();
49699 ix86_test_loading_full_dump ();
49700 ix86_test_loading_unspec ();
49703 } // namespace selftest
49705 #endif /* CHECKING_P */
49707 /* Initialize the GCC target structure. */
49708 #undef TARGET_RETURN_IN_MEMORY
49709 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
49711 #undef TARGET_LEGITIMIZE_ADDRESS
49712 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
49714 #undef TARGET_ATTRIBUTE_TABLE
49715 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
49716 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
49717 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
49718 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
49719 # undef TARGET_MERGE_DECL_ATTRIBUTES
49720 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
49721 #endif
49723 #undef TARGET_COMP_TYPE_ATTRIBUTES
49724 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
49726 #undef TARGET_INIT_BUILTINS
49727 #define TARGET_INIT_BUILTINS ix86_init_builtins
49728 #undef TARGET_BUILTIN_DECL
49729 #define TARGET_BUILTIN_DECL ix86_builtin_decl
49730 #undef TARGET_EXPAND_BUILTIN
49731 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
49733 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
49734 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
49735 ix86_builtin_vectorized_function
49737 #undef TARGET_VECTORIZE_BUILTIN_GATHER
49738 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
49740 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
49741 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
49743 #undef TARGET_BUILTIN_RECIPROCAL
49744 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
49746 #undef TARGET_ASM_FUNCTION_EPILOGUE
49747 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
49749 #undef TARGET_ENCODE_SECTION_INFO
49750 #ifndef SUBTARGET_ENCODE_SECTION_INFO
49751 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
49752 #else
49753 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
49754 #endif
49756 #undef TARGET_ASM_OPEN_PAREN
49757 #define TARGET_ASM_OPEN_PAREN ""
49758 #undef TARGET_ASM_CLOSE_PAREN
49759 #define TARGET_ASM_CLOSE_PAREN ""
49761 #undef TARGET_ASM_BYTE_OP
49762 #define TARGET_ASM_BYTE_OP ASM_BYTE
49764 #undef TARGET_ASM_ALIGNED_HI_OP
49765 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
49766 #undef TARGET_ASM_ALIGNED_SI_OP
49767 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
49768 #ifdef ASM_QUAD
49769 #undef TARGET_ASM_ALIGNED_DI_OP
49770 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
49771 #endif
49773 #undef TARGET_PROFILE_BEFORE_PROLOGUE
49774 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
49776 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
49777 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
49779 #undef TARGET_ASM_UNALIGNED_HI_OP
49780 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
49781 #undef TARGET_ASM_UNALIGNED_SI_OP
49782 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
49783 #undef TARGET_ASM_UNALIGNED_DI_OP
49784 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
49786 #undef TARGET_PRINT_OPERAND
49787 #define TARGET_PRINT_OPERAND ix86_print_operand
49788 #undef TARGET_PRINT_OPERAND_ADDRESS
49789 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
49790 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
49791 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
49792 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
49793 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
49795 #undef TARGET_SCHED_INIT_GLOBAL
49796 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
49797 #undef TARGET_SCHED_ADJUST_COST
49798 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
49799 #undef TARGET_SCHED_ISSUE_RATE
49800 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
49801 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
49802 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
49803 ia32_multipass_dfa_lookahead
49804 #undef TARGET_SCHED_MACRO_FUSION_P
49805 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
49806 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
49807 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
49809 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
49810 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
49812 #undef TARGET_MEMMODEL_CHECK
49813 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
49815 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
49816 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
49818 #ifdef HAVE_AS_TLS
49819 #undef TARGET_HAVE_TLS
49820 #define TARGET_HAVE_TLS true
49821 #endif
49822 #undef TARGET_CANNOT_FORCE_CONST_MEM
49823 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
49824 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
49825 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
49827 #undef TARGET_DELEGITIMIZE_ADDRESS
49828 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
49830 #undef TARGET_MS_BITFIELD_LAYOUT_P
49831 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
49833 #if TARGET_MACHO
49834 #undef TARGET_BINDS_LOCAL_P
49835 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
49836 #else
49837 #undef TARGET_BINDS_LOCAL_P
49838 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
49839 #endif
49840 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
49841 #undef TARGET_BINDS_LOCAL_P
49842 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
49843 #endif
49845 #undef TARGET_ASM_OUTPUT_MI_THUNK
49846 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
49847 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
49848 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
49850 #undef TARGET_ASM_FILE_START
49851 #define TARGET_ASM_FILE_START x86_file_start
49853 #undef TARGET_OPTION_OVERRIDE
49854 #define TARGET_OPTION_OVERRIDE ix86_option_override
49856 #undef TARGET_REGISTER_MOVE_COST
49857 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
49858 #undef TARGET_MEMORY_MOVE_COST
49859 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
49860 #undef TARGET_RTX_COSTS
49861 #define TARGET_RTX_COSTS ix86_rtx_costs
49862 #undef TARGET_ADDRESS_COST
49863 #define TARGET_ADDRESS_COST ix86_address_cost
49865 #undef TARGET_FLAGS_REGNUM
49866 #define TARGET_FLAGS_REGNUM FLAGS_REG
49867 #undef TARGET_FIXED_CONDITION_CODE_REGS
49868 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
49869 #undef TARGET_CC_MODES_COMPATIBLE
49870 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
49872 #undef TARGET_MACHINE_DEPENDENT_REORG
49873 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
49875 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
49876 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
49878 #undef TARGET_BUILD_BUILTIN_VA_LIST
49879 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
49881 #undef TARGET_FOLD_BUILTIN
49882 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
49884 #undef TARGET_GIMPLE_FOLD_BUILTIN
49885 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
49887 #undef TARGET_COMPARE_VERSION_PRIORITY
49888 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
49890 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
49891 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
49892 ix86_generate_version_dispatcher_body
49894 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
49895 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
49896 ix86_get_function_versions_dispatcher
49898 #undef TARGET_ENUM_VA_LIST_P
49899 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
49901 #undef TARGET_FN_ABI_VA_LIST
49902 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
49904 #undef TARGET_CANONICAL_VA_LIST_TYPE
49905 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
49907 #undef TARGET_EXPAND_BUILTIN_VA_START
49908 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
49910 #undef TARGET_MD_ASM_ADJUST
49911 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
49913 #undef TARGET_C_EXCESS_PRECISION
49914 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
49915 #undef TARGET_PROMOTE_PROTOTYPES
49916 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
49917 #undef TARGET_SETUP_INCOMING_VARARGS
49918 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
49919 #undef TARGET_MUST_PASS_IN_STACK
49920 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
49921 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
49922 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
49923 #undef TARGET_FUNCTION_ARG_ADVANCE
49924 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
49925 #undef TARGET_FUNCTION_ARG
49926 #define TARGET_FUNCTION_ARG ix86_function_arg
49927 #undef TARGET_INIT_PIC_REG
49928 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
49929 #undef TARGET_USE_PSEUDO_PIC_REG
49930 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
49931 #undef TARGET_FUNCTION_ARG_BOUNDARY
49932 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
49933 #undef TARGET_PASS_BY_REFERENCE
49934 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
49935 #undef TARGET_INTERNAL_ARG_POINTER
49936 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
49937 #undef TARGET_UPDATE_STACK_BOUNDARY
49938 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
49939 #undef TARGET_GET_DRAP_RTX
49940 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
49941 #undef TARGET_STRICT_ARGUMENT_NAMING
49942 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
49943 #undef TARGET_STATIC_CHAIN
49944 #define TARGET_STATIC_CHAIN ix86_static_chain
49945 #undef TARGET_TRAMPOLINE_INIT
49946 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
49947 #undef TARGET_RETURN_POPS_ARGS
49948 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
49950 #undef TARGET_WARN_FUNC_RETURN
49951 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
49953 #undef TARGET_LEGITIMATE_COMBINED_INSN
49954 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
49956 #undef TARGET_ASAN_SHADOW_OFFSET
49957 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
49959 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
49960 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
49962 #undef TARGET_SCALAR_MODE_SUPPORTED_P
49963 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
49965 #undef TARGET_VECTOR_MODE_SUPPORTED_P
49966 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
49968 #undef TARGET_C_MODE_FOR_SUFFIX
49969 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
49971 #ifdef HAVE_AS_TLS
49972 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
49973 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
49974 #endif
49976 #ifdef SUBTARGET_INSERT_ATTRIBUTES
49977 #undef TARGET_INSERT_ATTRIBUTES
49978 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
49979 #endif
49981 #undef TARGET_MANGLE_TYPE
49982 #define TARGET_MANGLE_TYPE ix86_mangle_type
49984 #undef TARGET_STACK_PROTECT_GUARD
49985 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
49987 #if !TARGET_MACHO
49988 #undef TARGET_STACK_PROTECT_FAIL
49989 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
49990 #endif
49992 #undef TARGET_FUNCTION_VALUE
49993 #define TARGET_FUNCTION_VALUE ix86_function_value
49995 #undef TARGET_FUNCTION_VALUE_REGNO_P
49996 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
49998 #undef TARGET_PROMOTE_FUNCTION_MODE
49999 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50001 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50002 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50004 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50005 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50007 #undef TARGET_INSTANTIATE_DECLS
50008 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50010 #undef TARGET_SECONDARY_RELOAD
50011 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50012 #undef TARGET_SECONDARY_MEMORY_NEEDED
50013 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
50014 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
50015 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
50017 #undef TARGET_CLASS_MAX_NREGS
50018 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50020 #undef TARGET_PREFERRED_RELOAD_CLASS
50021 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50022 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50023 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50024 #undef TARGET_CLASS_LIKELY_SPILLED_P
50025 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50027 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50028 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50029 ix86_builtin_vectorization_cost
50030 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50031 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50032 ix86_vectorize_vec_perm_const_ok
50033 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50034 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50035 ix86_preferred_simd_mode
50036 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50037 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50038 ix86_autovectorize_vector_sizes
50039 #undef TARGET_VECTORIZE_GET_MASK_MODE
50040 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50041 #undef TARGET_VECTORIZE_INIT_COST
50042 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50043 #undef TARGET_VECTORIZE_ADD_STMT_COST
50044 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50045 #undef TARGET_VECTORIZE_FINISH_COST
50046 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50047 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50048 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50050 #undef TARGET_SET_CURRENT_FUNCTION
50051 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50053 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50054 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50056 #undef TARGET_OPTION_SAVE
50057 #define TARGET_OPTION_SAVE ix86_function_specific_save
50059 #undef TARGET_OPTION_RESTORE
50060 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50062 #undef TARGET_OPTION_POST_STREAM_IN
50063 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50065 #undef TARGET_OPTION_PRINT
50066 #define TARGET_OPTION_PRINT ix86_function_specific_print
50068 #undef TARGET_OPTION_FUNCTION_VERSIONS
50069 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
50071 #undef TARGET_CAN_INLINE_P
50072 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50074 #undef TARGET_LEGITIMATE_ADDRESS_P
50075 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50077 #undef TARGET_REGISTER_PRIORITY
50078 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50080 #undef TARGET_REGISTER_USAGE_LEVELING_P
50081 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50083 #undef TARGET_LEGITIMATE_CONSTANT_P
50084 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50086 #undef TARGET_COMPUTE_FRAME_LAYOUT
50087 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
50089 #undef TARGET_FRAME_POINTER_REQUIRED
50090 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50092 #undef TARGET_CAN_ELIMINATE
50093 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50095 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50096 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50098 #undef TARGET_ASM_CODE_END
50099 #define TARGET_ASM_CODE_END ix86_code_end
50101 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50102 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50104 #undef TARGET_CANONICALIZE_COMPARISON
50105 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
50107 #undef TARGET_LOOP_UNROLL_ADJUST
50108 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50110 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50111 #undef TARGET_SPILL_CLASS
50112 #define TARGET_SPILL_CLASS ix86_spill_class
50114 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50115 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50116 ix86_simd_clone_compute_vecsize_and_simdlen
50118 #undef TARGET_SIMD_CLONE_ADJUST
50119 #define TARGET_SIMD_CLONE_ADJUST \
50120 ix86_simd_clone_adjust
50122 #undef TARGET_SIMD_CLONE_USABLE
50123 #define TARGET_SIMD_CLONE_USABLE \
50124 ix86_simd_clone_usable
50126 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50127 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50128 ix86_float_exceptions_rounding_supported_p
50130 #undef TARGET_MODE_EMIT
50131 #define TARGET_MODE_EMIT ix86_emit_mode_set
50133 #undef TARGET_MODE_NEEDED
50134 #define TARGET_MODE_NEEDED ix86_mode_needed
50136 #undef TARGET_MODE_AFTER
50137 #define TARGET_MODE_AFTER ix86_mode_after
50139 #undef TARGET_MODE_ENTRY
50140 #define TARGET_MODE_ENTRY ix86_mode_entry
50142 #undef TARGET_MODE_EXIT
50143 #define TARGET_MODE_EXIT ix86_mode_exit
50145 #undef TARGET_MODE_PRIORITY
50146 #define TARGET_MODE_PRIORITY ix86_mode_priority
50148 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50149 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50151 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50152 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50154 #undef TARGET_STORE_BOUNDS_FOR_ARG
50155 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50157 #undef TARGET_LOAD_RETURNED_BOUNDS
50158 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50160 #undef TARGET_STORE_RETURNED_BOUNDS
50161 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50163 #undef TARGET_CHKP_BOUND_MODE
50164 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50166 #undef TARGET_BUILTIN_CHKP_FUNCTION
50167 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50169 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50170 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50172 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50173 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50175 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50176 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50178 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50179 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50181 #undef TARGET_OFFLOAD_OPTIONS
50182 #define TARGET_OFFLOAD_OPTIONS \
50183 ix86_offload_options
50185 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50186 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50188 #undef TARGET_OPTAB_SUPPORTED_P
50189 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50191 #undef TARGET_HARD_REGNO_SCRATCH_OK
50192 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50194 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
50195 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
50197 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
50198 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
50200 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50201 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50203 #undef TARGET_INIT_LIBFUNCS
50204 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
50206 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
50207 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
50209 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
50210 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
50212 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
50213 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
50215 #undef TARGET_HARD_REGNO_NREGS
50216 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
50217 #undef TARGET_HARD_REGNO_MODE_OK
50218 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
50220 #undef TARGET_MODES_TIEABLE_P
50221 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
50223 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
50224 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
50225 ix86_hard_regno_call_part_clobbered
50227 #undef TARGET_CAN_CHANGE_MODE_CLASS
50228 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
50230 #undef TARGET_CONSTANT_ALIGNMENT
50231 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
50233 #if CHECKING_P
50234 #undef TARGET_RUN_TARGET_SELFTESTS
50235 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
50236 #endif /* #if CHECKING_P */
50238 struct gcc_target targetm = TARGET_INITIALIZER;
50240 #include "gt-i386.h"