PR c++/83490
[official-gcc.git] / gcc / config / i386 / i386.c
blob82a79bdb4264117e92f4b983d44f30d95226c3b2
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
94 /* This file should be included last. */
95 #include "target-def.h"
97 #include "x86-tune-costs.h"
99 static rtx legitimize_dllimport_symbol (rtx, bool);
100 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
101 static rtx legitimize_pe_coff_symbol (rtx, bool);
102 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
103 static bool ix86_save_reg (unsigned int, bool, bool);
104 static bool ix86_function_naked (const_tree);
105 static bool ix86_notrack_prefixed_insn_p (rtx);
106 static void ix86_emit_restore_reg_using_pop (rtx);
109 #ifndef CHECK_STACK_LIMIT
110 #define CHECK_STACK_LIMIT (-1)
111 #endif
113 /* Return index of given mode in mult and division cost tables. */
114 #define MODE_INDEX(mode) \
115 ((mode) == QImode ? 0 \
116 : (mode) == HImode ? 1 \
117 : (mode) == SImode ? 2 \
118 : (mode) == DImode ? 3 \
119 : 4)
122 /* Set by -mtune. */
123 const struct processor_costs *ix86_tune_cost = NULL;
125 /* Set by -mtune or -Os. */
126 const struct processor_costs *ix86_cost = NULL;
128 /* Processor feature/optimization bitmasks. */
129 #define m_386 (1U<<PROCESSOR_I386)
130 #define m_486 (1U<<PROCESSOR_I486)
131 #define m_PENT (1U<<PROCESSOR_PENTIUM)
132 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
133 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
134 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
135 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
136 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
137 #define m_CORE2 (1U<<PROCESSOR_CORE2)
138 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
139 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
140 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
141 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
142 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
143 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
144 #define m_KNL (1U<<PROCESSOR_KNL)
145 #define m_KNM (1U<<PROCESSOR_KNM)
146 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
147 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
148 #define m_INTEL (1U<<PROCESSOR_INTEL)
150 #define m_GEODE (1U<<PROCESSOR_GEODE)
151 #define m_K6 (1U<<PROCESSOR_K6)
152 #define m_K6_GEODE (m_K6 | m_GEODE)
153 #define m_K8 (1U<<PROCESSOR_K8)
154 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
155 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
156 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
157 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
158 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
159 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
160 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
161 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
162 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
163 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
164 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
165 #define m_BTVER (m_BTVER1 | m_BTVER2)
166 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
167 | m_ZNVER1)
169 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
171 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
172 #undef DEF_TUNE
173 #define DEF_TUNE(tune, name, selector) name,
174 #include "x86-tune.def"
175 #undef DEF_TUNE
178 /* Feature tests against the various tunings. */
179 unsigned char ix86_tune_features[X86_TUNE_LAST];
181 /* Feature tests against the various tunings used to create ix86_tune_features
182 based on the processor mask. */
183 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
184 #undef DEF_TUNE
185 #define DEF_TUNE(tune, name, selector) selector,
186 #include "x86-tune.def"
187 #undef DEF_TUNE
190 /* Feature tests against the various architecture variations. */
191 unsigned char ix86_arch_features[X86_ARCH_LAST];
193 /* Feature tests against the various architecture variations, used to create
194 ix86_arch_features based on the processor mask. */
195 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
196 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
197 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
199 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
200 ~m_386,
202 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
203 ~(m_386 | m_486),
205 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
206 ~m_386,
208 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
209 ~m_386,
212 /* In case the average insn count for single function invocation is
213 lower than this constant, emit fast (but longer) prologue and
214 epilogue code. */
215 #define FAST_PROLOGUE_INSN_COUNT 20
217 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
218 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
219 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
220 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
222 /* Array of the smallest class containing reg number REGNO, indexed by
223 REGNO. Used by REGNO_REG_CLASS in i386.h. */
225 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
227 /* ax, dx, cx, bx */
228 AREG, DREG, CREG, BREG,
229 /* si, di, bp, sp */
230 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
231 /* FP registers */
232 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
233 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
234 /* arg pointer */
235 NON_Q_REGS,
236 /* flags, fpsr, fpcr, frame */
237 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
238 /* SSE registers */
239 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
240 SSE_REGS, SSE_REGS,
241 /* MMX registers */
242 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
243 MMX_REGS, MMX_REGS,
244 /* REX registers */
245 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
246 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
247 /* SSE REX registers */
248 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
249 SSE_REGS, SSE_REGS,
250 /* AVX-512 SSE registers */
251 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
252 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
253 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 /* Mask registers. */
256 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
257 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
258 /* MPX bound registers */
259 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
262 /* The "default" register map used in 32bit mode. */
264 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
266 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
267 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
268 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
269 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
270 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
271 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
272 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
273 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
274 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
275 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
276 101, 102, 103, 104, /* bound registers */
279 /* The "default" register map used in 64bit mode. */
281 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
283 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
284 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
285 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
286 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
287 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
288 8,9,10,11,12,13,14,15, /* extended integer registers */
289 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
290 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
291 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
292 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
293 126, 127, 128, 129, /* bound registers */
296 /* Define the register numbers to be used in Dwarf debugging information.
297 The SVR4 reference port C compiler uses the following register numbers
298 in its Dwarf output code:
299 0 for %eax (gcc regno = 0)
300 1 for %ecx (gcc regno = 2)
301 2 for %edx (gcc regno = 1)
302 3 for %ebx (gcc regno = 3)
303 4 for %esp (gcc regno = 7)
304 5 for %ebp (gcc regno = 6)
305 6 for %esi (gcc regno = 4)
306 7 for %edi (gcc regno = 5)
307 The following three DWARF register numbers are never generated by
308 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
309 believed these numbers have these meanings.
310 8 for %eip (no gcc equivalent)
311 9 for %eflags (gcc regno = 17)
312 10 for %trapno (no gcc equivalent)
313 It is not at all clear how we should number the FP stack registers
314 for the x86 architecture. If the version of SDB on x86/svr4 were
315 a bit less brain dead with respect to floating-point then we would
316 have a precedent to follow with respect to DWARF register numbers
317 for x86 FP registers, but the SDB on x86/svr4 was so completely
318 broken with respect to FP registers that it is hardly worth thinking
319 of it as something to strive for compatibility with.
320 The version of x86/svr4 SDB I had does (partially)
321 seem to believe that DWARF register number 11 is associated with
322 the x86 register %st(0), but that's about all. Higher DWARF
323 register numbers don't seem to be associated with anything in
324 particular, and even for DWARF regno 11, SDB only seemed to under-
325 stand that it should say that a variable lives in %st(0) (when
326 asked via an `=' command) if we said it was in DWARF regno 11,
327 but SDB still printed garbage when asked for the value of the
328 variable in question (via a `/' command).
329 (Also note that the labels SDB printed for various FP stack regs
330 when doing an `x' command were all wrong.)
331 Note that these problems generally don't affect the native SVR4
332 C compiler because it doesn't allow the use of -O with -g and
333 because when it is *not* optimizing, it allocates a memory
334 location for each floating-point variable, and the memory
335 location is what gets described in the DWARF AT_location
336 attribute for the variable in question.
337 Regardless of the severe mental illness of the x86/svr4 SDB, we
338 do something sensible here and we use the following DWARF
339 register numbers. Note that these are all stack-top-relative
340 numbers.
341 11 for %st(0) (gcc regno = 8)
342 12 for %st(1) (gcc regno = 9)
343 13 for %st(2) (gcc regno = 10)
344 14 for %st(3) (gcc regno = 11)
345 15 for %st(4) (gcc regno = 12)
346 16 for %st(5) (gcc regno = 13)
347 17 for %st(6) (gcc regno = 14)
348 18 for %st(7) (gcc regno = 15)
350 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
352 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
353 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
354 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
355 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
356 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
357 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
358 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
359 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
360 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
361 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
362 101, 102, 103, 104, /* bound registers */
365 /* Define parameter passing and return registers. */
367 static int const x86_64_int_parameter_registers[6] =
369 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
372 static int const x86_64_ms_abi_int_parameter_registers[4] =
374 CX_REG, DX_REG, R8_REG, R9_REG
377 static int const x86_64_int_return_registers[4] =
379 AX_REG, DX_REG, DI_REG, SI_REG
382 /* Additional registers that are clobbered by SYSV calls. */
384 #define NUM_X86_64_MS_CLOBBERED_REGS 12
385 static int const x86_64_ms_sysv_extra_clobbered_registers
386 [NUM_X86_64_MS_CLOBBERED_REGS] =
388 SI_REG, DI_REG,
389 XMM6_REG, XMM7_REG,
390 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
391 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
394 enum xlogue_stub {
395 XLOGUE_STUB_SAVE,
396 XLOGUE_STUB_RESTORE,
397 XLOGUE_STUB_RESTORE_TAIL,
398 XLOGUE_STUB_SAVE_HFP,
399 XLOGUE_STUB_RESTORE_HFP,
400 XLOGUE_STUB_RESTORE_HFP_TAIL,
402 XLOGUE_STUB_COUNT
405 enum xlogue_stub_sets {
406 XLOGUE_SET_ALIGNED,
407 XLOGUE_SET_ALIGNED_PLUS_8,
408 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
409 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
411 XLOGUE_SET_COUNT
414 /* Register save/restore layout used by out-of-line stubs. */
415 class xlogue_layout {
416 public:
417 struct reginfo
419 unsigned regno;
420 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
421 rsi) to where each register is stored. */
424 unsigned get_nregs () const {return m_nregs;}
425 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
427 const reginfo &get_reginfo (unsigned reg) const
429 gcc_assert (reg < m_nregs);
430 return m_regs[reg];
433 static const char *get_stub_name (enum xlogue_stub stub,
434 unsigned n_extra_args);
436 /* Returns an rtx for the stub's symbol based upon
437 1.) the specified stub (save, restore or restore_ret) and
438 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
439 3.) rather or not stack alignment is being performed. */
440 static rtx get_stub_rtx (enum xlogue_stub stub);
442 /* Returns the amount of stack space (including padding) that the stub
443 needs to store registers based upon data in the machine_function. */
444 HOST_WIDE_INT get_stack_space_used () const
446 const struct machine_function *m = cfun->machine;
447 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
449 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
450 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
453 /* Returns the offset for the base pointer used by the stub. */
454 HOST_WIDE_INT get_stub_ptr_offset () const
456 return STUB_INDEX_OFFSET + m_stack_align_off_in;
459 static const struct xlogue_layout &get_instance ();
460 static unsigned count_stub_managed_regs ();
461 static bool is_stub_managed_reg (unsigned regno, unsigned count);
463 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
464 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
465 static const unsigned MAX_REGS = 18;
466 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
467 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
468 static const unsigned STUB_NAME_MAX_LEN = 20;
469 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
470 static const unsigned REG_ORDER[MAX_REGS];
471 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
473 private:
474 xlogue_layout ();
475 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
476 xlogue_layout (const xlogue_layout &);
478 /* True if hard frame pointer is used. */
479 bool m_hfp;
481 /* Max number of register this layout manages. */
482 unsigned m_nregs;
484 /* Incoming offset from 16-byte alignment. */
485 HOST_WIDE_INT m_stack_align_off_in;
487 /* Register order and offsets. */
488 struct reginfo m_regs[MAX_REGS];
490 /* Lazy-inited cache of symbol names for stubs. */
491 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
492 [STUB_NAME_MAX_LEN];
494 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
497 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
498 "savms64",
499 "resms64",
500 "resms64x",
501 "savms64f",
502 "resms64f",
503 "resms64fx"
506 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
507 /* The below offset values are where each register is stored for the layout
508 relative to incoming stack pointer. The value of each m_regs[].offset will
509 be relative to the incoming base pointer (rax or rsi) used by the stub.
511 s_instances: 0 1 2 3
512 Offset: realigned or aligned + 8
513 Register aligned aligned + 8 aligned w/HFP w/HFP */
514 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
515 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
516 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
517 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
518 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
519 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
520 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
521 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
522 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
523 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
524 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
525 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
526 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
527 BP_REG, /* 0xc0 0xc8 N/A N/A */
528 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
529 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
530 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
531 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
534 /* Instantiate static const values. */
535 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
536 const unsigned xlogue_layout::MIN_REGS;
537 const unsigned xlogue_layout::MAX_REGS;
538 const unsigned xlogue_layout::MAX_EXTRA_REGS;
539 const unsigned xlogue_layout::VARIANT_COUNT;
540 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
542 /* Initialize xlogue_layout::s_stub_names to zero. */
543 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
544 [STUB_NAME_MAX_LEN];
546 /* Instantiates all xlogue_layout instances. */
547 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
548 xlogue_layout (0, false),
549 xlogue_layout (8, false),
550 xlogue_layout (0, true),
551 xlogue_layout (8, true)
554 /* Return an appropriate const instance of xlogue_layout based upon values
555 in cfun->machine and crtl. */
556 const struct xlogue_layout &
557 xlogue_layout::get_instance ()
559 enum xlogue_stub_sets stub_set;
560 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
562 if (stack_realign_fp)
563 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
564 else if (frame_pointer_needed)
565 stub_set = aligned_plus_8
566 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
567 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
568 else
569 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
571 return s_instances[stub_set];
574 /* Determine how many clobbered registers can be saved by the stub.
575 Returns the count of registers the stub will save and restore. */
576 unsigned
577 xlogue_layout::count_stub_managed_regs ()
579 bool hfp = frame_pointer_needed || stack_realign_fp;
580 unsigned i, count;
581 unsigned regno;
583 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
585 regno = REG_ORDER[i];
586 if (regno == BP_REG && hfp)
587 continue;
588 if (!ix86_save_reg (regno, false, false))
589 break;
590 ++count;
592 return count;
595 /* Determine if register REGNO is a stub managed register given the
596 total COUNT of stub managed registers. */
597 bool
598 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
600 bool hfp = frame_pointer_needed || stack_realign_fp;
601 unsigned i;
603 for (i = 0; i < count; ++i)
605 gcc_assert (i < MAX_REGS);
606 if (REG_ORDER[i] == BP_REG && hfp)
607 ++count;
608 else if (REG_ORDER[i] == regno)
609 return true;
611 return false;
614 /* Constructor for xlogue_layout. */
615 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
616 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
617 m_stack_align_off_in (stack_align_off_in)
619 HOST_WIDE_INT offset = stack_align_off_in;
620 unsigned i, j;
622 for (i = j = 0; i < MAX_REGS; ++i)
624 unsigned regno = REG_ORDER[i];
626 if (regno == BP_REG && hfp)
627 continue;
628 if (SSE_REGNO_P (regno))
630 offset += 16;
631 /* Verify that SSE regs are always aligned. */
632 gcc_assert (!((stack_align_off_in + offset) & 15));
634 else
635 offset += 8;
637 m_regs[j].regno = regno;
638 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
640 gcc_assert (j == m_nregs);
643 const char *
644 xlogue_layout::get_stub_name (enum xlogue_stub stub,
645 unsigned n_extra_regs)
647 const int have_avx = TARGET_AVX;
648 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
650 /* Lazy init */
651 if (!*name)
653 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
654 (have_avx ? "avx" : "sse"),
655 STUB_BASE_NAMES[stub],
656 MIN_REGS + n_extra_regs);
657 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
660 return name;
663 /* Return rtx of a symbol ref for the entry point (based upon
664 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
666 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
668 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
669 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
670 gcc_assert (stub < XLOGUE_STUB_COUNT);
671 gcc_assert (crtl->stack_realign_finalized);
673 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
676 /* Define the structure for the machine field in struct function. */
678 struct GTY(()) stack_local_entry {
679 unsigned short mode;
680 unsigned short n;
681 rtx rtl;
682 struct stack_local_entry *next;
685 /* Which cpu are we scheduling for. */
686 enum attr_cpu ix86_schedule;
688 /* Which cpu are we optimizing for. */
689 enum processor_type ix86_tune;
691 /* Which instruction set architecture to use. */
692 enum processor_type ix86_arch;
694 /* True if processor has SSE prefetch instruction. */
695 unsigned char x86_prefetch_sse;
697 /* -mstackrealign option */
698 static const char ix86_force_align_arg_pointer_string[]
699 = "force_align_arg_pointer";
701 static rtx (*ix86_gen_leave) (void);
702 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
703 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
704 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
705 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
706 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
707 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_clzero) (rtx);
709 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
711 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
713 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
714 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
716 /* Preferred alignment for stack boundary in bits. */
717 unsigned int ix86_preferred_stack_boundary;
719 /* Alignment for incoming stack boundary in bits specified at
720 command line. */
721 static unsigned int ix86_user_incoming_stack_boundary;
723 /* Default alignment for incoming stack boundary in bits. */
724 static unsigned int ix86_default_incoming_stack_boundary;
726 /* Alignment for incoming stack boundary in bits. */
727 unsigned int ix86_incoming_stack_boundary;
729 /* Calling abi specific va_list type nodes. */
730 static GTY(()) tree sysv_va_list_type_node;
731 static GTY(()) tree ms_va_list_type_node;
733 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
734 char internal_label_prefix[16];
735 int internal_label_prefix_len;
737 /* Fence to use after loop using movnt. */
738 tree x86_mfence;
740 /* Register class used for passing given 64bit part of the argument.
741 These represent classes as documented by the PS ABI, with the exception
742 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
743 use SF or DFmode move instead of DImode to avoid reformatting penalties.
745 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
746 whenever possible (upper half does contain padding). */
747 enum x86_64_reg_class
749 X86_64_NO_CLASS,
750 X86_64_INTEGER_CLASS,
751 X86_64_INTEGERSI_CLASS,
752 X86_64_SSE_CLASS,
753 X86_64_SSESF_CLASS,
754 X86_64_SSEDF_CLASS,
755 X86_64_SSEUP_CLASS,
756 X86_64_X87_CLASS,
757 X86_64_X87UP_CLASS,
758 X86_64_COMPLEX_X87_CLASS,
759 X86_64_MEMORY_CLASS
762 #define MAX_CLASSES 8
764 /* Table of constants used by fldpi, fldln2, etc.... */
765 static REAL_VALUE_TYPE ext_80387_constants_table [5];
766 static bool ext_80387_constants_init;
769 static struct machine_function * ix86_init_machine_status (void);
770 static rtx ix86_function_value (const_tree, const_tree, bool);
771 static bool ix86_function_value_regno_p (const unsigned int);
772 static unsigned int ix86_function_arg_boundary (machine_mode,
773 const_tree);
774 static rtx ix86_static_chain (const_tree, bool);
775 static int ix86_function_regparm (const_tree, const_tree);
776 static void ix86_compute_frame_layout (void);
777 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
778 rtx, rtx, int);
779 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
780 static tree ix86_canonical_va_list_type (tree);
781 static void predict_jump (int);
782 static unsigned int split_stack_prologue_scratch_regno (void);
783 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
785 enum ix86_function_specific_strings
787 IX86_FUNCTION_SPECIFIC_ARCH,
788 IX86_FUNCTION_SPECIFIC_TUNE,
789 IX86_FUNCTION_SPECIFIC_MAX
792 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
793 const char *, const char *, enum fpmath_unit,
794 bool);
795 static void ix86_function_specific_save (struct cl_target_option *,
796 struct gcc_options *opts);
797 static void ix86_function_specific_restore (struct gcc_options *opts,
798 struct cl_target_option *);
799 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
800 static void ix86_function_specific_print (FILE *, int,
801 struct cl_target_option *);
802 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
803 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
804 struct gcc_options *,
805 struct gcc_options *,
806 struct gcc_options *);
807 static bool ix86_can_inline_p (tree, tree);
808 static void ix86_set_current_function (tree);
809 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
811 static enum calling_abi ix86_function_abi (const_tree);
814 #ifndef SUBTARGET32_DEFAULT_CPU
815 #define SUBTARGET32_DEFAULT_CPU "i386"
816 #endif
818 /* Whether -mtune= or -march= were specified */
819 static int ix86_tune_defaulted;
820 static int ix86_arch_specified;
822 /* Vectorization library interface and handlers. */
823 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
825 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
826 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
828 /* Processor target table, indexed by processor number */
829 struct ptt
831 const char *const name; /* processor name */
832 const struct processor_costs *cost; /* Processor costs */
833 const int align_loop; /* Default alignments. */
834 const int align_loop_max_skip;
835 const int align_jump;
836 const int align_jump_max_skip;
837 const int align_func;
840 /* This table must be in sync with enum processor_type in i386.h. */
841 static const struct ptt processor_target_table[PROCESSOR_max] =
843 {"generic", &generic_cost, 16, 10, 16, 10, 16},
844 {"i386", &i386_cost, 4, 3, 4, 3, 4},
845 {"i486", &i486_cost, 16, 15, 16, 15, 16},
846 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
847 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
848 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
849 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
850 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
851 {"core2", &core_cost, 16, 10, 16, 10, 16},
852 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
853 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
854 {"haswell", &core_cost, 16, 10, 16, 10, 16},
855 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
856 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
857 {"knl", &slm_cost, 16, 15, 16, 7, 16},
858 {"knm", &slm_cost, 16, 15, 16, 7, 16},
859 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
860 {"cannonlake", &core_cost, 16, 10, 16, 10, 16},
861 {"intel", &intel_cost, 16, 15, 16, 7, 16},
862 {"geode", &geode_cost, 0, 0, 0, 0, 0},
863 {"k6", &k6_cost, 32, 7, 32, 7, 32},
864 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
865 {"k8", &k8_cost, 16, 7, 16, 7, 16},
866 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
867 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
868 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
869 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
870 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
871 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
872 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
873 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
876 static unsigned int
877 rest_of_handle_insert_vzeroupper (void)
879 int i;
881 /* vzeroupper instructions are inserted immediately after reload to
882 account for possible spills from 256bit or 512bit registers. The pass
883 reuses mode switching infrastructure by re-running mode insertion
884 pass, so disable entities that have already been processed. */
885 for (i = 0; i < MAX_386_ENTITIES; i++)
886 ix86_optimize_mode_switching[i] = 0;
888 ix86_optimize_mode_switching[AVX_U128] = 1;
890 /* Call optimize_mode_switching. */
891 g->get_passes ()->execute_pass_mode_switching ();
892 return 0;
895 /* Return 1 if INSN uses or defines a hard register.
896 Hard register uses in a memory address are ignored.
897 Clobbers and flags definitions are ignored. */
899 static bool
900 has_non_address_hard_reg (rtx_insn *insn)
902 df_ref ref;
903 FOR_EACH_INSN_DEF (ref, insn)
904 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
905 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
906 && DF_REF_REGNO (ref) != FLAGS_REG)
907 return true;
909 FOR_EACH_INSN_USE (ref, insn)
910 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
911 return true;
913 return false;
916 /* Check if comparison INSN may be transformed
917 into vector comparison. Currently we transform
918 zero checks only which look like:
920 (set (reg:CCZ 17 flags)
921 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
922 (subreg:SI (reg:DI x) 0))
923 (const_int 0 [0]))) */
925 static bool
926 convertible_comparison_p (rtx_insn *insn)
928 if (!TARGET_SSE4_1)
929 return false;
931 rtx def_set = single_set (insn);
933 gcc_assert (def_set);
935 rtx src = SET_SRC (def_set);
936 rtx dst = SET_DEST (def_set);
938 gcc_assert (GET_CODE (src) == COMPARE);
940 if (GET_CODE (dst) != REG
941 || REGNO (dst) != FLAGS_REG
942 || GET_MODE (dst) != CCZmode)
943 return false;
945 rtx op1 = XEXP (src, 0);
946 rtx op2 = XEXP (src, 1);
948 if (op2 != CONST0_RTX (GET_MODE (op2)))
949 return false;
951 if (GET_CODE (op1) != IOR)
952 return false;
954 op2 = XEXP (op1, 1);
955 op1 = XEXP (op1, 0);
957 if (!SUBREG_P (op1)
958 || !SUBREG_P (op2)
959 || GET_MODE (op1) != SImode
960 || GET_MODE (op2) != SImode
961 || ((SUBREG_BYTE (op1) != 0
962 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
963 && (SUBREG_BYTE (op2) != 0
964 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
965 return false;
967 op1 = SUBREG_REG (op1);
968 op2 = SUBREG_REG (op2);
970 if (op1 != op2
971 || !REG_P (op1)
972 || GET_MODE (op1) != DImode)
973 return false;
975 return true;
978 /* The DImode version of scalar_to_vector_candidate_p. */
980 static bool
981 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
983 rtx def_set = single_set (insn);
985 if (!def_set)
986 return false;
988 if (has_non_address_hard_reg (insn))
989 return false;
991 rtx src = SET_SRC (def_set);
992 rtx dst = SET_DEST (def_set);
994 if (GET_CODE (src) == COMPARE)
995 return convertible_comparison_p (insn);
997 /* We are interested in DImode promotion only. */
998 if ((GET_MODE (src) != DImode
999 && !CONST_INT_P (src))
1000 || GET_MODE (dst) != DImode)
1001 return false;
1003 if (!REG_P (dst) && !MEM_P (dst))
1004 return false;
1006 switch (GET_CODE (src))
1008 case ASHIFTRT:
1009 if (!TARGET_AVX512VL)
1010 return false;
1011 /* FALLTHRU */
1013 case ASHIFT:
1014 case LSHIFTRT:
1015 if (!REG_P (XEXP (src, 1))
1016 && (!SUBREG_P (XEXP (src, 1))
1017 || SUBREG_BYTE (XEXP (src, 1)) != 0
1018 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1019 && (!CONST_INT_P (XEXP (src, 1))
1020 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1021 return false;
1023 if (GET_MODE (XEXP (src, 1)) != QImode
1024 && !CONST_INT_P (XEXP (src, 1)))
1025 return false;
1026 break;
1028 case PLUS:
1029 case MINUS:
1030 case IOR:
1031 case XOR:
1032 case AND:
1033 if (!REG_P (XEXP (src, 1))
1034 && !MEM_P (XEXP (src, 1))
1035 && !CONST_INT_P (XEXP (src, 1)))
1036 return false;
1038 if (GET_MODE (XEXP (src, 1)) != DImode
1039 && !CONST_INT_P (XEXP (src, 1)))
1040 return false;
1041 break;
1043 case NEG:
1044 case NOT:
1045 break;
1047 case REG:
1048 return true;
1050 case MEM:
1051 case CONST_INT:
1052 return REG_P (dst);
1054 default:
1055 return false;
1058 if (!REG_P (XEXP (src, 0))
1059 && !MEM_P (XEXP (src, 0))
1060 && !CONST_INT_P (XEXP (src, 0))
1061 /* Check for andnot case. */
1062 && (GET_CODE (src) != AND
1063 || GET_CODE (XEXP (src, 0)) != NOT
1064 || !REG_P (XEXP (XEXP (src, 0), 0))))
1065 return false;
1067 if (GET_MODE (XEXP (src, 0)) != DImode
1068 && !CONST_INT_P (XEXP (src, 0)))
1069 return false;
1071 return true;
1074 /* The TImode version of scalar_to_vector_candidate_p. */
1076 static bool
1077 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1079 rtx def_set = single_set (insn);
1081 if (!def_set)
1082 return false;
1084 if (has_non_address_hard_reg (insn))
1085 return false;
1087 rtx src = SET_SRC (def_set);
1088 rtx dst = SET_DEST (def_set);
1090 /* Only TImode load and store are allowed. */
1091 if (GET_MODE (dst) != TImode)
1092 return false;
1094 if (MEM_P (dst))
1096 /* Check for store. Memory must be aligned or unaligned store
1097 is optimal. Only support store from register, standard SSE
1098 constant or CONST_WIDE_INT generated from piecewise store.
1100 ??? Verify performance impact before enabling CONST_INT for
1101 __int128 store. */
1102 if (misaligned_operand (dst, TImode)
1103 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1104 return false;
1106 switch (GET_CODE (src))
1108 default:
1109 return false;
1111 case REG:
1112 case CONST_WIDE_INT:
1113 return true;
1115 case CONST_INT:
1116 return standard_sse_constant_p (src, TImode);
1119 else if (MEM_P (src))
1121 /* Check for load. Memory must be aligned or unaligned load is
1122 optimal. */
1123 return (REG_P (dst)
1124 && (!misaligned_operand (src, TImode)
1125 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1128 return false;
1131 /* Return 1 if INSN may be converted into vector
1132 instruction. */
1134 static bool
1135 scalar_to_vector_candidate_p (rtx_insn *insn)
1137 if (TARGET_64BIT)
1138 return timode_scalar_to_vector_candidate_p (insn);
1139 else
1140 return dimode_scalar_to_vector_candidate_p (insn);
1143 /* The DImode version of remove_non_convertible_regs. */
1145 static void
1146 dimode_remove_non_convertible_regs (bitmap candidates)
1148 bitmap_iterator bi;
1149 unsigned id;
1150 bitmap regs = BITMAP_ALLOC (NULL);
1152 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1154 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1155 rtx reg = SET_DEST (def_set);
1157 if (!REG_P (reg)
1158 || bitmap_bit_p (regs, REGNO (reg))
1159 || HARD_REGISTER_P (reg))
1160 continue;
1162 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1163 def;
1164 def = DF_REF_NEXT_REG (def))
1166 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1168 if (dump_file)
1169 fprintf (dump_file,
1170 "r%d has non convertible definition in insn %d\n",
1171 REGNO (reg), DF_REF_INSN_UID (def));
1173 bitmap_set_bit (regs, REGNO (reg));
1174 break;
1179 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1181 for (df_ref def = DF_REG_DEF_CHAIN (id);
1182 def;
1183 def = DF_REF_NEXT_REG (def))
1184 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1186 if (dump_file)
1187 fprintf (dump_file, "Removing insn %d from candidates list\n",
1188 DF_REF_INSN_UID (def));
1190 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1194 BITMAP_FREE (regs);
1197 /* For a register REGNO, scan instructions for its defs and uses.
1198 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1200 static void
1201 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1202 unsigned int regno)
1204 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1205 def;
1206 def = DF_REF_NEXT_REG (def))
1208 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1210 if (dump_file)
1211 fprintf (dump_file,
1212 "r%d has non convertible def in insn %d\n",
1213 regno, DF_REF_INSN_UID (def));
1215 bitmap_set_bit (regs, regno);
1216 break;
1220 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1221 ref;
1222 ref = DF_REF_NEXT_REG (ref))
1224 /* Debug instructions are skipped. */
1225 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1226 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1228 if (dump_file)
1229 fprintf (dump_file,
1230 "r%d has non convertible use in insn %d\n",
1231 regno, DF_REF_INSN_UID (ref));
1233 bitmap_set_bit (regs, regno);
1234 break;
1239 /* The TImode version of remove_non_convertible_regs. */
1241 static void
1242 timode_remove_non_convertible_regs (bitmap candidates)
1244 bitmap_iterator bi;
1245 unsigned id;
1246 bitmap regs = BITMAP_ALLOC (NULL);
1248 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1250 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1251 rtx dest = SET_DEST (def_set);
1252 rtx src = SET_SRC (def_set);
1254 if ((!REG_P (dest)
1255 || bitmap_bit_p (regs, REGNO (dest))
1256 || HARD_REGISTER_P (dest))
1257 && (!REG_P (src)
1258 || bitmap_bit_p (regs, REGNO (src))
1259 || HARD_REGISTER_P (src)))
1260 continue;
1262 if (REG_P (dest))
1263 timode_check_non_convertible_regs (candidates, regs,
1264 REGNO (dest));
1266 if (REG_P (src))
1267 timode_check_non_convertible_regs (candidates, regs,
1268 REGNO (src));
1271 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1273 for (df_ref def = DF_REG_DEF_CHAIN (id);
1274 def;
1275 def = DF_REF_NEXT_REG (def))
1276 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1278 if (dump_file)
1279 fprintf (dump_file, "Removing insn %d from candidates list\n",
1280 DF_REF_INSN_UID (def));
1282 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1285 for (df_ref ref = DF_REG_USE_CHAIN (id);
1286 ref;
1287 ref = DF_REF_NEXT_REG (ref))
1288 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1290 if (dump_file)
1291 fprintf (dump_file, "Removing insn %d from candidates list\n",
1292 DF_REF_INSN_UID (ref));
1294 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1298 BITMAP_FREE (regs);
1301 /* For a given bitmap of insn UIDs scans all instruction and
1302 remove insn from CANDIDATES in case it has both convertible
1303 and not convertible definitions.
1305 All insns in a bitmap are conversion candidates according to
1306 scalar_to_vector_candidate_p. Currently it implies all insns
1307 are single_set. */
1309 static void
1310 remove_non_convertible_regs (bitmap candidates)
1312 if (TARGET_64BIT)
1313 timode_remove_non_convertible_regs (candidates);
1314 else
1315 dimode_remove_non_convertible_regs (candidates);
1318 class scalar_chain
1320 public:
1321 scalar_chain ();
1322 virtual ~scalar_chain ();
1324 static unsigned max_id;
1326 /* ID of a chain. */
1327 unsigned int chain_id;
1328 /* A queue of instructions to be included into a chain. */
1329 bitmap queue;
1330 /* Instructions included into a chain. */
1331 bitmap insns;
1332 /* All registers defined by a chain. */
1333 bitmap defs;
1334 /* Registers used in both vector and sclar modes. */
1335 bitmap defs_conv;
1337 void build (bitmap candidates, unsigned insn_uid);
1338 virtual int compute_convert_gain () = 0;
1339 int convert ();
1341 protected:
1342 void add_to_queue (unsigned insn_uid);
1343 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1345 private:
1346 void add_insn (bitmap candidates, unsigned insn_uid);
1347 void analyze_register_chain (bitmap candidates, df_ref ref);
1348 virtual void mark_dual_mode_def (df_ref def) = 0;
1349 virtual void convert_insn (rtx_insn *insn) = 0;
1350 virtual void convert_registers () = 0;
1353 class dimode_scalar_chain : public scalar_chain
1355 public:
1356 int compute_convert_gain ();
1357 private:
1358 void mark_dual_mode_def (df_ref def);
1359 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1360 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1361 void convert_insn (rtx_insn *insn);
1362 void convert_op (rtx *op, rtx_insn *insn);
1363 void convert_reg (unsigned regno);
1364 void make_vector_copies (unsigned regno);
1365 void convert_registers ();
1366 int vector_const_cost (rtx exp);
1369 class timode_scalar_chain : public scalar_chain
1371 public:
1372 /* Convert from TImode to V1TImode is always faster. */
1373 int compute_convert_gain () { return 1; }
1375 private:
1376 void mark_dual_mode_def (df_ref def);
1377 void fix_debug_reg_uses (rtx reg);
1378 void convert_insn (rtx_insn *insn);
1379 /* We don't convert registers to difference size. */
1380 void convert_registers () {}
1383 unsigned scalar_chain::max_id = 0;
1385 /* Initialize new chain. */
1387 scalar_chain::scalar_chain ()
1389 chain_id = ++max_id;
1391 if (dump_file)
1392 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1394 bitmap_obstack_initialize (NULL);
1395 insns = BITMAP_ALLOC (NULL);
1396 defs = BITMAP_ALLOC (NULL);
1397 defs_conv = BITMAP_ALLOC (NULL);
1398 queue = NULL;
1401 /* Free chain's data. */
1403 scalar_chain::~scalar_chain ()
1405 BITMAP_FREE (insns);
1406 BITMAP_FREE (defs);
1407 BITMAP_FREE (defs_conv);
1408 bitmap_obstack_release (NULL);
1411 /* Add instruction into chains' queue. */
1413 void
1414 scalar_chain::add_to_queue (unsigned insn_uid)
1416 if (bitmap_bit_p (insns, insn_uid)
1417 || bitmap_bit_p (queue, insn_uid))
1418 return;
1420 if (dump_file)
1421 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1422 insn_uid, chain_id);
1423 bitmap_set_bit (queue, insn_uid);
1426 /* For DImode conversion, mark register defined by DEF as requiring
1427 conversion. */
1429 void
1430 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1432 gcc_assert (DF_REF_REG_DEF_P (def));
1434 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1435 return;
1437 if (dump_file)
1438 fprintf (dump_file,
1439 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1440 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1442 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1445 /* For TImode conversion, it is unused. */
1447 void
1448 timode_scalar_chain::mark_dual_mode_def (df_ref)
1450 gcc_unreachable ();
1453 /* Check REF's chain to add new insns into a queue
1454 and find registers requiring conversion. */
1456 void
1457 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1459 df_link *chain;
1461 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1462 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1463 add_to_queue (DF_REF_INSN_UID (ref));
1465 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1467 unsigned uid = DF_REF_INSN_UID (chain->ref);
1469 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1470 continue;
1472 if (!DF_REF_REG_MEM_P (chain->ref))
1474 if (bitmap_bit_p (insns, uid))
1475 continue;
1477 if (bitmap_bit_p (candidates, uid))
1479 add_to_queue (uid);
1480 continue;
1484 if (DF_REF_REG_DEF_P (chain->ref))
1486 if (dump_file)
1487 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1488 DF_REF_REGNO (chain->ref), uid);
1489 mark_dual_mode_def (chain->ref);
1491 else
1493 if (dump_file)
1494 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1495 DF_REF_REGNO (chain->ref), uid);
1496 mark_dual_mode_def (ref);
1501 /* Add instruction into a chain. */
1503 void
1504 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1506 if (bitmap_bit_p (insns, insn_uid))
1507 return;
1509 if (dump_file)
1510 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1512 bitmap_set_bit (insns, insn_uid);
1514 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1515 rtx def_set = single_set (insn);
1516 if (def_set && REG_P (SET_DEST (def_set))
1517 && !HARD_REGISTER_P (SET_DEST (def_set)))
1518 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1520 df_ref ref;
1521 df_ref def;
1522 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1523 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1524 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1525 def;
1526 def = DF_REF_NEXT_REG (def))
1527 analyze_register_chain (candidates, def);
1528 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1529 if (!DF_REF_REG_MEM_P (ref))
1530 analyze_register_chain (candidates, ref);
1533 /* Build new chain starting from insn INSN_UID recursively
1534 adding all dependent uses and definitions. */
1536 void
1537 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1539 queue = BITMAP_ALLOC (NULL);
1540 bitmap_set_bit (queue, insn_uid);
1542 if (dump_file)
1543 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1545 while (!bitmap_empty_p (queue))
1547 insn_uid = bitmap_first_set_bit (queue);
1548 bitmap_clear_bit (queue, insn_uid);
1549 bitmap_clear_bit (candidates, insn_uid);
1550 add_insn (candidates, insn_uid);
1553 if (dump_file)
1555 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1556 fprintf (dump_file, " insns: ");
1557 dump_bitmap (dump_file, insns);
1558 if (!bitmap_empty_p (defs_conv))
1560 bitmap_iterator bi;
1561 unsigned id;
1562 const char *comma = "";
1563 fprintf (dump_file, " defs to convert: ");
1564 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1566 fprintf (dump_file, "%sr%d", comma, id);
1567 comma = ", ";
1569 fprintf (dump_file, "\n");
1573 BITMAP_FREE (queue);
1576 /* Return a cost of building a vector costant
1577 instead of using a scalar one. */
1580 dimode_scalar_chain::vector_const_cost (rtx exp)
1582 gcc_assert (CONST_INT_P (exp));
1584 if (standard_sse_constant_p (exp, V2DImode))
1585 return COSTS_N_INSNS (1);
1586 return ix86_cost->sse_load[1];
1589 /* Compute a gain for chain conversion. */
1592 dimode_scalar_chain::compute_convert_gain ()
1594 bitmap_iterator bi;
1595 unsigned insn_uid;
1596 int gain = 0;
1597 int cost = 0;
1599 if (dump_file)
1600 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1602 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1604 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1605 rtx def_set = single_set (insn);
1606 rtx src = SET_SRC (def_set);
1607 rtx dst = SET_DEST (def_set);
1609 if (REG_P (src) && REG_P (dst))
1610 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1611 else if (REG_P (src) && MEM_P (dst))
1612 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1613 else if (MEM_P (src) && REG_P (dst))
1614 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1615 else if (GET_CODE (src) == ASHIFT
1616 || GET_CODE (src) == ASHIFTRT
1617 || GET_CODE (src) == LSHIFTRT)
1619 if (CONST_INT_P (XEXP (src, 0)))
1620 gain -= vector_const_cost (XEXP (src, 0));
1621 if (CONST_INT_P (XEXP (src, 1)))
1623 gain += ix86_cost->shift_const;
1624 if (INTVAL (XEXP (src, 1)) >= 32)
1625 gain -= COSTS_N_INSNS (1);
1627 else
1628 /* Additional gain for omitting two CMOVs. */
1629 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1631 else if (GET_CODE (src) == PLUS
1632 || GET_CODE (src) == MINUS
1633 || GET_CODE (src) == IOR
1634 || GET_CODE (src) == XOR
1635 || GET_CODE (src) == AND)
1637 gain += ix86_cost->add;
1638 /* Additional gain for andnot for targets without BMI. */
1639 if (GET_CODE (XEXP (src, 0)) == NOT
1640 && !TARGET_BMI)
1641 gain += 2 * ix86_cost->add;
1643 if (CONST_INT_P (XEXP (src, 0)))
1644 gain -= vector_const_cost (XEXP (src, 0));
1645 if (CONST_INT_P (XEXP (src, 1)))
1646 gain -= vector_const_cost (XEXP (src, 1));
1648 else if (GET_CODE (src) == NEG
1649 || GET_CODE (src) == NOT)
1650 gain += ix86_cost->add - COSTS_N_INSNS (1);
1651 else if (GET_CODE (src) == COMPARE)
1653 /* Assume comparison cost is the same. */
1655 else if (CONST_INT_P (src))
1657 if (REG_P (dst))
1658 gain += COSTS_N_INSNS (2);
1659 else if (MEM_P (dst))
1660 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1661 gain -= vector_const_cost (src);
1663 else
1664 gcc_unreachable ();
1667 if (dump_file)
1668 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1670 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1671 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1673 if (dump_file)
1674 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1676 gain -= cost;
1678 if (dump_file)
1679 fprintf (dump_file, " Total gain: %d\n", gain);
1681 return gain;
1684 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1687 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1689 if (x == reg)
1690 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1692 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1693 int i, j;
1694 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1696 if (fmt[i] == 'e')
1697 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1698 else if (fmt[i] == 'E')
1699 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1700 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1701 reg, new_reg);
1704 return x;
1707 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1709 void
1710 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1711 rtx reg, rtx new_reg)
1713 replace_with_subreg (single_set (insn), reg, new_reg);
1716 /* Insert generated conversion instruction sequence INSNS
1717 after instruction AFTER. New BB may be required in case
1718 instruction has EH region attached. */
1720 void
1721 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1723 if (!control_flow_insn_p (after))
1725 emit_insn_after (insns, after);
1726 return;
1729 basic_block bb = BLOCK_FOR_INSN (after);
1730 edge e = find_fallthru_edge (bb->succs);
1731 gcc_assert (e);
1733 basic_block new_bb = split_edge (e);
1734 emit_insn_after (insns, BB_HEAD (new_bb));
1737 /* Make vector copies for all register REGNO definitions
1738 and replace its uses in a chain. */
1740 void
1741 dimode_scalar_chain::make_vector_copies (unsigned regno)
1743 rtx reg = regno_reg_rtx[regno];
1744 rtx vreg = gen_reg_rtx (DImode);
1745 bool count_reg = false;
1746 df_ref ref;
1748 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1749 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1751 df_ref use;
1753 /* Detect the count register of a shift instruction. */
1754 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1755 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1757 rtx_insn *insn = DF_REF_INSN (use);
1758 rtx def_set = single_set (insn);
1760 gcc_assert (def_set);
1762 rtx src = SET_SRC (def_set);
1764 if ((GET_CODE (src) == ASHIFT
1765 || GET_CODE (src) == ASHIFTRT
1766 || GET_CODE (src) == LSHIFTRT)
1767 && !CONST_INT_P (XEXP (src, 1))
1768 && reg_or_subregno (XEXP (src, 1)) == regno)
1769 count_reg = true;
1772 start_sequence ();
1773 if (count_reg)
1775 rtx qreg = gen_lowpart (QImode, reg);
1776 rtx tmp = gen_reg_rtx (SImode);
1778 if (TARGET_ZERO_EXTEND_WITH_AND
1779 && optimize_function_for_speed_p (cfun))
1781 emit_move_insn (tmp, const0_rtx);
1782 emit_insn (gen_movstrictqi
1783 (gen_lowpart (QImode, tmp), qreg));
1785 else
1786 emit_insn (gen_rtx_SET
1787 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1789 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1791 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1792 emit_move_insn (slot, tmp);
1793 tmp = copy_rtx (slot);
1796 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1798 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1800 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1801 emit_move_insn (adjust_address (tmp, SImode, 0),
1802 gen_rtx_SUBREG (SImode, reg, 0));
1803 emit_move_insn (adjust_address (tmp, SImode, 4),
1804 gen_rtx_SUBREG (SImode, reg, 4));
1805 emit_move_insn (vreg, tmp);
1807 else if (TARGET_SSE4_1)
1809 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1810 CONST0_RTX (V4SImode),
1811 gen_rtx_SUBREG (SImode, reg, 0)));
1812 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1813 gen_rtx_SUBREG (V4SImode, vreg, 0),
1814 gen_rtx_SUBREG (SImode, reg, 4),
1815 GEN_INT (2)));
1817 else
1819 rtx tmp = gen_reg_rtx (DImode);
1820 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1821 CONST0_RTX (V4SImode),
1822 gen_rtx_SUBREG (SImode, reg, 0)));
1823 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1824 CONST0_RTX (V4SImode),
1825 gen_rtx_SUBREG (SImode, reg, 4)));
1826 emit_insn (gen_vec_interleave_lowv4si
1827 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1828 gen_rtx_SUBREG (V4SImode, vreg, 0),
1829 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1831 rtx_insn *seq = get_insns ();
1832 end_sequence ();
1833 rtx_insn *insn = DF_REF_INSN (ref);
1834 emit_conversion_insns (seq, insn);
1836 if (dump_file)
1837 fprintf (dump_file,
1838 " Copied r%d to a vector register r%d for insn %d\n",
1839 regno, REGNO (vreg), INSN_UID (insn));
1842 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1843 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1845 rtx_insn *insn = DF_REF_INSN (ref);
1846 if (count_reg)
1848 rtx def_set = single_set (insn);
1849 gcc_assert (def_set);
1851 rtx src = SET_SRC (def_set);
1853 if ((GET_CODE (src) == ASHIFT
1854 || GET_CODE (src) == ASHIFTRT
1855 || GET_CODE (src) == LSHIFTRT)
1856 && !CONST_INT_P (XEXP (src, 1))
1857 && reg_or_subregno (XEXP (src, 1)) == regno)
1858 XEXP (src, 1) = vreg;
1860 else
1861 replace_with_subreg_in_insn (insn, reg, vreg);
1863 if (dump_file)
1864 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1865 regno, REGNO (vreg), INSN_UID (insn));
1869 /* Convert all definitions of register REGNO
1870 and fix its uses. Scalar copies may be created
1871 in case register is used in not convertible insn. */
1873 void
1874 dimode_scalar_chain::convert_reg (unsigned regno)
1876 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1877 rtx reg = regno_reg_rtx[regno];
1878 rtx scopy = NULL_RTX;
1879 df_ref ref;
1880 bitmap conv;
1882 conv = BITMAP_ALLOC (NULL);
1883 bitmap_copy (conv, insns);
1885 if (scalar_copy)
1886 scopy = gen_reg_rtx (DImode);
1888 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1890 rtx_insn *insn = DF_REF_INSN (ref);
1891 rtx def_set = single_set (insn);
1892 rtx src = SET_SRC (def_set);
1893 rtx reg = DF_REF_REG (ref);
1895 if (!MEM_P (src))
1897 replace_with_subreg_in_insn (insn, reg, reg);
1898 bitmap_clear_bit (conv, INSN_UID (insn));
1901 if (scalar_copy)
1903 start_sequence ();
1904 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1906 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1907 emit_move_insn (tmp, reg);
1908 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1909 adjust_address (tmp, SImode, 0));
1910 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1911 adjust_address (tmp, SImode, 4));
1913 else if (TARGET_SSE4_1)
1915 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1916 emit_insn
1917 (gen_rtx_SET
1918 (gen_rtx_SUBREG (SImode, scopy, 0),
1919 gen_rtx_VEC_SELECT (SImode,
1920 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1922 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1923 emit_insn
1924 (gen_rtx_SET
1925 (gen_rtx_SUBREG (SImode, scopy, 4),
1926 gen_rtx_VEC_SELECT (SImode,
1927 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1929 else
1931 rtx vcopy = gen_reg_rtx (V2DImode);
1932 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1933 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1934 gen_rtx_SUBREG (SImode, vcopy, 0));
1935 emit_move_insn (vcopy,
1936 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1937 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1938 gen_rtx_SUBREG (SImode, vcopy, 0));
1940 rtx_insn *seq = get_insns ();
1941 end_sequence ();
1942 emit_conversion_insns (seq, insn);
1944 if (dump_file)
1945 fprintf (dump_file,
1946 " Copied r%d to a scalar register r%d for insn %d\n",
1947 regno, REGNO (scopy), INSN_UID (insn));
1951 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1952 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1954 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1956 rtx_insn *insn = DF_REF_INSN (ref);
1958 rtx def_set = single_set (insn);
1959 gcc_assert (def_set);
1961 rtx src = SET_SRC (def_set);
1962 rtx dst = SET_DEST (def_set);
1964 if ((GET_CODE (src) == ASHIFT
1965 || GET_CODE (src) == ASHIFTRT
1966 || GET_CODE (src) == LSHIFTRT)
1967 && !CONST_INT_P (XEXP (src, 1))
1968 && reg_or_subregno (XEXP (src, 1)) == regno)
1970 rtx tmp2 = gen_reg_rtx (V2DImode);
1972 start_sequence ();
1974 if (TARGET_SSE4_1)
1975 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1976 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1977 else
1979 rtx vec_cst
1980 = gen_rtx_CONST_VECTOR (V2DImode,
1981 gen_rtvec (2, GEN_INT (0xff),
1982 const0_rtx));
1983 vec_cst
1984 = validize_mem (force_const_mem (V2DImode, vec_cst));
1986 emit_insn (gen_rtx_SET
1987 (tmp2,
1988 gen_rtx_AND (V2DImode,
1989 gen_rtx_SUBREG (V2DImode, reg, 0),
1990 vec_cst)));
1992 rtx_insn *seq = get_insns ();
1993 end_sequence ();
1995 emit_insn_before (seq, insn);
1997 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1999 else if (!MEM_P (dst) || !REG_P (src))
2000 replace_with_subreg_in_insn (insn, reg, reg);
2002 bitmap_clear_bit (conv, INSN_UID (insn));
2005 /* Skip debug insns and uninitialized uses. */
2006 else if (DF_REF_CHAIN (ref)
2007 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2009 gcc_assert (scopy);
2010 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2011 df_insn_rescan (DF_REF_INSN (ref));
2014 BITMAP_FREE (conv);
2017 /* Convert operand OP in INSN. We should handle
2018 memory operands and uninitialized registers.
2019 All other register uses are converted during
2020 registers conversion. */
2022 void
2023 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2025 *op = copy_rtx_if_shared (*op);
2027 if (GET_CODE (*op) == NOT)
2029 convert_op (&XEXP (*op, 0), insn);
2030 PUT_MODE (*op, V2DImode);
2032 else if (MEM_P (*op))
2034 rtx tmp = gen_reg_rtx (DImode);
2036 emit_insn_before (gen_move_insn (tmp, *op), insn);
2037 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2039 if (dump_file)
2040 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2041 INSN_UID (insn), REGNO (tmp));
2043 else if (REG_P (*op))
2045 /* We may have not converted register usage in case
2046 this register has no definition. Otherwise it
2047 should be converted in convert_reg. */
2048 df_ref ref;
2049 FOR_EACH_INSN_USE (ref, insn)
2050 if (DF_REF_REGNO (ref) == REGNO (*op))
2052 gcc_assert (!DF_REF_CHAIN (ref));
2053 break;
2055 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2057 else if (CONST_INT_P (*op))
2059 rtx vec_cst;
2060 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2062 /* Prefer all ones vector in case of -1. */
2063 if (constm1_operand (*op, GET_MODE (*op)))
2064 vec_cst = CONSTM1_RTX (V2DImode);
2065 else
2066 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2067 gen_rtvec (2, *op, const0_rtx));
2069 if (!standard_sse_constant_p (vec_cst, V2DImode))
2071 start_sequence ();
2072 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2073 rtx_insn *seq = get_insns ();
2074 end_sequence ();
2075 emit_insn_before (seq, insn);
2078 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2079 *op = tmp;
2081 else
2083 gcc_assert (SUBREG_P (*op));
2084 gcc_assert (GET_MODE (*op) == V2DImode);
2088 /* Convert INSN to vector mode. */
2090 void
2091 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2093 rtx def_set = single_set (insn);
2094 rtx src = SET_SRC (def_set);
2095 rtx dst = SET_DEST (def_set);
2096 rtx subreg;
2098 if (MEM_P (dst) && !REG_P (src))
2100 /* There are no scalar integer instructions and therefore
2101 temporary register usage is required. */
2102 rtx tmp = gen_reg_rtx (DImode);
2103 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2104 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2107 switch (GET_CODE (src))
2109 case ASHIFT:
2110 case ASHIFTRT:
2111 case LSHIFTRT:
2112 convert_op (&XEXP (src, 0), insn);
2113 PUT_MODE (src, V2DImode);
2114 break;
2116 case PLUS:
2117 case MINUS:
2118 case IOR:
2119 case XOR:
2120 case AND:
2121 convert_op (&XEXP (src, 0), insn);
2122 convert_op (&XEXP (src, 1), insn);
2123 PUT_MODE (src, V2DImode);
2124 break;
2126 case NEG:
2127 src = XEXP (src, 0);
2128 convert_op (&src, insn);
2129 subreg = gen_reg_rtx (V2DImode);
2130 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2131 src = gen_rtx_MINUS (V2DImode, subreg, src);
2132 break;
2134 case NOT:
2135 src = XEXP (src, 0);
2136 convert_op (&src, insn);
2137 subreg = gen_reg_rtx (V2DImode);
2138 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2139 src = gen_rtx_XOR (V2DImode, src, subreg);
2140 break;
2142 case MEM:
2143 if (!REG_P (dst))
2144 convert_op (&src, insn);
2145 break;
2147 case REG:
2148 if (!MEM_P (dst))
2149 convert_op (&src, insn);
2150 break;
2152 case SUBREG:
2153 gcc_assert (GET_MODE (src) == V2DImode);
2154 break;
2156 case COMPARE:
2157 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2159 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2160 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2162 if (REG_P (src))
2163 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2164 else
2165 subreg = copy_rtx_if_shared (src);
2166 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2167 copy_rtx_if_shared (subreg),
2168 copy_rtx_if_shared (subreg)),
2169 insn);
2170 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2171 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2172 copy_rtx_if_shared (src)),
2173 UNSPEC_PTEST);
2174 break;
2176 case CONST_INT:
2177 convert_op (&src, insn);
2178 break;
2180 default:
2181 gcc_unreachable ();
2184 SET_SRC (def_set) = src;
2185 SET_DEST (def_set) = dst;
2187 /* Drop possible dead definitions. */
2188 PATTERN (insn) = def_set;
2190 INSN_CODE (insn) = -1;
2191 recog_memoized (insn);
2192 df_insn_rescan (insn);
2195 /* Fix uses of converted REG in debug insns. */
2197 void
2198 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2200 if (!flag_var_tracking)
2201 return;
2203 df_ref ref, next;
2204 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2206 rtx_insn *insn = DF_REF_INSN (ref);
2207 /* Make sure the next ref is for a different instruction,
2208 so that we're not affected by the rescan. */
2209 next = DF_REF_NEXT_REG (ref);
2210 while (next && DF_REF_INSN (next) == insn)
2211 next = DF_REF_NEXT_REG (next);
2213 if (DEBUG_INSN_P (insn))
2215 /* It may be a debug insn with a TImode variable in
2216 register. */
2217 bool changed = false;
2218 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2220 rtx *loc = DF_REF_LOC (ref);
2221 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2223 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2224 changed = true;
2227 if (changed)
2228 df_insn_rescan (insn);
2233 /* Convert INSN from TImode to V1T1mode. */
2235 void
2236 timode_scalar_chain::convert_insn (rtx_insn *insn)
2238 rtx def_set = single_set (insn);
2239 rtx src = SET_SRC (def_set);
2240 rtx dst = SET_DEST (def_set);
2242 switch (GET_CODE (dst))
2244 case REG:
2246 rtx tmp = find_reg_equal_equiv_note (insn);
2247 if (tmp)
2248 PUT_MODE (XEXP (tmp, 0), V1TImode);
2249 PUT_MODE (dst, V1TImode);
2250 fix_debug_reg_uses (dst);
2252 break;
2253 case MEM:
2254 PUT_MODE (dst, V1TImode);
2255 break;
2257 default:
2258 gcc_unreachable ();
2261 switch (GET_CODE (src))
2263 case REG:
2264 PUT_MODE (src, V1TImode);
2265 /* Call fix_debug_reg_uses only if SRC is never defined. */
2266 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2267 fix_debug_reg_uses (src);
2268 break;
2270 case MEM:
2271 PUT_MODE (src, V1TImode);
2272 break;
2274 case CONST_WIDE_INT:
2275 if (NONDEBUG_INSN_P (insn))
2277 /* Since there are no instructions to store 128-bit constant,
2278 temporary register usage is required. */
2279 rtx tmp = gen_reg_rtx (V1TImode);
2280 start_sequence ();
2281 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2282 src = validize_mem (force_const_mem (V1TImode, src));
2283 rtx_insn *seq = get_insns ();
2284 end_sequence ();
2285 if (seq)
2286 emit_insn_before (seq, insn);
2287 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2288 dst = tmp;
2290 break;
2292 case CONST_INT:
2293 switch (standard_sse_constant_p (src, TImode))
2295 case 1:
2296 src = CONST0_RTX (GET_MODE (dst));
2297 break;
2298 case 2:
2299 src = CONSTM1_RTX (GET_MODE (dst));
2300 break;
2301 default:
2302 gcc_unreachable ();
2304 if (NONDEBUG_INSN_P (insn))
2306 rtx tmp = gen_reg_rtx (V1TImode);
2307 /* Since there are no instructions to store standard SSE
2308 constant, temporary register usage is required. */
2309 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2310 dst = tmp;
2312 break;
2314 default:
2315 gcc_unreachable ();
2318 SET_SRC (def_set) = src;
2319 SET_DEST (def_set) = dst;
2321 /* Drop possible dead definitions. */
2322 PATTERN (insn) = def_set;
2324 INSN_CODE (insn) = -1;
2325 recog_memoized (insn);
2326 df_insn_rescan (insn);
2329 void
2330 dimode_scalar_chain::convert_registers ()
2332 bitmap_iterator bi;
2333 unsigned id;
2335 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2336 convert_reg (id);
2338 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2339 make_vector_copies (id);
2342 /* Convert whole chain creating required register
2343 conversions and copies. */
2346 scalar_chain::convert ()
2348 bitmap_iterator bi;
2349 unsigned id;
2350 int converted_insns = 0;
2352 if (!dbg_cnt (stv_conversion))
2353 return 0;
2355 if (dump_file)
2356 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2358 convert_registers ();
2360 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2362 convert_insn (DF_INSN_UID_GET (id)->insn);
2363 converted_insns++;
2366 return converted_insns;
2369 /* Main STV pass function. Find and convert scalar
2370 instructions into vector mode when profitable. */
2372 static unsigned int
2373 convert_scalars_to_vector ()
2375 basic_block bb;
2376 bitmap candidates;
2377 int converted_insns = 0;
2379 bitmap_obstack_initialize (NULL);
2380 candidates = BITMAP_ALLOC (NULL);
2382 calculate_dominance_info (CDI_DOMINATORS);
2383 df_set_flags (DF_DEFER_INSN_RESCAN);
2384 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2385 df_md_add_problem ();
2386 df_analyze ();
2388 /* Find all instructions we want to convert into vector mode. */
2389 if (dump_file)
2390 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2392 FOR_EACH_BB_FN (bb, cfun)
2394 rtx_insn *insn;
2395 FOR_BB_INSNS (bb, insn)
2396 if (scalar_to_vector_candidate_p (insn))
2398 if (dump_file)
2399 fprintf (dump_file, " insn %d is marked as a candidate\n",
2400 INSN_UID (insn));
2402 bitmap_set_bit (candidates, INSN_UID (insn));
2406 remove_non_convertible_regs (candidates);
2408 if (bitmap_empty_p (candidates))
2409 if (dump_file)
2410 fprintf (dump_file, "There are no candidates for optimization.\n");
2412 while (!bitmap_empty_p (candidates))
2414 unsigned uid = bitmap_first_set_bit (candidates);
2415 scalar_chain *chain;
2417 if (TARGET_64BIT)
2418 chain = new timode_scalar_chain;
2419 else
2420 chain = new dimode_scalar_chain;
2422 /* Find instructions chain we want to convert to vector mode.
2423 Check all uses and definitions to estimate all required
2424 conversions. */
2425 chain->build (candidates, uid);
2427 if (chain->compute_convert_gain () > 0)
2428 converted_insns += chain->convert ();
2429 else
2430 if (dump_file)
2431 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2432 chain->chain_id);
2434 delete chain;
2437 if (dump_file)
2438 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2440 BITMAP_FREE (candidates);
2441 bitmap_obstack_release (NULL);
2442 df_process_deferred_rescans ();
2444 /* Conversion means we may have 128bit register spills/fills
2445 which require aligned stack. */
2446 if (converted_insns)
2448 if (crtl->stack_alignment_needed < 128)
2449 crtl->stack_alignment_needed = 128;
2450 if (crtl->stack_alignment_estimated < 128)
2451 crtl->stack_alignment_estimated = 128;
2452 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2453 if (TARGET_64BIT)
2454 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2455 parm; parm = DECL_CHAIN (parm))
2457 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2458 continue;
2459 if (DECL_RTL_SET_P (parm)
2460 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2462 rtx r = DECL_RTL (parm);
2463 if (REG_P (r))
2464 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2466 if (DECL_INCOMING_RTL (parm)
2467 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2469 rtx r = DECL_INCOMING_RTL (parm);
2470 if (REG_P (r))
2471 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2476 return 0;
2479 namespace {
2481 const pass_data pass_data_insert_vzeroupper =
2483 RTL_PASS, /* type */
2484 "vzeroupper", /* name */
2485 OPTGROUP_NONE, /* optinfo_flags */
2486 TV_MACH_DEP, /* tv_id */
2487 0, /* properties_required */
2488 0, /* properties_provided */
2489 0, /* properties_destroyed */
2490 0, /* todo_flags_start */
2491 TODO_df_finish, /* todo_flags_finish */
2494 class pass_insert_vzeroupper : public rtl_opt_pass
2496 public:
2497 pass_insert_vzeroupper(gcc::context *ctxt)
2498 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2501 /* opt_pass methods: */
2502 virtual bool gate (function *)
2504 return TARGET_AVX
2505 && TARGET_VZEROUPPER && flag_expensive_optimizations
2506 && !optimize_size;
2509 virtual unsigned int execute (function *)
2511 return rest_of_handle_insert_vzeroupper ();
2514 }; // class pass_insert_vzeroupper
2516 const pass_data pass_data_stv =
2518 RTL_PASS, /* type */
2519 "stv", /* name */
2520 OPTGROUP_NONE, /* optinfo_flags */
2521 TV_MACH_DEP, /* tv_id */
2522 0, /* properties_required */
2523 0, /* properties_provided */
2524 0, /* properties_destroyed */
2525 0, /* todo_flags_start */
2526 TODO_df_finish, /* todo_flags_finish */
2529 class pass_stv : public rtl_opt_pass
2531 public:
2532 pass_stv (gcc::context *ctxt)
2533 : rtl_opt_pass (pass_data_stv, ctxt),
2534 timode_p (false)
2537 /* opt_pass methods: */
2538 virtual bool gate (function *)
2540 return (timode_p == !!TARGET_64BIT
2541 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2544 virtual unsigned int execute (function *)
2546 return convert_scalars_to_vector ();
2549 opt_pass *clone ()
2551 return new pass_stv (m_ctxt);
2554 void set_pass_param (unsigned int n, bool param)
2556 gcc_assert (n == 0);
2557 timode_p = param;
2560 private:
2561 bool timode_p;
2562 }; // class pass_stv
2564 } // anon namespace
2566 rtl_opt_pass *
2567 make_pass_insert_vzeroupper (gcc::context *ctxt)
2569 return new pass_insert_vzeroupper (ctxt);
2572 rtl_opt_pass *
2573 make_pass_stv (gcc::context *ctxt)
2575 return new pass_stv (ctxt);
2578 /* Inserting ENDBRANCH instructions. */
2580 static unsigned int
2581 rest_of_insert_endbranch (void)
2583 timevar_push (TV_MACH_DEP);
2585 rtx cet_eb;
2586 rtx_insn *insn;
2587 basic_block bb;
2589 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2590 absent among function attributes. Later an optimization will be
2591 introduced to make analysis if an address of a static function is
2592 taken. A static function whose address is not taken will get a
2593 nocf_check attribute. This will allow to reduce the number of EB. */
2595 if (!lookup_attribute ("nocf_check",
2596 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2597 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2599 cet_eb = gen_nop_endbr ();
2601 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2602 insn = BB_HEAD (bb);
2603 emit_insn_before (cet_eb, insn);
2606 bb = 0;
2607 FOR_EACH_BB_FN (bb, cfun)
2609 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2610 insn = NEXT_INSN (insn))
2612 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2614 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2615 continue;
2616 /* Generate ENDBRANCH after CALL, which can return more than
2617 twice, setjmp-like functions. */
2619 /* Skip notes and debug insns that must be next to the
2620 call insn. ??? This might skip a lot more than
2621 that... ??? Skipping barriers and emitting code
2622 after them surely looks like a mistake; we probably
2623 won't ever hit it, for we'll hit BB_END first. */
2624 rtx_insn *next_insn = insn;
2625 while ((next_insn != BB_END (bb))
2626 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2627 || NOTE_P (NEXT_INSN (next_insn))
2628 || BARRIER_P (NEXT_INSN (next_insn))))
2629 next_insn = NEXT_INSN (next_insn);
2631 cet_eb = gen_nop_endbr ();
2632 emit_insn_after_setloc (cet_eb, next_insn, INSN_LOCATION (insn));
2633 continue;
2636 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2638 rtx target = JUMP_LABEL (insn);
2639 if (target == NULL_RTX || ANY_RETURN_P (target))
2640 continue;
2642 /* Check the jump is a switch table. */
2643 rtx_insn *label = as_a<rtx_insn *> (target);
2644 rtx_insn *table = next_insn (label);
2645 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2646 continue;
2648 /* For the indirect jump find out all places it jumps and insert
2649 ENDBRANCH there. It should be done under a special flag to
2650 control ENDBRANCH generation for switch stmts. */
2651 edge_iterator ei;
2652 edge e;
2653 basic_block dest_blk;
2655 FOR_EACH_EDGE (e, ei, bb->succs)
2657 rtx_insn *insn;
2659 dest_blk = e->dest;
2660 insn = BB_HEAD (dest_blk);
2661 gcc_assert (LABEL_P (insn));
2662 cet_eb = gen_nop_endbr ();
2663 emit_insn_after (cet_eb, insn);
2665 continue;
2668 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2669 || (NOTE_P (insn)
2670 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2671 /* TODO. Check /s bit also. */
2673 cet_eb = gen_nop_endbr ();
2674 emit_insn_after (cet_eb, insn);
2675 continue;
2680 timevar_pop (TV_MACH_DEP);
2681 return 0;
2684 namespace {
2686 const pass_data pass_data_insert_endbranch =
2688 RTL_PASS, /* type. */
2689 "cet", /* name. */
2690 OPTGROUP_NONE, /* optinfo_flags. */
2691 TV_MACH_DEP, /* tv_id. */
2692 0, /* properties_required. */
2693 0, /* properties_provided. */
2694 0, /* properties_destroyed. */
2695 0, /* todo_flags_start. */
2696 0, /* todo_flags_finish. */
2699 class pass_insert_endbranch : public rtl_opt_pass
2701 public:
2702 pass_insert_endbranch (gcc::context *ctxt)
2703 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2706 /* opt_pass methods: */
2707 virtual bool gate (function *)
2709 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2712 virtual unsigned int execute (function *)
2714 return rest_of_insert_endbranch ();
2717 }; // class pass_insert_endbranch
2719 } // anon namespace
2721 rtl_opt_pass *
2722 make_pass_insert_endbranch (gcc::context *ctxt)
2724 return new pass_insert_endbranch (ctxt);
2727 /* Return true if a red-zone is in use. */
2729 bool
2730 ix86_using_red_zone (void)
2732 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2735 /* Return a string that documents the current -m options. The caller is
2736 responsible for freeing the string. */
2738 static char *
2739 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2740 int flags, int flags2,
2741 const char *arch, const char *tune,
2742 enum fpmath_unit fpmath, bool add_nl_p)
2744 struct ix86_target_opts
2746 const char *option; /* option string */
2747 HOST_WIDE_INT mask; /* isa mask options */
2750 /* This table is ordered so that options like -msse4.2 that imply other
2751 ISAs come first. Target string will be displayed in the same order. */
2752 static struct ix86_target_opts isa2_opts[] =
2754 { "-mmpx", OPTION_MASK_ISA_MPX },
2755 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2756 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2757 { "-mvaes", OPTION_MASK_ISA_VAES },
2758 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2759 { "-msgx", OPTION_MASK_ISA_SGX },
2760 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2761 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2762 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2763 { "-mibt", OPTION_MASK_ISA_IBT },
2764 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2766 static struct ix86_target_opts isa_opts[] =
2768 { "-mgfni", OPTION_MASK_ISA_GFNI },
2769 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2770 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2771 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2772 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2773 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2774 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2775 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2776 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2777 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2778 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2779 { "-mfma", OPTION_MASK_ISA_FMA },
2780 { "-mxop", OPTION_MASK_ISA_XOP },
2781 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2782 { "-mf16c", OPTION_MASK_ISA_F16C },
2783 { "-mavx", OPTION_MASK_ISA_AVX },
2784 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2785 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2786 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2787 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2788 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2789 { "-msse3", OPTION_MASK_ISA_SSE3 },
2790 { "-maes", OPTION_MASK_ISA_AES },
2791 { "-msha", OPTION_MASK_ISA_SHA },
2792 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2793 { "-msse2", OPTION_MASK_ISA_SSE2 },
2794 { "-msse", OPTION_MASK_ISA_SSE },
2795 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2796 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2797 { "-mmmx", OPTION_MASK_ISA_MMX },
2798 { "-mrtm", OPTION_MASK_ISA_RTM },
2799 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2800 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2801 { "-madx", OPTION_MASK_ISA_ADX },
2802 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2803 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2804 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2805 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2806 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2807 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2808 { "-mabm", OPTION_MASK_ISA_ABM },
2809 { "-mbmi", OPTION_MASK_ISA_BMI },
2810 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2811 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2812 { "-mtbm", OPTION_MASK_ISA_TBM },
2813 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2814 { "-mcx16", OPTION_MASK_ISA_CX16 },
2815 { "-msahf", OPTION_MASK_ISA_SAHF },
2816 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2817 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2818 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2819 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2820 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
2821 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2822 { "-mpku", OPTION_MASK_ISA_PKU },
2823 { "-mlwp", OPTION_MASK_ISA_LWP },
2824 { "-mhle", OPTION_MASK_ISA_HLE },
2825 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2826 { "-mclwb", OPTION_MASK_ISA_CLWB }
2829 /* Flag options. */
2830 static struct ix86_target_opts flag_opts[] =
2832 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2833 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2834 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2835 { "-m80387", MASK_80387 },
2836 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2837 { "-malign-double", MASK_ALIGN_DOUBLE },
2838 { "-mcld", MASK_CLD },
2839 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2840 { "-mieee-fp", MASK_IEEE_FP },
2841 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2842 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2843 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2844 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2845 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2846 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2847 { "-mno-red-zone", MASK_NO_RED_ZONE },
2848 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2849 { "-mrecip", MASK_RECIP },
2850 { "-mrtd", MASK_RTD },
2851 { "-msseregparm", MASK_SSEREGPARM },
2852 { "-mstack-arg-probe", MASK_STACK_PROBE },
2853 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2854 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2855 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2856 { "-mvzeroupper", MASK_VZEROUPPER },
2857 { "-mstv", MASK_STV },
2858 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2859 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2860 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2863 /* Additional flag options. */
2864 static struct ix86_target_opts flag2_opts[] =
2866 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2869 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2870 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2872 char isa_other[40];
2873 char isa2_other[40];
2874 char flags_other[40];
2875 char flags2_other[40];
2876 unsigned num = 0;
2877 unsigned i, j;
2878 char *ret;
2879 char *ptr;
2880 size_t len;
2881 size_t line_len;
2882 size_t sep_len;
2883 const char *abi;
2885 memset (opts, '\0', sizeof (opts));
2887 /* Add -march= option. */
2888 if (arch)
2890 opts[num][0] = "-march=";
2891 opts[num++][1] = arch;
2894 /* Add -mtune= option. */
2895 if (tune)
2897 opts[num][0] = "-mtune=";
2898 opts[num++][1] = tune;
2901 /* Add -m32/-m64/-mx32. */
2902 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2904 if ((isa & OPTION_MASK_ABI_64) != 0)
2905 abi = "-m64";
2906 else
2907 abi = "-mx32";
2908 isa &= ~ (OPTION_MASK_ISA_64BIT
2909 | OPTION_MASK_ABI_64
2910 | OPTION_MASK_ABI_X32);
2912 else
2913 abi = "-m32";
2914 opts[num++][0] = abi;
2916 /* Pick out the options in isa2 options. */
2917 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2919 if ((isa2 & isa2_opts[i].mask) != 0)
2921 opts[num++][0] = isa2_opts[i].option;
2922 isa2 &= ~ isa2_opts[i].mask;
2926 if (isa2 && add_nl_p)
2928 opts[num++][0] = isa2_other;
2929 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2932 /* Pick out the options in isa options. */
2933 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2935 if ((isa & isa_opts[i].mask) != 0)
2937 opts[num++][0] = isa_opts[i].option;
2938 isa &= ~ isa_opts[i].mask;
2942 if (isa && add_nl_p)
2944 opts[num++][0] = isa_other;
2945 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2948 /* Add flag options. */
2949 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2951 if ((flags & flag_opts[i].mask) != 0)
2953 opts[num++][0] = flag_opts[i].option;
2954 flags &= ~ flag_opts[i].mask;
2958 if (flags && add_nl_p)
2960 opts[num++][0] = flags_other;
2961 sprintf (flags_other, "(other flags: %#x)", flags);
2964 /* Add additional flag options. */
2965 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2967 if ((flags2 & flag2_opts[i].mask) != 0)
2969 opts[num++][0] = flag2_opts[i].option;
2970 flags2 &= ~ flag2_opts[i].mask;
2974 if (flags2 && add_nl_p)
2976 opts[num++][0] = flags2_other;
2977 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2980 /* Add -fpmath= option. */
2981 if (fpmath)
2983 opts[num][0] = "-mfpmath=";
2984 switch ((int) fpmath)
2986 case FPMATH_387:
2987 opts[num++][1] = "387";
2988 break;
2990 case FPMATH_SSE:
2991 opts[num++][1] = "sse";
2992 break;
2994 case FPMATH_387 | FPMATH_SSE:
2995 opts[num++][1] = "sse+387";
2996 break;
2998 default:
2999 gcc_unreachable ();
3003 /* Any options? */
3004 if (num == 0)
3005 return NULL;
3007 gcc_assert (num < ARRAY_SIZE (opts));
3009 /* Size the string. */
3010 len = 0;
3011 sep_len = (add_nl_p) ? 3 : 1;
3012 for (i = 0; i < num; i++)
3014 len += sep_len;
3015 for (j = 0; j < 2; j++)
3016 if (opts[i][j])
3017 len += strlen (opts[i][j]);
3020 /* Build the string. */
3021 ret = ptr = (char *) xmalloc (len);
3022 line_len = 0;
3024 for (i = 0; i < num; i++)
3026 size_t len2[2];
3028 for (j = 0; j < 2; j++)
3029 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3031 if (i != 0)
3033 *ptr++ = ' ';
3034 line_len++;
3036 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3038 *ptr++ = '\\';
3039 *ptr++ = '\n';
3040 line_len = 0;
3044 for (j = 0; j < 2; j++)
3045 if (opts[i][j])
3047 memcpy (ptr, opts[i][j], len2[j]);
3048 ptr += len2[j];
3049 line_len += len2[j];
3053 *ptr = '\0';
3054 gcc_assert (ret + len >= ptr);
3056 return ret;
3059 /* Return true, if profiling code should be emitted before
3060 prologue. Otherwise it returns false.
3061 Note: For x86 with "hotfix" it is sorried. */
3062 static bool
3063 ix86_profile_before_prologue (void)
3065 return flag_fentry != 0;
3068 /* Function that is callable from the debugger to print the current
3069 options. */
3070 void ATTRIBUTE_UNUSED
3071 ix86_debug_options (void)
3073 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3074 target_flags, ix86_target_flags,
3075 ix86_arch_string,ix86_tune_string,
3076 ix86_fpmath, true);
3078 if (opts)
3080 fprintf (stderr, "%s\n\n", opts);
3081 free (opts);
3083 else
3084 fputs ("<no options>\n\n", stderr);
3086 return;
3089 /* Return true if T is one of the bytes we should avoid with
3090 -fmitigate-rop. */
3092 static bool
3093 ix86_rop_should_change_byte_p (int t)
3095 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3098 static const char *stringop_alg_names[] = {
3099 #define DEF_ENUM
3100 #define DEF_ALG(alg, name) #name,
3101 #include "stringop.def"
3102 #undef DEF_ENUM
3103 #undef DEF_ALG
3106 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3107 The string is of the following form (or comma separated list of it):
3109 strategy_alg:max_size:[align|noalign]
3111 where the full size range for the strategy is either [0, max_size] or
3112 [min_size, max_size], in which min_size is the max_size + 1 of the
3113 preceding range. The last size range must have max_size == -1.
3115 Examples:
3118 -mmemcpy-strategy=libcall:-1:noalign
3120 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3124 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3126 This is to tell the compiler to use the following strategy for memset
3127 1) when the expected size is between [1, 16], use rep_8byte strategy;
3128 2) when the size is between [17, 2048], use vector_loop;
3129 3) when the size is > 2048, use libcall. */
3131 struct stringop_size_range
3133 int max;
3134 stringop_alg alg;
3135 bool noalign;
3138 static void
3139 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3141 const struct stringop_algs *default_algs;
3142 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3143 char *curr_range_str, *next_range_str;
3144 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3145 int i = 0, n = 0;
3147 if (is_memset)
3148 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3149 else
3150 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3152 curr_range_str = strategy_str;
3156 int maxs;
3157 char alg_name[128];
3158 char align[16];
3159 next_range_str = strchr (curr_range_str, ',');
3160 if (next_range_str)
3161 *next_range_str++ = '\0';
3163 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
3164 alg_name, &maxs, align))
3166 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3167 return;
3170 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3172 error ("size ranges of option %qs should be increasing", opt);
3173 return;
3176 for (i = 0; i < last_alg; i++)
3177 if (!strcmp (alg_name, stringop_alg_names[i]))
3178 break;
3180 if (i == last_alg)
3182 error ("wrong strategy name %qs specified for option %qs",
3183 alg_name, opt);
3185 auto_vec <const char *> candidates;
3186 for (i = 0; i < last_alg; i++)
3187 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3188 candidates.safe_push (stringop_alg_names[i]);
3190 char *s;
3191 const char *hint
3192 = candidates_list_and_hint (alg_name, s, candidates);
3193 if (hint)
3194 inform (input_location,
3195 "valid arguments to %qs are: %s; did you mean %qs?",
3196 opt, s, hint);
3197 else
3198 inform (input_location, "valid arguments to %qs are: %s",
3199 opt, s);
3200 XDELETEVEC (s);
3201 return;
3204 if ((stringop_alg) i == rep_prefix_8_byte
3205 && !TARGET_64BIT)
3207 /* rep; movq isn't available in 32-bit code. */
3208 error ("strategy name %qs specified for option %qs "
3209 "not supported for 32-bit code", alg_name, opt);
3210 return;
3213 input_ranges[n].max = maxs;
3214 input_ranges[n].alg = (stringop_alg) i;
3215 if (!strcmp (align, "align"))
3216 input_ranges[n].noalign = false;
3217 else if (!strcmp (align, "noalign"))
3218 input_ranges[n].noalign = true;
3219 else
3221 error ("unknown alignment %qs specified for option %qs", align, opt);
3222 return;
3224 n++;
3225 curr_range_str = next_range_str;
3227 while (curr_range_str);
3229 if (input_ranges[n - 1].max != -1)
3231 error ("the max value for the last size range should be -1"
3232 " for option %qs", opt);
3233 return;
3236 if (n > MAX_STRINGOP_ALGS)
3238 error ("too many size ranges specified in option %qs", opt);
3239 return;
3242 /* Now override the default algs array. */
3243 for (i = 0; i < n; i++)
3245 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3246 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3247 = input_ranges[i].alg;
3248 *const_cast<int *>(&default_algs->size[i].noalign)
3249 = input_ranges[i].noalign;
3254 /* parse -mtune-ctrl= option. When DUMP is true,
3255 print the features that are explicitly set. */
3257 static void
3258 parse_mtune_ctrl_str (bool dump)
3260 if (!ix86_tune_ctrl_string)
3261 return;
3263 char *next_feature_string = NULL;
3264 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3265 char *orig = curr_feature_string;
3266 int i;
3269 bool clear = false;
3271 next_feature_string = strchr (curr_feature_string, ',');
3272 if (next_feature_string)
3273 *next_feature_string++ = '\0';
3274 if (*curr_feature_string == '^')
3276 curr_feature_string++;
3277 clear = true;
3279 for (i = 0; i < X86_TUNE_LAST; i++)
3281 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3283 ix86_tune_features[i] = !clear;
3284 if (dump)
3285 fprintf (stderr, "Explicitly %s feature %s\n",
3286 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3287 break;
3290 if (i == X86_TUNE_LAST)
3291 error ("unknown parameter to option -mtune-ctrl: %s",
3292 clear ? curr_feature_string - 1 : curr_feature_string);
3293 curr_feature_string = next_feature_string;
3295 while (curr_feature_string);
3296 free (orig);
3299 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3300 processor type. */
3302 static void
3303 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3305 unsigned int ix86_tune_mask = 1u << ix86_tune;
3306 int i;
3308 for (i = 0; i < X86_TUNE_LAST; ++i)
3310 if (ix86_tune_no_default)
3311 ix86_tune_features[i] = 0;
3312 else
3313 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3316 if (dump)
3318 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3319 for (i = 0; i < X86_TUNE_LAST; i++)
3320 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3321 ix86_tune_features[i] ? "on" : "off");
3324 parse_mtune_ctrl_str (dump);
3328 /* Default align_* from the processor table. */
3330 static void
3331 ix86_default_align (struct gcc_options *opts)
3333 if (opts->x_align_loops == 0)
3335 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3336 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3338 if (opts->x_align_jumps == 0)
3340 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3341 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3343 if (opts->x_align_functions == 0)
3345 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3349 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3351 static void
3352 ix86_override_options_after_change (void)
3354 ix86_default_align (&global_options);
3357 /* Override various settings based on options. If MAIN_ARGS_P, the
3358 options are from the command line, otherwise they are from
3359 attributes. Return true if there's an error related to march
3360 option. */
3362 static bool
3363 ix86_option_override_internal (bool main_args_p,
3364 struct gcc_options *opts,
3365 struct gcc_options *opts_set)
3367 int i;
3368 unsigned int ix86_arch_mask;
3369 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3371 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3372 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3373 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3374 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3375 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3376 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3377 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3378 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3379 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3380 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3381 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3382 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3383 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3384 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3385 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3386 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3387 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3388 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3389 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3390 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3391 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3392 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3393 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3394 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3395 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3396 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3397 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3398 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3399 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3400 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3401 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3402 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3403 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3404 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3405 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3406 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3407 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3408 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3409 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3410 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3411 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3412 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3413 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3414 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3415 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3416 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3417 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3418 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3419 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3420 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3421 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3422 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3423 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3424 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3425 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3426 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3427 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3428 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3429 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3430 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3431 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3432 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3433 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3434 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3436 #define PTA_CORE2 \
3437 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3438 | PTA_CX16 | PTA_FXSR)
3439 #define PTA_NEHALEM \
3440 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3441 #define PTA_WESTMERE \
3442 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3443 #define PTA_SANDYBRIDGE \
3444 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3445 #define PTA_IVYBRIDGE \
3446 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3447 #define PTA_HASWELL \
3448 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3449 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3450 #define PTA_BROADWELL \
3451 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3452 #define PTA_SKYLAKE \
3453 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3454 #define PTA_SKYLAKE_AVX512 \
3455 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3456 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU | PTA_CLWB)
3457 #define PTA_CANNONLAKE \
3458 (PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA)
3459 #define PTA_KNL \
3460 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3461 #define PTA_BONNELL \
3462 (PTA_CORE2 | PTA_MOVBE)
3463 #define PTA_SILVERMONT \
3464 (PTA_WESTMERE | PTA_MOVBE)
3465 #define PTA_KNM \
3466 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3468 /* if this reaches 64, need to widen struct pta flags below */
3470 static struct pta
3472 const char *const name; /* processor name or nickname. */
3473 const enum processor_type processor;
3474 const enum attr_cpu schedule;
3475 const unsigned HOST_WIDE_INT flags;
3477 const processor_alias_table[] =
3479 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3480 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3481 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3482 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3483 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3484 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3485 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3486 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3487 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3488 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3489 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3490 PTA_MMX | PTA_SSE | PTA_FXSR},
3491 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3492 PTA_MMX | PTA_SSE | PTA_FXSR},
3493 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3494 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3495 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3496 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3497 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3498 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3499 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3500 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3501 PTA_MMX | PTA_SSE | PTA_FXSR},
3502 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3503 PTA_MMX | PTA_SSE | PTA_FXSR},
3504 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3505 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3506 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3507 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3508 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3509 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3510 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3511 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3512 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3513 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3514 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3515 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3516 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3517 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3518 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3519 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3520 PTA_SANDYBRIDGE},
3521 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3522 PTA_SANDYBRIDGE},
3523 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3524 PTA_IVYBRIDGE},
3525 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3526 PTA_IVYBRIDGE},
3527 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3528 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3529 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3530 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3531 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3532 PTA_SKYLAKE_AVX512},
3533 {"cannonlake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_CANNONLAKE},
3534 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3535 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3536 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3537 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3538 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3539 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3540 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3541 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3542 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3543 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3544 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3545 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3546 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3547 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3548 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3549 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3550 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3551 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3552 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3553 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3554 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3555 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3556 {"x86-64", PROCESSOR_K8, CPU_K8,
3557 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3558 {"eden-x2", PROCESSOR_K8, CPU_K8,
3559 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3560 {"nano", PROCESSOR_K8, CPU_K8,
3561 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3562 | PTA_SSSE3 | PTA_FXSR},
3563 {"nano-1000", PROCESSOR_K8, CPU_K8,
3564 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3565 | PTA_SSSE3 | PTA_FXSR},
3566 {"nano-2000", PROCESSOR_K8, CPU_K8,
3567 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3568 | PTA_SSSE3 | PTA_FXSR},
3569 {"nano-3000", PROCESSOR_K8, CPU_K8,
3570 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3571 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3572 {"nano-x2", PROCESSOR_K8, CPU_K8,
3573 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3574 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3575 {"eden-x4", PROCESSOR_K8, CPU_K8,
3576 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3577 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3578 {"nano-x4", PROCESSOR_K8, CPU_K8,
3579 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3580 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3581 {"k8", PROCESSOR_K8, CPU_K8,
3582 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3583 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3584 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3585 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3586 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3587 {"opteron", PROCESSOR_K8, CPU_K8,
3588 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3589 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3590 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3591 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3592 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3593 {"athlon64", PROCESSOR_K8, CPU_K8,
3594 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3595 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3596 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3597 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3598 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3599 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3600 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3601 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3602 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3603 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3604 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3605 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3606 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3607 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3608 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3609 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3610 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3611 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3612 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3613 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3614 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3615 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3616 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3617 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3618 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3619 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3620 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3621 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3622 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3623 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3624 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3625 | PTA_XSAVEOPT | PTA_FSGSBASE},
3626 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3627 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3628 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3629 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3630 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3631 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3632 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3633 | PTA_MOVBE | PTA_MWAITX},
3634 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3635 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3636 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3637 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3638 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3639 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3640 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3641 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3642 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3643 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3644 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3645 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3646 | PTA_FXSR | PTA_XSAVE},
3647 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3648 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3649 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3650 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3651 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3652 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3654 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3655 PTA_64BIT
3656 | PTA_HLE /* flags are only used for -march switch. */ },
3659 /* -mrecip options. */
3660 static struct
3662 const char *string; /* option name */
3663 unsigned int mask; /* mask bits to set */
3665 const recip_options[] =
3667 { "all", RECIP_MASK_ALL },
3668 { "none", RECIP_MASK_NONE },
3669 { "div", RECIP_MASK_DIV },
3670 { "sqrt", RECIP_MASK_SQRT },
3671 { "vec-div", RECIP_MASK_VEC_DIV },
3672 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3675 int const pta_size = ARRAY_SIZE (processor_alias_table);
3677 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3678 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3679 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3680 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3681 #ifdef TARGET_BI_ARCH
3682 else
3684 #if TARGET_BI_ARCH == 1
3685 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3686 is on and OPTION_MASK_ABI_X32 is off. We turn off
3687 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3688 -mx32. */
3689 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3690 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3691 #else
3692 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3693 on and OPTION_MASK_ABI_64 is off. We turn off
3694 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3695 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3696 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3697 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3698 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3699 #endif
3700 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3701 && TARGET_IAMCU_P (opts->x_target_flags))
3702 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3703 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3705 #endif
3707 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3709 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3710 OPTION_MASK_ABI_64 for TARGET_X32. */
3711 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3712 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3714 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3715 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3716 | OPTION_MASK_ABI_X32
3717 | OPTION_MASK_ABI_64);
3718 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3720 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3721 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3722 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3723 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3726 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3727 SUBTARGET_OVERRIDE_OPTIONS;
3728 #endif
3730 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3731 SUBSUBTARGET_OVERRIDE_OPTIONS;
3732 #endif
3734 /* -fPIC is the default for x86_64. */
3735 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3736 opts->x_flag_pic = 2;
3738 /* Need to check -mtune=generic first. */
3739 if (opts->x_ix86_tune_string)
3741 /* As special support for cross compilers we read -mtune=native
3742 as -mtune=generic. With native compilers we won't see the
3743 -mtune=native, as it was changed by the driver. */
3744 if (!strcmp (opts->x_ix86_tune_string, "native"))
3746 opts->x_ix86_tune_string = "generic";
3748 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3749 warning (OPT_Wdeprecated,
3750 main_args_p
3751 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3752 "or %<-mtune=generic%> instead as appropriate")
3753 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3754 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3755 " instead as appropriate"));
3757 else
3759 if (opts->x_ix86_arch_string)
3760 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3761 if (!opts->x_ix86_tune_string)
3763 opts->x_ix86_tune_string
3764 = processor_target_table[TARGET_CPU_DEFAULT].name;
3765 ix86_tune_defaulted = 1;
3768 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3769 or defaulted. We need to use a sensible tune option. */
3770 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3772 opts->x_ix86_tune_string = "generic";
3776 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3777 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3779 /* rep; movq isn't available in 32-bit code. */
3780 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3781 opts->x_ix86_stringop_alg = no_stringop;
3784 if (!opts->x_ix86_arch_string)
3785 opts->x_ix86_arch_string
3786 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3787 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3788 else
3789 ix86_arch_specified = 1;
3791 if (opts_set->x_ix86_pmode)
3793 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3794 && opts->x_ix86_pmode == PMODE_SI)
3795 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3796 && opts->x_ix86_pmode == PMODE_DI))
3797 error ("address mode %qs not supported in the %s bit mode",
3798 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3799 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3801 else
3802 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3803 ? PMODE_DI : PMODE_SI;
3805 if (!opts_set->x_ix86_abi)
3806 opts->x_ix86_abi = DEFAULT_ABI;
3808 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3809 error ("-mabi=ms not supported with X32 ABI");
3810 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3812 /* For targets using ms ABI enable ms-extensions, if not
3813 explicit turned off. For non-ms ABI we turn off this
3814 option. */
3815 if (!opts_set->x_flag_ms_extensions)
3816 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3818 if (opts_set->x_ix86_cmodel)
3820 switch (opts->x_ix86_cmodel)
3822 case CM_SMALL:
3823 case CM_SMALL_PIC:
3824 if (opts->x_flag_pic)
3825 opts->x_ix86_cmodel = CM_SMALL_PIC;
3826 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3827 error ("code model %qs not supported in the %s bit mode",
3828 "small", "32");
3829 break;
3831 case CM_MEDIUM:
3832 case CM_MEDIUM_PIC:
3833 if (opts->x_flag_pic)
3834 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3835 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3836 error ("code model %qs not supported in the %s bit mode",
3837 "medium", "32");
3838 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3839 error ("code model %qs not supported in x32 mode",
3840 "medium");
3841 break;
3843 case CM_LARGE:
3844 case CM_LARGE_PIC:
3845 if (opts->x_flag_pic)
3846 opts->x_ix86_cmodel = CM_LARGE_PIC;
3847 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3848 error ("code model %qs not supported in the %s bit mode",
3849 "large", "32");
3850 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3851 error ("code model %qs not supported in x32 mode",
3852 "large");
3853 break;
3855 case CM_32:
3856 if (opts->x_flag_pic)
3857 error ("code model %s does not support PIC mode", "32");
3858 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3859 error ("code model %qs not supported in the %s bit mode",
3860 "32", "64");
3861 break;
3863 case CM_KERNEL:
3864 if (opts->x_flag_pic)
3866 error ("code model %s does not support PIC mode", "kernel");
3867 opts->x_ix86_cmodel = CM_32;
3869 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3870 error ("code model %qs not supported in the %s bit mode",
3871 "kernel", "32");
3872 break;
3874 default:
3875 gcc_unreachable ();
3878 else
3880 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3881 use of rip-relative addressing. This eliminates fixups that
3882 would otherwise be needed if this object is to be placed in a
3883 DLL, and is essentially just as efficient as direct addressing. */
3884 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3885 && (TARGET_RDOS || TARGET_PECOFF))
3886 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3887 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3888 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3889 else
3890 opts->x_ix86_cmodel = CM_32;
3892 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3894 error ("-masm=intel not supported in this configuration");
3895 opts->x_ix86_asm_dialect = ASM_ATT;
3897 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3898 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3899 sorry ("%i-bit mode not compiled in",
3900 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3902 for (i = 0; i < pta_size; i++)
3903 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3905 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3907 error (main_args_p
3908 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3909 "switch")
3910 : G_("%<generic%> CPU can be used only for "
3911 "%<target(\"tune=\")%> attribute"));
3912 return false;
3914 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3916 error (main_args_p
3917 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3918 "switch")
3919 : G_("%<intel%> CPU can be used only for "
3920 "%<target(\"tune=\")%> attribute"));
3921 return false;
3924 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3925 && !(processor_alias_table[i].flags & PTA_64BIT))
3927 error ("CPU you selected does not support x86-64 "
3928 "instruction set");
3929 return false;
3932 ix86_schedule = processor_alias_table[i].schedule;
3933 ix86_arch = processor_alias_table[i].processor;
3934 /* Default cpu tuning to the architecture. */
3935 ix86_tune = ix86_arch;
3937 if (processor_alias_table[i].flags & PTA_MMX
3938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3940 if (processor_alias_table[i].flags & PTA_3DNOW
3941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3943 if (processor_alias_table[i].flags & PTA_3DNOW_A
3944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3946 if (processor_alias_table[i].flags & PTA_SSE
3947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3949 if (processor_alias_table[i].flags & PTA_SSE2
3950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3952 if (processor_alias_table[i].flags & PTA_SSE3
3953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3955 if (processor_alias_table[i].flags & PTA_SSSE3
3956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3958 if (processor_alias_table[i].flags & PTA_SSE4_1
3959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3961 if (processor_alias_table[i].flags & PTA_SSE4_2
3962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3964 if (processor_alias_table[i].flags & PTA_AVX
3965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3967 if (processor_alias_table[i].flags & PTA_AVX2
3968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3970 if (processor_alias_table[i].flags & PTA_FMA
3971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3973 if (processor_alias_table[i].flags & PTA_SSE4A
3974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3976 if (processor_alias_table[i].flags & PTA_FMA4
3977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3979 if (processor_alias_table[i].flags & PTA_XOP
3980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3982 if (processor_alias_table[i].flags & PTA_LWP
3983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3985 if (processor_alias_table[i].flags & PTA_ABM
3986 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3987 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3988 if (processor_alias_table[i].flags & PTA_BMI
3989 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3990 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3991 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3992 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3993 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3994 if (processor_alias_table[i].flags & PTA_TBM
3995 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3996 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3997 if (processor_alias_table[i].flags & PTA_BMI2
3998 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3999 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4000 if (processor_alias_table[i].flags & PTA_CX16
4001 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
4002 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
4003 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
4004 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4005 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4006 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4007 && (processor_alias_table[i].flags & PTA_NO_SAHF))
4008 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4009 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4010 if (processor_alias_table[i].flags & PTA_MOVBE
4011 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
4012 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
4013 if (processor_alias_table[i].flags & PTA_AES
4014 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4015 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4016 if (processor_alias_table[i].flags & PTA_SHA
4017 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4018 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4019 if (processor_alias_table[i].flags & PTA_PCLMUL
4020 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4021 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4022 if (processor_alias_table[i].flags & PTA_FSGSBASE
4023 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4024 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4025 if (processor_alias_table[i].flags & PTA_RDRND
4026 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4027 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4028 if (processor_alias_table[i].flags & PTA_F16C
4029 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4030 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4031 if (processor_alias_table[i].flags & PTA_RTM
4032 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4033 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4034 if (processor_alias_table[i].flags & PTA_HLE
4035 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
4036 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
4037 if (processor_alias_table[i].flags & PTA_PRFCHW
4038 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4039 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4040 if (processor_alias_table[i].flags & PTA_RDSEED
4041 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4042 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4043 if (processor_alias_table[i].flags & PTA_ADX
4044 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4045 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4046 if (processor_alias_table[i].flags & PTA_FXSR
4047 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4048 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4049 if (processor_alias_table[i].flags & PTA_XSAVE
4050 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4051 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4052 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4053 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4054 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4055 if (processor_alias_table[i].flags & PTA_AVX512F
4056 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4057 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4058 if (processor_alias_table[i].flags & PTA_AVX512ER
4059 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4060 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4061 if (processor_alias_table[i].flags & PTA_AVX512PF
4062 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4063 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4064 if (processor_alias_table[i].flags & PTA_AVX512CD
4065 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4066 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4067 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4068 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4069 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4070 if (processor_alias_table[i].flags & PTA_CLWB
4071 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4072 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4073 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4074 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4075 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4076 if (processor_alias_table[i].flags & PTA_CLZERO
4077 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
4078 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
4079 if (processor_alias_table[i].flags & PTA_XSAVEC
4080 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4081 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4082 if (processor_alias_table[i].flags & PTA_XSAVES
4083 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4084 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4085 if (processor_alias_table[i].flags & PTA_AVX512DQ
4086 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4087 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4088 if (processor_alias_table[i].flags & PTA_AVX512BW
4089 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4090 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4091 if (processor_alias_table[i].flags & PTA_AVX512VL
4092 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4093 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4094 if (processor_alias_table[i].flags & PTA_MPX
4095 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4096 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4097 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4098 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4099 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4100 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4101 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4102 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4104 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4105 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
4106 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4107 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4108 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
4109 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4110 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4111 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4112 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4113 if (processor_alias_table[i].flags & PTA_SGX
4114 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4115 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4117 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4118 x86_prefetch_sse = true;
4119 if (processor_alias_table[i].flags & PTA_MWAITX
4120 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
4121 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
4122 if (processor_alias_table[i].flags & PTA_PKU
4123 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4124 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4126 /* Don't enable x87 instructions if only
4127 general registers are allowed. */
4128 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4129 && !(opts_set->x_target_flags & MASK_80387))
4131 if (processor_alias_table[i].flags & PTA_NO_80387)
4132 opts->x_target_flags &= ~MASK_80387;
4133 else
4134 opts->x_target_flags |= MASK_80387;
4136 break;
4139 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4140 error ("Intel MPX does not support x32");
4142 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4143 error ("Intel MPX does not support x32");
4145 if (i == pta_size)
4147 error (main_args_p
4148 ? G_("bad value (%qs) for %<-march=%> switch")
4149 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4150 opts->x_ix86_arch_string);
4152 auto_vec <const char *> candidates;
4153 for (i = 0; i < pta_size; i++)
4154 if (strcmp (processor_alias_table[i].name, "generic")
4155 && strcmp (processor_alias_table[i].name, "intel")
4156 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4157 || (processor_alias_table[i].flags & PTA_64BIT)))
4158 candidates.safe_push (processor_alias_table[i].name);
4160 char *s;
4161 const char *hint
4162 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4163 if (hint)
4164 inform (input_location,
4165 main_args_p
4166 ? G_("valid arguments to %<-march=%> switch are: "
4167 "%s; did you mean %qs?")
4168 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4169 "%s; did you mean %qs?"), s, hint);
4170 else
4171 inform (input_location,
4172 main_args_p
4173 ? G_("valid arguments to %<-march=%> switch are: %s")
4174 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4175 "are: %s"), s);
4176 XDELETEVEC (s);
4179 ix86_arch_mask = 1u << ix86_arch;
4180 for (i = 0; i < X86_ARCH_LAST; ++i)
4181 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4183 for (i = 0; i < pta_size; i++)
4184 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4186 ix86_schedule = processor_alias_table[i].schedule;
4187 ix86_tune = processor_alias_table[i].processor;
4188 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4190 if (!(processor_alias_table[i].flags & PTA_64BIT))
4192 if (ix86_tune_defaulted)
4194 opts->x_ix86_tune_string = "x86-64";
4195 for (i = 0; i < pta_size; i++)
4196 if (! strcmp (opts->x_ix86_tune_string,
4197 processor_alias_table[i].name))
4198 break;
4199 ix86_schedule = processor_alias_table[i].schedule;
4200 ix86_tune = processor_alias_table[i].processor;
4202 else
4203 error ("CPU you selected does not support x86-64 "
4204 "instruction set");
4207 /* Intel CPUs have always interpreted SSE prefetch instructions as
4208 NOPs; so, we can enable SSE prefetch instructions even when
4209 -mtune (rather than -march) points us to a processor that has them.
4210 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4211 higher processors. */
4212 if (TARGET_CMOV
4213 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4214 x86_prefetch_sse = true;
4215 break;
4218 if (ix86_tune_specified && i == pta_size)
4220 error (main_args_p
4221 ? G_("bad value (%qs) for %<-mtune=%> switch")
4222 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4223 opts->x_ix86_tune_string);
4225 auto_vec <const char *> candidates;
4226 for (i = 0; i < pta_size; i++)
4227 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4228 || (processor_alias_table[i].flags & PTA_64BIT))
4229 candidates.safe_push (processor_alias_table[i].name);
4231 char *s;
4232 const char *hint
4233 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4234 if (hint)
4235 inform (input_location,
4236 main_args_p
4237 ? G_("valid arguments to %<-mtune=%> switch are: "
4238 "%s; did you mean %qs?")
4239 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4240 "%s; did you mean %qs?"), s, hint);
4241 else
4242 inform (input_location,
4243 main_args_p
4244 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4245 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4246 "are: %s"), s);
4247 XDELETEVEC (s);
4250 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4252 #ifndef USE_IX86_FRAME_POINTER
4253 #define USE_IX86_FRAME_POINTER 0
4254 #endif
4256 #ifndef USE_X86_64_FRAME_POINTER
4257 #define USE_X86_64_FRAME_POINTER 0
4258 #endif
4260 /* Set the default values for switches whose default depends on TARGET_64BIT
4261 in case they weren't overwritten by command line options. */
4262 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4264 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4265 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4266 if (opts->x_flag_asynchronous_unwind_tables
4267 && !opts_set->x_flag_unwind_tables
4268 && TARGET_64BIT_MS_ABI)
4269 opts->x_flag_unwind_tables = 1;
4270 if (opts->x_flag_asynchronous_unwind_tables == 2)
4271 opts->x_flag_unwind_tables
4272 = opts->x_flag_asynchronous_unwind_tables = 1;
4273 if (opts->x_flag_pcc_struct_return == 2)
4274 opts->x_flag_pcc_struct_return = 0;
4276 else
4278 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4279 opts->x_flag_omit_frame_pointer
4280 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4281 if (opts->x_flag_asynchronous_unwind_tables == 2)
4282 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4283 if (opts->x_flag_pcc_struct_return == 2)
4285 /* Intel MCU psABI specifies that -freg-struct-return should
4286 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4287 we check -miamcu so that -freg-struct-return is always
4288 turned on if -miamcu is used. */
4289 if (TARGET_IAMCU_P (opts->x_target_flags))
4290 opts->x_flag_pcc_struct_return = 0;
4291 else
4292 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4296 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4297 /* TODO: ix86_cost should be chosen at instruction or function granuality
4298 so for cold code we use size_cost even in !optimize_size compilation. */
4299 if (opts->x_optimize_size)
4300 ix86_cost = &ix86_size_cost;
4301 else
4302 ix86_cost = ix86_tune_cost;
4304 /* Arrange to set up i386_stack_locals for all functions. */
4305 init_machine_status = ix86_init_machine_status;
4307 /* Validate -mregparm= value. */
4308 if (opts_set->x_ix86_regparm)
4310 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4311 warning (0, "-mregparm is ignored in 64-bit mode");
4312 else if (TARGET_IAMCU_P (opts->x_target_flags))
4313 warning (0, "-mregparm is ignored for Intel MCU psABI");
4314 if (opts->x_ix86_regparm > REGPARM_MAX)
4316 error ("-mregparm=%d is not between 0 and %d",
4317 opts->x_ix86_regparm, REGPARM_MAX);
4318 opts->x_ix86_regparm = 0;
4321 if (TARGET_IAMCU_P (opts->x_target_flags)
4322 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4323 opts->x_ix86_regparm = REGPARM_MAX;
4325 /* Default align_* from the processor table. */
4326 ix86_default_align (opts);
4328 /* Provide default for -mbranch-cost= value. */
4329 if (!opts_set->x_ix86_branch_cost)
4330 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4332 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4334 opts->x_target_flags
4335 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4337 /* Enable by default the SSE and MMX builtins. Do allow the user to
4338 explicitly disable any of these. In particular, disabling SSE and
4339 MMX for kernel code is extremely useful. */
4340 if (!ix86_arch_specified)
4341 opts->x_ix86_isa_flags
4342 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4343 | TARGET_SUBTARGET64_ISA_DEFAULT)
4344 & ~opts->x_ix86_isa_flags_explicit);
4346 if (TARGET_RTD_P (opts->x_target_flags))
4347 warning (0,
4348 main_args_p
4349 ? G_("%<-mrtd%> is ignored in 64bit mode")
4350 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4352 else
4354 opts->x_target_flags
4355 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4357 if (!ix86_arch_specified)
4358 opts->x_ix86_isa_flags
4359 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4361 /* i386 ABI does not specify red zone. It still makes sense to use it
4362 when programmer takes care to stack from being destroyed. */
4363 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4364 opts->x_target_flags |= MASK_NO_RED_ZONE;
4367 /* Keep nonleaf frame pointers. */
4368 if (opts->x_flag_omit_frame_pointer)
4369 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4370 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4371 opts->x_flag_omit_frame_pointer = 1;
4373 /* If we're doing fast math, we don't care about comparison order
4374 wrt NaNs. This lets us use a shorter comparison sequence. */
4375 if (opts->x_flag_finite_math_only)
4376 opts->x_target_flags &= ~MASK_IEEE_FP;
4378 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4379 since the insns won't need emulation. */
4380 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4381 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4383 /* Likewise, if the target doesn't have a 387, or we've specified
4384 software floating point, don't use 387 inline intrinsics. */
4385 if (!TARGET_80387_P (opts->x_target_flags))
4386 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4388 /* Turn on MMX builtins for -msse. */
4389 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4390 opts->x_ix86_isa_flags
4391 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4393 /* Enable SSE prefetch. */
4394 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4395 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4396 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4397 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4398 x86_prefetch_sse = true;
4400 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4401 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4402 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4403 opts->x_ix86_isa_flags
4404 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4406 /* Enable lzcnt instruction for -mabm. */
4407 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4408 opts->x_ix86_isa_flags
4409 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4411 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4412 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4413 opts->x_ix86_isa_flags
4414 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4415 & ~opts->x_ix86_isa_flags_explicit);
4417 /* Validate -mpreferred-stack-boundary= value or default it to
4418 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4419 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4420 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4422 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4423 int max = TARGET_SEH ? 4 : 12;
4425 if (opts->x_ix86_preferred_stack_boundary_arg < min
4426 || opts->x_ix86_preferred_stack_boundary_arg > max)
4428 if (min == max)
4429 error ("-mpreferred-stack-boundary is not supported "
4430 "for this target");
4431 else
4432 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4433 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4435 else
4436 ix86_preferred_stack_boundary
4437 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4440 /* Set the default value for -mstackrealign. */
4441 if (!opts_set->x_ix86_force_align_arg_pointer)
4442 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4444 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4446 /* Validate -mincoming-stack-boundary= value or default it to
4447 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4448 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4449 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4451 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4453 if (opts->x_ix86_incoming_stack_boundary_arg < min
4454 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4455 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4456 opts->x_ix86_incoming_stack_boundary_arg, min);
4457 else
4459 ix86_user_incoming_stack_boundary
4460 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4461 ix86_incoming_stack_boundary
4462 = ix86_user_incoming_stack_boundary;
4466 #ifndef NO_PROFILE_COUNTERS
4467 if (flag_nop_mcount)
4468 error ("-mnop-mcount is not compatible with this target");
4469 #endif
4470 if (flag_nop_mcount && flag_pic)
4471 error ("-mnop-mcount is not implemented for -fPIC");
4473 /* Accept -msseregparm only if at least SSE support is enabled. */
4474 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4475 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4476 error (main_args_p
4477 ? G_("%<-msseregparm%> used without SSE enabled")
4478 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4480 if (opts_set->x_ix86_fpmath)
4482 if (opts->x_ix86_fpmath & FPMATH_SSE)
4484 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4486 if (TARGET_80387_P (opts->x_target_flags))
4488 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4489 opts->x_ix86_fpmath = FPMATH_387;
4492 else if ((opts->x_ix86_fpmath & FPMATH_387)
4493 && !TARGET_80387_P (opts->x_target_flags))
4495 warning (0, "387 instruction set disabled, using SSE arithmetics");
4496 opts->x_ix86_fpmath = FPMATH_SSE;
4500 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4501 fpmath=387. The second is however default at many targets since the
4502 extra 80bit precision of temporaries is considered to be part of ABI.
4503 Overwrite the default at least for -ffast-math.
4504 TODO: -mfpmath=both seems to produce same performing code with bit
4505 smaller binaries. It is however not clear if register allocation is
4506 ready for this setting.
4507 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4508 codegen. We may switch to 387 with -ffast-math for size optimized
4509 functions. */
4510 else if (fast_math_flags_set_p (&global_options)
4511 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4512 opts->x_ix86_fpmath = FPMATH_SSE;
4513 else
4514 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4516 /* Use external vectorized library in vectorizing intrinsics. */
4517 if (opts_set->x_ix86_veclibabi_type)
4518 switch (opts->x_ix86_veclibabi_type)
4520 case ix86_veclibabi_type_svml:
4521 ix86_veclib_handler = ix86_veclibabi_svml;
4522 break;
4524 case ix86_veclibabi_type_acml:
4525 ix86_veclib_handler = ix86_veclibabi_acml;
4526 break;
4528 default:
4529 gcc_unreachable ();
4532 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4533 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4534 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4536 /* If stack probes are required, the space used for large function
4537 arguments on the stack must also be probed, so enable
4538 -maccumulate-outgoing-args so this happens in the prologue. */
4539 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4540 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4542 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4543 warning (0,
4544 main_args_p
4545 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4546 "for correctness")
4547 : G_("stack probing requires "
4548 "%<target(\"accumulate-outgoing-args\")%> for "
4549 "correctness"));
4550 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4553 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4554 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4555 if (fixed_regs[BP_REG]
4556 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4558 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4559 warning (0,
4560 main_args_p
4561 ? G_("fixed ebp register requires "
4562 "%<-maccumulate-outgoing-args%>")
4563 : G_("fixed ebp register requires "
4564 "%<target(\"accumulate-outgoing-args\")%>"));
4565 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4568 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4570 char *p;
4571 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4572 p = strchr (internal_label_prefix, 'X');
4573 internal_label_prefix_len = p - internal_label_prefix;
4574 *p = '\0';
4577 /* When scheduling description is not available, disable scheduler pass
4578 so it won't slow down the compilation and make x87 code slower. */
4579 if (!TARGET_SCHEDULE)
4580 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4582 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4583 ix86_tune_cost->simultaneous_prefetches,
4584 opts->x_param_values,
4585 opts_set->x_param_values);
4586 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4587 ix86_tune_cost->prefetch_block,
4588 opts->x_param_values,
4589 opts_set->x_param_values);
4590 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4591 ix86_tune_cost->l1_cache_size,
4592 opts->x_param_values,
4593 opts_set->x_param_values);
4594 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4595 ix86_tune_cost->l2_cache_size,
4596 opts->x_param_values,
4597 opts_set->x_param_values);
4599 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4600 if (opts->x_flag_prefetch_loop_arrays < 0
4601 && HAVE_prefetch
4602 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4603 && !opts->x_optimize_size
4604 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4605 opts->x_flag_prefetch_loop_arrays = 1;
4607 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4608 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4609 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4610 targetm.expand_builtin_va_start = NULL;
4612 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4614 ix86_gen_leave = gen_leave_rex64;
4615 if (Pmode == DImode)
4617 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4618 ix86_gen_tls_local_dynamic_base_64
4619 = gen_tls_local_dynamic_base_64_di;
4621 else
4623 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4624 ix86_gen_tls_local_dynamic_base_64
4625 = gen_tls_local_dynamic_base_64_si;
4628 else
4629 ix86_gen_leave = gen_leave;
4631 if (Pmode == DImode)
4633 ix86_gen_add3 = gen_adddi3;
4634 ix86_gen_sub3 = gen_subdi3;
4635 ix86_gen_sub3_carry = gen_subdi3_carry;
4636 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4637 ix86_gen_andsp = gen_anddi3;
4638 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4639 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4640 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4641 ix86_gen_monitor = gen_sse3_monitor_di;
4642 ix86_gen_monitorx = gen_monitorx_di;
4643 ix86_gen_clzero = gen_clzero_di;
4645 else
4647 ix86_gen_add3 = gen_addsi3;
4648 ix86_gen_sub3 = gen_subsi3;
4649 ix86_gen_sub3_carry = gen_subsi3_carry;
4650 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4651 ix86_gen_andsp = gen_andsi3;
4652 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4653 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4654 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4655 ix86_gen_monitor = gen_sse3_monitor_si;
4656 ix86_gen_monitorx = gen_monitorx_si;
4657 ix86_gen_clzero = gen_clzero_si;
4660 #ifdef USE_IX86_CLD
4661 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4662 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4663 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4664 #endif
4666 /* Set the default value for -mfentry. */
4667 if (!opts_set->x_flag_fentry)
4668 opts->x_flag_fentry = TARGET_SEH;
4669 else
4671 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4672 && opts->x_flag_fentry)
4673 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4674 "with -fpic");
4675 else if (TARGET_SEH && !opts->x_flag_fentry)
4676 sorry ("-mno-fentry isn%'t compatible with SEH");
4679 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4680 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4682 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4683 && TARGET_EMIT_VZEROUPPER)
4684 opts->x_target_flags |= MASK_VZEROUPPER;
4685 if (!(opts_set->x_target_flags & MASK_STV))
4686 opts->x_target_flags |= MASK_STV;
4687 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4688 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4689 stack realignment will be extra cost the pass doesn't take into
4690 account and the pass can't realign the stack. */
4691 if (ix86_preferred_stack_boundary < 128
4692 || ix86_incoming_stack_boundary < 128
4693 || opts->x_ix86_force_align_arg_pointer)
4694 opts->x_target_flags &= ~MASK_STV;
4695 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4696 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4697 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4698 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4699 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4700 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4702 /* Enable 128-bit AVX instruction generation
4703 for the auto-vectorizer. */
4704 if (TARGET_AVX128_OPTIMAL
4705 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4706 opts->x_prefer_vector_width_type = PVW_AVX128;
4708 /* Use 256-bit AVX instruction generation
4709 in the auto-vectorizer. */
4710 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4711 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4712 opts->x_prefer_vector_width_type = PVW_AVX256;
4714 if (opts->x_ix86_recip_name)
4716 char *p = ASTRDUP (opts->x_ix86_recip_name);
4717 char *q;
4718 unsigned int mask, i;
4719 bool invert;
4721 while ((q = strtok (p, ",")) != NULL)
4723 p = NULL;
4724 if (*q == '!')
4726 invert = true;
4727 q++;
4729 else
4730 invert = false;
4732 if (!strcmp (q, "default"))
4733 mask = RECIP_MASK_ALL;
4734 else
4736 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4737 if (!strcmp (q, recip_options[i].string))
4739 mask = recip_options[i].mask;
4740 break;
4743 if (i == ARRAY_SIZE (recip_options))
4745 error ("unknown option for -mrecip=%s", q);
4746 invert = false;
4747 mask = RECIP_MASK_NONE;
4751 opts->x_recip_mask_explicit |= mask;
4752 if (invert)
4753 opts->x_recip_mask &= ~mask;
4754 else
4755 opts->x_recip_mask |= mask;
4759 if (TARGET_RECIP_P (opts->x_target_flags))
4760 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4761 else if (opts_set->x_target_flags & MASK_RECIP)
4762 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4764 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4765 for 64-bit Bionic. Also default long double to 64-bit for Intel
4766 MCU psABI. */
4767 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4768 && !(opts_set->x_target_flags
4769 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4770 opts->x_target_flags |= (TARGET_64BIT
4771 ? MASK_LONG_DOUBLE_128
4772 : MASK_LONG_DOUBLE_64);
4774 /* Only one of them can be active. */
4775 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4776 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4778 /* Handle stack protector */
4779 if (!opts_set->x_ix86_stack_protector_guard)
4780 opts->x_ix86_stack_protector_guard
4781 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4783 #ifdef TARGET_THREAD_SSP_OFFSET
4784 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4785 #endif
4787 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4789 char *endp;
4790 const char *str = ix86_stack_protector_guard_offset_str;
4792 errno = 0;
4793 int64_t offset;
4795 #if defined(INT64_T_IS_LONG)
4796 offset = strtol (str, &endp, 0);
4797 #else
4798 offset = strtoll (str, &endp, 0);
4799 #endif
4801 if (!*str || *endp || errno)
4802 error ("%qs is not a valid number "
4803 "in -mstack-protector-guard-offset=", str);
4805 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4806 HOST_WIDE_INT_C (0x7fffffff)))
4807 error ("%qs is not a valid offset "
4808 "in -mstack-protector-guard-offset=", str);
4810 ix86_stack_protector_guard_offset = offset;
4813 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4815 /* The kernel uses a different segment register for performance
4816 reasons; a system call would not have to trash the userspace
4817 segment register, which would be expensive. */
4818 if (ix86_cmodel == CM_KERNEL)
4819 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4821 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4823 const char *str = ix86_stack_protector_guard_reg_str;
4824 addr_space_t seg = ADDR_SPACE_GENERIC;
4826 /* Discard optional register prefix. */
4827 if (str[0] == '%')
4828 str++;
4830 if (strlen (str) == 2 && str[1] == 's')
4832 if (str[0] == 'f')
4833 seg = ADDR_SPACE_SEG_FS;
4834 else if (str[0] == 'g')
4835 seg = ADDR_SPACE_SEG_GS;
4838 if (seg == ADDR_SPACE_GENERIC)
4839 error ("%qs is not a valid base register "
4840 "in -mstack-protector-guard-reg=",
4841 ix86_stack_protector_guard_reg_str);
4843 ix86_stack_protector_guard_reg = seg;
4846 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4847 if (opts->x_ix86_tune_memcpy_strategy)
4849 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4850 ix86_parse_stringop_strategy_string (str, false);
4851 free (str);
4854 if (opts->x_ix86_tune_memset_strategy)
4856 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4857 ix86_parse_stringop_strategy_string (str, true);
4858 free (str);
4861 /* Save the initial options in case the user does function specific
4862 options. */
4863 if (main_args_p)
4864 target_option_default_node = target_option_current_node
4865 = build_target_option_node (opts);
4867 /* Do not support control flow instrumentation if CET is not enabled. */
4868 if (opts->x_flag_cf_protection != CF_NONE)
4870 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4871 || TARGET_SHSTK_P (opts->x_ix86_isa_flags2)))
4873 if (flag_cf_protection == CF_FULL)
4875 error ("%<-fcf-protection=full%> requires CET support "
4876 "on this target. Use -mcet or one of -mibt, "
4877 "-mshstk options to enable CET");
4879 else if (flag_cf_protection == CF_BRANCH)
4881 error ("%<-fcf-protection=branch%> requires CET support "
4882 "on this target. Use -mcet or one of -mibt, "
4883 "-mshstk options to enable CET");
4885 else if (flag_cf_protection == CF_RETURN)
4887 error ("%<-fcf-protection=return%> requires CET support "
4888 "on this target. Use -mcet or one of -mibt, "
4889 "-mshstk options to enable CET");
4891 flag_cf_protection = CF_NONE;
4892 return false;
4894 opts->x_flag_cf_protection =
4895 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4898 return true;
4901 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4903 static void
4904 ix86_option_override (void)
4906 ix86_option_override_internal (true, &global_options, &global_options_set);
4909 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4910 static char *
4911 ix86_offload_options (void)
4913 if (TARGET_LP64)
4914 return xstrdup ("-foffload-abi=lp64");
4915 return xstrdup ("-foffload-abi=ilp32");
4918 /* Update register usage after having seen the compiler flags. */
4920 static void
4921 ix86_conditional_register_usage (void)
4923 int i, c_mask;
4925 /* If there are no caller-saved registers, preserve all registers.
4926 except fixed_regs and registers used for function return value
4927 since aggregate_value_p checks call_used_regs[regno] on return
4928 value. */
4929 if (cfun && cfun->machine->no_caller_saved_registers)
4930 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4931 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4932 call_used_regs[i] = 0;
4934 /* For 32-bit targets, squash the REX registers. */
4935 if (! TARGET_64BIT)
4937 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4938 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4939 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4940 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4941 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4942 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4945 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4946 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4948 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4950 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4952 /* Set/reset conditionally defined registers from
4953 CALL_USED_REGISTERS initializer. */
4954 if (call_used_regs[i] > 1)
4955 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4957 /* Calculate registers of CLOBBERED_REGS register set
4958 as call used registers from GENERAL_REGS register set. */
4959 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4960 && call_used_regs[i])
4961 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4964 /* If MMX is disabled, squash the registers. */
4965 if (! TARGET_MMX)
4966 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4967 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4968 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4970 /* If SSE is disabled, squash the registers. */
4971 if (! TARGET_SSE)
4972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4973 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4974 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4976 /* If the FPU is disabled, squash the registers. */
4977 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4978 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4979 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4980 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4982 /* If AVX512F is disabled, squash the registers. */
4983 if (! TARGET_AVX512F)
4985 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4986 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4988 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4989 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4992 /* If MPX is disabled, squash the registers. */
4993 if (! TARGET_MPX)
4994 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4995 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4998 /* Canonicalize a comparison from one we don't have to one we do have. */
5000 static void
5001 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5002 bool op0_preserve_value)
5004 /* The order of operands in x87 ficom compare is forced by combine in
5005 simplify_comparison () function. Float operator is treated as RTX_OBJ
5006 with a precedence over other operators and is always put in the first
5007 place. Swap condition and operands to match ficom instruction. */
5008 if (!op0_preserve_value
5009 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5011 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5013 /* We are called only for compares that are split to SAHF instruction.
5014 Ensure that we have setcc/jcc insn for the swapped condition. */
5015 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5017 std::swap (*op0, *op1);
5018 *code = (int) scode;
5023 /* Save the current options */
5025 static void
5026 ix86_function_specific_save (struct cl_target_option *ptr,
5027 struct gcc_options *opts)
5029 ptr->arch = ix86_arch;
5030 ptr->schedule = ix86_schedule;
5031 ptr->prefetch_sse = x86_prefetch_sse;
5032 ptr->tune = ix86_tune;
5033 ptr->branch_cost = ix86_branch_cost;
5034 ptr->tune_defaulted = ix86_tune_defaulted;
5035 ptr->arch_specified = ix86_arch_specified;
5036 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5037 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5038 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5039 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5040 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5041 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5042 ptr->x_ix86_abi = opts->x_ix86_abi;
5043 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5044 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5045 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5046 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5047 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5048 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5049 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5050 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5051 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5052 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5053 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5054 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5055 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5056 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5057 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5058 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5059 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5060 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5061 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5062 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5064 /* The fields are char but the variables are not; make sure the
5065 values fit in the fields. */
5066 gcc_assert (ptr->arch == ix86_arch);
5067 gcc_assert (ptr->schedule == ix86_schedule);
5068 gcc_assert (ptr->tune == ix86_tune);
5069 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5072 /* Restore the current options */
5074 static void
5075 ix86_function_specific_restore (struct gcc_options *opts,
5076 struct cl_target_option *ptr)
5078 enum processor_type old_tune = ix86_tune;
5079 enum processor_type old_arch = ix86_arch;
5080 unsigned int ix86_arch_mask;
5081 int i;
5083 /* We don't change -fPIC. */
5084 opts->x_flag_pic = flag_pic;
5086 ix86_arch = (enum processor_type) ptr->arch;
5087 ix86_schedule = (enum attr_cpu) ptr->schedule;
5088 ix86_tune = (enum processor_type) ptr->tune;
5089 x86_prefetch_sse = ptr->prefetch_sse;
5090 opts->x_ix86_branch_cost = ptr->branch_cost;
5091 ix86_tune_defaulted = ptr->tune_defaulted;
5092 ix86_arch_specified = ptr->arch_specified;
5093 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5094 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5095 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5096 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5097 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5098 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5099 opts->x_ix86_abi = ptr->x_ix86_abi;
5100 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5101 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5102 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5103 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5104 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5105 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5106 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5107 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5108 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5109 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5110 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5111 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5112 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5113 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5114 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5115 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5116 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5117 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5118 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5119 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5120 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5121 /* TODO: ix86_cost should be chosen at instruction or function granuality
5122 so for cold code we use size_cost even in !optimize_size compilation. */
5123 if (opts->x_optimize_size)
5124 ix86_cost = &ix86_size_cost;
5125 else
5126 ix86_cost = ix86_tune_cost;
5128 /* Recreate the arch feature tests if the arch changed */
5129 if (old_arch != ix86_arch)
5131 ix86_arch_mask = 1u << ix86_arch;
5132 for (i = 0; i < X86_ARCH_LAST; ++i)
5133 ix86_arch_features[i]
5134 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5137 /* Recreate the tune optimization tests */
5138 if (old_tune != ix86_tune)
5139 set_ix86_tune_features (ix86_tune, false);
5142 /* Adjust target options after streaming them in. This is mainly about
5143 reconciling them with global options. */
5145 static void
5146 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5148 /* flag_pic is a global option, but ix86_cmodel is target saved option
5149 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5150 for PIC, or error out. */
5151 if (flag_pic)
5152 switch (ptr->x_ix86_cmodel)
5154 case CM_SMALL:
5155 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5156 break;
5158 case CM_MEDIUM:
5159 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5160 break;
5162 case CM_LARGE:
5163 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5164 break;
5166 case CM_KERNEL:
5167 error ("code model %s does not support PIC mode", "kernel");
5168 break;
5170 default:
5171 break;
5173 else
5174 switch (ptr->x_ix86_cmodel)
5176 case CM_SMALL_PIC:
5177 ptr->x_ix86_cmodel = CM_SMALL;
5178 break;
5180 case CM_MEDIUM_PIC:
5181 ptr->x_ix86_cmodel = CM_MEDIUM;
5182 break;
5184 case CM_LARGE_PIC:
5185 ptr->x_ix86_cmodel = CM_LARGE;
5186 break;
5188 default:
5189 break;
5193 /* Print the current options */
5195 static void
5196 ix86_function_specific_print (FILE *file, int indent,
5197 struct cl_target_option *ptr)
5199 char *target_string
5200 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5201 ptr->x_target_flags, ptr->x_ix86_target_flags,
5202 NULL, NULL, ptr->x_ix86_fpmath, false);
5204 gcc_assert (ptr->arch < PROCESSOR_max);
5205 fprintf (file, "%*sarch = %d (%s)\n",
5206 indent, "",
5207 ptr->arch, processor_target_table[ptr->arch].name);
5209 gcc_assert (ptr->tune < PROCESSOR_max);
5210 fprintf (file, "%*stune = %d (%s)\n",
5211 indent, "",
5212 ptr->tune, processor_target_table[ptr->tune].name);
5214 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5216 if (target_string)
5218 fprintf (file, "%*s%s\n", indent, "", target_string);
5219 free (target_string);
5224 /* Inner function to process the attribute((target(...))), take an argument and
5225 set the current options from the argument. If we have a list, recursively go
5226 over the list. */
5228 static bool
5229 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5230 struct gcc_options *opts,
5231 struct gcc_options *opts_set,
5232 struct gcc_options *enum_opts_set)
5234 char *next_optstr;
5235 bool ret = true;
5237 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5238 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5239 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5240 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5241 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5243 enum ix86_opt_type
5245 ix86_opt_unknown,
5246 ix86_opt_yes,
5247 ix86_opt_no,
5248 ix86_opt_str,
5249 ix86_opt_enum,
5250 ix86_opt_isa
5253 static const struct
5255 const char *string;
5256 size_t len;
5257 enum ix86_opt_type type;
5258 int opt;
5259 int mask;
5260 } attrs[] = {
5261 /* isa options */
5262 IX86_ATTR_ISA ("sgx", OPT_msgx),
5263 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5264 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5265 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5266 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5267 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5269 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5270 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5271 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5272 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5273 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5274 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5275 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5276 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5277 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5278 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5279 IX86_ATTR_ISA ("fma", OPT_mfma),
5280 IX86_ATTR_ISA ("xop", OPT_mxop),
5281 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5282 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5283 IX86_ATTR_ISA ("avx", OPT_mavx),
5284 IX86_ATTR_ISA ("sse4", OPT_msse4),
5285 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5286 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5287 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5288 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5289 IX86_ATTR_ISA ("sse3", OPT_msse3),
5290 IX86_ATTR_ISA ("aes", OPT_maes),
5291 IX86_ATTR_ISA ("sha", OPT_msha),
5292 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5293 IX86_ATTR_ISA ("sse2", OPT_msse2),
5294 IX86_ATTR_ISA ("sse", OPT_msse),
5295 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5296 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5297 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5298 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5299 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5300 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5301 IX86_ATTR_ISA ("adx", OPT_madx),
5302 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5303 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5304 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5305 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5306 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5307 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5308 IX86_ATTR_ISA ("abm", OPT_mabm),
5309 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5310 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5311 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5312 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5313 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5314 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5315 IX86_ATTR_ISA ("sahf", OPT_msahf),
5316 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5317 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5318 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5319 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5320 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5321 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5322 IX86_ATTR_ISA ("pku", OPT_mpku),
5323 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5324 IX86_ATTR_ISA ("hle", OPT_mhle),
5325 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5326 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5327 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5328 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5329 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5330 IX86_ATTR_ISA ("ibt", OPT_mibt),
5331 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5332 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5334 /* enum options */
5335 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5337 /* string options */
5338 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5339 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5341 /* flag options */
5342 IX86_ATTR_YES ("cld",
5343 OPT_mcld,
5344 MASK_CLD),
5346 IX86_ATTR_NO ("fancy-math-387",
5347 OPT_mfancy_math_387,
5348 MASK_NO_FANCY_MATH_387),
5350 IX86_ATTR_YES ("ieee-fp",
5351 OPT_mieee_fp,
5352 MASK_IEEE_FP),
5354 IX86_ATTR_YES ("inline-all-stringops",
5355 OPT_minline_all_stringops,
5356 MASK_INLINE_ALL_STRINGOPS),
5358 IX86_ATTR_YES ("inline-stringops-dynamically",
5359 OPT_minline_stringops_dynamically,
5360 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5362 IX86_ATTR_NO ("align-stringops",
5363 OPT_mno_align_stringops,
5364 MASK_NO_ALIGN_STRINGOPS),
5366 IX86_ATTR_YES ("recip",
5367 OPT_mrecip,
5368 MASK_RECIP),
5372 /* If this is a list, recurse to get the options. */
5373 if (TREE_CODE (args) == TREE_LIST)
5375 bool ret = true;
5377 for (; args; args = TREE_CHAIN (args))
5378 if (TREE_VALUE (args)
5379 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5380 p_strings, opts, opts_set,
5381 enum_opts_set))
5382 ret = false;
5384 return ret;
5387 else if (TREE_CODE (args) != STRING_CST)
5389 error ("attribute %<target%> argument not a string");
5390 return false;
5393 /* Handle multiple arguments separated by commas. */
5394 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5396 while (next_optstr && *next_optstr != '\0')
5398 char *p = next_optstr;
5399 char *orig_p = p;
5400 char *comma = strchr (next_optstr, ',');
5401 const char *opt_string;
5402 size_t len, opt_len;
5403 int opt;
5404 bool opt_set_p;
5405 char ch;
5406 unsigned i;
5407 enum ix86_opt_type type = ix86_opt_unknown;
5408 int mask = 0;
5410 if (comma)
5412 *comma = '\0';
5413 len = comma - next_optstr;
5414 next_optstr = comma + 1;
5416 else
5418 len = strlen (p);
5419 next_optstr = NULL;
5422 /* Recognize no-xxx. */
5423 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5425 opt_set_p = false;
5426 p += 3;
5427 len -= 3;
5429 else
5430 opt_set_p = true;
5432 /* Find the option. */
5433 ch = *p;
5434 opt = N_OPTS;
5435 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5437 type = attrs[i].type;
5438 opt_len = attrs[i].len;
5439 if (ch == attrs[i].string[0]
5440 && ((type != ix86_opt_str && type != ix86_opt_enum)
5441 ? len == opt_len
5442 : len > opt_len)
5443 && memcmp (p, attrs[i].string, opt_len) == 0)
5445 opt = attrs[i].opt;
5446 mask = attrs[i].mask;
5447 opt_string = attrs[i].string;
5448 break;
5452 /* Process the option. */
5453 if (opt == N_OPTS)
5455 error ("attribute(target(\"%s\")) is unknown", orig_p);
5456 ret = false;
5459 else if (type == ix86_opt_isa)
5461 struct cl_decoded_option decoded;
5463 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5464 ix86_handle_option (opts, opts_set,
5465 &decoded, input_location);
5468 else if (type == ix86_opt_yes || type == ix86_opt_no)
5470 if (type == ix86_opt_no)
5471 opt_set_p = !opt_set_p;
5473 if (opt_set_p)
5474 opts->x_target_flags |= mask;
5475 else
5476 opts->x_target_flags &= ~mask;
5479 else if (type == ix86_opt_str)
5481 if (p_strings[opt])
5483 error ("option(\"%s\") was already specified", opt_string);
5484 ret = false;
5486 else
5487 p_strings[opt] = xstrdup (p + opt_len);
5490 else if (type == ix86_opt_enum)
5492 bool arg_ok;
5493 int value;
5495 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5496 if (arg_ok)
5497 set_option (opts, enum_opts_set, opt, value,
5498 p + opt_len, DK_UNSPECIFIED, input_location,
5499 global_dc);
5500 else
5502 error ("attribute(target(\"%s\")) is unknown", orig_p);
5503 ret = false;
5507 else
5508 gcc_unreachable ();
5511 return ret;
5514 /* Release allocated strings. */
5515 static void
5516 release_options_strings (char **option_strings)
5518 /* Free up memory allocated to hold the strings */
5519 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5520 free (option_strings[i]);
5523 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5525 tree
5526 ix86_valid_target_attribute_tree (tree args,
5527 struct gcc_options *opts,
5528 struct gcc_options *opts_set)
5530 const char *orig_arch_string = opts->x_ix86_arch_string;
5531 const char *orig_tune_string = opts->x_ix86_tune_string;
5532 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5533 int orig_tune_defaulted = ix86_tune_defaulted;
5534 int orig_arch_specified = ix86_arch_specified;
5535 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5536 tree t = NULL_TREE;
5537 struct cl_target_option *def
5538 = TREE_TARGET_OPTION (target_option_default_node);
5539 struct gcc_options enum_opts_set;
5541 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5543 /* Process each of the options on the chain. */
5544 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5545 opts_set, &enum_opts_set))
5546 return error_mark_node;
5548 /* If the changed options are different from the default, rerun
5549 ix86_option_override_internal, and then save the options away.
5550 The string options are attribute options, and will be undone
5551 when we copy the save structure. */
5552 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5553 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5554 || opts->x_target_flags != def->x_target_flags
5555 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5556 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5557 || enum_opts_set.x_ix86_fpmath)
5559 /* If we are using the default tune= or arch=, undo the string assigned,
5560 and use the default. */
5561 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5563 opts->x_ix86_arch_string
5564 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5566 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5567 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5568 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5569 | OPTION_MASK_ABI_64
5570 | OPTION_MASK_ABI_X32
5571 | OPTION_MASK_CODE16);
5572 opts->x_ix86_isa_flags2 = 0;
5574 else if (!orig_arch_specified)
5575 opts->x_ix86_arch_string = NULL;
5577 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5578 opts->x_ix86_tune_string
5579 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5580 else if (orig_tune_defaulted)
5581 opts->x_ix86_tune_string = NULL;
5583 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5584 if (enum_opts_set.x_ix86_fpmath)
5585 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5587 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5588 bool r = ix86_option_override_internal (false, opts, opts_set);
5589 if (!r)
5591 release_options_strings (option_strings);
5592 return error_mark_node;
5595 /* Add any builtin functions with the new isa if any. */
5596 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5598 /* Save the current options unless we are validating options for
5599 #pragma. */
5600 t = build_target_option_node (opts);
5602 opts->x_ix86_arch_string = orig_arch_string;
5603 opts->x_ix86_tune_string = orig_tune_string;
5604 opts_set->x_ix86_fpmath = orig_fpmath_set;
5606 release_options_strings (option_strings);
5609 return t;
5612 /* Hook to validate attribute((target("string"))). */
5614 static bool
5615 ix86_valid_target_attribute_p (tree fndecl,
5616 tree ARG_UNUSED (name),
5617 tree args,
5618 int ARG_UNUSED (flags))
5620 struct gcc_options func_options;
5621 tree new_target, new_optimize;
5622 bool ret = true;
5624 /* attribute((target("default"))) does nothing, beyond
5625 affecting multi-versioning. */
5626 if (TREE_VALUE (args)
5627 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5628 && TREE_CHAIN (args) == NULL_TREE
5629 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5630 return true;
5632 tree old_optimize = build_optimization_node (&global_options);
5634 /* Get the optimization options of the current function. */
5635 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5637 if (!func_optimize)
5638 func_optimize = old_optimize;
5640 /* Init func_options. */
5641 memset (&func_options, 0, sizeof (func_options));
5642 init_options_struct (&func_options, NULL);
5643 lang_hooks.init_options_struct (&func_options);
5645 cl_optimization_restore (&func_options,
5646 TREE_OPTIMIZATION (func_optimize));
5648 /* Initialize func_options to the default before its target options can
5649 be set. */
5650 cl_target_option_restore (&func_options,
5651 TREE_TARGET_OPTION (target_option_default_node));
5653 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5654 &global_options_set);
5656 new_optimize = build_optimization_node (&func_options);
5658 if (new_target == error_mark_node)
5659 ret = false;
5661 else if (fndecl && new_target)
5663 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5665 if (old_optimize != new_optimize)
5666 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5669 finalize_options_struct (&func_options);
5671 return ret;
5675 /* Hook to determine if one function can safely inline another. */
5677 static bool
5678 ix86_can_inline_p (tree caller, tree callee)
5680 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5681 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5682 if (!callee_tree)
5683 callee_tree = target_option_default_node;
5684 if (!caller_tree)
5685 caller_tree = target_option_default_node;
5686 if (callee_tree == caller_tree)
5687 return true;
5689 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5690 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5691 bool ret = false;
5693 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5694 function can inline a SSE2 function but a SSE2 function can't inline
5695 a SSE4 function. */
5696 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5697 != callee_opts->x_ix86_isa_flags)
5698 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5699 != callee_opts->x_ix86_isa_flags2))
5700 ret = false;
5702 /* See if we have the same non-isa options. */
5703 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5704 ret = false;
5706 /* See if arch, tune, etc. are the same. */
5707 else if (caller_opts->arch != callee_opts->arch)
5708 ret = false;
5710 else if (caller_opts->tune != callee_opts->tune)
5711 ret = false;
5713 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5714 /* If the calle doesn't use FP expressions differences in
5715 ix86_fpmath can be ignored. We are called from FEs
5716 for multi-versioning call optimization, so beware of
5717 ipa_fn_summaries not available. */
5718 && (! ipa_fn_summaries
5719 || ipa_fn_summaries->get
5720 (cgraph_node::get (callee))->fp_expressions))
5721 ret = false;
5723 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5724 ret = false;
5726 else
5727 ret = true;
5729 return ret;
5733 /* Remember the last target of ix86_set_current_function. */
5734 static GTY(()) tree ix86_previous_fndecl;
5736 /* Set targets globals to the default (or current #pragma GCC target
5737 if active). Invalidate ix86_previous_fndecl cache. */
5739 void
5740 ix86_reset_previous_fndecl (void)
5742 tree new_tree = target_option_current_node;
5743 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5744 if (TREE_TARGET_GLOBALS (new_tree))
5745 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5746 else if (new_tree == target_option_default_node)
5747 restore_target_globals (&default_target_globals);
5748 else
5749 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5750 ix86_previous_fndecl = NULL_TREE;
5753 /* Set the func_type field from the function FNDECL. */
5755 static void
5756 ix86_set_func_type (tree fndecl)
5758 if (cfun->machine->func_type == TYPE_UNKNOWN)
5760 if (lookup_attribute ("interrupt",
5761 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5763 if (ix86_function_naked (fndecl))
5764 error_at (DECL_SOURCE_LOCATION (fndecl),
5765 "interrupt and naked attributes are not compatible");
5767 int nargs = 0;
5768 for (tree arg = DECL_ARGUMENTS (fndecl);
5769 arg;
5770 arg = TREE_CHAIN (arg))
5771 nargs++;
5772 cfun->machine->no_caller_saved_registers = true;
5773 cfun->machine->func_type
5774 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5776 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5778 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5779 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5780 sorry ("Only DWARF debug format is supported for interrupt "
5781 "service routine.");
5783 else
5785 cfun->machine->func_type = TYPE_NORMAL;
5786 if (lookup_attribute ("no_caller_saved_registers",
5787 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5788 cfun->machine->no_caller_saved_registers = true;
5793 /* Establish appropriate back-end context for processing the function
5794 FNDECL. The argument might be NULL to indicate processing at top
5795 level, outside of any function scope. */
5796 static void
5797 ix86_set_current_function (tree fndecl)
5799 /* Only change the context if the function changes. This hook is called
5800 several times in the course of compiling a function, and we don't want to
5801 slow things down too much or call target_reinit when it isn't safe. */
5802 if (fndecl == ix86_previous_fndecl)
5804 /* There may be 2 function bodies for the same function FNDECL,
5805 one is extern inline and one isn't. Call ix86_set_func_type
5806 to set the func_type field. */
5807 if (fndecl != NULL_TREE)
5808 ix86_set_func_type (fndecl);
5809 return;
5812 tree old_tree;
5813 if (ix86_previous_fndecl == NULL_TREE)
5814 old_tree = target_option_current_node;
5815 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5816 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5817 else
5818 old_tree = target_option_default_node;
5820 if (fndecl == NULL_TREE)
5822 if (old_tree != target_option_current_node)
5823 ix86_reset_previous_fndecl ();
5824 return;
5827 ix86_set_func_type (fndecl);
5829 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5830 if (new_tree == NULL_TREE)
5831 new_tree = target_option_default_node;
5833 if (old_tree != new_tree)
5835 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5836 if (TREE_TARGET_GLOBALS (new_tree))
5837 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5838 else if (new_tree == target_option_default_node)
5839 restore_target_globals (&default_target_globals);
5840 else
5841 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5843 ix86_previous_fndecl = fndecl;
5845 static bool prev_no_caller_saved_registers;
5847 /* 64-bit MS and SYSV ABI have different set of call used registers.
5848 Avoid expensive re-initialization of init_regs each time we switch
5849 function context. */
5850 if (TARGET_64BIT
5851 && (call_used_regs[SI_REG]
5852 == (cfun->machine->call_abi == MS_ABI)))
5853 reinit_regs ();
5854 /* Need to re-initialize init_regs if caller-saved registers are
5855 changed. */
5856 else if (prev_no_caller_saved_registers
5857 != cfun->machine->no_caller_saved_registers)
5858 reinit_regs ();
5860 if (cfun->machine->func_type != TYPE_NORMAL
5861 || cfun->machine->no_caller_saved_registers)
5863 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5864 may change processor state. */
5865 const char *isa;
5866 if (TARGET_MPX)
5867 isa = "MPX";
5868 else if (TARGET_SSE)
5869 isa = "SSE";
5870 else if (TARGET_MMX)
5871 isa = "MMX/3Dnow";
5872 else if (TARGET_80387)
5873 isa = "80387";
5874 else
5875 isa = NULL;
5876 if (isa != NULL)
5878 if (cfun->machine->func_type != TYPE_NORMAL)
5879 sorry ("%s instructions aren't allowed in %s service routine",
5880 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5881 ? "exception" : "interrupt"));
5882 else
5883 sorry ("%s instructions aren't allowed in function with "
5884 "no_caller_saved_registers attribute", isa);
5885 /* Don't issue the same error twice. */
5886 cfun->machine->func_type = TYPE_NORMAL;
5887 cfun->machine->no_caller_saved_registers = false;
5891 prev_no_caller_saved_registers
5892 = cfun->machine->no_caller_saved_registers;
5896 /* Return true if this goes in large data/bss. */
5898 static bool
5899 ix86_in_large_data_p (tree exp)
5901 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5902 return false;
5904 if (exp == NULL_TREE)
5905 return false;
5907 /* Functions are never large data. */
5908 if (TREE_CODE (exp) == FUNCTION_DECL)
5909 return false;
5911 /* Automatic variables are never large data. */
5912 if (VAR_P (exp) && !is_global_var (exp))
5913 return false;
5915 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5917 const char *section = DECL_SECTION_NAME (exp);
5918 if (strcmp (section, ".ldata") == 0
5919 || strcmp (section, ".lbss") == 0)
5920 return true;
5921 return false;
5923 else
5925 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5927 /* If this is an incomplete type with size 0, then we can't put it
5928 in data because it might be too big when completed. Also,
5929 int_size_in_bytes returns -1 if size can vary or is larger than
5930 an integer in which case also it is safer to assume that it goes in
5931 large data. */
5932 if (size <= 0 || size > ix86_section_threshold)
5933 return true;
5936 return false;
5939 /* i386-specific section flag to mark large sections. */
5940 #define SECTION_LARGE SECTION_MACH_DEP
5942 /* Switch to the appropriate section for output of DECL.
5943 DECL is either a `VAR_DECL' node or a constant of some sort.
5944 RELOC indicates whether forming the initial value of DECL requires
5945 link-time relocations. */
5947 ATTRIBUTE_UNUSED static section *
5948 x86_64_elf_select_section (tree decl, int reloc,
5949 unsigned HOST_WIDE_INT align)
5951 if (ix86_in_large_data_p (decl))
5953 const char *sname = NULL;
5954 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
5955 switch (categorize_decl_for_section (decl, reloc))
5957 case SECCAT_DATA:
5958 sname = ".ldata";
5959 break;
5960 case SECCAT_DATA_REL:
5961 sname = ".ldata.rel";
5962 break;
5963 case SECCAT_DATA_REL_LOCAL:
5964 sname = ".ldata.rel.local";
5965 break;
5966 case SECCAT_DATA_REL_RO:
5967 sname = ".ldata.rel.ro";
5968 break;
5969 case SECCAT_DATA_REL_RO_LOCAL:
5970 sname = ".ldata.rel.ro.local";
5971 break;
5972 case SECCAT_BSS:
5973 sname = ".lbss";
5974 flags |= SECTION_BSS;
5975 break;
5976 case SECCAT_RODATA:
5977 case SECCAT_RODATA_MERGE_STR:
5978 case SECCAT_RODATA_MERGE_STR_INIT:
5979 case SECCAT_RODATA_MERGE_CONST:
5980 sname = ".lrodata";
5981 flags &= ~SECTION_WRITE;
5982 break;
5983 case SECCAT_SRODATA:
5984 case SECCAT_SDATA:
5985 case SECCAT_SBSS:
5986 gcc_unreachable ();
5987 case SECCAT_TEXT:
5988 case SECCAT_TDATA:
5989 case SECCAT_TBSS:
5990 /* We don't split these for medium model. Place them into
5991 default sections and hope for best. */
5992 break;
5994 if (sname)
5996 /* We might get called with string constants, but get_named_section
5997 doesn't like them as they are not DECLs. Also, we need to set
5998 flags in that case. */
5999 if (!DECL_P (decl))
6000 return get_section (sname, flags, NULL);
6001 return get_named_section (decl, sname, reloc);
6004 return default_elf_select_section (decl, reloc, align);
6007 /* Select a set of attributes for section NAME based on the properties
6008 of DECL and whether or not RELOC indicates that DECL's initializer
6009 might contain runtime relocations. */
6011 static unsigned int ATTRIBUTE_UNUSED
6012 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6014 unsigned int flags = default_section_type_flags (decl, name, reloc);
6016 if (ix86_in_large_data_p (decl))
6017 flags |= SECTION_LARGE;
6019 if (decl == NULL_TREE
6020 && (strcmp (name, ".ldata.rel.ro") == 0
6021 || strcmp (name, ".ldata.rel.ro.local") == 0))
6022 flags |= SECTION_RELRO;
6024 if (strcmp (name, ".lbss") == 0
6025 || strncmp (name, ".lbss.", 5) == 0
6026 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6027 flags |= SECTION_BSS;
6029 return flags;
6032 /* Build up a unique section name, expressed as a
6033 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6034 RELOC indicates whether the initial value of EXP requires
6035 link-time relocations. */
6037 static void ATTRIBUTE_UNUSED
6038 x86_64_elf_unique_section (tree decl, int reloc)
6040 if (ix86_in_large_data_p (decl))
6042 const char *prefix = NULL;
6043 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6044 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6046 switch (categorize_decl_for_section (decl, reloc))
6048 case SECCAT_DATA:
6049 case SECCAT_DATA_REL:
6050 case SECCAT_DATA_REL_LOCAL:
6051 case SECCAT_DATA_REL_RO:
6052 case SECCAT_DATA_REL_RO_LOCAL:
6053 prefix = one_only ? ".ld" : ".ldata";
6054 break;
6055 case SECCAT_BSS:
6056 prefix = one_only ? ".lb" : ".lbss";
6057 break;
6058 case SECCAT_RODATA:
6059 case SECCAT_RODATA_MERGE_STR:
6060 case SECCAT_RODATA_MERGE_STR_INIT:
6061 case SECCAT_RODATA_MERGE_CONST:
6062 prefix = one_only ? ".lr" : ".lrodata";
6063 break;
6064 case SECCAT_SRODATA:
6065 case SECCAT_SDATA:
6066 case SECCAT_SBSS:
6067 gcc_unreachable ();
6068 case SECCAT_TEXT:
6069 case SECCAT_TDATA:
6070 case SECCAT_TBSS:
6071 /* We don't split these for medium model. Place them into
6072 default sections and hope for best. */
6073 break;
6075 if (prefix)
6077 const char *name, *linkonce;
6078 char *string;
6080 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6081 name = targetm.strip_name_encoding (name);
6083 /* If we're using one_only, then there needs to be a .gnu.linkonce
6084 prefix to the section name. */
6085 linkonce = one_only ? ".gnu.linkonce" : "";
6087 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6089 set_decl_section_name (decl, string);
6090 return;
6093 default_unique_section (decl, reloc);
6096 #ifdef COMMON_ASM_OP
6098 #ifndef LARGECOMM_SECTION_ASM_OP
6099 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6100 #endif
6102 /* This says how to output assembler code to declare an
6103 uninitialized external linkage data object.
6105 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6106 large objects. */
6107 void
6108 x86_elf_aligned_decl_common (FILE *file, tree decl,
6109 const char *name, unsigned HOST_WIDE_INT size,
6110 int align)
6112 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6113 && size > (unsigned int)ix86_section_threshold)
6115 switch_to_section (get_named_section (decl, ".lbss", 0));
6116 fputs (LARGECOMM_SECTION_ASM_OP, file);
6118 else
6119 fputs (COMMON_ASM_OP, file);
6120 assemble_name (file, name);
6121 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6122 size, align / BITS_PER_UNIT);
6124 #endif
6126 /* Utility function for targets to use in implementing
6127 ASM_OUTPUT_ALIGNED_BSS. */
6129 void
6130 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6131 unsigned HOST_WIDE_INT size, int align)
6133 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6134 && size > (unsigned int)ix86_section_threshold)
6135 switch_to_section (get_named_section (decl, ".lbss", 0));
6136 else
6137 switch_to_section (bss_section);
6138 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6139 #ifdef ASM_DECLARE_OBJECT_NAME
6140 last_assemble_variable_decl = decl;
6141 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6142 #else
6143 /* Standard thing is just output label for the object. */
6144 ASM_OUTPUT_LABEL (file, name);
6145 #endif /* ASM_DECLARE_OBJECT_NAME */
6146 ASM_OUTPUT_SKIP (file, size ? size : 1);
6149 /* Decide whether we must probe the stack before any space allocation
6150 on this target. It's essentially TARGET_STACK_PROBE except when
6151 -fstack-check causes the stack to be already probed differently. */
6153 bool
6154 ix86_target_stack_probe (void)
6156 /* Do not probe the stack twice if static stack checking is enabled. */
6157 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6158 return false;
6160 return TARGET_STACK_PROBE;
6163 /* Decide whether we can make a sibling call to a function. DECL is the
6164 declaration of the function being targeted by the call and EXP is the
6165 CALL_EXPR representing the call. */
6167 static bool
6168 ix86_function_ok_for_sibcall (tree decl, tree exp)
6170 tree type, decl_or_type;
6171 rtx a, b;
6172 bool bind_global = decl && !targetm.binds_local_p (decl);
6174 if (ix86_function_naked (current_function_decl))
6175 return false;
6177 /* Sibling call isn't OK if there are no caller-saved registers
6178 since all registers must be preserved before return. */
6179 if (cfun->machine->no_caller_saved_registers)
6180 return false;
6182 /* If we are generating position-independent code, we cannot sibcall
6183 optimize direct calls to global functions, as the PLT requires
6184 %ebx be live. (Darwin does not have a PLT.) */
6185 if (!TARGET_MACHO
6186 && !TARGET_64BIT
6187 && flag_pic
6188 && flag_plt
6189 && bind_global)
6190 return false;
6192 /* If we need to align the outgoing stack, then sibcalling would
6193 unalign the stack, which may break the called function. */
6194 if (ix86_minimum_incoming_stack_boundary (true)
6195 < PREFERRED_STACK_BOUNDARY)
6196 return false;
6198 if (decl)
6200 decl_or_type = decl;
6201 type = TREE_TYPE (decl);
6203 else
6205 /* We're looking at the CALL_EXPR, we need the type of the function. */
6206 type = CALL_EXPR_FN (exp); /* pointer expression */
6207 type = TREE_TYPE (type); /* pointer type */
6208 type = TREE_TYPE (type); /* function type */
6209 decl_or_type = type;
6212 /* Check that the return value locations are the same. Like
6213 if we are returning floats on the 80387 register stack, we cannot
6214 make a sibcall from a function that doesn't return a float to a
6215 function that does or, conversely, from a function that does return
6216 a float to a function that doesn't; the necessary stack adjustment
6217 would not be executed. This is also the place we notice
6218 differences in the return value ABI. Note that it is ok for one
6219 of the functions to have void return type as long as the return
6220 value of the other is passed in a register. */
6221 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6222 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6223 cfun->decl, false);
6224 if (STACK_REG_P (a) || STACK_REG_P (b))
6226 if (!rtx_equal_p (a, b))
6227 return false;
6229 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6231 else if (!rtx_equal_p (a, b))
6232 return false;
6234 if (TARGET_64BIT)
6236 /* The SYSV ABI has more call-clobbered registers;
6237 disallow sibcalls from MS to SYSV. */
6238 if (cfun->machine->call_abi == MS_ABI
6239 && ix86_function_type_abi (type) == SYSV_ABI)
6240 return false;
6242 else
6244 /* If this call is indirect, we'll need to be able to use a
6245 call-clobbered register for the address of the target function.
6246 Make sure that all such registers are not used for passing
6247 parameters. Note that DLLIMPORT functions and call to global
6248 function via GOT slot are indirect. */
6249 if (!decl
6250 || (bind_global && flag_pic && !flag_plt)
6251 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6253 /* Check if regparm >= 3 since arg_reg_available is set to
6254 false if regparm == 0. If regparm is 1 or 2, there is
6255 always a call-clobbered register available.
6257 ??? The symbol indirect call doesn't need a call-clobbered
6258 register. But we don't know if this is a symbol indirect
6259 call or not here. */
6260 if (ix86_function_regparm (type, NULL) >= 3
6261 && !cfun->machine->arg_reg_available)
6262 return false;
6266 /* Otherwise okay. That also includes certain types of indirect calls. */
6267 return true;
6270 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6271 and "sseregparm" calling convention attributes;
6272 arguments as in struct attribute_spec.handler. */
6274 static tree
6275 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6276 bool *no_add_attrs)
6278 if (TREE_CODE (*node) != FUNCTION_TYPE
6279 && TREE_CODE (*node) != METHOD_TYPE
6280 && TREE_CODE (*node) != FIELD_DECL
6281 && TREE_CODE (*node) != TYPE_DECL)
6283 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6284 name);
6285 *no_add_attrs = true;
6286 return NULL_TREE;
6289 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6290 if (is_attribute_p ("regparm", name))
6292 tree cst;
6294 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6296 error ("fastcall and regparm attributes are not compatible");
6299 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6301 error ("regparam and thiscall attributes are not compatible");
6304 cst = TREE_VALUE (args);
6305 if (TREE_CODE (cst) != INTEGER_CST)
6307 warning (OPT_Wattributes,
6308 "%qE attribute requires an integer constant argument",
6309 name);
6310 *no_add_attrs = true;
6312 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6314 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6315 name, REGPARM_MAX);
6316 *no_add_attrs = true;
6319 return NULL_TREE;
6322 if (TARGET_64BIT)
6324 /* Do not warn when emulating the MS ABI. */
6325 if ((TREE_CODE (*node) != FUNCTION_TYPE
6326 && TREE_CODE (*node) != METHOD_TYPE)
6327 || ix86_function_type_abi (*node) != MS_ABI)
6328 warning (OPT_Wattributes, "%qE attribute ignored",
6329 name);
6330 *no_add_attrs = true;
6331 return NULL_TREE;
6334 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6335 if (is_attribute_p ("fastcall", name))
6337 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6339 error ("fastcall and cdecl attributes are not compatible");
6341 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6343 error ("fastcall and stdcall attributes are not compatible");
6345 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6347 error ("fastcall and regparm attributes are not compatible");
6349 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6351 error ("fastcall and thiscall attributes are not compatible");
6355 /* Can combine stdcall with fastcall (redundant), regparm and
6356 sseregparm. */
6357 else if (is_attribute_p ("stdcall", name))
6359 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6361 error ("stdcall and cdecl attributes are not compatible");
6363 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6365 error ("stdcall and fastcall attributes are not compatible");
6367 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6369 error ("stdcall and thiscall attributes are not compatible");
6373 /* Can combine cdecl with regparm and sseregparm. */
6374 else if (is_attribute_p ("cdecl", name))
6376 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6378 error ("stdcall and cdecl attributes are not compatible");
6380 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6382 error ("fastcall and cdecl attributes are not compatible");
6384 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6386 error ("cdecl and thiscall attributes are not compatible");
6389 else if (is_attribute_p ("thiscall", name))
6391 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6392 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6393 name);
6394 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6396 error ("stdcall and thiscall attributes are not compatible");
6398 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6400 error ("fastcall and thiscall attributes are not compatible");
6402 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6404 error ("cdecl and thiscall attributes are not compatible");
6408 /* Can combine sseregparm with all attributes. */
6410 return NULL_TREE;
6413 /* The transactional memory builtins are implicitly regparm or fastcall
6414 depending on the ABI. Override the generic do-nothing attribute that
6415 these builtins were declared with, and replace it with one of the two
6416 attributes that we expect elsewhere. */
6418 static tree
6419 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6420 int flags, bool *no_add_attrs)
6422 tree alt;
6424 /* In no case do we want to add the placeholder attribute. */
6425 *no_add_attrs = true;
6427 /* The 64-bit ABI is unchanged for transactional memory. */
6428 if (TARGET_64BIT)
6429 return NULL_TREE;
6431 /* ??? Is there a better way to validate 32-bit windows? We have
6432 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6433 if (CHECK_STACK_LIMIT > 0)
6434 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6435 else
6437 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6438 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6440 decl_attributes (node, alt, flags);
6442 return NULL_TREE;
6445 /* This function determines from TYPE the calling-convention. */
6447 unsigned int
6448 ix86_get_callcvt (const_tree type)
6450 unsigned int ret = 0;
6451 bool is_stdarg;
6452 tree attrs;
6454 if (TARGET_64BIT)
6455 return IX86_CALLCVT_CDECL;
6457 attrs = TYPE_ATTRIBUTES (type);
6458 if (attrs != NULL_TREE)
6460 if (lookup_attribute ("cdecl", attrs))
6461 ret |= IX86_CALLCVT_CDECL;
6462 else if (lookup_attribute ("stdcall", attrs))
6463 ret |= IX86_CALLCVT_STDCALL;
6464 else if (lookup_attribute ("fastcall", attrs))
6465 ret |= IX86_CALLCVT_FASTCALL;
6466 else if (lookup_attribute ("thiscall", attrs))
6467 ret |= IX86_CALLCVT_THISCALL;
6469 /* Regparam isn't allowed for thiscall and fastcall. */
6470 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6472 if (lookup_attribute ("regparm", attrs))
6473 ret |= IX86_CALLCVT_REGPARM;
6474 if (lookup_attribute ("sseregparm", attrs))
6475 ret |= IX86_CALLCVT_SSEREGPARM;
6478 if (IX86_BASE_CALLCVT(ret) != 0)
6479 return ret;
6482 is_stdarg = stdarg_p (type);
6483 if (TARGET_RTD && !is_stdarg)
6484 return IX86_CALLCVT_STDCALL | ret;
6486 if (ret != 0
6487 || is_stdarg
6488 || TREE_CODE (type) != METHOD_TYPE
6489 || ix86_function_type_abi (type) != MS_ABI)
6490 return IX86_CALLCVT_CDECL | ret;
6492 return IX86_CALLCVT_THISCALL;
6495 /* Return 0 if the attributes for two types are incompatible, 1 if they
6496 are compatible, and 2 if they are nearly compatible (which causes a
6497 warning to be generated). */
6499 static int
6500 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6502 unsigned int ccvt1, ccvt2;
6504 if (TREE_CODE (type1) != FUNCTION_TYPE
6505 && TREE_CODE (type1) != METHOD_TYPE)
6506 return 1;
6508 ccvt1 = ix86_get_callcvt (type1);
6509 ccvt2 = ix86_get_callcvt (type2);
6510 if (ccvt1 != ccvt2)
6511 return 0;
6512 if (ix86_function_regparm (type1, NULL)
6513 != ix86_function_regparm (type2, NULL))
6514 return 0;
6516 return 1;
6519 /* Return the regparm value for a function with the indicated TYPE and DECL.
6520 DECL may be NULL when calling function indirectly
6521 or considering a libcall. */
6523 static int
6524 ix86_function_regparm (const_tree type, const_tree decl)
6526 tree attr;
6527 int regparm;
6528 unsigned int ccvt;
6530 if (TARGET_64BIT)
6531 return (ix86_function_type_abi (type) == SYSV_ABI
6532 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6533 ccvt = ix86_get_callcvt (type);
6534 regparm = ix86_regparm;
6536 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6538 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6539 if (attr)
6541 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6542 return regparm;
6545 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6546 return 2;
6547 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6548 return 1;
6550 /* Use register calling convention for local functions when possible. */
6551 if (decl
6552 && TREE_CODE (decl) == FUNCTION_DECL)
6554 cgraph_node *target = cgraph_node::get (decl);
6555 if (target)
6556 target = target->function_symbol ();
6558 /* Caller and callee must agree on the calling convention, so
6559 checking here just optimize means that with
6560 __attribute__((optimize (...))) caller could use regparm convention
6561 and callee not, or vice versa. Instead look at whether the callee
6562 is optimized or not. */
6563 if (target && opt_for_fn (target->decl, optimize)
6564 && !(profile_flag && !flag_fentry))
6566 cgraph_local_info *i = &target->local;
6567 if (i && i->local && i->can_change_signature)
6569 int local_regparm, globals = 0, regno;
6571 /* Make sure no regparm register is taken by a
6572 fixed register variable. */
6573 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6574 local_regparm++)
6575 if (fixed_regs[local_regparm])
6576 break;
6578 /* We don't want to use regparm(3) for nested functions as
6579 these use a static chain pointer in the third argument. */
6580 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6581 local_regparm = 2;
6583 /* Save a register for the split stack. */
6584 if (flag_split_stack)
6586 if (local_regparm == 3)
6587 local_regparm = 2;
6588 else if (local_regparm == 2
6589 && DECL_STATIC_CHAIN (target->decl))
6590 local_regparm = 1;
6593 /* Each fixed register usage increases register pressure,
6594 so less registers should be used for argument passing.
6595 This functionality can be overriden by an explicit
6596 regparm value. */
6597 for (regno = AX_REG; regno <= DI_REG; regno++)
6598 if (fixed_regs[regno])
6599 globals++;
6601 local_regparm
6602 = globals < local_regparm ? local_regparm - globals : 0;
6604 if (local_regparm > regparm)
6605 regparm = local_regparm;
6610 return regparm;
6613 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6614 DFmode (2) arguments in SSE registers for a function with the
6615 indicated TYPE and DECL. DECL may be NULL when calling function
6616 indirectly or considering a libcall. Return -1 if any FP parameter
6617 should be rejected by error. This is used in siutation we imply SSE
6618 calling convetion but the function is called from another function with
6619 SSE disabled. Otherwise return 0. */
6621 static int
6622 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6624 gcc_assert (!TARGET_64BIT);
6626 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6627 by the sseregparm attribute. */
6628 if (TARGET_SSEREGPARM
6629 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6631 if (!TARGET_SSE)
6633 if (warn)
6635 if (decl)
6636 error ("calling %qD with attribute sseregparm without "
6637 "SSE/SSE2 enabled", decl);
6638 else
6639 error ("calling %qT with attribute sseregparm without "
6640 "SSE/SSE2 enabled", type);
6642 return 0;
6645 return 2;
6648 if (!decl)
6649 return 0;
6651 cgraph_node *target = cgraph_node::get (decl);
6652 if (target)
6653 target = target->function_symbol ();
6655 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6656 (and DFmode for SSE2) arguments in SSE registers. */
6657 if (target
6658 /* TARGET_SSE_MATH */
6659 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6660 && opt_for_fn (target->decl, optimize)
6661 && !(profile_flag && !flag_fentry))
6663 cgraph_local_info *i = &target->local;
6664 if (i && i->local && i->can_change_signature)
6666 /* Refuse to produce wrong code when local function with SSE enabled
6667 is called from SSE disabled function.
6668 FIXME: We need a way to detect these cases cross-ltrans partition
6669 and avoid using SSE calling conventions on local functions called
6670 from function with SSE disabled. For now at least delay the
6671 warning until we know we are going to produce wrong code.
6672 See PR66047 */
6673 if (!TARGET_SSE && warn)
6674 return -1;
6675 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6676 ->x_ix86_isa_flags) ? 2 : 1;
6680 return 0;
6683 /* Return true if EAX is live at the start of the function. Used by
6684 ix86_expand_prologue to determine if we need special help before
6685 calling allocate_stack_worker. */
6687 static bool
6688 ix86_eax_live_at_start_p (void)
6690 /* Cheat. Don't bother working forward from ix86_function_regparm
6691 to the function type to whether an actual argument is located in
6692 eax. Instead just look at cfg info, which is still close enough
6693 to correct at this point. This gives false positives for broken
6694 functions that might use uninitialized data that happens to be
6695 allocated in eax, but who cares? */
6696 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6699 static bool
6700 ix86_keep_aggregate_return_pointer (tree fntype)
6702 tree attr;
6704 if (!TARGET_64BIT)
6706 attr = lookup_attribute ("callee_pop_aggregate_return",
6707 TYPE_ATTRIBUTES (fntype));
6708 if (attr)
6709 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6711 /* For 32-bit MS-ABI the default is to keep aggregate
6712 return pointer. */
6713 if (ix86_function_type_abi (fntype) == MS_ABI)
6714 return true;
6716 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6719 /* Value is the number of bytes of arguments automatically
6720 popped when returning from a subroutine call.
6721 FUNDECL is the declaration node of the function (as a tree),
6722 FUNTYPE is the data type of the function (as a tree),
6723 or for a library call it is an identifier node for the subroutine name.
6724 SIZE is the number of bytes of arguments passed on the stack.
6726 On the 80386, the RTD insn may be used to pop them if the number
6727 of args is fixed, but if the number is variable then the caller
6728 must pop them all. RTD can't be used for library calls now
6729 because the library is compiled with the Unix compiler.
6730 Use of RTD is a selectable option, since it is incompatible with
6731 standard Unix calling sequences. If the option is not selected,
6732 the caller must always pop the args.
6734 The attribute stdcall is equivalent to RTD on a per module basis. */
6736 static int
6737 ix86_return_pops_args (tree fundecl, tree funtype, int size)
6739 unsigned int ccvt;
6741 /* None of the 64-bit ABIs pop arguments. */
6742 if (TARGET_64BIT)
6743 return 0;
6745 ccvt = ix86_get_callcvt (funtype);
6747 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6748 | IX86_CALLCVT_THISCALL)) != 0
6749 && ! stdarg_p (funtype))
6750 return size;
6752 /* Lose any fake structure return argument if it is passed on the stack. */
6753 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6754 && !ix86_keep_aggregate_return_pointer (funtype))
6756 int nregs = ix86_function_regparm (funtype, fundecl);
6757 if (nregs == 0)
6758 return GET_MODE_SIZE (Pmode);
6761 return 0;
6764 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6766 static bool
6767 ix86_legitimate_combined_insn (rtx_insn *insn)
6769 int i;
6771 /* Check operand constraints in case hard registers were propagated
6772 into insn pattern. This check prevents combine pass from
6773 generating insn patterns with invalid hard register operands.
6774 These invalid insns can eventually confuse reload to error out
6775 with a spill failure. See also PRs 46829 and 46843. */
6777 gcc_assert (INSN_CODE (insn) >= 0);
6779 extract_insn (insn);
6780 preprocess_constraints (insn);
6782 int n_operands = recog_data.n_operands;
6783 int n_alternatives = recog_data.n_alternatives;
6784 for (i = 0; i < n_operands; i++)
6786 rtx op = recog_data.operand[i];
6787 machine_mode mode = GET_MODE (op);
6788 const operand_alternative *op_alt;
6789 int offset = 0;
6790 bool win;
6791 int j;
6793 /* A unary operator may be accepted by the predicate, but it
6794 is irrelevant for matching constraints. */
6795 if (UNARY_P (op))
6796 op = XEXP (op, 0);
6798 if (SUBREG_P (op))
6800 if (REG_P (SUBREG_REG (op))
6801 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6802 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6803 GET_MODE (SUBREG_REG (op)),
6804 SUBREG_BYTE (op),
6805 GET_MODE (op));
6806 op = SUBREG_REG (op);
6809 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6810 continue;
6812 op_alt = recog_op_alt;
6814 /* Operand has no constraints, anything is OK. */
6815 win = !n_alternatives;
6817 alternative_mask preferred = get_preferred_alternatives (insn);
6818 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6820 if (!TEST_BIT (preferred, j))
6821 continue;
6822 if (op_alt[i].anything_ok
6823 || (op_alt[i].matches != -1
6824 && operands_match_p
6825 (recog_data.operand[i],
6826 recog_data.operand[op_alt[i].matches]))
6827 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6829 win = true;
6830 break;
6834 if (!win)
6835 return false;
6838 return true;
6841 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6843 static unsigned HOST_WIDE_INT
6844 ix86_asan_shadow_offset (void)
6846 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6847 : HOST_WIDE_INT_C (0x7fff8000))
6848 : (HOST_WIDE_INT_1 << 29);
6851 /* Argument support functions. */
6853 /* Return true when register may be used to pass function parameters. */
6854 bool
6855 ix86_function_arg_regno_p (int regno)
6857 int i;
6858 enum calling_abi call_abi;
6859 const int *parm_regs;
6861 if (TARGET_MPX && BND_REGNO_P (regno))
6862 return true;
6864 if (!TARGET_64BIT)
6866 if (TARGET_MACHO)
6867 return (regno < REGPARM_MAX
6868 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6869 else
6870 return (regno < REGPARM_MAX
6871 || (TARGET_MMX && MMX_REGNO_P (regno)
6872 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6873 || (TARGET_SSE && SSE_REGNO_P (regno)
6874 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6877 if (TARGET_SSE && SSE_REGNO_P (regno)
6878 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6879 return true;
6881 /* TODO: The function should depend on current function ABI but
6882 builtins.c would need updating then. Therefore we use the
6883 default ABI. */
6884 call_abi = ix86_cfun_abi ();
6886 /* RAX is used as hidden argument to va_arg functions. */
6887 if (call_abi == SYSV_ABI && regno == AX_REG)
6888 return true;
6890 if (call_abi == MS_ABI)
6891 parm_regs = x86_64_ms_abi_int_parameter_registers;
6892 else
6893 parm_regs = x86_64_int_parameter_registers;
6895 for (i = 0; i < (call_abi == MS_ABI
6896 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6897 if (regno == parm_regs[i])
6898 return true;
6899 return false;
6902 /* Return if we do not know how to pass TYPE solely in registers. */
6904 static bool
6905 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6907 if (must_pass_in_stack_var_size_or_pad (mode, type))
6908 return true;
6910 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
6911 The layout_type routine is crafty and tries to trick us into passing
6912 currently unsupported vector types on the stack by using TImode. */
6913 return (!TARGET_64BIT && mode == TImode
6914 && type && TREE_CODE (type) != VECTOR_TYPE);
6917 /* It returns the size, in bytes, of the area reserved for arguments passed
6918 in registers for the function represented by fndecl dependent to the used
6919 abi format. */
6921 ix86_reg_parm_stack_space (const_tree fndecl)
6923 enum calling_abi call_abi = SYSV_ABI;
6924 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
6925 call_abi = ix86_function_abi (fndecl);
6926 else
6927 call_abi = ix86_function_type_abi (fndecl);
6928 if (TARGET_64BIT && call_abi == MS_ABI)
6929 return 32;
6930 return 0;
6933 /* We add this as a workaround in order to use libc_has_function
6934 hook in i386.md. */
6935 bool
6936 ix86_libc_has_function (enum function_class fn_class)
6938 return targetm.libc_has_function (fn_class);
6941 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
6942 specifying the call abi used. */
6943 enum calling_abi
6944 ix86_function_type_abi (const_tree fntype)
6946 enum calling_abi abi = ix86_abi;
6948 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
6949 return abi;
6951 if (abi == SYSV_ABI
6952 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
6954 static int warned;
6955 if (TARGET_X32 && !warned)
6957 error ("X32 does not support ms_abi attribute");
6958 warned = 1;
6961 abi = MS_ABI;
6963 else if (abi == MS_ABI
6964 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
6965 abi = SYSV_ABI;
6967 return abi;
6970 static enum calling_abi
6971 ix86_function_abi (const_tree fndecl)
6973 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
6976 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
6977 specifying the call abi used. */
6978 enum calling_abi
6979 ix86_cfun_abi (void)
6981 return cfun ? cfun->machine->call_abi : ix86_abi;
6984 static bool
6985 ix86_function_ms_hook_prologue (const_tree fn)
6987 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
6989 if (decl_function_context (fn) != NULL_TREE)
6990 error_at (DECL_SOURCE_LOCATION (fn),
6991 "ms_hook_prologue is not compatible with nested function");
6992 else
6993 return true;
6995 return false;
6998 static bool
6999 ix86_function_naked (const_tree fn)
7001 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7002 return true;
7004 return false;
7007 /* Write the extra assembler code needed to declare a function properly. */
7009 void
7010 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7011 tree decl)
7013 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7015 if (is_ms_hook)
7017 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7018 unsigned int filler_cc = 0xcccccccc;
7020 for (i = 0; i < filler_count; i += 4)
7021 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7024 #ifdef SUBTARGET_ASM_UNWIND_INIT
7025 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7026 #endif
7028 ASM_OUTPUT_LABEL (asm_out_file, fname);
7030 /* Output magic byte marker, if hot-patch attribute is set. */
7031 if (is_ms_hook)
7033 if (TARGET_64BIT)
7035 /* leaq [%rsp + 0], %rsp */
7036 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7037 asm_out_file);
7039 else
7041 /* movl.s %edi, %edi
7042 push %ebp
7043 movl.s %esp, %ebp */
7044 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7049 /* Implementation of call abi switching target hook. Specific to FNDECL
7050 the specific call register sets are set. See also
7051 ix86_conditional_register_usage for more details. */
7052 void
7053 ix86_call_abi_override (const_tree fndecl)
7055 cfun->machine->call_abi = ix86_function_abi (fndecl);
7058 /* Return 1 if pseudo register should be created and used to hold
7059 GOT address for PIC code. */
7060 bool
7061 ix86_use_pseudo_pic_reg (void)
7063 if ((TARGET_64BIT
7064 && (ix86_cmodel == CM_SMALL_PIC
7065 || TARGET_PECOFF))
7066 || !flag_pic)
7067 return false;
7068 return true;
7071 /* Initialize large model PIC register. */
7073 static void
7074 ix86_init_large_pic_reg (unsigned int tmp_regno)
7076 rtx_code_label *label;
7077 rtx tmp_reg;
7079 gcc_assert (Pmode == DImode);
7080 label = gen_label_rtx ();
7081 emit_label (label);
7082 LABEL_PRESERVE_P (label) = 1;
7083 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7084 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7085 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7086 label));
7087 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7088 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7089 pic_offset_table_rtx, tmp_reg));
7090 const char *name = LABEL_NAME (label);
7091 PUT_CODE (label, NOTE);
7092 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7093 NOTE_DELETED_LABEL_NAME (label) = name;
7096 /* Create and initialize PIC register if required. */
7097 static void
7098 ix86_init_pic_reg (void)
7100 edge entry_edge;
7101 rtx_insn *seq;
7103 if (!ix86_use_pseudo_pic_reg ())
7104 return;
7106 start_sequence ();
7108 if (TARGET_64BIT)
7110 if (ix86_cmodel == CM_LARGE_PIC)
7111 ix86_init_large_pic_reg (R11_REG);
7112 else
7113 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7115 else
7117 /* If there is future mcount call in the function it is more profitable
7118 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7119 rtx reg = crtl->profile
7120 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7121 : pic_offset_table_rtx;
7122 rtx_insn *insn = emit_insn (gen_set_got (reg));
7123 RTX_FRAME_RELATED_P (insn) = 1;
7124 if (crtl->profile)
7125 emit_move_insn (pic_offset_table_rtx, reg);
7126 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7129 seq = get_insns ();
7130 end_sequence ();
7132 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7133 insert_insn_on_edge (seq, entry_edge);
7134 commit_one_edge_insertion (entry_edge);
7137 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7138 for a call to a function whose data type is FNTYPE.
7139 For a library call, FNTYPE is 0. */
7141 void
7142 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7143 tree fntype, /* tree ptr for function decl */
7144 rtx libname, /* SYMBOL_REF of library name or 0 */
7145 tree fndecl,
7146 int caller)
7148 struct cgraph_local_info *i = NULL;
7149 struct cgraph_node *target = NULL;
7151 memset (cum, 0, sizeof (*cum));
7153 if (fndecl)
7155 target = cgraph_node::get (fndecl);
7156 if (target)
7158 target = target->function_symbol ();
7159 i = cgraph_node::local_info (target->decl);
7160 cum->call_abi = ix86_function_abi (target->decl);
7162 else
7163 cum->call_abi = ix86_function_abi (fndecl);
7165 else
7166 cum->call_abi = ix86_function_type_abi (fntype);
7168 cum->caller = caller;
7170 /* Set up the number of registers to use for passing arguments. */
7171 cum->nregs = ix86_regparm;
7172 if (TARGET_64BIT)
7174 cum->nregs = (cum->call_abi == SYSV_ABI
7175 ? X86_64_REGPARM_MAX
7176 : X86_64_MS_REGPARM_MAX);
7178 if (TARGET_SSE)
7180 cum->sse_nregs = SSE_REGPARM_MAX;
7181 if (TARGET_64BIT)
7183 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7184 ? X86_64_SSE_REGPARM_MAX
7185 : X86_64_MS_SSE_REGPARM_MAX);
7188 if (TARGET_MMX)
7189 cum->mmx_nregs = MMX_REGPARM_MAX;
7190 cum->warn_avx512f = true;
7191 cum->warn_avx = true;
7192 cum->warn_sse = true;
7193 cum->warn_mmx = true;
7195 /* Because type might mismatch in between caller and callee, we need to
7196 use actual type of function for local calls.
7197 FIXME: cgraph_analyze can be told to actually record if function uses
7198 va_start so for local functions maybe_vaarg can be made aggressive
7199 helping K&R code.
7200 FIXME: once typesytem is fixed, we won't need this code anymore. */
7201 if (i && i->local && i->can_change_signature)
7202 fntype = TREE_TYPE (target->decl);
7203 cum->stdarg = stdarg_p (fntype);
7204 cum->maybe_vaarg = (fntype
7205 ? (!prototype_p (fntype) || stdarg_p (fntype))
7206 : !libname);
7208 cum->bnd_regno = FIRST_BND_REG;
7209 cum->bnds_in_bt = 0;
7210 cum->force_bnd_pass = 0;
7211 cum->decl = fndecl;
7213 cum->warn_empty = !warn_abi || cum->stdarg;
7214 if (!cum->warn_empty && fntype)
7216 function_args_iterator iter;
7217 tree argtype;
7218 bool seen_empty_type = false;
7219 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7221 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7222 break;
7223 if (TYPE_EMPTY_P (argtype))
7224 seen_empty_type = true;
7225 else if (seen_empty_type)
7227 cum->warn_empty = true;
7228 break;
7233 if (!TARGET_64BIT)
7235 /* If there are variable arguments, then we won't pass anything
7236 in registers in 32-bit mode. */
7237 if (stdarg_p (fntype))
7239 cum->nregs = 0;
7240 /* Since in 32-bit, variable arguments are always passed on
7241 stack, there is scratch register available for indirect
7242 sibcall. */
7243 cfun->machine->arg_reg_available = true;
7244 cum->sse_nregs = 0;
7245 cum->mmx_nregs = 0;
7246 cum->warn_avx512f = false;
7247 cum->warn_avx = false;
7248 cum->warn_sse = false;
7249 cum->warn_mmx = false;
7250 return;
7253 /* Use ecx and edx registers if function has fastcall attribute,
7254 else look for regparm information. */
7255 if (fntype)
7257 unsigned int ccvt = ix86_get_callcvt (fntype);
7258 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7260 cum->nregs = 1;
7261 cum->fastcall = 1; /* Same first register as in fastcall. */
7263 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7265 cum->nregs = 2;
7266 cum->fastcall = 1;
7268 else
7269 cum->nregs = ix86_function_regparm (fntype, fndecl);
7272 /* Set up the number of SSE registers used for passing SFmode
7273 and DFmode arguments. Warn for mismatching ABI. */
7274 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7277 cfun->machine->arg_reg_available = (cum->nregs > 0);
7280 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7281 But in the case of vector types, it is some vector mode.
7283 When we have only some of our vector isa extensions enabled, then there
7284 are some modes for which vector_mode_supported_p is false. For these
7285 modes, the generic vector support in gcc will choose some non-vector mode
7286 in order to implement the type. By computing the natural mode, we'll
7287 select the proper ABI location for the operand and not depend on whatever
7288 the middle-end decides to do with these vector types.
7290 The midde-end can't deal with the vector types > 16 bytes. In this
7291 case, we return the original mode and warn ABI change if CUM isn't
7292 NULL.
7294 If INT_RETURN is true, warn ABI change if the vector mode isn't
7295 available for function return value. */
7297 static machine_mode
7298 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7299 bool in_return)
7301 machine_mode mode = TYPE_MODE (type);
7303 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7305 HOST_WIDE_INT size = int_size_in_bytes (type);
7306 if ((size == 8 || size == 16 || size == 32 || size == 64)
7307 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7308 && TYPE_VECTOR_SUBPARTS (type) > 1)
7310 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7312 /* There are no XFmode vector modes. */
7313 if (innermode == XFmode)
7314 return mode;
7316 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7317 mode = MIN_MODE_VECTOR_FLOAT;
7318 else
7319 mode = MIN_MODE_VECTOR_INT;
7321 /* Get the mode which has this inner mode and number of units. */
7322 FOR_EACH_MODE_FROM (mode, mode)
7323 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7324 && GET_MODE_INNER (mode) == innermode)
7326 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7328 static bool warnedavx512f;
7329 static bool warnedavx512f_ret;
7331 if (cum && cum->warn_avx512f && !warnedavx512f)
7333 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7334 "without AVX512F enabled changes the ABI"))
7335 warnedavx512f = true;
7337 else if (in_return && !warnedavx512f_ret)
7339 if (warning (OPT_Wpsabi, "AVX512F vector return "
7340 "without AVX512F enabled changes the ABI"))
7341 warnedavx512f_ret = true;
7344 return TYPE_MODE (type);
7346 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7348 static bool warnedavx;
7349 static bool warnedavx_ret;
7351 if (cum && cum->warn_avx && !warnedavx)
7353 if (warning (OPT_Wpsabi, "AVX vector argument "
7354 "without AVX enabled changes the ABI"))
7355 warnedavx = true;
7357 else if (in_return && !warnedavx_ret)
7359 if (warning (OPT_Wpsabi, "AVX vector return "
7360 "without AVX enabled changes the ABI"))
7361 warnedavx_ret = true;
7364 return TYPE_MODE (type);
7366 else if (((size == 8 && TARGET_64BIT) || size == 16)
7367 && !TARGET_SSE
7368 && !TARGET_IAMCU)
7370 static bool warnedsse;
7371 static bool warnedsse_ret;
7373 if (cum && cum->warn_sse && !warnedsse)
7375 if (warning (OPT_Wpsabi, "SSE vector argument "
7376 "without SSE enabled changes the ABI"))
7377 warnedsse = true;
7379 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7381 if (warning (OPT_Wpsabi, "SSE vector return "
7382 "without SSE enabled changes the ABI"))
7383 warnedsse_ret = true;
7386 else if ((size == 8 && !TARGET_64BIT)
7387 && (!cfun
7388 || cfun->machine->func_type == TYPE_NORMAL)
7389 && !TARGET_MMX
7390 && !TARGET_IAMCU)
7392 static bool warnedmmx;
7393 static bool warnedmmx_ret;
7395 if (cum && cum->warn_mmx && !warnedmmx)
7397 if (warning (OPT_Wpsabi, "MMX vector argument "
7398 "without MMX enabled changes the ABI"))
7399 warnedmmx = true;
7401 else if (in_return && !warnedmmx_ret)
7403 if (warning (OPT_Wpsabi, "MMX vector return "
7404 "without MMX enabled changes the ABI"))
7405 warnedmmx_ret = true;
7408 return mode;
7411 gcc_unreachable ();
7415 return mode;
7418 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7419 this may not agree with the mode that the type system has chosen for the
7420 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7421 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7423 static rtx
7424 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7425 unsigned int regno)
7427 rtx tmp;
7429 if (orig_mode != BLKmode)
7430 tmp = gen_rtx_REG (orig_mode, regno);
7431 else
7433 tmp = gen_rtx_REG (mode, regno);
7434 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7435 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7438 return tmp;
7441 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7442 of this code is to classify each 8bytes of incoming argument by the register
7443 class and assign registers accordingly. */
7445 /* Return the union class of CLASS1 and CLASS2.
7446 See the x86-64 PS ABI for details. */
7448 static enum x86_64_reg_class
7449 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7451 /* Rule #1: If both classes are equal, this is the resulting class. */
7452 if (class1 == class2)
7453 return class1;
7455 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7456 the other class. */
7457 if (class1 == X86_64_NO_CLASS)
7458 return class2;
7459 if (class2 == X86_64_NO_CLASS)
7460 return class1;
7462 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7463 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7464 return X86_64_MEMORY_CLASS;
7466 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7467 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7468 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7469 return X86_64_INTEGERSI_CLASS;
7470 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7471 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7472 return X86_64_INTEGER_CLASS;
7474 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7475 MEMORY is used. */
7476 if (class1 == X86_64_X87_CLASS
7477 || class1 == X86_64_X87UP_CLASS
7478 || class1 == X86_64_COMPLEX_X87_CLASS
7479 || class2 == X86_64_X87_CLASS
7480 || class2 == X86_64_X87UP_CLASS
7481 || class2 == X86_64_COMPLEX_X87_CLASS)
7482 return X86_64_MEMORY_CLASS;
7484 /* Rule #6: Otherwise class SSE is used. */
7485 return X86_64_SSE_CLASS;
7488 /* Classify the argument of type TYPE and mode MODE.
7489 CLASSES will be filled by the register class used to pass each word
7490 of the operand. The number of words is returned. In case the parameter
7491 should be passed in memory, 0 is returned. As a special case for zero
7492 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7494 BIT_OFFSET is used internally for handling records and specifies offset
7495 of the offset in bits modulo 512 to avoid overflow cases.
7497 See the x86-64 PS ABI for details.
7500 static int
7501 classify_argument (machine_mode mode, const_tree type,
7502 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7504 HOST_WIDE_INT bytes =
7505 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7506 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7508 /* Variable sized entities are always passed/returned in memory. */
7509 if (bytes < 0)
7510 return 0;
7512 if (mode != VOIDmode
7513 && targetm.calls.must_pass_in_stack (mode, type))
7514 return 0;
7516 if (type && AGGREGATE_TYPE_P (type))
7518 int i;
7519 tree field;
7520 enum x86_64_reg_class subclasses[MAX_CLASSES];
7522 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7523 if (bytes > 64)
7524 return 0;
7526 for (i = 0; i < words; i++)
7527 classes[i] = X86_64_NO_CLASS;
7529 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7530 signalize memory class, so handle it as special case. */
7531 if (!words)
7533 classes[0] = X86_64_NO_CLASS;
7534 return 1;
7537 /* Classify each field of record and merge classes. */
7538 switch (TREE_CODE (type))
7540 case RECORD_TYPE:
7541 /* And now merge the fields of structure. */
7542 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7544 if (TREE_CODE (field) == FIELD_DECL)
7546 int num;
7548 if (TREE_TYPE (field) == error_mark_node)
7549 continue;
7551 /* Bitfields are always classified as integer. Handle them
7552 early, since later code would consider them to be
7553 misaligned integers. */
7554 if (DECL_BIT_FIELD (field))
7556 for (i = (int_bit_position (field)
7557 + (bit_offset % 64)) / 8 / 8;
7558 i < ((int_bit_position (field) + (bit_offset % 64))
7559 + tree_to_shwi (DECL_SIZE (field))
7560 + 63) / 8 / 8; i++)
7561 classes[i] =
7562 merge_classes (X86_64_INTEGER_CLASS,
7563 classes[i]);
7565 else
7567 int pos;
7569 type = TREE_TYPE (field);
7571 /* Flexible array member is ignored. */
7572 if (TYPE_MODE (type) == BLKmode
7573 && TREE_CODE (type) == ARRAY_TYPE
7574 && TYPE_SIZE (type) == NULL_TREE
7575 && TYPE_DOMAIN (type) != NULL_TREE
7576 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7577 == NULL_TREE))
7579 static bool warned;
7581 if (!warned && warn_psabi)
7583 warned = true;
7584 inform (input_location,
7585 "the ABI of passing struct with"
7586 " a flexible array member has"
7587 " changed in GCC 4.4");
7589 continue;
7591 num = classify_argument (TYPE_MODE (type), type,
7592 subclasses,
7593 (int_bit_position (field)
7594 + bit_offset) % 512);
7595 if (!num)
7596 return 0;
7597 pos = (int_bit_position (field)
7598 + (bit_offset % 64)) / 8 / 8;
7599 for (i = 0; i < num && (i + pos) < words; i++)
7600 classes[i + pos] =
7601 merge_classes (subclasses[i], classes[i + pos]);
7605 break;
7607 case ARRAY_TYPE:
7608 /* Arrays are handled as small records. */
7610 int num;
7611 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7612 TREE_TYPE (type), subclasses, bit_offset);
7613 if (!num)
7614 return 0;
7616 /* The partial classes are now full classes. */
7617 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7618 subclasses[0] = X86_64_SSE_CLASS;
7619 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7620 && !((bit_offset % 64) == 0 && bytes == 4))
7621 subclasses[0] = X86_64_INTEGER_CLASS;
7623 for (i = 0; i < words; i++)
7624 classes[i] = subclasses[i % num];
7626 break;
7628 case UNION_TYPE:
7629 case QUAL_UNION_TYPE:
7630 /* Unions are similar to RECORD_TYPE but offset is always 0.
7632 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7634 if (TREE_CODE (field) == FIELD_DECL)
7636 int num;
7638 if (TREE_TYPE (field) == error_mark_node)
7639 continue;
7641 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7642 TREE_TYPE (field), subclasses,
7643 bit_offset);
7644 if (!num)
7645 return 0;
7646 for (i = 0; i < num && i < words; i++)
7647 classes[i] = merge_classes (subclasses[i], classes[i]);
7650 break;
7652 default:
7653 gcc_unreachable ();
7656 if (words > 2)
7658 /* When size > 16 bytes, if the first one isn't
7659 X86_64_SSE_CLASS or any other ones aren't
7660 X86_64_SSEUP_CLASS, everything should be passed in
7661 memory. */
7662 if (classes[0] != X86_64_SSE_CLASS)
7663 return 0;
7665 for (i = 1; i < words; i++)
7666 if (classes[i] != X86_64_SSEUP_CLASS)
7667 return 0;
7670 /* Final merger cleanup. */
7671 for (i = 0; i < words; i++)
7673 /* If one class is MEMORY, everything should be passed in
7674 memory. */
7675 if (classes[i] == X86_64_MEMORY_CLASS)
7676 return 0;
7678 /* The X86_64_SSEUP_CLASS should be always preceded by
7679 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7680 if (classes[i] == X86_64_SSEUP_CLASS
7681 && classes[i - 1] != X86_64_SSE_CLASS
7682 && classes[i - 1] != X86_64_SSEUP_CLASS)
7684 /* The first one should never be X86_64_SSEUP_CLASS. */
7685 gcc_assert (i != 0);
7686 classes[i] = X86_64_SSE_CLASS;
7689 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7690 everything should be passed in memory. */
7691 if (classes[i] == X86_64_X87UP_CLASS
7692 && (classes[i - 1] != X86_64_X87_CLASS))
7694 static bool warned;
7696 /* The first one should never be X86_64_X87UP_CLASS. */
7697 gcc_assert (i != 0);
7698 if (!warned && warn_psabi)
7700 warned = true;
7701 inform (input_location,
7702 "the ABI of passing union with long double"
7703 " has changed in GCC 4.4");
7705 return 0;
7708 return words;
7711 /* Compute alignment needed. We align all types to natural boundaries with
7712 exception of XFmode that is aligned to 64bits. */
7713 if (mode != VOIDmode && mode != BLKmode)
7715 int mode_alignment = GET_MODE_BITSIZE (mode);
7717 if (mode == XFmode)
7718 mode_alignment = 128;
7719 else if (mode == XCmode)
7720 mode_alignment = 256;
7721 if (COMPLEX_MODE_P (mode))
7722 mode_alignment /= 2;
7723 /* Misaligned fields are always returned in memory. */
7724 if (bit_offset % mode_alignment)
7725 return 0;
7728 /* for V1xx modes, just use the base mode */
7729 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7730 && GET_MODE_UNIT_SIZE (mode) == bytes)
7731 mode = GET_MODE_INNER (mode);
7733 /* Classification of atomic types. */
7734 switch (mode)
7736 case E_SDmode:
7737 case E_DDmode:
7738 classes[0] = X86_64_SSE_CLASS;
7739 return 1;
7740 case E_TDmode:
7741 classes[0] = X86_64_SSE_CLASS;
7742 classes[1] = X86_64_SSEUP_CLASS;
7743 return 2;
7744 case E_DImode:
7745 case E_SImode:
7746 case E_HImode:
7747 case E_QImode:
7748 case E_CSImode:
7749 case E_CHImode:
7750 case E_CQImode:
7752 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7754 /* Analyze last 128 bits only. */
7755 size = (size - 1) & 0x7f;
7757 if (size < 32)
7759 classes[0] = X86_64_INTEGERSI_CLASS;
7760 return 1;
7762 else if (size < 64)
7764 classes[0] = X86_64_INTEGER_CLASS;
7765 return 1;
7767 else if (size < 64+32)
7769 classes[0] = X86_64_INTEGER_CLASS;
7770 classes[1] = X86_64_INTEGERSI_CLASS;
7771 return 2;
7773 else if (size < 64+64)
7775 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7776 return 2;
7778 else
7779 gcc_unreachable ();
7781 case E_CDImode:
7782 case E_TImode:
7783 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7784 return 2;
7785 case E_COImode:
7786 case E_OImode:
7787 /* OImode shouldn't be used directly. */
7788 gcc_unreachable ();
7789 case E_CTImode:
7790 return 0;
7791 case E_SFmode:
7792 if (!(bit_offset % 64))
7793 classes[0] = X86_64_SSESF_CLASS;
7794 else
7795 classes[0] = X86_64_SSE_CLASS;
7796 return 1;
7797 case E_DFmode:
7798 classes[0] = X86_64_SSEDF_CLASS;
7799 return 1;
7800 case E_XFmode:
7801 classes[0] = X86_64_X87_CLASS;
7802 classes[1] = X86_64_X87UP_CLASS;
7803 return 2;
7804 case E_TFmode:
7805 classes[0] = X86_64_SSE_CLASS;
7806 classes[1] = X86_64_SSEUP_CLASS;
7807 return 2;
7808 case E_SCmode:
7809 classes[0] = X86_64_SSE_CLASS;
7810 if (!(bit_offset % 64))
7811 return 1;
7812 else
7814 static bool warned;
7816 if (!warned && warn_psabi)
7818 warned = true;
7819 inform (input_location,
7820 "the ABI of passing structure with complex float"
7821 " member has changed in GCC 4.4");
7823 classes[1] = X86_64_SSESF_CLASS;
7824 return 2;
7826 case E_DCmode:
7827 classes[0] = X86_64_SSEDF_CLASS;
7828 classes[1] = X86_64_SSEDF_CLASS;
7829 return 2;
7830 case E_XCmode:
7831 classes[0] = X86_64_COMPLEX_X87_CLASS;
7832 return 1;
7833 case E_TCmode:
7834 /* This modes is larger than 16 bytes. */
7835 return 0;
7836 case E_V8SFmode:
7837 case E_V8SImode:
7838 case E_V32QImode:
7839 case E_V16HImode:
7840 case E_V4DFmode:
7841 case E_V4DImode:
7842 classes[0] = X86_64_SSE_CLASS;
7843 classes[1] = X86_64_SSEUP_CLASS;
7844 classes[2] = X86_64_SSEUP_CLASS;
7845 classes[3] = X86_64_SSEUP_CLASS;
7846 return 4;
7847 case E_V8DFmode:
7848 case E_V16SFmode:
7849 case E_V8DImode:
7850 case E_V16SImode:
7851 case E_V32HImode:
7852 case E_V64QImode:
7853 classes[0] = X86_64_SSE_CLASS;
7854 classes[1] = X86_64_SSEUP_CLASS;
7855 classes[2] = X86_64_SSEUP_CLASS;
7856 classes[3] = X86_64_SSEUP_CLASS;
7857 classes[4] = X86_64_SSEUP_CLASS;
7858 classes[5] = X86_64_SSEUP_CLASS;
7859 classes[6] = X86_64_SSEUP_CLASS;
7860 classes[7] = X86_64_SSEUP_CLASS;
7861 return 8;
7862 case E_V4SFmode:
7863 case E_V4SImode:
7864 case E_V16QImode:
7865 case E_V8HImode:
7866 case E_V2DFmode:
7867 case E_V2DImode:
7868 classes[0] = X86_64_SSE_CLASS;
7869 classes[1] = X86_64_SSEUP_CLASS;
7870 return 2;
7871 case E_V1TImode:
7872 case E_V1DImode:
7873 case E_V2SFmode:
7874 case E_V2SImode:
7875 case E_V4HImode:
7876 case E_V8QImode:
7877 classes[0] = X86_64_SSE_CLASS;
7878 return 1;
7879 case E_BLKmode:
7880 case E_VOIDmode:
7881 return 0;
7882 default:
7883 gcc_assert (VECTOR_MODE_P (mode));
7885 if (bytes > 16)
7886 return 0;
7888 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7890 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7891 classes[0] = X86_64_INTEGERSI_CLASS;
7892 else
7893 classes[0] = X86_64_INTEGER_CLASS;
7894 classes[1] = X86_64_INTEGER_CLASS;
7895 return 1 + (bytes > 8);
7899 /* Examine the argument and return set number of register required in each
7900 class. Return true iff parameter should be passed in memory. */
7902 static bool
7903 examine_argument (machine_mode mode, const_tree type, int in_return,
7904 int *int_nregs, int *sse_nregs)
7906 enum x86_64_reg_class regclass[MAX_CLASSES];
7907 int n = classify_argument (mode, type, regclass, 0);
7909 *int_nregs = 0;
7910 *sse_nregs = 0;
7912 if (!n)
7913 return true;
7914 for (n--; n >= 0; n--)
7915 switch (regclass[n])
7917 case X86_64_INTEGER_CLASS:
7918 case X86_64_INTEGERSI_CLASS:
7919 (*int_nregs)++;
7920 break;
7921 case X86_64_SSE_CLASS:
7922 case X86_64_SSESF_CLASS:
7923 case X86_64_SSEDF_CLASS:
7924 (*sse_nregs)++;
7925 break;
7926 case X86_64_NO_CLASS:
7927 case X86_64_SSEUP_CLASS:
7928 break;
7929 case X86_64_X87_CLASS:
7930 case X86_64_X87UP_CLASS:
7931 case X86_64_COMPLEX_X87_CLASS:
7932 if (!in_return)
7933 return true;
7934 break;
7935 case X86_64_MEMORY_CLASS:
7936 gcc_unreachable ();
7939 return false;
7942 /* Construct container for the argument used by GCC interface. See
7943 FUNCTION_ARG for the detailed description. */
7945 static rtx
7946 construct_container (machine_mode mode, machine_mode orig_mode,
7947 const_tree type, int in_return, int nintregs, int nsseregs,
7948 const int *intreg, int sse_regno)
7950 /* The following variables hold the static issued_error state. */
7951 static bool issued_sse_arg_error;
7952 static bool issued_sse_ret_error;
7953 static bool issued_x87_ret_error;
7955 machine_mode tmpmode;
7956 int bytes =
7957 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7958 enum x86_64_reg_class regclass[MAX_CLASSES];
7959 int n;
7960 int i;
7961 int nexps = 0;
7962 int needed_sseregs, needed_intregs;
7963 rtx exp[MAX_CLASSES];
7964 rtx ret;
7966 n = classify_argument (mode, type, regclass, 0);
7967 if (!n)
7968 return NULL;
7969 if (examine_argument (mode, type, in_return, &needed_intregs,
7970 &needed_sseregs))
7971 return NULL;
7972 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
7973 return NULL;
7975 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
7976 some less clueful developer tries to use floating-point anyway. */
7977 if (needed_sseregs && !TARGET_SSE)
7979 if (in_return)
7981 if (!issued_sse_ret_error)
7983 error ("SSE register return with SSE disabled");
7984 issued_sse_ret_error = true;
7987 else if (!issued_sse_arg_error)
7989 error ("SSE register argument with SSE disabled");
7990 issued_sse_arg_error = true;
7992 return NULL;
7995 /* Likewise, error if the ABI requires us to return values in the
7996 x87 registers and the user specified -mno-80387. */
7997 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
7998 for (i = 0; i < n; i++)
7999 if (regclass[i] == X86_64_X87_CLASS
8000 || regclass[i] == X86_64_X87UP_CLASS
8001 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8003 if (!issued_x87_ret_error)
8005 error ("x87 register return with x87 disabled");
8006 issued_x87_ret_error = true;
8008 return NULL;
8011 /* First construct simple cases. Avoid SCmode, since we want to use
8012 single register to pass this type. */
8013 if (n == 1 && mode != SCmode)
8014 switch (regclass[0])
8016 case X86_64_INTEGER_CLASS:
8017 case X86_64_INTEGERSI_CLASS:
8018 return gen_rtx_REG (mode, intreg[0]);
8019 case X86_64_SSE_CLASS:
8020 case X86_64_SSESF_CLASS:
8021 case X86_64_SSEDF_CLASS:
8022 if (mode != BLKmode)
8023 return gen_reg_or_parallel (mode, orig_mode,
8024 SSE_REGNO (sse_regno));
8025 break;
8026 case X86_64_X87_CLASS:
8027 case X86_64_COMPLEX_X87_CLASS:
8028 return gen_rtx_REG (mode, FIRST_STACK_REG);
8029 case X86_64_NO_CLASS:
8030 /* Zero sized array, struct or class. */
8031 return NULL;
8032 default:
8033 gcc_unreachable ();
8035 if (n == 2
8036 && regclass[0] == X86_64_SSE_CLASS
8037 && regclass[1] == X86_64_SSEUP_CLASS
8038 && mode != BLKmode)
8039 return gen_reg_or_parallel (mode, orig_mode,
8040 SSE_REGNO (sse_regno));
8041 if (n == 4
8042 && regclass[0] == X86_64_SSE_CLASS
8043 && regclass[1] == X86_64_SSEUP_CLASS
8044 && regclass[2] == X86_64_SSEUP_CLASS
8045 && regclass[3] == X86_64_SSEUP_CLASS
8046 && mode != BLKmode)
8047 return gen_reg_or_parallel (mode, orig_mode,
8048 SSE_REGNO (sse_regno));
8049 if (n == 8
8050 && regclass[0] == X86_64_SSE_CLASS
8051 && regclass[1] == X86_64_SSEUP_CLASS
8052 && regclass[2] == X86_64_SSEUP_CLASS
8053 && regclass[3] == X86_64_SSEUP_CLASS
8054 && regclass[4] == X86_64_SSEUP_CLASS
8055 && regclass[5] == X86_64_SSEUP_CLASS
8056 && regclass[6] == X86_64_SSEUP_CLASS
8057 && regclass[7] == X86_64_SSEUP_CLASS
8058 && mode != BLKmode)
8059 return gen_reg_or_parallel (mode, orig_mode,
8060 SSE_REGNO (sse_regno));
8061 if (n == 2
8062 && regclass[0] == X86_64_X87_CLASS
8063 && regclass[1] == X86_64_X87UP_CLASS)
8064 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8066 if (n == 2
8067 && regclass[0] == X86_64_INTEGER_CLASS
8068 && regclass[1] == X86_64_INTEGER_CLASS
8069 && (mode == CDImode || mode == TImode)
8070 && intreg[0] + 1 == intreg[1])
8071 return gen_rtx_REG (mode, intreg[0]);
8073 /* Otherwise figure out the entries of the PARALLEL. */
8074 for (i = 0; i < n; i++)
8076 int pos;
8078 switch (regclass[i])
8080 case X86_64_NO_CLASS:
8081 break;
8082 case X86_64_INTEGER_CLASS:
8083 case X86_64_INTEGERSI_CLASS:
8084 /* Merge TImodes on aligned occasions here too. */
8085 if (i * 8 + 8 > bytes)
8087 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8088 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8089 /* We've requested 24 bytes we
8090 don't have mode for. Use DImode. */
8091 tmpmode = DImode;
8093 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8094 tmpmode = SImode;
8095 else
8096 tmpmode = DImode;
8097 exp [nexps++]
8098 = gen_rtx_EXPR_LIST (VOIDmode,
8099 gen_rtx_REG (tmpmode, *intreg),
8100 GEN_INT (i*8));
8101 intreg++;
8102 break;
8103 case X86_64_SSESF_CLASS:
8104 exp [nexps++]
8105 = gen_rtx_EXPR_LIST (VOIDmode,
8106 gen_rtx_REG (SFmode,
8107 SSE_REGNO (sse_regno)),
8108 GEN_INT (i*8));
8109 sse_regno++;
8110 break;
8111 case X86_64_SSEDF_CLASS:
8112 exp [nexps++]
8113 = gen_rtx_EXPR_LIST (VOIDmode,
8114 gen_rtx_REG (DFmode,
8115 SSE_REGNO (sse_regno)),
8116 GEN_INT (i*8));
8117 sse_regno++;
8118 break;
8119 case X86_64_SSE_CLASS:
8120 pos = i;
8121 switch (n)
8123 case 1:
8124 tmpmode = DImode;
8125 break;
8126 case 2:
8127 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8129 tmpmode = TImode;
8130 i++;
8132 else
8133 tmpmode = DImode;
8134 break;
8135 case 4:
8136 gcc_assert (i == 0
8137 && regclass[1] == X86_64_SSEUP_CLASS
8138 && regclass[2] == X86_64_SSEUP_CLASS
8139 && regclass[3] == X86_64_SSEUP_CLASS);
8140 tmpmode = OImode;
8141 i += 3;
8142 break;
8143 case 8:
8144 gcc_assert (i == 0
8145 && regclass[1] == X86_64_SSEUP_CLASS
8146 && regclass[2] == X86_64_SSEUP_CLASS
8147 && regclass[3] == X86_64_SSEUP_CLASS
8148 && regclass[4] == X86_64_SSEUP_CLASS
8149 && regclass[5] == X86_64_SSEUP_CLASS
8150 && regclass[6] == X86_64_SSEUP_CLASS
8151 && regclass[7] == X86_64_SSEUP_CLASS);
8152 tmpmode = XImode;
8153 i += 7;
8154 break;
8155 default:
8156 gcc_unreachable ();
8158 exp [nexps++]
8159 = gen_rtx_EXPR_LIST (VOIDmode,
8160 gen_rtx_REG (tmpmode,
8161 SSE_REGNO (sse_regno)),
8162 GEN_INT (pos*8));
8163 sse_regno++;
8164 break;
8165 default:
8166 gcc_unreachable ();
8170 /* Empty aligned struct, union or class. */
8171 if (nexps == 0)
8172 return NULL;
8174 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8175 for (i = 0; i < nexps; i++)
8176 XVECEXP (ret, 0, i) = exp [i];
8177 return ret;
8180 /* Update the data in CUM to advance over an argument of mode MODE
8181 and data type TYPE. (TYPE is null for libcalls where that information
8182 may not be available.)
8184 Return a number of integer regsiters advanced over. */
8186 static int
8187 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8188 const_tree type, HOST_WIDE_INT bytes,
8189 HOST_WIDE_INT words)
8191 int res = 0;
8192 bool error_p = false;
8194 if (TARGET_IAMCU)
8196 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8197 bytes in registers. */
8198 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8199 goto pass_in_reg;
8200 return res;
8203 switch (mode)
8205 default:
8206 break;
8208 case E_BLKmode:
8209 if (bytes < 0)
8210 break;
8211 /* FALLTHRU */
8213 case E_DImode:
8214 case E_SImode:
8215 case E_HImode:
8216 case E_QImode:
8217 pass_in_reg:
8218 cum->words += words;
8219 cum->nregs -= words;
8220 cum->regno += words;
8221 if (cum->nregs >= 0)
8222 res = words;
8223 if (cum->nregs <= 0)
8225 cum->nregs = 0;
8226 cfun->machine->arg_reg_available = false;
8227 cum->regno = 0;
8229 break;
8231 case E_OImode:
8232 /* OImode shouldn't be used directly. */
8233 gcc_unreachable ();
8235 case E_DFmode:
8236 if (cum->float_in_sse == -1)
8237 error_p = true;
8238 if (cum->float_in_sse < 2)
8239 break;
8240 /* FALLTHRU */
8241 case E_SFmode:
8242 if (cum->float_in_sse == -1)
8243 error_p = true;
8244 if (cum->float_in_sse < 1)
8245 break;
8246 /* FALLTHRU */
8248 case E_V8SFmode:
8249 case E_V8SImode:
8250 case E_V64QImode:
8251 case E_V32HImode:
8252 case E_V16SImode:
8253 case E_V8DImode:
8254 case E_V16SFmode:
8255 case E_V8DFmode:
8256 case E_V32QImode:
8257 case E_V16HImode:
8258 case E_V4DFmode:
8259 case E_V4DImode:
8260 case E_TImode:
8261 case E_V16QImode:
8262 case E_V8HImode:
8263 case E_V4SImode:
8264 case E_V2DImode:
8265 case E_V4SFmode:
8266 case E_V2DFmode:
8267 if (!type || !AGGREGATE_TYPE_P (type))
8269 cum->sse_words += words;
8270 cum->sse_nregs -= 1;
8271 cum->sse_regno += 1;
8272 if (cum->sse_nregs <= 0)
8274 cum->sse_nregs = 0;
8275 cum->sse_regno = 0;
8278 break;
8280 case E_V8QImode:
8281 case E_V4HImode:
8282 case E_V2SImode:
8283 case E_V2SFmode:
8284 case E_V1TImode:
8285 case E_V1DImode:
8286 if (!type || !AGGREGATE_TYPE_P (type))
8288 cum->mmx_words += words;
8289 cum->mmx_nregs -= 1;
8290 cum->mmx_regno += 1;
8291 if (cum->mmx_nregs <= 0)
8293 cum->mmx_nregs = 0;
8294 cum->mmx_regno = 0;
8297 break;
8299 if (error_p)
8301 cum->float_in_sse = 0;
8302 error ("calling %qD with SSE calling convention without "
8303 "SSE/SSE2 enabled", cum->decl);
8304 sorry ("this is a GCC bug that can be worked around by adding "
8305 "attribute used to function called");
8308 return res;
8311 static int
8312 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8313 const_tree type, HOST_WIDE_INT words, bool named)
8315 int int_nregs, sse_nregs;
8317 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8318 if (!named && (VALID_AVX512F_REG_MODE (mode)
8319 || VALID_AVX256_REG_MODE (mode)))
8320 return 0;
8322 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8323 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8325 cum->nregs -= int_nregs;
8326 cum->sse_nregs -= sse_nregs;
8327 cum->regno += int_nregs;
8328 cum->sse_regno += sse_nregs;
8329 return int_nregs;
8331 else
8333 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8334 cum->words = ROUND_UP (cum->words, align);
8335 cum->words += words;
8336 return 0;
8340 static int
8341 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8342 HOST_WIDE_INT words)
8344 /* Otherwise, this should be passed indirect. */
8345 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8347 cum->words += words;
8348 if (cum->nregs > 0)
8350 cum->nregs -= 1;
8351 cum->regno += 1;
8352 return 1;
8354 return 0;
8357 /* Update the data in CUM to advance over an argument of mode MODE and
8358 data type TYPE. (TYPE is null for libcalls where that information
8359 may not be available.) */
8361 static void
8362 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8363 const_tree type, bool named)
8365 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8366 HOST_WIDE_INT bytes, words;
8367 int nregs;
8369 /* The argument of interrupt handler is a special case and is
8370 handled in ix86_function_arg. */
8371 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8372 return;
8374 if (mode == BLKmode)
8375 bytes = int_size_in_bytes (type);
8376 else
8377 bytes = GET_MODE_SIZE (mode);
8378 words = CEIL (bytes, UNITS_PER_WORD);
8380 if (type)
8381 mode = type_natural_mode (type, NULL, false);
8383 if ((type && POINTER_BOUNDS_TYPE_P (type))
8384 || POINTER_BOUNDS_MODE_P (mode))
8386 /* If we pass bounds in BT then just update remained bounds count. */
8387 if (cum->bnds_in_bt)
8389 cum->bnds_in_bt--;
8390 return;
8393 /* Update remained number of bounds to force. */
8394 if (cum->force_bnd_pass)
8395 cum->force_bnd_pass--;
8397 cum->bnd_regno++;
8399 return;
8402 /* The first arg not going to Bounds Tables resets this counter. */
8403 cum->bnds_in_bt = 0;
8404 /* For unnamed args we always pass bounds to avoid bounds mess when
8405 passed and received types do not match. If bounds do not follow
8406 unnamed arg, still pretend required number of bounds were passed. */
8407 if (cum->force_bnd_pass)
8409 cum->bnd_regno += cum->force_bnd_pass;
8410 cum->force_bnd_pass = 0;
8413 if (TARGET_64BIT)
8415 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8417 if (call_abi == MS_ABI)
8418 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8419 else
8420 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8422 else
8423 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8425 /* For stdarg we expect bounds to be passed for each value passed
8426 in register. */
8427 if (cum->stdarg)
8428 cum->force_bnd_pass = nregs;
8429 /* For pointers passed in memory we expect bounds passed in Bounds
8430 Table. */
8431 if (!nregs)
8433 /* Track if there are outgoing arguments on stack. */
8434 if (cum->caller)
8435 cfun->machine->outgoing_args_on_stack = true;
8437 cum->bnds_in_bt = chkp_type_bounds_count (type);
8441 /* Define where to put the arguments to a function.
8442 Value is zero to push the argument on the stack,
8443 or a hard register in which to store the argument.
8445 MODE is the argument's machine mode.
8446 TYPE is the data type of the argument (as a tree).
8447 This is null for libcalls where that information may
8448 not be available.
8449 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8450 the preceding args and about the function being called.
8451 NAMED is nonzero if this argument is a named parameter
8452 (otherwise it is an extra parameter matching an ellipsis). */
8454 static rtx
8455 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8456 machine_mode orig_mode, const_tree type,
8457 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8459 bool error_p = false;
8461 /* Avoid the AL settings for the Unix64 ABI. */
8462 if (mode == VOIDmode)
8463 return constm1_rtx;
8465 if (TARGET_IAMCU)
8467 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8468 bytes in registers. */
8469 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8470 goto pass_in_reg;
8471 return NULL_RTX;
8474 switch (mode)
8476 default:
8477 break;
8479 case E_BLKmode:
8480 if (bytes < 0)
8481 break;
8482 /* FALLTHRU */
8483 case E_DImode:
8484 case E_SImode:
8485 case E_HImode:
8486 case E_QImode:
8487 pass_in_reg:
8488 if (words <= cum->nregs)
8490 int regno = cum->regno;
8492 /* Fastcall allocates the first two DWORD (SImode) or
8493 smaller arguments to ECX and EDX if it isn't an
8494 aggregate type . */
8495 if (cum->fastcall)
8497 if (mode == BLKmode
8498 || mode == DImode
8499 || (type && AGGREGATE_TYPE_P (type)))
8500 break;
8502 /* ECX not EAX is the first allocated register. */
8503 if (regno == AX_REG)
8504 regno = CX_REG;
8506 return gen_rtx_REG (mode, regno);
8508 break;
8510 case E_DFmode:
8511 if (cum->float_in_sse == -1)
8512 error_p = true;
8513 if (cum->float_in_sse < 2)
8514 break;
8515 /* FALLTHRU */
8516 case E_SFmode:
8517 if (cum->float_in_sse == -1)
8518 error_p = true;
8519 if (cum->float_in_sse < 1)
8520 break;
8521 /* FALLTHRU */
8522 case E_TImode:
8523 /* In 32bit, we pass TImode in xmm registers. */
8524 case E_V16QImode:
8525 case E_V8HImode:
8526 case E_V4SImode:
8527 case E_V2DImode:
8528 case E_V4SFmode:
8529 case E_V2DFmode:
8530 if (!type || !AGGREGATE_TYPE_P (type))
8532 if (cum->sse_nregs)
8533 return gen_reg_or_parallel (mode, orig_mode,
8534 cum->sse_regno + FIRST_SSE_REG);
8536 break;
8538 case E_OImode:
8539 case E_XImode:
8540 /* OImode and XImode shouldn't be used directly. */
8541 gcc_unreachable ();
8543 case E_V64QImode:
8544 case E_V32HImode:
8545 case E_V16SImode:
8546 case E_V8DImode:
8547 case E_V16SFmode:
8548 case E_V8DFmode:
8549 case E_V8SFmode:
8550 case E_V8SImode:
8551 case E_V32QImode:
8552 case E_V16HImode:
8553 case E_V4DFmode:
8554 case E_V4DImode:
8555 if (!type || !AGGREGATE_TYPE_P (type))
8557 if (cum->sse_nregs)
8558 return gen_reg_or_parallel (mode, orig_mode,
8559 cum->sse_regno + FIRST_SSE_REG);
8561 break;
8563 case E_V8QImode:
8564 case E_V4HImode:
8565 case E_V2SImode:
8566 case E_V2SFmode:
8567 case E_V1TImode:
8568 case E_V1DImode:
8569 if (!type || !AGGREGATE_TYPE_P (type))
8571 if (cum->mmx_nregs)
8572 return gen_reg_or_parallel (mode, orig_mode,
8573 cum->mmx_regno + FIRST_MMX_REG);
8575 break;
8577 if (error_p)
8579 cum->float_in_sse = 0;
8580 error ("calling %qD with SSE calling convention without "
8581 "SSE/SSE2 enabled", cum->decl);
8582 sorry ("this is a GCC bug that can be worked around by adding "
8583 "attribute used to function called");
8586 return NULL_RTX;
8589 static rtx
8590 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8591 machine_mode orig_mode, const_tree type, bool named)
8593 /* Handle a hidden AL argument containing number of registers
8594 for varargs x86-64 functions. */
8595 if (mode == VOIDmode)
8596 return GEN_INT (cum->maybe_vaarg
8597 ? (cum->sse_nregs < 0
8598 ? X86_64_SSE_REGPARM_MAX
8599 : cum->sse_regno)
8600 : -1);
8602 switch (mode)
8604 default:
8605 break;
8607 case E_V8SFmode:
8608 case E_V8SImode:
8609 case E_V32QImode:
8610 case E_V16HImode:
8611 case E_V4DFmode:
8612 case E_V4DImode:
8613 case E_V16SFmode:
8614 case E_V16SImode:
8615 case E_V64QImode:
8616 case E_V32HImode:
8617 case E_V8DFmode:
8618 case E_V8DImode:
8619 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8620 if (!named)
8621 return NULL;
8622 break;
8625 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8626 cum->sse_nregs,
8627 &x86_64_int_parameter_registers [cum->regno],
8628 cum->sse_regno);
8631 static rtx
8632 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8633 machine_mode orig_mode, bool named,
8634 HOST_WIDE_INT bytes)
8636 unsigned int regno;
8638 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8639 We use value of -2 to specify that current function call is MSABI. */
8640 if (mode == VOIDmode)
8641 return GEN_INT (-2);
8643 /* If we've run out of registers, it goes on the stack. */
8644 if (cum->nregs == 0)
8645 return NULL_RTX;
8647 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8649 /* Only floating point modes are passed in anything but integer regs. */
8650 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8652 if (named)
8653 regno = cum->regno + FIRST_SSE_REG;
8654 else
8656 rtx t1, t2;
8658 /* Unnamed floating parameters are passed in both the
8659 SSE and integer registers. */
8660 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8661 t2 = gen_rtx_REG (mode, regno);
8662 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8663 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8664 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8667 /* Handle aggregated types passed in register. */
8668 if (orig_mode == BLKmode)
8670 if (bytes > 0 && bytes <= 8)
8671 mode = (bytes > 4 ? DImode : SImode);
8672 if (mode == BLKmode)
8673 mode = DImode;
8676 return gen_reg_or_parallel (mode, orig_mode, regno);
8679 /* Return where to put the arguments to a function.
8680 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8682 MODE is the argument's machine mode. TYPE is the data type of the
8683 argument. It is null for libcalls where that information may not be
8684 available. CUM gives information about the preceding args and about
8685 the function being called. NAMED is nonzero if this argument is a
8686 named parameter (otherwise it is an extra parameter matching an
8687 ellipsis). */
8689 static rtx
8690 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8691 const_tree type, bool named)
8693 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8694 machine_mode mode = omode;
8695 HOST_WIDE_INT bytes, words;
8696 rtx arg;
8698 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8700 gcc_assert (type != NULL_TREE);
8701 if (POINTER_TYPE_P (type))
8703 /* This is the pointer argument. */
8704 gcc_assert (TYPE_MODE (type) == Pmode);
8705 /* It is at -WORD(AP) in the current frame in interrupt and
8706 exception handlers. */
8707 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8709 else
8711 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8712 && TREE_CODE (type) == INTEGER_TYPE
8713 && TYPE_MODE (type) == word_mode);
8714 /* The error code is the word-mode integer argument at
8715 -2 * WORD(AP) in the current frame of the exception
8716 handler. */
8717 arg = gen_rtx_MEM (word_mode,
8718 plus_constant (Pmode,
8719 arg_pointer_rtx,
8720 -2 * UNITS_PER_WORD));
8722 return arg;
8725 /* All pointer bounds arguments are handled separately here. */
8726 if ((type && POINTER_BOUNDS_TYPE_P (type))
8727 || POINTER_BOUNDS_MODE_P (mode))
8729 /* Return NULL if bounds are forced to go in Bounds Table. */
8730 if (cum->bnds_in_bt)
8731 arg = NULL;
8732 /* Return the next available bound reg if any. */
8733 else if (cum->bnd_regno <= LAST_BND_REG)
8734 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8735 /* Return the next special slot number otherwise. */
8736 else
8737 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8739 return arg;
8742 if (mode == BLKmode)
8743 bytes = int_size_in_bytes (type);
8744 else
8745 bytes = GET_MODE_SIZE (mode);
8746 words = CEIL (bytes, UNITS_PER_WORD);
8748 /* To simplify the code below, represent vector types with a vector mode
8749 even if MMX/SSE are not active. */
8750 if (type && TREE_CODE (type) == VECTOR_TYPE)
8751 mode = type_natural_mode (type, cum, false);
8753 if (TARGET_64BIT)
8755 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8757 if (call_abi == MS_ABI)
8758 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8759 else
8760 arg = function_arg_64 (cum, mode, omode, type, named);
8762 else
8763 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8765 /* Track if there are outgoing arguments on stack. */
8766 if (arg == NULL_RTX && cum->caller)
8767 cfun->machine->outgoing_args_on_stack = true;
8769 return arg;
8772 /* A C expression that indicates when an argument must be passed by
8773 reference. If nonzero for an argument, a copy of that argument is
8774 made in memory and a pointer to the argument is passed instead of
8775 the argument itself. The pointer is passed in whatever way is
8776 appropriate for passing a pointer to that type. */
8778 static bool
8779 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8780 const_tree type, bool)
8782 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8784 /* Bounds are never passed by reference. */
8785 if ((type && POINTER_BOUNDS_TYPE_P (type))
8786 || POINTER_BOUNDS_MODE_P (mode))
8787 return false;
8789 if (TARGET_64BIT)
8791 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8793 /* See Windows x64 Software Convention. */
8794 if (call_abi == MS_ABI)
8796 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8798 if (type)
8800 /* Arrays are passed by reference. */
8801 if (TREE_CODE (type) == ARRAY_TYPE)
8802 return true;
8804 if (RECORD_OR_UNION_TYPE_P (type))
8806 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8807 are passed by reference. */
8808 msize = int_size_in_bytes (type);
8812 /* __m128 is passed by reference. */
8813 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8815 else if (type && int_size_in_bytes (type) == -1)
8816 return true;
8819 return false;
8822 /* Return true when TYPE should be 128bit aligned for 32bit argument
8823 passing ABI. XXX: This function is obsolete and is only used for
8824 checking psABI compatibility with previous versions of GCC. */
8826 static bool
8827 ix86_compat_aligned_value_p (const_tree type)
8829 machine_mode mode = TYPE_MODE (type);
8830 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8831 || mode == TDmode
8832 || mode == TFmode
8833 || mode == TCmode)
8834 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8835 return true;
8836 if (TYPE_ALIGN (type) < 128)
8837 return false;
8839 if (AGGREGATE_TYPE_P (type))
8841 /* Walk the aggregates recursively. */
8842 switch (TREE_CODE (type))
8844 case RECORD_TYPE:
8845 case UNION_TYPE:
8846 case QUAL_UNION_TYPE:
8848 tree field;
8850 /* Walk all the structure fields. */
8851 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8853 if (TREE_CODE (field) == FIELD_DECL
8854 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8855 return true;
8857 break;
8860 case ARRAY_TYPE:
8861 /* Just for use if some languages passes arrays by value. */
8862 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8863 return true;
8864 break;
8866 default:
8867 gcc_unreachable ();
8870 return false;
8873 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8874 XXX: This function is obsolete and is only used for checking psABI
8875 compatibility with previous versions of GCC. */
8877 static unsigned int
8878 ix86_compat_function_arg_boundary (machine_mode mode,
8879 const_tree type, unsigned int align)
8881 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8882 natural boundaries. */
8883 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8885 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8886 make an exception for SSE modes since these require 128bit
8887 alignment.
8889 The handling here differs from field_alignment. ICC aligns MMX
8890 arguments to 4 byte boundaries, while structure fields are aligned
8891 to 8 byte boundaries. */
8892 if (!type)
8894 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8895 align = PARM_BOUNDARY;
8897 else
8899 if (!ix86_compat_aligned_value_p (type))
8900 align = PARM_BOUNDARY;
8903 if (align > BIGGEST_ALIGNMENT)
8904 align = BIGGEST_ALIGNMENT;
8905 return align;
8908 /* Return true when TYPE should be 128bit aligned for 32bit argument
8909 passing ABI. */
8911 static bool
8912 ix86_contains_aligned_value_p (const_tree type)
8914 machine_mode mode = TYPE_MODE (type);
8916 if (mode == XFmode || mode == XCmode)
8917 return false;
8919 if (TYPE_ALIGN (type) < 128)
8920 return false;
8922 if (AGGREGATE_TYPE_P (type))
8924 /* Walk the aggregates recursively. */
8925 switch (TREE_CODE (type))
8927 case RECORD_TYPE:
8928 case UNION_TYPE:
8929 case QUAL_UNION_TYPE:
8931 tree field;
8933 /* Walk all the structure fields. */
8934 for (field = TYPE_FIELDS (type);
8935 field;
8936 field = DECL_CHAIN (field))
8938 if (TREE_CODE (field) == FIELD_DECL
8939 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
8940 return true;
8942 break;
8945 case ARRAY_TYPE:
8946 /* Just for use if some languages passes arrays by value. */
8947 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
8948 return true;
8949 break;
8951 default:
8952 gcc_unreachable ();
8955 else
8956 return TYPE_ALIGN (type) >= 128;
8958 return false;
8961 /* Gives the alignment boundary, in bits, of an argument with the
8962 specified mode and type. */
8964 static unsigned int
8965 ix86_function_arg_boundary (machine_mode mode, const_tree type)
8967 unsigned int align;
8968 if (type)
8970 /* Since the main variant type is used for call, we convert it to
8971 the main variant type. */
8972 type = TYPE_MAIN_VARIANT (type);
8973 align = TYPE_ALIGN (type);
8975 else
8976 align = GET_MODE_ALIGNMENT (mode);
8977 if (align < PARM_BOUNDARY)
8978 align = PARM_BOUNDARY;
8979 else
8981 static bool warned;
8982 unsigned int saved_align = align;
8984 if (!TARGET_64BIT)
8986 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
8987 if (!type)
8989 if (mode == XFmode || mode == XCmode)
8990 align = PARM_BOUNDARY;
8992 else if (!ix86_contains_aligned_value_p (type))
8993 align = PARM_BOUNDARY;
8995 if (align < 128)
8996 align = PARM_BOUNDARY;
8999 if (warn_psabi
9000 && !warned
9001 && align != ix86_compat_function_arg_boundary (mode, type,
9002 saved_align))
9004 warned = true;
9005 inform (input_location,
9006 "The ABI for passing parameters with %d-byte"
9007 " alignment has changed in GCC 4.6",
9008 align / BITS_PER_UNIT);
9012 return align;
9015 /* Return true if N is a possible register number of function value. */
9017 static bool
9018 ix86_function_value_regno_p (const unsigned int regno)
9020 switch (regno)
9022 case AX_REG:
9023 return true;
9024 case DX_REG:
9025 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9026 case DI_REG:
9027 case SI_REG:
9028 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9030 case BND0_REG:
9031 case BND1_REG:
9032 return chkp_function_instrumented_p (current_function_decl);
9034 /* Complex values are returned in %st(0)/%st(1) pair. */
9035 case ST0_REG:
9036 case ST1_REG:
9037 /* TODO: The function should depend on current function ABI but
9038 builtins.c would need updating then. Therefore we use the
9039 default ABI. */
9040 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9041 return false;
9042 return TARGET_FLOAT_RETURNS_IN_80387;
9044 /* Complex values are returned in %xmm0/%xmm1 pair. */
9045 case XMM0_REG:
9046 case XMM1_REG:
9047 return TARGET_SSE;
9049 case MM0_REG:
9050 if (TARGET_MACHO || TARGET_64BIT)
9051 return false;
9052 return TARGET_MMX;
9055 return false;
9058 /* Define how to find the value returned by a function.
9059 VALTYPE is the data type of the value (as a tree).
9060 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9061 otherwise, FUNC is 0. */
9063 static rtx
9064 function_value_32 (machine_mode orig_mode, machine_mode mode,
9065 const_tree fntype, const_tree fn)
9067 unsigned int regno;
9069 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9070 we normally prevent this case when mmx is not available. However
9071 some ABIs may require the result to be returned like DImode. */
9072 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9073 regno = FIRST_MMX_REG;
9075 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9076 we prevent this case when sse is not available. However some ABIs
9077 may require the result to be returned like integer TImode. */
9078 else if (mode == TImode
9079 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9080 regno = FIRST_SSE_REG;
9082 /* 32-byte vector modes in %ymm0. */
9083 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9084 regno = FIRST_SSE_REG;
9086 /* 64-byte vector modes in %zmm0. */
9087 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9088 regno = FIRST_SSE_REG;
9090 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9091 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9092 regno = FIRST_FLOAT_REG;
9093 else
9094 /* Most things go in %eax. */
9095 regno = AX_REG;
9097 /* Override FP return register with %xmm0 for local functions when
9098 SSE math is enabled or for functions with sseregparm attribute. */
9099 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9101 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9102 if (sse_level == -1)
9104 error ("calling %qD with SSE calling convention without "
9105 "SSE/SSE2 enabled", fn);
9106 sorry ("this is a GCC bug that can be worked around by adding "
9107 "attribute used to function called");
9109 else if ((sse_level >= 1 && mode == SFmode)
9110 || (sse_level == 2 && mode == DFmode))
9111 regno = FIRST_SSE_REG;
9114 /* OImode shouldn't be used directly. */
9115 gcc_assert (mode != OImode);
9117 return gen_rtx_REG (orig_mode, regno);
9120 static rtx
9121 function_value_64 (machine_mode orig_mode, machine_mode mode,
9122 const_tree valtype)
9124 rtx ret;
9126 /* Handle libcalls, which don't provide a type node. */
9127 if (valtype == NULL)
9129 unsigned int regno;
9131 switch (mode)
9133 case E_SFmode:
9134 case E_SCmode:
9135 case E_DFmode:
9136 case E_DCmode:
9137 case E_TFmode:
9138 case E_SDmode:
9139 case E_DDmode:
9140 case E_TDmode:
9141 regno = FIRST_SSE_REG;
9142 break;
9143 case E_XFmode:
9144 case E_XCmode:
9145 regno = FIRST_FLOAT_REG;
9146 break;
9147 case E_TCmode:
9148 return NULL;
9149 default:
9150 regno = AX_REG;
9153 return gen_rtx_REG (mode, regno);
9155 else if (POINTER_TYPE_P (valtype))
9157 /* Pointers are always returned in word_mode. */
9158 mode = word_mode;
9161 ret = construct_container (mode, orig_mode, valtype, 1,
9162 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9163 x86_64_int_return_registers, 0);
9165 /* For zero sized structures, construct_container returns NULL, but we
9166 need to keep rest of compiler happy by returning meaningful value. */
9167 if (!ret)
9168 ret = gen_rtx_REG (orig_mode, AX_REG);
9170 return ret;
9173 static rtx
9174 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9175 const_tree valtype)
9177 unsigned int regno = AX_REG;
9179 if (TARGET_SSE)
9181 switch (GET_MODE_SIZE (mode))
9183 case 16:
9184 if (valtype != NULL_TREE
9185 && !VECTOR_INTEGER_TYPE_P (valtype)
9186 && !VECTOR_INTEGER_TYPE_P (valtype)
9187 && !INTEGRAL_TYPE_P (valtype)
9188 && !VECTOR_FLOAT_TYPE_P (valtype))
9189 break;
9190 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9191 && !COMPLEX_MODE_P (mode))
9192 regno = FIRST_SSE_REG;
9193 break;
9194 case 8:
9195 case 4:
9196 if (mode == SFmode || mode == DFmode)
9197 regno = FIRST_SSE_REG;
9198 break;
9199 default:
9200 break;
9203 return gen_rtx_REG (orig_mode, regno);
9206 static rtx
9207 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9208 machine_mode orig_mode, machine_mode mode)
9210 const_tree fn, fntype;
9212 fn = NULL_TREE;
9213 if (fntype_or_decl && DECL_P (fntype_or_decl))
9214 fn = fntype_or_decl;
9215 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9217 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9218 || POINTER_BOUNDS_MODE_P (mode))
9219 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9220 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9221 return function_value_ms_64 (orig_mode, mode, valtype);
9222 else if (TARGET_64BIT)
9223 return function_value_64 (orig_mode, mode, valtype);
9224 else
9225 return function_value_32 (orig_mode, mode, fntype, fn);
9228 static rtx
9229 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9231 machine_mode mode, orig_mode;
9233 orig_mode = TYPE_MODE (valtype);
9234 mode = type_natural_mode (valtype, NULL, true);
9235 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9238 /* Return an RTX representing a place where a function returns
9239 or recieves pointer bounds or NULL if no bounds are returned.
9241 VALTYPE is a data type of a value returned by the function.
9243 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9244 or FUNCTION_TYPE of the function.
9246 If OUTGOING is false, return a place in which the caller will
9247 see the return value. Otherwise, return a place where a
9248 function returns a value. */
9250 static rtx
9251 ix86_function_value_bounds (const_tree valtype,
9252 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9253 bool outgoing ATTRIBUTE_UNUSED)
9255 rtx res = NULL_RTX;
9257 if (BOUNDED_TYPE_P (valtype))
9258 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9259 else if (chkp_type_has_pointer (valtype))
9261 bitmap slots;
9262 rtx bounds[2];
9263 bitmap_iterator bi;
9264 unsigned i, bnd_no = 0;
9266 bitmap_obstack_initialize (NULL);
9267 slots = BITMAP_ALLOC (NULL);
9268 chkp_find_bound_slots (valtype, slots);
9270 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9272 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9273 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9274 gcc_assert (bnd_no < 2);
9275 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9278 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9280 BITMAP_FREE (slots);
9281 bitmap_obstack_release (NULL);
9283 else
9284 res = NULL_RTX;
9286 return res;
9289 /* Pointer function arguments and return values are promoted to
9290 word_mode for normal functions. */
9292 static machine_mode
9293 ix86_promote_function_mode (const_tree type, machine_mode mode,
9294 int *punsignedp, const_tree fntype,
9295 int for_return)
9297 if (cfun->machine->func_type == TYPE_NORMAL
9298 && type != NULL_TREE
9299 && POINTER_TYPE_P (type))
9301 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9302 return word_mode;
9304 return default_promote_function_mode (type, mode, punsignedp, fntype,
9305 for_return);
9308 /* Return true if a structure, union or array with MODE containing FIELD
9309 should be accessed using BLKmode. */
9311 static bool
9312 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9314 /* Union with XFmode must be in BLKmode. */
9315 return (mode == XFmode
9316 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9317 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9321 ix86_libcall_value (machine_mode mode)
9323 return ix86_function_value_1 (NULL, NULL, mode, mode);
9326 /* Return true iff type is returned in memory. */
9328 static bool
9329 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9331 #ifdef SUBTARGET_RETURN_IN_MEMORY
9332 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9333 #else
9334 const machine_mode mode = type_natural_mode (type, NULL, true);
9335 HOST_WIDE_INT size;
9337 if (POINTER_BOUNDS_TYPE_P (type))
9338 return false;
9340 if (TARGET_64BIT)
9342 if (ix86_function_type_abi (fntype) == MS_ABI)
9344 size = int_size_in_bytes (type);
9346 /* __m128 is returned in xmm0. */
9347 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9348 || INTEGRAL_TYPE_P (type)
9349 || VECTOR_FLOAT_TYPE_P (type))
9350 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9351 && !COMPLEX_MODE_P (mode)
9352 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9353 return false;
9355 /* Otherwise, the size must be exactly in [1248]. */
9356 return size != 1 && size != 2 && size != 4 && size != 8;
9358 else
9360 int needed_intregs, needed_sseregs;
9362 return examine_argument (mode, type, 1,
9363 &needed_intregs, &needed_sseregs);
9366 else
9368 size = int_size_in_bytes (type);
9370 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9371 bytes in registers. */
9372 if (TARGET_IAMCU)
9373 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9375 if (mode == BLKmode)
9376 return true;
9378 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9379 return false;
9381 if (VECTOR_MODE_P (mode) || mode == TImode)
9383 /* User-created vectors small enough to fit in EAX. */
9384 if (size < 8)
9385 return false;
9387 /* Unless ABI prescibes otherwise,
9388 MMX/3dNow values are returned in MM0 if available. */
9390 if (size == 8)
9391 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9393 /* SSE values are returned in XMM0 if available. */
9394 if (size == 16)
9395 return !TARGET_SSE;
9397 /* AVX values are returned in YMM0 if available. */
9398 if (size == 32)
9399 return !TARGET_AVX;
9401 /* AVX512F values are returned in ZMM0 if available. */
9402 if (size == 64)
9403 return !TARGET_AVX512F;
9406 if (mode == XFmode)
9407 return false;
9409 if (size > 12)
9410 return true;
9412 /* OImode shouldn't be used directly. */
9413 gcc_assert (mode != OImode);
9415 return false;
9417 #endif
9421 /* Create the va_list data type. */
9423 static tree
9424 ix86_build_builtin_va_list_64 (void)
9426 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9428 record = lang_hooks.types.make_type (RECORD_TYPE);
9429 type_decl = build_decl (BUILTINS_LOCATION,
9430 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9432 f_gpr = build_decl (BUILTINS_LOCATION,
9433 FIELD_DECL, get_identifier ("gp_offset"),
9434 unsigned_type_node);
9435 f_fpr = build_decl (BUILTINS_LOCATION,
9436 FIELD_DECL, get_identifier ("fp_offset"),
9437 unsigned_type_node);
9438 f_ovf = build_decl (BUILTINS_LOCATION,
9439 FIELD_DECL, get_identifier ("overflow_arg_area"),
9440 ptr_type_node);
9441 f_sav = build_decl (BUILTINS_LOCATION,
9442 FIELD_DECL, get_identifier ("reg_save_area"),
9443 ptr_type_node);
9445 va_list_gpr_counter_field = f_gpr;
9446 va_list_fpr_counter_field = f_fpr;
9448 DECL_FIELD_CONTEXT (f_gpr) = record;
9449 DECL_FIELD_CONTEXT (f_fpr) = record;
9450 DECL_FIELD_CONTEXT (f_ovf) = record;
9451 DECL_FIELD_CONTEXT (f_sav) = record;
9453 TYPE_STUB_DECL (record) = type_decl;
9454 TYPE_NAME (record) = type_decl;
9455 TYPE_FIELDS (record) = f_gpr;
9456 DECL_CHAIN (f_gpr) = f_fpr;
9457 DECL_CHAIN (f_fpr) = f_ovf;
9458 DECL_CHAIN (f_ovf) = f_sav;
9460 layout_type (record);
9462 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9463 NULL_TREE, TYPE_ATTRIBUTES (record));
9465 /* The correct type is an array type of one element. */
9466 return build_array_type (record, build_index_type (size_zero_node));
9469 /* Setup the builtin va_list data type and for 64-bit the additional
9470 calling convention specific va_list data types. */
9472 static tree
9473 ix86_build_builtin_va_list (void)
9475 if (TARGET_64BIT)
9477 /* Initialize ABI specific va_list builtin types.
9479 In lto1, we can encounter two va_list types:
9480 - one as a result of the type-merge across TUs, and
9481 - the one constructed here.
9482 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9483 a type identity check in canonical_va_list_type based on
9484 TYPE_MAIN_VARIANT (which we used to have) will not work.
9485 Instead, we tag each va_list_type_node with its unique attribute, and
9486 look for the attribute in the type identity check in
9487 canonical_va_list_type.
9489 Tagging sysv_va_list_type_node directly with the attribute is
9490 problematic since it's a array of one record, which will degrade into a
9491 pointer to record when used as parameter (see build_va_arg comments for
9492 an example), dropping the attribute in the process. So we tag the
9493 record instead. */
9495 /* For SYSV_ABI we use an array of one record. */
9496 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9498 /* For MS_ABI we use plain pointer to argument area. */
9499 tree char_ptr_type = build_pointer_type (char_type_node);
9500 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9501 TYPE_ATTRIBUTES (char_ptr_type));
9502 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9504 return ((ix86_abi == MS_ABI)
9505 ? ms_va_list_type_node
9506 : sysv_va_list_type_node);
9508 else
9510 /* For i386 we use plain pointer to argument area. */
9511 return build_pointer_type (char_type_node);
9515 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9517 static void
9518 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9520 rtx save_area, mem;
9521 alias_set_type set;
9522 int i, max;
9524 /* GPR size of varargs save area. */
9525 if (cfun->va_list_gpr_size)
9526 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9527 else
9528 ix86_varargs_gpr_size = 0;
9530 /* FPR size of varargs save area. We don't need it if we don't pass
9531 anything in SSE registers. */
9532 if (TARGET_SSE && cfun->va_list_fpr_size)
9533 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9534 else
9535 ix86_varargs_fpr_size = 0;
9537 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9538 return;
9540 save_area = frame_pointer_rtx;
9541 set = get_varargs_alias_set ();
9543 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9544 if (max > X86_64_REGPARM_MAX)
9545 max = X86_64_REGPARM_MAX;
9547 for (i = cum->regno; i < max; i++)
9549 mem = gen_rtx_MEM (word_mode,
9550 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9551 MEM_NOTRAP_P (mem) = 1;
9552 set_mem_alias_set (mem, set);
9553 emit_move_insn (mem,
9554 gen_rtx_REG (word_mode,
9555 x86_64_int_parameter_registers[i]));
9558 if (ix86_varargs_fpr_size)
9560 machine_mode smode;
9561 rtx_code_label *label;
9562 rtx test;
9564 /* Now emit code to save SSE registers. The AX parameter contains number
9565 of SSE parameter registers used to call this function, though all we
9566 actually check here is the zero/non-zero status. */
9568 label = gen_label_rtx ();
9569 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9570 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9571 label));
9573 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9574 we used movdqa (i.e. TImode) instead? Perhaps even better would
9575 be if we could determine the real mode of the data, via a hook
9576 into pass_stdarg. Ignore all that for now. */
9577 smode = V4SFmode;
9578 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9579 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9581 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9582 if (max > X86_64_SSE_REGPARM_MAX)
9583 max = X86_64_SSE_REGPARM_MAX;
9585 for (i = cum->sse_regno; i < max; ++i)
9587 mem = plus_constant (Pmode, save_area,
9588 i * 16 + ix86_varargs_gpr_size);
9589 mem = gen_rtx_MEM (smode, mem);
9590 MEM_NOTRAP_P (mem) = 1;
9591 set_mem_alias_set (mem, set);
9592 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9594 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9597 emit_label (label);
9601 static void
9602 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9604 alias_set_type set = get_varargs_alias_set ();
9605 int i;
9607 /* Reset to zero, as there might be a sysv vaarg used
9608 before. */
9609 ix86_varargs_gpr_size = 0;
9610 ix86_varargs_fpr_size = 0;
9612 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9614 rtx reg, mem;
9616 mem = gen_rtx_MEM (Pmode,
9617 plus_constant (Pmode, virtual_incoming_args_rtx,
9618 i * UNITS_PER_WORD));
9619 MEM_NOTRAP_P (mem) = 1;
9620 set_mem_alias_set (mem, set);
9622 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9623 emit_move_insn (mem, reg);
9627 static void
9628 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9629 tree type, int *, int no_rtl)
9631 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9632 CUMULATIVE_ARGS next_cum;
9633 tree fntype;
9635 /* This argument doesn't appear to be used anymore. Which is good,
9636 because the old code here didn't suppress rtl generation. */
9637 gcc_assert (!no_rtl);
9639 if (!TARGET_64BIT)
9640 return;
9642 fntype = TREE_TYPE (current_function_decl);
9644 /* For varargs, we do not want to skip the dummy va_dcl argument.
9645 For stdargs, we do want to skip the last named argument. */
9646 next_cum = *cum;
9647 if (stdarg_p (fntype))
9648 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9649 true);
9651 if (cum->call_abi == MS_ABI)
9652 setup_incoming_varargs_ms_64 (&next_cum);
9653 else
9654 setup_incoming_varargs_64 (&next_cum);
9657 static void
9658 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9659 machine_mode mode,
9660 tree type,
9661 int *pretend_size ATTRIBUTE_UNUSED,
9662 int no_rtl)
9664 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9665 CUMULATIVE_ARGS next_cum;
9666 tree fntype;
9667 rtx save_area;
9668 int bnd_reg, i, max;
9670 gcc_assert (!no_rtl);
9672 /* Do nothing if we use plain pointer to argument area. */
9673 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9674 return;
9676 fntype = TREE_TYPE (current_function_decl);
9678 /* For varargs, we do not want to skip the dummy va_dcl argument.
9679 For stdargs, we do want to skip the last named argument. */
9680 next_cum = *cum;
9681 if (stdarg_p (fntype))
9682 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9683 true);
9684 save_area = frame_pointer_rtx;
9686 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9687 if (max > X86_64_REGPARM_MAX)
9688 max = X86_64_REGPARM_MAX;
9690 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9691 if (chkp_function_instrumented_p (current_function_decl))
9692 for (i = cum->regno; i < max; i++)
9694 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9695 rtx ptr = gen_rtx_REG (Pmode,
9696 x86_64_int_parameter_registers[i]);
9697 rtx bounds;
9699 if (bnd_reg <= LAST_BND_REG)
9700 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9701 else
9703 rtx ldx_addr =
9704 plus_constant (Pmode, arg_pointer_rtx,
9705 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9706 bounds = gen_reg_rtx (BNDmode);
9707 emit_insn (BNDmode == BND64mode
9708 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9709 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9712 emit_insn (BNDmode == BND64mode
9713 ? gen_bnd64_stx (addr, ptr, bounds)
9714 : gen_bnd32_stx (addr, ptr, bounds));
9716 bnd_reg++;
9721 /* Checks if TYPE is of kind va_list char *. */
9723 static bool
9724 is_va_list_char_pointer (tree type)
9726 tree canonic;
9728 /* For 32-bit it is always true. */
9729 if (!TARGET_64BIT)
9730 return true;
9731 canonic = ix86_canonical_va_list_type (type);
9732 return (canonic == ms_va_list_type_node
9733 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9736 /* Implement va_start. */
9738 static void
9739 ix86_va_start (tree valist, rtx nextarg)
9741 HOST_WIDE_INT words, n_gpr, n_fpr;
9742 tree f_gpr, f_fpr, f_ovf, f_sav;
9743 tree gpr, fpr, ovf, sav, t;
9744 tree type;
9745 rtx ovf_rtx;
9747 if (flag_split_stack
9748 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9750 unsigned int scratch_regno;
9752 /* When we are splitting the stack, we can't refer to the stack
9753 arguments using internal_arg_pointer, because they may be on
9754 the old stack. The split stack prologue will arrange to
9755 leave a pointer to the old stack arguments in a scratch
9756 register, which we here copy to a pseudo-register. The split
9757 stack prologue can't set the pseudo-register directly because
9758 it (the prologue) runs before any registers have been saved. */
9760 scratch_regno = split_stack_prologue_scratch_regno ();
9761 if (scratch_regno != INVALID_REGNUM)
9763 rtx reg;
9764 rtx_insn *seq;
9766 reg = gen_reg_rtx (Pmode);
9767 cfun->machine->split_stack_varargs_pointer = reg;
9769 start_sequence ();
9770 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9771 seq = get_insns ();
9772 end_sequence ();
9774 push_topmost_sequence ();
9775 emit_insn_after (seq, entry_of_function ());
9776 pop_topmost_sequence ();
9780 /* Only 64bit target needs something special. */
9781 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9783 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9784 std_expand_builtin_va_start (valist, nextarg);
9785 else
9787 rtx va_r, next;
9789 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9790 next = expand_binop (ptr_mode, add_optab,
9791 cfun->machine->split_stack_varargs_pointer,
9792 crtl->args.arg_offset_rtx,
9793 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9794 convert_move (va_r, next, 0);
9796 /* Store zero bounds for va_list. */
9797 if (chkp_function_instrumented_p (current_function_decl))
9798 chkp_expand_bounds_reset_for_mem (valist,
9799 make_tree (TREE_TYPE (valist),
9800 next));
9803 return;
9806 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9807 f_fpr = DECL_CHAIN (f_gpr);
9808 f_ovf = DECL_CHAIN (f_fpr);
9809 f_sav = DECL_CHAIN (f_ovf);
9811 valist = build_simple_mem_ref (valist);
9812 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9813 /* The following should be folded into the MEM_REF offset. */
9814 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9815 f_gpr, NULL_TREE);
9816 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9817 f_fpr, NULL_TREE);
9818 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9819 f_ovf, NULL_TREE);
9820 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9821 f_sav, NULL_TREE);
9823 /* Count number of gp and fp argument registers used. */
9824 words = crtl->args.info.words;
9825 n_gpr = crtl->args.info.regno;
9826 n_fpr = crtl->args.info.sse_regno;
9828 if (cfun->va_list_gpr_size)
9830 type = TREE_TYPE (gpr);
9831 t = build2 (MODIFY_EXPR, type,
9832 gpr, build_int_cst (type, n_gpr * 8));
9833 TREE_SIDE_EFFECTS (t) = 1;
9834 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9837 if (TARGET_SSE && cfun->va_list_fpr_size)
9839 type = TREE_TYPE (fpr);
9840 t = build2 (MODIFY_EXPR, type, fpr,
9841 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9842 TREE_SIDE_EFFECTS (t) = 1;
9843 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9846 /* Find the overflow area. */
9847 type = TREE_TYPE (ovf);
9848 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9849 ovf_rtx = crtl->args.internal_arg_pointer;
9850 else
9851 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9852 t = make_tree (type, ovf_rtx);
9853 if (words != 0)
9854 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9856 /* Store zero bounds for overflow area pointer. */
9857 if (chkp_function_instrumented_p (current_function_decl))
9858 chkp_expand_bounds_reset_for_mem (ovf, t);
9860 t = build2 (MODIFY_EXPR, type, ovf, t);
9861 TREE_SIDE_EFFECTS (t) = 1;
9862 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9864 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9866 /* Find the register save area.
9867 Prologue of the function save it right above stack frame. */
9868 type = TREE_TYPE (sav);
9869 t = make_tree (type, frame_pointer_rtx);
9870 if (!ix86_varargs_gpr_size)
9871 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9873 /* Store zero bounds for save area pointer. */
9874 if (chkp_function_instrumented_p (current_function_decl))
9875 chkp_expand_bounds_reset_for_mem (sav, t);
9877 t = build2 (MODIFY_EXPR, type, sav, t);
9878 TREE_SIDE_EFFECTS (t) = 1;
9879 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9883 /* Implement va_arg. */
9885 static tree
9886 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9887 gimple_seq *post_p)
9889 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9890 tree f_gpr, f_fpr, f_ovf, f_sav;
9891 tree gpr, fpr, ovf, sav, t;
9892 int size, rsize;
9893 tree lab_false, lab_over = NULL_TREE;
9894 tree addr, t2;
9895 rtx container;
9896 int indirect_p = 0;
9897 tree ptrtype;
9898 machine_mode nat_mode;
9899 unsigned int arg_boundary;
9901 /* Only 64bit target needs something special. */
9902 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9903 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9905 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9906 f_fpr = DECL_CHAIN (f_gpr);
9907 f_ovf = DECL_CHAIN (f_fpr);
9908 f_sav = DECL_CHAIN (f_ovf);
9910 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9911 valist, f_gpr, NULL_TREE);
9913 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9914 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9915 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9917 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9918 if (indirect_p)
9919 type = build_pointer_type (type);
9920 size = arg_int_size_in_bytes (type);
9921 rsize = CEIL (size, UNITS_PER_WORD);
9923 nat_mode = type_natural_mode (type, NULL, false);
9924 switch (nat_mode)
9926 case E_V8SFmode:
9927 case E_V8SImode:
9928 case E_V32QImode:
9929 case E_V16HImode:
9930 case E_V4DFmode:
9931 case E_V4DImode:
9932 case E_V16SFmode:
9933 case E_V16SImode:
9934 case E_V64QImode:
9935 case E_V32HImode:
9936 case E_V8DFmode:
9937 case E_V8DImode:
9938 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9939 if (!TARGET_64BIT_MS_ABI)
9941 container = NULL;
9942 break;
9944 /* FALLTHRU */
9946 default:
9947 container = construct_container (nat_mode, TYPE_MODE (type),
9948 type, 0, X86_64_REGPARM_MAX,
9949 X86_64_SSE_REGPARM_MAX, intreg,
9951 break;
9954 /* Pull the value out of the saved registers. */
9956 addr = create_tmp_var (ptr_type_node, "addr");
9958 if (container)
9960 int needed_intregs, needed_sseregs;
9961 bool need_temp;
9962 tree int_addr, sse_addr;
9964 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9965 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9967 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9969 need_temp = (!REG_P (container)
9970 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9971 || TYPE_ALIGN (type) > 128));
9973 /* In case we are passing structure, verify that it is consecutive block
9974 on the register save area. If not we need to do moves. */
9975 if (!need_temp && !REG_P (container))
9977 /* Verify that all registers are strictly consecutive */
9978 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
9980 int i;
9982 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9984 rtx slot = XVECEXP (container, 0, i);
9985 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
9986 || INTVAL (XEXP (slot, 1)) != i * 16)
9987 need_temp = true;
9990 else
9992 int i;
9994 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9996 rtx slot = XVECEXP (container, 0, i);
9997 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
9998 || INTVAL (XEXP (slot, 1)) != i * 8)
9999 need_temp = true;
10003 if (!need_temp)
10005 int_addr = addr;
10006 sse_addr = addr;
10008 else
10010 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10011 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10014 /* First ensure that we fit completely in registers. */
10015 if (needed_intregs)
10017 t = build_int_cst (TREE_TYPE (gpr),
10018 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10019 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10020 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10021 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10022 gimplify_and_add (t, pre_p);
10024 if (needed_sseregs)
10026 t = build_int_cst (TREE_TYPE (fpr),
10027 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10028 + X86_64_REGPARM_MAX * 8);
10029 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10030 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10031 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10032 gimplify_and_add (t, pre_p);
10035 /* Compute index to start of area used for integer regs. */
10036 if (needed_intregs)
10038 /* int_addr = gpr + sav; */
10039 t = fold_build_pointer_plus (sav, gpr);
10040 gimplify_assign (int_addr, t, pre_p);
10042 if (needed_sseregs)
10044 /* sse_addr = fpr + sav; */
10045 t = fold_build_pointer_plus (sav, fpr);
10046 gimplify_assign (sse_addr, t, pre_p);
10048 if (need_temp)
10050 int i, prev_size = 0;
10051 tree temp = create_tmp_var (type, "va_arg_tmp");
10053 /* addr = &temp; */
10054 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10055 gimplify_assign (addr, t, pre_p);
10057 for (i = 0; i < XVECLEN (container, 0); i++)
10059 rtx slot = XVECEXP (container, 0, i);
10060 rtx reg = XEXP (slot, 0);
10061 machine_mode mode = GET_MODE (reg);
10062 tree piece_type;
10063 tree addr_type;
10064 tree daddr_type;
10065 tree src_addr, src;
10066 int src_offset;
10067 tree dest_addr, dest;
10068 int cur_size = GET_MODE_SIZE (mode);
10070 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10071 prev_size = INTVAL (XEXP (slot, 1));
10072 if (prev_size + cur_size > size)
10074 cur_size = size - prev_size;
10075 unsigned int nbits = cur_size * BITS_PER_UNIT;
10076 if (!int_mode_for_size (nbits, 1).exists (&mode))
10077 mode = QImode;
10079 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10080 if (mode == GET_MODE (reg))
10081 addr_type = build_pointer_type (piece_type);
10082 else
10083 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10084 true);
10085 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10086 true);
10088 if (SSE_REGNO_P (REGNO (reg)))
10090 src_addr = sse_addr;
10091 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10093 else
10095 src_addr = int_addr;
10096 src_offset = REGNO (reg) * 8;
10098 src_addr = fold_convert (addr_type, src_addr);
10099 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10101 dest_addr = fold_convert (daddr_type, addr);
10102 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10103 if (cur_size == GET_MODE_SIZE (mode))
10105 src = build_va_arg_indirect_ref (src_addr);
10106 dest = build_va_arg_indirect_ref (dest_addr);
10108 gimplify_assign (dest, src, pre_p);
10110 else
10112 tree copy
10113 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10114 3, dest_addr, src_addr,
10115 size_int (cur_size));
10116 gimplify_and_add (copy, pre_p);
10118 prev_size += cur_size;
10122 if (needed_intregs)
10124 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10125 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10126 gimplify_assign (gpr, t, pre_p);
10129 if (needed_sseregs)
10131 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10132 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10133 gimplify_assign (unshare_expr (fpr), t, pre_p);
10136 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10138 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10141 /* ... otherwise out of the overflow area. */
10143 /* When we align parameter on stack for caller, if the parameter
10144 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10145 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10146 here with caller. */
10147 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10148 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10149 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10151 /* Care for on-stack alignment if needed. */
10152 if (arg_boundary <= 64 || size == 0)
10153 t = ovf;
10154 else
10156 HOST_WIDE_INT align = arg_boundary / 8;
10157 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10158 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10159 build_int_cst (TREE_TYPE (t), -align));
10162 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10163 gimplify_assign (addr, t, pre_p);
10165 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10166 gimplify_assign (unshare_expr (ovf), t, pre_p);
10168 if (container)
10169 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10171 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10172 addr = fold_convert (ptrtype, addr);
10174 if (indirect_p)
10175 addr = build_va_arg_indirect_ref (addr);
10176 return build_va_arg_indirect_ref (addr);
10179 /* Return true if OPNUM's MEM should be matched
10180 in movabs* patterns. */
10182 bool
10183 ix86_check_movabs (rtx insn, int opnum)
10185 rtx set, mem;
10187 set = PATTERN (insn);
10188 if (GET_CODE (set) == PARALLEL)
10189 set = XVECEXP (set, 0, 0);
10190 gcc_assert (GET_CODE (set) == SET);
10191 mem = XEXP (set, opnum);
10192 while (SUBREG_P (mem))
10193 mem = SUBREG_REG (mem);
10194 gcc_assert (MEM_P (mem));
10195 return volatile_ok || !MEM_VOLATILE_P (mem);
10198 /* Return false if INSN contains a MEM with a non-default address space. */
10199 bool
10200 ix86_check_no_addr_space (rtx insn)
10202 subrtx_var_iterator::array_type array;
10203 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10205 rtx x = *iter;
10206 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10207 return false;
10209 return true;
10212 /* Initialize the table of extra 80387 mathematical constants. */
10214 static void
10215 init_ext_80387_constants (void)
10217 static const char * cst[5] =
10219 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10220 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10221 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10222 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10223 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10225 int i;
10227 for (i = 0; i < 5; i++)
10229 real_from_string (&ext_80387_constants_table[i], cst[i]);
10230 /* Ensure each constant is rounded to XFmode precision. */
10231 real_convert (&ext_80387_constants_table[i],
10232 XFmode, &ext_80387_constants_table[i]);
10235 ext_80387_constants_init = 1;
10238 /* Return non-zero if the constant is something that
10239 can be loaded with a special instruction. */
10242 standard_80387_constant_p (rtx x)
10244 machine_mode mode = GET_MODE (x);
10246 const REAL_VALUE_TYPE *r;
10248 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10249 return -1;
10251 if (x == CONST0_RTX (mode))
10252 return 1;
10253 if (x == CONST1_RTX (mode))
10254 return 2;
10256 r = CONST_DOUBLE_REAL_VALUE (x);
10258 /* For XFmode constants, try to find a special 80387 instruction when
10259 optimizing for size or on those CPUs that benefit from them. */
10260 if (mode == XFmode
10261 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10263 int i;
10265 if (! ext_80387_constants_init)
10266 init_ext_80387_constants ();
10268 for (i = 0; i < 5; i++)
10269 if (real_identical (r, &ext_80387_constants_table[i]))
10270 return i + 3;
10273 /* Load of the constant -0.0 or -1.0 will be split as
10274 fldz;fchs or fld1;fchs sequence. */
10275 if (real_isnegzero (r))
10276 return 8;
10277 if (real_identical (r, &dconstm1))
10278 return 9;
10280 return 0;
10283 /* Return the opcode of the special instruction to be used to load
10284 the constant X. */
10286 const char *
10287 standard_80387_constant_opcode (rtx x)
10289 switch (standard_80387_constant_p (x))
10291 case 1:
10292 return "fldz";
10293 case 2:
10294 return "fld1";
10295 case 3:
10296 return "fldlg2";
10297 case 4:
10298 return "fldln2";
10299 case 5:
10300 return "fldl2e";
10301 case 6:
10302 return "fldl2t";
10303 case 7:
10304 return "fldpi";
10305 case 8:
10306 case 9:
10307 return "#";
10308 default:
10309 gcc_unreachable ();
10313 /* Return the CONST_DOUBLE representing the 80387 constant that is
10314 loaded by the specified special instruction. The argument IDX
10315 matches the return value from standard_80387_constant_p. */
10318 standard_80387_constant_rtx (int idx)
10320 int i;
10322 if (! ext_80387_constants_init)
10323 init_ext_80387_constants ();
10325 switch (idx)
10327 case 3:
10328 case 4:
10329 case 5:
10330 case 6:
10331 case 7:
10332 i = idx - 3;
10333 break;
10335 default:
10336 gcc_unreachable ();
10339 return const_double_from_real_value (ext_80387_constants_table[i],
10340 XFmode);
10343 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10344 in supported SSE/AVX vector mode. */
10347 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10349 machine_mode mode;
10351 if (!TARGET_SSE)
10352 return 0;
10354 mode = GET_MODE (x);
10356 if (x == const0_rtx || const0_operand (x, mode))
10357 return 1;
10359 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10361 /* VOIDmode integer constant, get mode from the predicate. */
10362 if (mode == VOIDmode)
10363 mode = pred_mode;
10365 switch (GET_MODE_SIZE (mode))
10367 case 64:
10368 if (TARGET_AVX512F)
10369 return 2;
10370 break;
10371 case 32:
10372 if (TARGET_AVX2)
10373 return 2;
10374 break;
10375 case 16:
10376 if (TARGET_SSE2)
10377 return 2;
10378 break;
10379 case 0:
10380 /* VOIDmode */
10381 gcc_unreachable ();
10382 default:
10383 break;
10387 return 0;
10390 /* Return the opcode of the special instruction to be used to load
10391 the constant operands[1] into operands[0]. */
10393 const char *
10394 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10396 machine_mode mode;
10397 rtx x = operands[1];
10399 gcc_assert (TARGET_SSE);
10401 mode = GET_MODE (x);
10403 if (x == const0_rtx || const0_operand (x, mode))
10405 switch (get_attr_mode (insn))
10407 case MODE_TI:
10408 if (!EXT_REX_SSE_REG_P (operands[0]))
10409 return "%vpxor\t%0, %d0";
10410 /* FALLTHRU */
10411 case MODE_XI:
10412 case MODE_OI:
10413 if (EXT_REX_SSE_REG_P (operands[0]))
10414 return (TARGET_AVX512VL
10415 ? "vpxord\t%x0, %x0, %x0"
10416 : "vpxord\t%g0, %g0, %g0");
10417 return "vpxor\t%x0, %x0, %x0";
10419 case MODE_V2DF:
10420 if (!EXT_REX_SSE_REG_P (operands[0]))
10421 return "%vxorpd\t%0, %d0";
10422 /* FALLTHRU */
10423 case MODE_V8DF:
10424 case MODE_V4DF:
10425 if (!EXT_REX_SSE_REG_P (operands[0]))
10426 return "vxorpd\t%x0, %x0, %x0";
10427 else if (TARGET_AVX512DQ)
10428 return (TARGET_AVX512VL
10429 ? "vxorpd\t%x0, %x0, %x0"
10430 : "vxorpd\t%g0, %g0, %g0");
10431 else
10432 return (TARGET_AVX512VL
10433 ? "vpxorq\t%x0, %x0, %x0"
10434 : "vpxorq\t%g0, %g0, %g0");
10436 case MODE_V4SF:
10437 if (!EXT_REX_SSE_REG_P (operands[0]))
10438 return "%vxorps\t%0, %d0";
10439 /* FALLTHRU */
10440 case MODE_V16SF:
10441 case MODE_V8SF:
10442 if (!EXT_REX_SSE_REG_P (operands[0]))
10443 return "vxorps\t%x0, %x0, %x0";
10444 else if (TARGET_AVX512DQ)
10445 return (TARGET_AVX512VL
10446 ? "vxorps\t%x0, %x0, %x0"
10447 : "vxorps\t%g0, %g0, %g0");
10448 else
10449 return (TARGET_AVX512VL
10450 ? "vpxord\t%x0, %x0, %x0"
10451 : "vpxord\t%g0, %g0, %g0");
10453 default:
10454 gcc_unreachable ();
10457 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10459 enum attr_mode insn_mode = get_attr_mode (insn);
10461 switch (insn_mode)
10463 case MODE_XI:
10464 case MODE_V8DF:
10465 case MODE_V16SF:
10466 gcc_assert (TARGET_AVX512F);
10467 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10469 case MODE_OI:
10470 case MODE_V4DF:
10471 case MODE_V8SF:
10472 gcc_assert (TARGET_AVX2);
10473 /* FALLTHRU */
10474 case MODE_TI:
10475 case MODE_V2DF:
10476 case MODE_V4SF:
10477 gcc_assert (TARGET_SSE2);
10478 if (!EXT_REX_SSE_REG_P (operands[0]))
10479 return (TARGET_AVX
10480 ? "vpcmpeqd\t%0, %0, %0"
10481 : "pcmpeqd\t%0, %0");
10482 else if (TARGET_AVX512VL)
10483 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10484 else
10485 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10487 default:
10488 gcc_unreachable ();
10492 gcc_unreachable ();
10495 /* Returns true if INSN can be transformed from a memory load
10496 to a supported FP constant load. */
10498 bool
10499 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10501 rtx src = find_constant_src (insn);
10503 gcc_assert (REG_P (dst));
10505 if (src == NULL
10506 || (SSE_REGNO_P (REGNO (dst))
10507 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10508 || (STACK_REGNO_P (REGNO (dst))
10509 && standard_80387_constant_p (src) < 1))
10510 return false;
10512 return true;
10515 /* Returns true if OP contains a symbol reference */
10517 bool
10518 symbolic_reference_mentioned_p (rtx op)
10520 const char *fmt;
10521 int i;
10523 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10524 return true;
10526 fmt = GET_RTX_FORMAT (GET_CODE (op));
10527 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10529 if (fmt[i] == 'E')
10531 int j;
10533 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10534 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10535 return true;
10538 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10539 return true;
10542 return false;
10545 /* Return true if it is appropriate to emit `ret' instructions in the
10546 body of a function. Do this only if the epilogue is simple, needing a
10547 couple of insns. Prior to reloading, we can't tell how many registers
10548 must be saved, so return false then. Return false if there is no frame
10549 marker to de-allocate. */
10551 bool
10552 ix86_can_use_return_insn_p (void)
10554 if (ix86_function_naked (current_function_decl))
10555 return false;
10557 /* Don't use `ret' instruction in interrupt handler. */
10558 if (! reload_completed
10559 || frame_pointer_needed
10560 || cfun->machine->func_type != TYPE_NORMAL)
10561 return 0;
10563 /* Don't allow more than 32k pop, since that's all we can do
10564 with one instruction. */
10565 if (crtl->args.pops_args && crtl->args.size >= 32768)
10566 return 0;
10568 struct ix86_frame &frame = cfun->machine->frame;
10569 return (frame.stack_pointer_offset == UNITS_PER_WORD
10570 && (frame.nregs + frame.nsseregs) == 0);
10573 /* Value should be nonzero if functions must have frame pointers.
10574 Zero means the frame pointer need not be set up (and parms may
10575 be accessed via the stack pointer) in functions that seem suitable. */
10577 static bool
10578 ix86_frame_pointer_required (void)
10580 /* If we accessed previous frames, then the generated code expects
10581 to be able to access the saved ebp value in our frame. */
10582 if (cfun->machine->accesses_prev_frame)
10583 return true;
10585 /* Several x86 os'es need a frame pointer for other reasons,
10586 usually pertaining to setjmp. */
10587 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10588 return true;
10590 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10591 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10592 return true;
10594 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10595 allocation is 4GB. */
10596 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10597 return true;
10599 /* SSE saves require frame-pointer when stack is misaligned. */
10600 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10601 return true;
10603 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10604 turns off the frame pointer by default. Turn it back on now if
10605 we've not got a leaf function. */
10606 if (TARGET_OMIT_LEAF_FRAME_POINTER
10607 && (!crtl->is_leaf
10608 || ix86_current_function_calls_tls_descriptor))
10609 return true;
10611 if (crtl->profile && !flag_fentry)
10612 return true;
10614 return false;
10617 /* Record that the current function accesses previous call frames. */
10619 void
10620 ix86_setup_frame_addresses (void)
10622 cfun->machine->accesses_prev_frame = 1;
10625 #ifndef USE_HIDDEN_LINKONCE
10626 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10627 # define USE_HIDDEN_LINKONCE 1
10628 # else
10629 # define USE_HIDDEN_LINKONCE 0
10630 # endif
10631 #endif
10633 static int pic_labels_used;
10635 /* Fills in the label name that should be used for a pc thunk for
10636 the given register. */
10638 static void
10639 get_pc_thunk_name (char name[32], unsigned int regno)
10641 gcc_assert (!TARGET_64BIT);
10643 if (USE_HIDDEN_LINKONCE)
10644 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10645 else
10646 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10650 /* This function generates code for -fpic that loads %ebx with
10651 the return address of the caller and then returns. */
10653 static void
10654 ix86_code_end (void)
10656 rtx xops[2];
10657 int regno;
10659 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
10661 char name[32];
10662 tree decl;
10664 if (!(pic_labels_used & (1 << regno)))
10665 continue;
10667 get_pc_thunk_name (name, regno);
10669 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10670 get_identifier (name),
10671 build_function_type_list (void_type_node, NULL_TREE));
10672 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10673 NULL_TREE, void_type_node);
10674 TREE_PUBLIC (decl) = 1;
10675 TREE_STATIC (decl) = 1;
10676 DECL_IGNORED_P (decl) = 1;
10678 #if TARGET_MACHO
10679 if (TARGET_MACHO)
10681 switch_to_section (darwin_sections[picbase_thunk_section]);
10682 fputs ("\t.weak_definition\t", asm_out_file);
10683 assemble_name (asm_out_file, name);
10684 fputs ("\n\t.private_extern\t", asm_out_file);
10685 assemble_name (asm_out_file, name);
10686 putc ('\n', asm_out_file);
10687 ASM_OUTPUT_LABEL (asm_out_file, name);
10688 DECL_WEAK (decl) = 1;
10690 else
10691 #endif
10692 if (USE_HIDDEN_LINKONCE)
10694 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10696 targetm.asm_out.unique_section (decl, 0);
10697 switch_to_section (get_named_section (decl, NULL, 0));
10699 targetm.asm_out.globalize_label (asm_out_file, name);
10700 fputs ("\t.hidden\t", asm_out_file);
10701 assemble_name (asm_out_file, name);
10702 putc ('\n', asm_out_file);
10703 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10705 else
10707 switch_to_section (text_section);
10708 ASM_OUTPUT_LABEL (asm_out_file, name);
10711 DECL_INITIAL (decl) = make_node (BLOCK);
10712 current_function_decl = decl;
10713 allocate_struct_function (decl, false);
10714 init_function_start (decl);
10715 /* We're about to hide the function body from callees of final_* by
10716 emitting it directly; tell them we're a thunk, if they care. */
10717 cfun->is_thunk = true;
10718 first_function_block_is_cold = false;
10719 /* Make sure unwind info is emitted for the thunk if needed. */
10720 final_start_function (emit_barrier (), asm_out_file, 1);
10722 /* Pad stack IP move with 4 instructions (two NOPs count
10723 as one instruction). */
10724 if (TARGET_PAD_SHORT_FUNCTION)
10726 int i = 8;
10728 while (i--)
10729 fputs ("\tnop\n", asm_out_file);
10732 xops[0] = gen_rtx_REG (Pmode, regno);
10733 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
10734 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
10735 output_asm_insn ("%!ret", NULL);
10736 final_end_function ();
10737 init_insn_lengths ();
10738 free_after_compilation (cfun);
10739 set_cfun (NULL);
10740 current_function_decl = NULL;
10743 if (flag_split_stack)
10744 file_end_indicate_split_stack ();
10747 /* Emit code for the SET_GOT patterns. */
10749 const char *
10750 output_set_got (rtx dest, rtx label)
10752 rtx xops[3];
10754 xops[0] = dest;
10756 if (TARGET_VXWORKS_RTP && flag_pic)
10758 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
10759 xops[2] = gen_rtx_MEM (Pmode,
10760 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
10761 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
10763 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
10764 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
10765 an unadorned address. */
10766 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
10767 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
10768 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
10769 return "";
10772 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
10774 if (flag_pic)
10776 char name[32];
10777 get_pc_thunk_name (name, REGNO (dest));
10778 pic_labels_used |= 1 << REGNO (dest);
10780 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
10781 xops[2] = gen_rtx_MEM (QImode, xops[2]);
10782 output_asm_insn ("%!call\t%X2", xops);
10784 #if TARGET_MACHO
10785 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
10786 This is what will be referenced by the Mach-O PIC subsystem. */
10787 if (machopic_should_output_picbase_label () || !label)
10788 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
10790 /* When we are restoring the pic base at the site of a nonlocal label,
10791 and we decided to emit the pic base above, we will still output a
10792 local label used for calculating the correction offset (even though
10793 the offset will be 0 in that case). */
10794 if (label)
10795 targetm.asm_out.internal_label (asm_out_file, "L",
10796 CODE_LABEL_NUMBER (label));
10797 #endif
10799 else
10801 if (TARGET_MACHO)
10802 /* We don't need a pic base, we're not producing pic. */
10803 gcc_unreachable ();
10805 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
10806 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
10807 targetm.asm_out.internal_label (asm_out_file, "L",
10808 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
10811 if (!TARGET_MACHO)
10812 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
10814 return "";
10817 /* Generate an "push" pattern for input ARG. */
10819 static rtx
10820 gen_push (rtx arg)
10822 struct machine_function *m = cfun->machine;
10824 if (m->fs.cfa_reg == stack_pointer_rtx)
10825 m->fs.cfa_offset += UNITS_PER_WORD;
10826 m->fs.sp_offset += UNITS_PER_WORD;
10828 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10829 arg = gen_rtx_REG (word_mode, REGNO (arg));
10831 return gen_rtx_SET (gen_rtx_MEM (word_mode,
10832 gen_rtx_PRE_DEC (Pmode,
10833 stack_pointer_rtx)),
10834 arg);
10837 /* Generate an "pop" pattern for input ARG. */
10839 static rtx
10840 gen_pop (rtx arg)
10842 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10843 arg = gen_rtx_REG (word_mode, REGNO (arg));
10845 return gen_rtx_SET (arg,
10846 gen_rtx_MEM (word_mode,
10847 gen_rtx_POST_INC (Pmode,
10848 stack_pointer_rtx)));
10851 /* Return >= 0 if there is an unused call-clobbered register available
10852 for the entire function. */
10854 static unsigned int
10855 ix86_select_alt_pic_regnum (void)
10857 if (ix86_use_pseudo_pic_reg ())
10858 return INVALID_REGNUM;
10860 if (crtl->is_leaf
10861 && !crtl->profile
10862 && !ix86_current_function_calls_tls_descriptor)
10864 int i, drap;
10865 /* Can't use the same register for both PIC and DRAP. */
10866 if (crtl->drap_reg)
10867 drap = REGNO (crtl->drap_reg);
10868 else
10869 drap = -1;
10870 for (i = 2; i >= 0; --i)
10871 if (i != drap && !df_regs_ever_live_p (i))
10872 return i;
10875 return INVALID_REGNUM;
10878 /* Return true if REGNO is used by the epilogue. */
10880 bool
10881 ix86_epilogue_uses (int regno)
10883 /* If there are no caller-saved registers, we preserve all registers,
10884 except for MMX and x87 registers which aren't supported when saving
10885 and restoring registers. Don't explicitly save SP register since
10886 it is always preserved. */
10887 return (epilogue_completed
10888 && cfun->machine->no_caller_saved_registers
10889 && !fixed_regs[regno]
10890 && !STACK_REGNO_P (regno)
10891 && !MMX_REGNO_P (regno));
10894 /* Return nonzero if register REGNO can be used as a scratch register
10895 in peephole2. */
10897 static bool
10898 ix86_hard_regno_scratch_ok (unsigned int regno)
10900 /* If there are no caller-saved registers, we can't use any register
10901 as a scratch register after epilogue and use REGNO as scratch
10902 register only if it has been used before to avoid saving and
10903 restoring it. */
10904 return (!cfun->machine->no_caller_saved_registers
10905 || (!epilogue_completed
10906 && df_regs_ever_live_p (regno)));
10909 /* Return true if register class CL should be an additional allocno
10910 class. */
10912 static bool
10913 ix86_additional_allocno_class_p (reg_class_t cl)
10915 return cl == MOD4_SSE_REGS;
10918 /* Return TRUE if we need to save REGNO. */
10920 static bool
10921 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
10923 /* If there are no caller-saved registers, we preserve all registers,
10924 except for MMX and x87 registers which aren't supported when saving
10925 and restoring registers. Don't explicitly save SP register since
10926 it is always preserved. */
10927 if (cfun->machine->no_caller_saved_registers)
10929 /* Don't preserve registers used for function return value. */
10930 rtx reg = crtl->return_rtx;
10931 if (reg)
10933 unsigned int i = REGNO (reg);
10934 unsigned int nregs = REG_NREGS (reg);
10935 while (nregs-- > 0)
10936 if ((i + nregs) == regno)
10937 return false;
10939 reg = crtl->return_bnd;
10940 if (reg)
10942 i = REGNO (reg);
10943 nregs = REG_NREGS (reg);
10944 while (nregs-- > 0)
10945 if ((i + nregs) == regno)
10946 return false;
10950 return (df_regs_ever_live_p (regno)
10951 && !fixed_regs[regno]
10952 && !STACK_REGNO_P (regno)
10953 && !MMX_REGNO_P (regno)
10954 && (regno != HARD_FRAME_POINTER_REGNUM
10955 || !frame_pointer_needed));
10958 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
10959 && pic_offset_table_rtx)
10961 if (ix86_use_pseudo_pic_reg ())
10963 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
10964 _mcount in prologue. */
10965 if (!TARGET_64BIT && flag_pic && crtl->profile)
10966 return true;
10968 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10969 || crtl->profile
10970 || crtl->calls_eh_return
10971 || crtl->uses_const_pool
10972 || cfun->has_nonlocal_label)
10973 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
10976 if (crtl->calls_eh_return && maybe_eh_return)
10978 unsigned i;
10979 for (i = 0; ; i++)
10981 unsigned test = EH_RETURN_DATA_REGNO (i);
10982 if (test == INVALID_REGNUM)
10983 break;
10984 if (test == regno)
10985 return true;
10989 if (ignore_outlined && cfun->machine->call_ms2sysv)
10991 unsigned count = cfun->machine->call_ms2sysv_extra_regs
10992 + xlogue_layout::MIN_REGS;
10993 if (xlogue_layout::is_stub_managed_reg (regno, count))
10994 return false;
10997 if (crtl->drap_reg
10998 && regno == REGNO (crtl->drap_reg)
10999 && !cfun->machine->no_drap_save_restore)
11000 return true;
11002 return (df_regs_ever_live_p (regno)
11003 && !call_used_regs[regno]
11004 && !fixed_regs[regno]
11005 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11008 /* Return number of saved general prupose registers. */
11010 static int
11011 ix86_nsaved_regs (void)
11013 int nregs = 0;
11014 int regno;
11016 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11017 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11018 nregs ++;
11019 return nregs;
11022 /* Return number of saved SSE registers. */
11024 static int
11025 ix86_nsaved_sseregs (void)
11027 int nregs = 0;
11028 int regno;
11030 if (!TARGET_64BIT_MS_ABI)
11031 return 0;
11032 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11033 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11034 nregs ++;
11035 return nregs;
11038 /* Given FROM and TO register numbers, say whether this elimination is
11039 allowed. If stack alignment is needed, we can only replace argument
11040 pointer with hard frame pointer, or replace frame pointer with stack
11041 pointer. Otherwise, frame pointer elimination is automatically
11042 handled and all other eliminations are valid. */
11044 static bool
11045 ix86_can_eliminate (const int from, const int to)
11047 if (stack_realign_fp)
11048 return ((from == ARG_POINTER_REGNUM
11049 && to == HARD_FRAME_POINTER_REGNUM)
11050 || (from == FRAME_POINTER_REGNUM
11051 && to == STACK_POINTER_REGNUM));
11052 else
11053 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11056 /* Return the offset between two registers, one to be eliminated, and the other
11057 its replacement, at the start of a routine. */
11059 HOST_WIDE_INT
11060 ix86_initial_elimination_offset (int from, int to)
11062 struct ix86_frame &frame = cfun->machine->frame;
11064 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11065 return frame.hard_frame_pointer_offset;
11066 else if (from == FRAME_POINTER_REGNUM
11067 && to == HARD_FRAME_POINTER_REGNUM)
11068 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11069 else
11071 gcc_assert (to == STACK_POINTER_REGNUM);
11073 if (from == ARG_POINTER_REGNUM)
11074 return frame.stack_pointer_offset;
11076 gcc_assert (from == FRAME_POINTER_REGNUM);
11077 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11081 /* In a dynamically-aligned function, we can't know the offset from
11082 stack pointer to frame pointer, so we must ensure that setjmp
11083 eliminates fp against the hard fp (%ebp) rather than trying to
11084 index from %esp up to the top of the frame across a gap that is
11085 of unknown (at compile-time) size. */
11086 static rtx
11087 ix86_builtin_setjmp_frame_value (void)
11089 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11092 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11093 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11095 static bool warned_once = false;
11096 if (!warned_once)
11098 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11099 feature);
11100 warned_once = true;
11104 /* When using -fsplit-stack, the allocation routines set a field in
11105 the TCB to the bottom of the stack plus this much space, measured
11106 in bytes. */
11108 #define SPLIT_STACK_AVAILABLE 256
11110 /* Fill structure ix86_frame about frame of currently computed function. */
11112 static void
11113 ix86_compute_frame_layout (void)
11115 struct ix86_frame *frame = &cfun->machine->frame;
11116 struct machine_function *m = cfun->machine;
11117 unsigned HOST_WIDE_INT stack_alignment_needed;
11118 HOST_WIDE_INT offset;
11119 unsigned HOST_WIDE_INT preferred_alignment;
11120 HOST_WIDE_INT size = get_frame_size ();
11121 HOST_WIDE_INT to_allocate;
11123 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11124 * ms_abi functions that call a sysv function. We now need to prune away
11125 * cases where it should be disabled. */
11126 if (TARGET_64BIT && m->call_ms2sysv)
11128 gcc_assert (TARGET_64BIT_MS_ABI);
11129 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11130 gcc_assert (!TARGET_SEH);
11131 gcc_assert (TARGET_SSE);
11132 gcc_assert (!ix86_using_red_zone ());
11134 if (crtl->calls_eh_return)
11136 gcc_assert (!reload_completed);
11137 m->call_ms2sysv = false;
11138 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11141 else if (ix86_static_chain_on_stack)
11143 gcc_assert (!reload_completed);
11144 m->call_ms2sysv = false;
11145 warn_once_call_ms2sysv_xlogues ("static call chains");
11148 /* Finally, compute which registers the stub will manage. */
11149 else
11151 unsigned count = xlogue_layout::count_stub_managed_regs ();
11152 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11153 m->call_ms2sysv_pad_in = 0;
11157 frame->nregs = ix86_nsaved_regs ();
11158 frame->nsseregs = ix86_nsaved_sseregs ();
11160 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11161 except for function prologues, leaf functions and when the defult
11162 incoming stack boundary is overriden at command line or via
11163 force_align_arg_pointer attribute. */
11164 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11165 && (!crtl->is_leaf || cfun->calls_alloca != 0
11166 || ix86_current_function_calls_tls_descriptor
11167 || ix86_incoming_stack_boundary < 128))
11169 crtl->preferred_stack_boundary = 128;
11170 crtl->stack_alignment_needed = 128;
11173 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11174 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11176 gcc_assert (!size || stack_alignment_needed);
11177 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11178 gcc_assert (preferred_alignment <= stack_alignment_needed);
11180 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11181 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11182 if (TARGET_64BIT && m->call_ms2sysv)
11184 gcc_assert (stack_alignment_needed >= 16);
11185 gcc_assert (!frame->nsseregs);
11188 /* For SEH we have to limit the amount of code movement into the prologue.
11189 At present we do this via a BLOCKAGE, at which point there's very little
11190 scheduling that can be done, which means that there's very little point
11191 in doing anything except PUSHs. */
11192 if (TARGET_SEH)
11193 m->use_fast_prologue_epilogue = false;
11194 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11196 int count = frame->nregs;
11197 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11199 /* The fast prologue uses move instead of push to save registers. This
11200 is significantly longer, but also executes faster as modern hardware
11201 can execute the moves in parallel, but can't do that for push/pop.
11203 Be careful about choosing what prologue to emit: When function takes
11204 many instructions to execute we may use slow version as well as in
11205 case function is known to be outside hot spot (this is known with
11206 feedback only). Weight the size of function by number of registers
11207 to save as it is cheap to use one or two push instructions but very
11208 slow to use many of them. */
11209 if (count)
11210 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11211 if (node->frequency < NODE_FREQUENCY_NORMAL
11212 || (flag_branch_probabilities
11213 && node->frequency < NODE_FREQUENCY_HOT))
11214 m->use_fast_prologue_epilogue = false;
11215 else
11216 m->use_fast_prologue_epilogue
11217 = !expensive_function_p (count);
11220 frame->save_regs_using_mov
11221 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11222 /* If static stack checking is enabled and done with probes,
11223 the registers need to be saved before allocating the frame. */
11224 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11226 /* Skip return address and error code in exception handler. */
11227 offset = INCOMING_FRAME_SP_OFFSET;
11229 /* Skip pushed static chain. */
11230 if (ix86_static_chain_on_stack)
11231 offset += UNITS_PER_WORD;
11233 /* Skip saved base pointer. */
11234 if (frame_pointer_needed)
11235 offset += UNITS_PER_WORD;
11236 frame->hfp_save_offset = offset;
11238 /* The traditional frame pointer location is at the top of the frame. */
11239 frame->hard_frame_pointer_offset = offset;
11241 /* Register save area */
11242 offset += frame->nregs * UNITS_PER_WORD;
11243 frame->reg_save_offset = offset;
11245 /* On SEH target, registers are pushed just before the frame pointer
11246 location. */
11247 if (TARGET_SEH)
11248 frame->hard_frame_pointer_offset = offset;
11250 /* Calculate the size of the va-arg area (not including padding, if any). */
11251 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11253 if (stack_realign_fp)
11255 /* We may need a 16-byte aligned stack for the remainder of the
11256 register save area, but the stack frame for the local function
11257 may require a greater alignment if using AVX/2/512. In order
11258 to avoid wasting space, we first calculate the space needed for
11259 the rest of the register saves, add that to the stack pointer,
11260 and then realign the stack to the boundary of the start of the
11261 frame for the local function. */
11262 HOST_WIDE_INT space_needed = 0;
11263 HOST_WIDE_INT sse_reg_space_needed = 0;
11265 if (TARGET_64BIT)
11267 if (m->call_ms2sysv)
11269 m->call_ms2sysv_pad_in = 0;
11270 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11273 else if (frame->nsseregs)
11274 /* The only ABI that has saved SSE registers (Win64) also has a
11275 16-byte aligned default stack. However, many programs violate
11276 the ABI, and Wine64 forces stack realignment to compensate. */
11277 space_needed = frame->nsseregs * 16;
11279 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11281 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11282 rounding to be pedantic. */
11283 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11285 else
11286 space_needed = frame->va_arg_size;
11288 /* Record the allocation size required prior to the realignment AND. */
11289 frame->stack_realign_allocate = space_needed;
11291 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11292 before this point are not directly comparable with values below
11293 this point. Use sp_valid_at to determine if the stack pointer is
11294 valid for a given offset, fp_valid_at for the frame pointer, or
11295 choose_baseaddr to have a base register chosen for you.
11297 Note that the result of (frame->stack_realign_offset
11298 & (stack_alignment_needed - 1)) may not equal zero. */
11299 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11300 frame->stack_realign_offset = offset - space_needed;
11301 frame->sse_reg_save_offset = frame->stack_realign_offset
11302 + sse_reg_space_needed;
11304 else
11306 frame->stack_realign_offset = offset;
11308 if (TARGET_64BIT && m->call_ms2sysv)
11310 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11311 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11314 /* Align and set SSE register save area. */
11315 else if (frame->nsseregs)
11317 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11318 required and the DRAP re-alignment boundary is at least 16 bytes,
11319 then we want the SSE register save area properly aligned. */
11320 if (ix86_incoming_stack_boundary >= 128
11321 || (stack_realign_drap && stack_alignment_needed >= 16))
11322 offset = ROUND_UP (offset, 16);
11323 offset += frame->nsseregs * 16;
11325 frame->sse_reg_save_offset = offset;
11326 offset += frame->va_arg_size;
11329 /* Align start of frame for local function. */
11330 if (m->call_ms2sysv
11331 || frame->va_arg_size != 0
11332 || size != 0
11333 || !crtl->is_leaf
11334 || cfun->calls_alloca
11335 || ix86_current_function_calls_tls_descriptor)
11336 offset = ROUND_UP (offset, stack_alignment_needed);
11338 /* Frame pointer points here. */
11339 frame->frame_pointer_offset = offset;
11341 offset += size;
11343 /* Add outgoing arguments area. Can be skipped if we eliminated
11344 all the function calls as dead code.
11345 Skipping is however impossible when function calls alloca. Alloca
11346 expander assumes that last crtl->outgoing_args_size
11347 of stack frame are unused. */
11348 if (ACCUMULATE_OUTGOING_ARGS
11349 && (!crtl->is_leaf || cfun->calls_alloca
11350 || ix86_current_function_calls_tls_descriptor))
11352 offset += crtl->outgoing_args_size;
11353 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11355 else
11356 frame->outgoing_arguments_size = 0;
11358 /* Align stack boundary. Only needed if we're calling another function
11359 or using alloca. */
11360 if (!crtl->is_leaf || cfun->calls_alloca
11361 || ix86_current_function_calls_tls_descriptor)
11362 offset = ROUND_UP (offset, preferred_alignment);
11364 /* We've reached end of stack frame. */
11365 frame->stack_pointer_offset = offset;
11367 /* Size prologue needs to allocate. */
11368 to_allocate = offset - frame->sse_reg_save_offset;
11370 if ((!to_allocate && frame->nregs <= 1)
11371 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
11372 frame->save_regs_using_mov = false;
11374 if (ix86_using_red_zone ()
11375 && crtl->sp_is_unchanging
11376 && crtl->is_leaf
11377 && !ix86_pc_thunk_call_expanded
11378 && !ix86_current_function_calls_tls_descriptor)
11380 frame->red_zone_size = to_allocate;
11381 if (frame->save_regs_using_mov)
11382 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11383 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11384 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11386 else
11387 frame->red_zone_size = 0;
11388 frame->stack_pointer_offset -= frame->red_zone_size;
11390 /* The SEH frame pointer location is near the bottom of the frame.
11391 This is enforced by the fact that the difference between the
11392 stack pointer and the frame pointer is limited to 240 bytes in
11393 the unwind data structure. */
11394 if (TARGET_SEH)
11396 HOST_WIDE_INT diff;
11398 /* If we can leave the frame pointer where it is, do so. Also, returns
11399 the establisher frame for __builtin_frame_address (0). */
11400 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11401 if (diff <= SEH_MAX_FRAME_SIZE
11402 && (diff > 240 || (diff & 15) != 0)
11403 && !crtl->accesses_prior_frames)
11405 /* Ideally we'd determine what portion of the local stack frame
11406 (within the constraint of the lowest 240) is most heavily used.
11407 But without that complication, simply bias the frame pointer
11408 by 128 bytes so as to maximize the amount of the local stack
11409 frame that is addressable with 8-bit offsets. */
11410 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11415 /* This is semi-inlined memory_address_length, but simplified
11416 since we know that we're always dealing with reg+offset, and
11417 to avoid having to create and discard all that rtl. */
11419 static inline int
11420 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11422 int len = 4;
11424 if (offset == 0)
11426 /* EBP and R13 cannot be encoded without an offset. */
11427 len = (regno == BP_REG || regno == R13_REG);
11429 else if (IN_RANGE (offset, -128, 127))
11430 len = 1;
11432 /* ESP and R12 must be encoded with a SIB byte. */
11433 if (regno == SP_REG || regno == R12_REG)
11434 len++;
11436 return len;
11439 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11440 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11442 static bool
11443 sp_valid_at (HOST_WIDE_INT cfa_offset)
11445 const struct machine_frame_state &fs = cfun->machine->fs;
11446 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11448 /* Validate that the cfa_offset isn't in a "no-man's land". */
11449 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11450 return false;
11452 return fs.sp_valid;
11455 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11456 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11458 static inline bool
11459 fp_valid_at (HOST_WIDE_INT cfa_offset)
11461 const struct machine_frame_state &fs = cfun->machine->fs;
11462 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11464 /* Validate that the cfa_offset isn't in a "no-man's land". */
11465 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11466 return false;
11468 return fs.fp_valid;
11471 /* Choose a base register based upon alignment requested, speed and/or
11472 size. */
11474 static void
11475 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11476 HOST_WIDE_INT &base_offset,
11477 unsigned int align_reqested, unsigned int *align)
11479 const struct machine_function *m = cfun->machine;
11480 unsigned int hfp_align;
11481 unsigned int drap_align;
11482 unsigned int sp_align;
11483 bool hfp_ok = fp_valid_at (cfa_offset);
11484 bool drap_ok = m->fs.drap_valid;
11485 bool sp_ok = sp_valid_at (cfa_offset);
11487 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11489 /* Filter out any registers that don't meet the requested alignment
11490 criteria. */
11491 if (align_reqested)
11493 if (m->fs.realigned)
11494 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11495 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11496 notes (which we would need to use a realigned stack pointer),
11497 so disable on SEH targets. */
11498 else if (m->fs.sp_realigned)
11499 sp_align = crtl->stack_alignment_needed;
11501 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11502 drap_ok = drap_ok && drap_align >= align_reqested;
11503 sp_ok = sp_ok && sp_align >= align_reqested;
11506 if (m->use_fast_prologue_epilogue)
11508 /* Choose the base register most likely to allow the most scheduling
11509 opportunities. Generally FP is valid throughout the function,
11510 while DRAP must be reloaded within the epilogue. But choose either
11511 over the SP due to increased encoding size. */
11513 if (hfp_ok)
11515 base_reg = hard_frame_pointer_rtx;
11516 base_offset = m->fs.fp_offset - cfa_offset;
11518 else if (drap_ok)
11520 base_reg = crtl->drap_reg;
11521 base_offset = 0 - cfa_offset;
11523 else if (sp_ok)
11525 base_reg = stack_pointer_rtx;
11526 base_offset = m->fs.sp_offset - cfa_offset;
11529 else
11531 HOST_WIDE_INT toffset;
11532 int len = 16, tlen;
11534 /* Choose the base register with the smallest address encoding.
11535 With a tie, choose FP > DRAP > SP. */
11536 if (sp_ok)
11538 base_reg = stack_pointer_rtx;
11539 base_offset = m->fs.sp_offset - cfa_offset;
11540 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11542 if (drap_ok)
11544 toffset = 0 - cfa_offset;
11545 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11546 if (tlen <= len)
11548 base_reg = crtl->drap_reg;
11549 base_offset = toffset;
11550 len = tlen;
11553 if (hfp_ok)
11555 toffset = m->fs.fp_offset - cfa_offset;
11556 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11557 if (tlen <= len)
11559 base_reg = hard_frame_pointer_rtx;
11560 base_offset = toffset;
11561 len = tlen;
11566 /* Set the align return value. */
11567 if (align)
11569 if (base_reg == stack_pointer_rtx)
11570 *align = sp_align;
11571 else if (base_reg == crtl->drap_reg)
11572 *align = drap_align;
11573 else if (base_reg == hard_frame_pointer_rtx)
11574 *align = hfp_align;
11578 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11579 the alignment of address. If ALIGN is non-null, it should point to
11580 an alignment value (in bits) that is preferred or zero and will
11581 recieve the alignment of the base register that was selected,
11582 irrespective of rather or not CFA_OFFSET is a multiple of that
11583 alignment value. If it is possible for the base register offset to be
11584 non-immediate then SCRATCH_REGNO should specify a scratch register to
11585 use.
11587 The valid base registers are taken from CFUN->MACHINE->FS. */
11589 static rtx
11590 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
11591 unsigned int scratch_regno = INVALID_REGNUM)
11593 rtx base_reg = NULL;
11594 HOST_WIDE_INT base_offset = 0;
11596 /* If a specific alignment is requested, try to get a base register
11597 with that alignment first. */
11598 if (align && *align)
11599 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11601 if (!base_reg)
11602 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11604 gcc_assert (base_reg != NULL);
11606 rtx base_offset_rtx = GEN_INT (base_offset);
11608 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
11610 gcc_assert (scratch_regno != INVALID_REGNUM);
11612 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11613 emit_move_insn (scratch_reg, base_offset_rtx);
11615 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
11618 return plus_constant (Pmode, base_reg, base_offset);
11621 /* Emit code to save registers in the prologue. */
11623 static void
11624 ix86_emit_save_regs (void)
11626 unsigned int regno;
11627 rtx_insn *insn;
11629 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
11630 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11632 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
11633 RTX_FRAME_RELATED_P (insn) = 1;
11637 /* Emit a single register save at CFA - CFA_OFFSET. */
11639 static void
11640 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
11641 HOST_WIDE_INT cfa_offset)
11643 struct machine_function *m = cfun->machine;
11644 rtx reg = gen_rtx_REG (mode, regno);
11645 rtx mem, addr, base, insn;
11646 unsigned int align = GET_MODE_ALIGNMENT (mode);
11648 addr = choose_baseaddr (cfa_offset, &align);
11649 mem = gen_frame_mem (mode, addr);
11651 /* The location aligment depends upon the base register. */
11652 align = MIN (GET_MODE_ALIGNMENT (mode), align);
11653 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
11654 set_mem_align (mem, align);
11656 insn = emit_insn (gen_rtx_SET (mem, reg));
11657 RTX_FRAME_RELATED_P (insn) = 1;
11659 base = addr;
11660 if (GET_CODE (base) == PLUS)
11661 base = XEXP (base, 0);
11662 gcc_checking_assert (REG_P (base));
11664 /* When saving registers into a re-aligned local stack frame, avoid
11665 any tricky guessing by dwarf2out. */
11666 if (m->fs.realigned)
11668 gcc_checking_assert (stack_realign_drap);
11670 if (regno == REGNO (crtl->drap_reg))
11672 /* A bit of a hack. We force the DRAP register to be saved in
11673 the re-aligned stack frame, which provides us with a copy
11674 of the CFA that will last past the prologue. Install it. */
11675 gcc_checking_assert (cfun->machine->fs.fp_valid);
11676 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11677 cfun->machine->fs.fp_offset - cfa_offset);
11678 mem = gen_rtx_MEM (mode, addr);
11679 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
11681 else
11683 /* The frame pointer is a stable reference within the
11684 aligned frame. Use it. */
11685 gcc_checking_assert (cfun->machine->fs.fp_valid);
11686 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11687 cfun->machine->fs.fp_offset - cfa_offset);
11688 mem = gen_rtx_MEM (mode, addr);
11689 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11693 else if (base == stack_pointer_rtx && m->fs.sp_realigned
11694 && cfa_offset >= m->fs.sp_realigned_offset)
11696 gcc_checking_assert (stack_realign_fp);
11697 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11700 /* The memory may not be relative to the current CFA register,
11701 which means that we may need to generate a new pattern for
11702 use by the unwind info. */
11703 else if (base != m->fs.cfa_reg)
11705 addr = plus_constant (Pmode, m->fs.cfa_reg,
11706 m->fs.cfa_offset - cfa_offset);
11707 mem = gen_rtx_MEM (mode, addr);
11708 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
11712 /* Emit code to save registers using MOV insns.
11713 First register is stored at CFA - CFA_OFFSET. */
11714 static void
11715 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
11717 unsigned int regno;
11719 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11720 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11722 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
11723 cfa_offset -= UNITS_PER_WORD;
11727 /* Emit code to save SSE registers using MOV insns.
11728 First register is stored at CFA - CFA_OFFSET. */
11729 static void
11730 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
11732 unsigned int regno;
11734 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11735 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11737 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
11738 cfa_offset -= GET_MODE_SIZE (V4SFmode);
11742 static GTY(()) rtx queued_cfa_restores;
11744 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
11745 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
11746 Don't add the note if the previously saved value will be left untouched
11747 within stack red-zone till return, as unwinders can find the same value
11748 in the register and on the stack. */
11750 static void
11751 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
11753 if (!crtl->shrink_wrapped
11754 && cfa_offset <= cfun->machine->fs.red_zone_offset)
11755 return;
11757 if (insn)
11759 add_reg_note (insn, REG_CFA_RESTORE, reg);
11760 RTX_FRAME_RELATED_P (insn) = 1;
11762 else
11763 queued_cfa_restores
11764 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
11767 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
11769 static void
11770 ix86_add_queued_cfa_restore_notes (rtx insn)
11772 rtx last;
11773 if (!queued_cfa_restores)
11774 return;
11775 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
11777 XEXP (last, 1) = REG_NOTES (insn);
11778 REG_NOTES (insn) = queued_cfa_restores;
11779 queued_cfa_restores = NULL_RTX;
11780 RTX_FRAME_RELATED_P (insn) = 1;
11783 /* Expand prologue or epilogue stack adjustment.
11784 The pattern exist to put a dependency on all ebp-based memory accesses.
11785 STYLE should be negative if instructions should be marked as frame related,
11786 zero if %r11 register is live and cannot be freely used and positive
11787 otherwise. */
11789 static rtx
11790 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
11791 int style, bool set_cfa)
11793 struct machine_function *m = cfun->machine;
11794 rtx insn;
11795 bool add_frame_related_expr = false;
11797 if (Pmode == SImode)
11798 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
11799 else if (x86_64_immediate_operand (offset, DImode))
11800 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
11801 else
11803 rtx tmp;
11804 /* r11 is used by indirect sibcall return as well, set before the
11805 epilogue and used after the epilogue. */
11806 if (style)
11807 tmp = gen_rtx_REG (DImode, R11_REG);
11808 else
11810 gcc_assert (src != hard_frame_pointer_rtx
11811 && dest != hard_frame_pointer_rtx);
11812 tmp = hard_frame_pointer_rtx;
11814 insn = emit_insn (gen_rtx_SET (tmp, offset));
11815 if (style < 0)
11816 add_frame_related_expr = true;
11818 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
11821 insn = emit_insn (insn);
11822 if (style >= 0)
11823 ix86_add_queued_cfa_restore_notes (insn);
11825 if (set_cfa)
11827 rtx r;
11829 gcc_assert (m->fs.cfa_reg == src);
11830 m->fs.cfa_offset += INTVAL (offset);
11831 m->fs.cfa_reg = dest;
11833 r = gen_rtx_PLUS (Pmode, src, offset);
11834 r = gen_rtx_SET (dest, r);
11835 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
11836 RTX_FRAME_RELATED_P (insn) = 1;
11838 else if (style < 0)
11840 RTX_FRAME_RELATED_P (insn) = 1;
11841 if (add_frame_related_expr)
11843 rtx r = gen_rtx_PLUS (Pmode, src, offset);
11844 r = gen_rtx_SET (dest, r);
11845 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
11849 if (dest == stack_pointer_rtx)
11851 HOST_WIDE_INT ooffset = m->fs.sp_offset;
11852 bool valid = m->fs.sp_valid;
11853 bool realigned = m->fs.sp_realigned;
11855 if (src == hard_frame_pointer_rtx)
11857 valid = m->fs.fp_valid;
11858 realigned = false;
11859 ooffset = m->fs.fp_offset;
11861 else if (src == crtl->drap_reg)
11863 valid = m->fs.drap_valid;
11864 realigned = false;
11865 ooffset = 0;
11867 else
11869 /* Else there are two possibilities: SP itself, which we set
11870 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
11871 taken care of this by hand along the eh_return path. */
11872 gcc_checking_assert (src == stack_pointer_rtx
11873 || offset == const0_rtx);
11876 m->fs.sp_offset = ooffset - INTVAL (offset);
11877 m->fs.sp_valid = valid;
11878 m->fs.sp_realigned = realigned;
11880 return insn;
11883 /* Find an available register to be used as dynamic realign argument
11884 pointer regsiter. Such a register will be written in prologue and
11885 used in begin of body, so it must not be
11886 1. parameter passing register.
11887 2. GOT pointer.
11888 We reuse static-chain register if it is available. Otherwise, we
11889 use DI for i386 and R13 for x86-64. We chose R13 since it has
11890 shorter encoding.
11892 Return: the regno of chosen register. */
11894 static unsigned int
11895 find_drap_reg (void)
11897 tree decl = cfun->decl;
11899 /* Always use callee-saved register if there are no caller-saved
11900 registers. */
11901 if (TARGET_64BIT)
11903 /* Use R13 for nested function or function need static chain.
11904 Since function with tail call may use any caller-saved
11905 registers in epilogue, DRAP must not use caller-saved
11906 register in such case. */
11907 if (DECL_STATIC_CHAIN (decl)
11908 || cfun->machine->no_caller_saved_registers
11909 || crtl->tail_call_emit)
11910 return R13_REG;
11912 return R10_REG;
11914 else
11916 /* Use DI for nested function or function need static chain.
11917 Since function with tail call may use any caller-saved
11918 registers in epilogue, DRAP must not use caller-saved
11919 register in such case. */
11920 if (DECL_STATIC_CHAIN (decl)
11921 || cfun->machine->no_caller_saved_registers
11922 || crtl->tail_call_emit)
11923 return DI_REG;
11925 /* Reuse static chain register if it isn't used for parameter
11926 passing. */
11927 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
11929 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
11930 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
11931 return CX_REG;
11933 return DI_REG;
11937 /* Handle a "force_align_arg_pointer" attribute. */
11939 static tree
11940 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
11941 tree, int, bool *no_add_attrs)
11943 if (TREE_CODE (*node) != FUNCTION_TYPE
11944 && TREE_CODE (*node) != METHOD_TYPE
11945 && TREE_CODE (*node) != FIELD_DECL
11946 && TREE_CODE (*node) != TYPE_DECL)
11948 warning (OPT_Wattributes, "%qE attribute only applies to functions",
11949 name);
11950 *no_add_attrs = true;
11953 return NULL_TREE;
11956 /* Return minimum incoming stack alignment. */
11958 static unsigned int
11959 ix86_minimum_incoming_stack_boundary (bool sibcall)
11961 unsigned int incoming_stack_boundary;
11963 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
11964 if (cfun->machine->func_type != TYPE_NORMAL)
11965 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
11966 /* Prefer the one specified at command line. */
11967 else if (ix86_user_incoming_stack_boundary)
11968 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
11969 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
11970 if -mstackrealign is used, it isn't used for sibcall check and
11971 estimated stack alignment is 128bit. */
11972 else if (!sibcall
11973 && ix86_force_align_arg_pointer
11974 && crtl->stack_alignment_estimated == 128)
11975 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11976 else
11977 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
11979 /* Incoming stack alignment can be changed on individual functions
11980 via force_align_arg_pointer attribute. We use the smallest
11981 incoming stack boundary. */
11982 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
11983 && lookup_attribute (ix86_force_align_arg_pointer_string,
11984 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
11985 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11987 /* The incoming stack frame has to be aligned at least at
11988 parm_stack_boundary. */
11989 if (incoming_stack_boundary < crtl->parm_stack_boundary)
11990 incoming_stack_boundary = crtl->parm_stack_boundary;
11992 /* Stack at entrance of main is aligned by runtime. We use the
11993 smallest incoming stack boundary. */
11994 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
11995 && DECL_NAME (current_function_decl)
11996 && MAIN_NAME_P (DECL_NAME (current_function_decl))
11997 && DECL_FILE_SCOPE_P (current_function_decl))
11998 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12000 return incoming_stack_boundary;
12003 /* Update incoming stack boundary and estimated stack alignment. */
12005 static void
12006 ix86_update_stack_boundary (void)
12008 ix86_incoming_stack_boundary
12009 = ix86_minimum_incoming_stack_boundary (false);
12011 /* x86_64 vararg needs 16byte stack alignment for register save
12012 area. */
12013 if (TARGET_64BIT
12014 && cfun->stdarg
12015 && crtl->stack_alignment_estimated < 128)
12016 crtl->stack_alignment_estimated = 128;
12018 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12019 if (ix86_tls_descriptor_calls_expanded_in_cfun
12020 && crtl->preferred_stack_boundary < 128)
12021 crtl->preferred_stack_boundary = 128;
12024 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12025 needed or an rtx for DRAP otherwise. */
12027 static rtx
12028 ix86_get_drap_rtx (void)
12030 /* We must use DRAP if there are outgoing arguments on stack and
12031 ACCUMULATE_OUTGOING_ARGS is false. */
12032 if (ix86_force_drap
12033 || (cfun->machine->outgoing_args_on_stack
12034 && !ACCUMULATE_OUTGOING_ARGS))
12035 crtl->need_drap = true;
12037 if (stack_realign_drap)
12039 /* Assign DRAP to vDRAP and returns vDRAP */
12040 unsigned int regno = find_drap_reg ();
12041 rtx drap_vreg;
12042 rtx arg_ptr;
12043 rtx_insn *seq, *insn;
12045 arg_ptr = gen_rtx_REG (Pmode, regno);
12046 crtl->drap_reg = arg_ptr;
12048 start_sequence ();
12049 drap_vreg = copy_to_reg (arg_ptr);
12050 seq = get_insns ();
12051 end_sequence ();
12053 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12054 if (!optimize)
12056 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12057 RTX_FRAME_RELATED_P (insn) = 1;
12059 return drap_vreg;
12061 else
12062 return NULL;
12065 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12067 static rtx
12068 ix86_internal_arg_pointer (void)
12070 return virtual_incoming_args_rtx;
12073 struct scratch_reg {
12074 rtx reg;
12075 bool saved;
12078 /* Return a short-lived scratch register for use on function entry.
12079 In 32-bit mode, it is valid only after the registers are saved
12080 in the prologue. This register must be released by means of
12081 release_scratch_register_on_entry once it is dead. */
12083 static void
12084 get_scratch_register_on_entry (struct scratch_reg *sr)
12086 int regno;
12088 sr->saved = false;
12090 if (TARGET_64BIT)
12092 /* We always use R11 in 64-bit mode. */
12093 regno = R11_REG;
12095 else
12097 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12098 bool fastcall_p
12099 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12100 bool thiscall_p
12101 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12102 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12103 int regparm = ix86_function_regparm (fntype, decl);
12104 int drap_regno
12105 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12107 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12108 for the static chain register. */
12109 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12110 && drap_regno != AX_REG)
12111 regno = AX_REG;
12112 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12113 for the static chain register. */
12114 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12115 regno = AX_REG;
12116 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12117 regno = DX_REG;
12118 /* ecx is the static chain register. */
12119 else if (regparm < 3 && !fastcall_p && !thiscall_p
12120 && !static_chain_p
12121 && drap_regno != CX_REG)
12122 regno = CX_REG;
12123 else if (ix86_save_reg (BX_REG, true, false))
12124 regno = BX_REG;
12125 /* esi is the static chain register. */
12126 else if (!(regparm == 3 && static_chain_p)
12127 && ix86_save_reg (SI_REG, true, false))
12128 regno = SI_REG;
12129 else if (ix86_save_reg (DI_REG, true, false))
12130 regno = DI_REG;
12131 else
12133 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12134 sr->saved = true;
12138 sr->reg = gen_rtx_REG (Pmode, regno);
12139 if (sr->saved)
12141 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12142 RTX_FRAME_RELATED_P (insn) = 1;
12146 /* Release a scratch register obtained from the preceding function. */
12148 static void
12149 release_scratch_register_on_entry (struct scratch_reg *sr)
12151 if (sr->saved)
12153 struct machine_function *m = cfun->machine;
12154 rtx x, insn = emit_insn (gen_pop (sr->reg));
12156 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12157 RTX_FRAME_RELATED_P (insn) = 1;
12158 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12159 x = gen_rtx_SET (stack_pointer_rtx, x);
12160 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12161 m->fs.sp_offset -= UNITS_PER_WORD;
12165 /* Return the probing interval for -fstack-clash-protection. */
12167 static HOST_WIDE_INT
12168 get_probe_interval (void)
12170 if (flag_stack_clash_protection)
12171 return (HOST_WIDE_INT_1U
12172 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
12173 else
12174 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
12177 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12179 This differs from the next routine in that it tries hard to prevent
12180 attacks that jump the stack guard. Thus it is never allowed to allocate
12181 more than PROBE_INTERVAL bytes of stack space without a suitable
12182 probe. */
12184 static void
12185 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
12187 struct machine_function *m = cfun->machine;
12189 /* If this function does not statically allocate stack space, then
12190 no probes are needed. */
12191 if (!size)
12193 /* However, the allocation of space via pushes for register
12194 saves could be viewed as allocating space, but without the
12195 need to probe. */
12196 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12197 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12198 else
12199 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12200 return;
12203 /* If we are a noreturn function, then we have to consider the
12204 possibility that we're called via a jump rather than a call.
12206 Thus we don't have the implicit probe generated by saving the
12207 return address into the stack at the call. Thus, the stack
12208 pointer could be anywhere in the guard page. The safe thing
12209 to do is emit a probe now.
12211 ?!? This should be revamped to work like aarch64 and s390 where
12212 we track the offset from the most recent probe. Normally that
12213 offset would be zero. For a noreturn function we would reset
12214 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12215 we just probe when we cross PROBE_INTERVAL. */
12216 if (TREE_THIS_VOLATILE (cfun->decl))
12218 /* We can safely use any register here since we're just going to push
12219 its value and immediately pop it back. But we do try and avoid
12220 argument passing registers so as not to introduce dependencies in
12221 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12222 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12223 rtx_insn *insn = emit_insn (gen_push (dummy_reg));
12224 RTX_FRAME_RELATED_P (insn) = 1;
12225 ix86_emit_restore_reg_using_pop (dummy_reg);
12226 emit_insn (gen_blockage ());
12229 /* If we allocate less than the size of the guard statically,
12230 then no probing is necessary, but we do need to allocate
12231 the stack. */
12232 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12234 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12235 GEN_INT (-size), -1,
12236 m->fs.cfa_reg == stack_pointer_rtx);
12237 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12238 return;
12241 /* We're allocating a large enough stack frame that we need to
12242 emit probes. Either emit them inline or in a loop depending
12243 on the size. */
12244 HOST_WIDE_INT probe_interval = get_probe_interval ();
12245 if (size <= 4 * probe_interval)
12247 HOST_WIDE_INT i;
12248 for (i = probe_interval; i <= size; i += probe_interval)
12250 /* Allocate PROBE_INTERVAL bytes. */
12251 rtx insn
12252 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12253 GEN_INT (-probe_interval), -1,
12254 m->fs.cfa_reg == stack_pointer_rtx);
12255 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12257 /* And probe at *sp. */
12258 emit_stack_probe (stack_pointer_rtx);
12259 emit_insn (gen_blockage ());
12262 /* We need to allocate space for the residual, but we do not need
12263 to probe the residual. */
12264 HOST_WIDE_INT residual = (i - probe_interval - size);
12265 if (residual)
12266 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12267 GEN_INT (residual), -1,
12268 m->fs.cfa_reg == stack_pointer_rtx);
12269 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12271 else
12273 struct scratch_reg sr;
12274 get_scratch_register_on_entry (&sr);
12276 /* Step 1: round SIZE down to a multiple of the interval. */
12277 HOST_WIDE_INT rounded_size = size & -probe_interval;
12279 /* Step 2: compute final value of the loop counter. Use lea if
12280 possible. */
12281 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12282 rtx insn;
12283 if (address_no_seg_operand (addr, Pmode))
12284 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12285 else
12287 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12288 insn = emit_insn (gen_rtx_SET (sr.reg,
12289 gen_rtx_PLUS (Pmode, sr.reg,
12290 stack_pointer_rtx)));
12292 if (m->fs.cfa_reg == stack_pointer_rtx)
12294 add_reg_note (insn, REG_CFA_DEF_CFA,
12295 plus_constant (Pmode, sr.reg,
12296 m->fs.cfa_offset + rounded_size));
12297 RTX_FRAME_RELATED_P (insn) = 1;
12300 /* Step 3: the loop. */
12301 rtx size_rtx = GEN_INT (rounded_size);
12302 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12303 size_rtx));
12304 if (m->fs.cfa_reg == stack_pointer_rtx)
12306 m->fs.cfa_offset += rounded_size;
12307 add_reg_note (insn, REG_CFA_DEF_CFA,
12308 plus_constant (Pmode, stack_pointer_rtx,
12309 m->fs.cfa_offset));
12310 RTX_FRAME_RELATED_P (insn) = 1;
12312 m->fs.sp_offset += rounded_size;
12313 emit_insn (gen_blockage ());
12315 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12316 is equal to ROUNDED_SIZE. */
12318 if (size != rounded_size)
12319 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12320 GEN_INT (rounded_size - size), -1,
12321 m->fs.cfa_reg == stack_pointer_rtx);
12322 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12324 release_scratch_register_on_entry (&sr);
12327 /* Make sure nothing is scheduled before we are done. */
12328 emit_insn (gen_blockage ());
12331 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
12333 static void
12334 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
12336 /* We skip the probe for the first interval + a small dope of 4 words and
12337 probe that many bytes past the specified size to maintain a protection
12338 area at the botton of the stack. */
12339 const int dope = 4 * UNITS_PER_WORD;
12340 rtx size_rtx = GEN_INT (size), last;
12342 /* See if we have a constant small number of probes to generate. If so,
12343 that's the easy case. The run-time loop is made up of 9 insns in the
12344 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12345 for n # of intervals. */
12346 if (size <= 4 * get_probe_interval ())
12348 HOST_WIDE_INT i, adjust;
12349 bool first_probe = true;
12351 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12352 values of N from 1 until it exceeds SIZE. If only one probe is
12353 needed, this will not generate any code. Then adjust and probe
12354 to PROBE_INTERVAL + SIZE. */
12355 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12357 if (first_probe)
12359 adjust = 2 * get_probe_interval () + dope;
12360 first_probe = false;
12362 else
12363 adjust = get_probe_interval ();
12365 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12366 plus_constant (Pmode, stack_pointer_rtx,
12367 -adjust)));
12368 emit_stack_probe (stack_pointer_rtx);
12371 if (first_probe)
12372 adjust = size + get_probe_interval () + dope;
12373 else
12374 adjust = size + get_probe_interval () - i;
12376 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12377 plus_constant (Pmode, stack_pointer_rtx,
12378 -adjust)));
12379 emit_stack_probe (stack_pointer_rtx);
12381 /* Adjust back to account for the additional first interval. */
12382 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12383 plus_constant (Pmode, stack_pointer_rtx,
12384 (get_probe_interval ()
12385 + dope))));
12388 /* Otherwise, do the same as above, but in a loop. Note that we must be
12389 extra careful with variables wrapping around because we might be at
12390 the very top (or the very bottom) of the address space and we have
12391 to be able to handle this case properly; in particular, we use an
12392 equality test for the loop condition. */
12393 else
12395 HOST_WIDE_INT rounded_size;
12396 struct scratch_reg sr;
12398 get_scratch_register_on_entry (&sr);
12401 /* Step 1: round SIZE to the previous multiple of the interval. */
12403 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12406 /* Step 2: compute initial and final value of the loop counter. */
12408 /* SP = SP_0 + PROBE_INTERVAL. */
12409 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12410 plus_constant (Pmode, stack_pointer_rtx,
12411 - (get_probe_interval () + dope))));
12413 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12414 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12415 emit_insn (gen_rtx_SET (sr.reg,
12416 plus_constant (Pmode, stack_pointer_rtx,
12417 -rounded_size)));
12418 else
12420 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12421 emit_insn (gen_rtx_SET (sr.reg,
12422 gen_rtx_PLUS (Pmode, sr.reg,
12423 stack_pointer_rtx)));
12427 /* Step 3: the loop
12431 SP = SP + PROBE_INTERVAL
12432 probe at SP
12434 while (SP != LAST_ADDR)
12436 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12437 values of N from 1 until it is equal to ROUNDED_SIZE. */
12439 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12442 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12443 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12445 if (size != rounded_size)
12447 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12448 plus_constant (Pmode, stack_pointer_rtx,
12449 rounded_size - size)));
12450 emit_stack_probe (stack_pointer_rtx);
12453 /* Adjust back to account for the additional first interval. */
12454 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12455 plus_constant (Pmode, stack_pointer_rtx,
12456 (get_probe_interval ()
12457 + dope))));
12459 release_scratch_register_on_entry (&sr);
12462 /* Even if the stack pointer isn't the CFA register, we need to correctly
12463 describe the adjustments made to it, in particular differentiate the
12464 frame-related ones from the frame-unrelated ones. */
12465 if (size > 0)
12467 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12468 XVECEXP (expr, 0, 0)
12469 = gen_rtx_SET (stack_pointer_rtx,
12470 plus_constant (Pmode, stack_pointer_rtx, -size));
12471 XVECEXP (expr, 0, 1)
12472 = gen_rtx_SET (stack_pointer_rtx,
12473 plus_constant (Pmode, stack_pointer_rtx,
12474 get_probe_interval () + dope + size));
12475 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12476 RTX_FRAME_RELATED_P (last) = 1;
12478 cfun->machine->fs.sp_offset += size;
12481 /* Make sure nothing is scheduled before we are done. */
12482 emit_insn (gen_blockage ());
12485 /* Adjust the stack pointer up to REG while probing it. */
12487 const char *
12488 output_adjust_stack_and_probe (rtx reg)
12490 static int labelno = 0;
12491 char loop_lab[32];
12492 rtx xops[2];
12494 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12496 /* Loop. */
12497 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12499 /* SP = SP + PROBE_INTERVAL. */
12500 xops[0] = stack_pointer_rtx;
12501 xops[1] = GEN_INT (get_probe_interval ());
12502 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12504 /* Probe at SP. */
12505 xops[1] = const0_rtx;
12506 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12508 /* Test if SP == LAST_ADDR. */
12509 xops[0] = stack_pointer_rtx;
12510 xops[1] = reg;
12511 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12513 /* Branch. */
12514 fputs ("\tjne\t", asm_out_file);
12515 assemble_name_raw (asm_out_file, loop_lab);
12516 fputc ('\n', asm_out_file);
12518 return "";
12521 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12522 inclusive. These are offsets from the current stack pointer. */
12524 static void
12525 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
12527 /* See if we have a constant small number of probes to generate. If so,
12528 that's the easy case. The run-time loop is made up of 6 insns in the
12529 generic case while the compile-time loop is made up of n insns for n #
12530 of intervals. */
12531 if (size <= 6 * get_probe_interval ())
12533 HOST_WIDE_INT i;
12535 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12536 it exceeds SIZE. If only one probe is needed, this will not
12537 generate any code. Then probe at FIRST + SIZE. */
12538 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12539 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12540 -(first + i)));
12542 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12543 -(first + size)));
12546 /* Otherwise, do the same as above, but in a loop. Note that we must be
12547 extra careful with variables wrapping around because we might be at
12548 the very top (or the very bottom) of the address space and we have
12549 to be able to handle this case properly; in particular, we use an
12550 equality test for the loop condition. */
12551 else
12553 HOST_WIDE_INT rounded_size, last;
12554 struct scratch_reg sr;
12556 get_scratch_register_on_entry (&sr);
12559 /* Step 1: round SIZE to the previous multiple of the interval. */
12561 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12564 /* Step 2: compute initial and final value of the loop counter. */
12566 /* TEST_OFFSET = FIRST. */
12567 emit_move_insn (sr.reg, GEN_INT (-first));
12569 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
12570 last = first + rounded_size;
12573 /* Step 3: the loop
12577 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
12578 probe at TEST_ADDR
12580 while (TEST_ADDR != LAST_ADDR)
12582 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
12583 until it is equal to ROUNDED_SIZE. */
12585 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
12588 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
12589 that SIZE is equal to ROUNDED_SIZE. */
12591 if (size != rounded_size)
12592 emit_stack_probe (plus_constant (Pmode,
12593 gen_rtx_PLUS (Pmode,
12594 stack_pointer_rtx,
12595 sr.reg),
12596 rounded_size - size));
12598 release_scratch_register_on_entry (&sr);
12601 /* Make sure nothing is scheduled before we are done. */
12602 emit_insn (gen_blockage ());
12605 /* Probe a range of stack addresses from REG to END, inclusive. These are
12606 offsets from the current stack pointer. */
12608 const char *
12609 output_probe_stack_range (rtx reg, rtx end)
12611 static int labelno = 0;
12612 char loop_lab[32];
12613 rtx xops[3];
12615 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12617 /* Loop. */
12618 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12620 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
12621 xops[0] = reg;
12622 xops[1] = GEN_INT (get_probe_interval ());
12623 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12625 /* Probe at TEST_ADDR. */
12626 xops[0] = stack_pointer_rtx;
12627 xops[1] = reg;
12628 xops[2] = const0_rtx;
12629 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
12631 /* Test if TEST_ADDR == LAST_ADDR. */
12632 xops[0] = reg;
12633 xops[1] = end;
12634 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12636 /* Branch. */
12637 fputs ("\tjne\t", asm_out_file);
12638 assemble_name_raw (asm_out_file, loop_lab);
12639 fputc ('\n', asm_out_file);
12641 return "";
12644 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
12645 will guide prologue/epilogue to be generated in correct form. */
12647 static void
12648 ix86_finalize_stack_frame_flags (void)
12650 /* Check if stack realign is really needed after reload, and
12651 stores result in cfun */
12652 unsigned int incoming_stack_boundary
12653 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
12654 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
12655 unsigned int stack_alignment
12656 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
12657 ? crtl->max_used_stack_slot_alignment
12658 : crtl->stack_alignment_needed);
12659 unsigned int stack_realign
12660 = (incoming_stack_boundary < stack_alignment);
12661 bool recompute_frame_layout_p = false;
12663 if (crtl->stack_realign_finalized)
12665 /* After stack_realign_needed is finalized, we can't no longer
12666 change it. */
12667 gcc_assert (crtl->stack_realign_needed == stack_realign);
12668 return;
12671 /* If the only reason for frame_pointer_needed is that we conservatively
12672 assumed stack realignment might be needed or -fno-omit-frame-pointer
12673 is used, but in the end nothing that needed the stack alignment had
12674 been spilled nor stack access, clear frame_pointer_needed and say we
12675 don't need stack realignment. */
12676 if ((stack_realign || !flag_omit_frame_pointer)
12677 && frame_pointer_needed
12678 && crtl->is_leaf
12679 && crtl->sp_is_unchanging
12680 && !ix86_current_function_calls_tls_descriptor
12681 && !crtl->accesses_prior_frames
12682 && !cfun->calls_alloca
12683 && !crtl->calls_eh_return
12684 /* See ira_setup_eliminable_regset for the rationale. */
12685 && !(STACK_CHECK_MOVING_SP
12686 && flag_stack_check
12687 && flag_exceptions
12688 && cfun->can_throw_non_call_exceptions)
12689 && !ix86_frame_pointer_required ()
12690 && get_frame_size () == 0
12691 && ix86_nsaved_sseregs () == 0
12692 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
12694 HARD_REG_SET set_up_by_prologue, prologue_used;
12695 basic_block bb;
12697 CLEAR_HARD_REG_SET (prologue_used);
12698 CLEAR_HARD_REG_SET (set_up_by_prologue);
12699 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
12700 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
12701 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
12702 HARD_FRAME_POINTER_REGNUM);
12704 /* The preferred stack alignment is the minimum stack alignment. */
12705 if (stack_alignment > crtl->preferred_stack_boundary)
12706 stack_alignment = crtl->preferred_stack_boundary;
12708 bool require_stack_frame = false;
12710 FOR_EACH_BB_FN (bb, cfun)
12712 rtx_insn *insn;
12713 FOR_BB_INSNS (bb, insn)
12714 if (NONDEBUG_INSN_P (insn)
12715 && requires_stack_frame_p (insn, prologue_used,
12716 set_up_by_prologue))
12718 require_stack_frame = true;
12720 if (stack_realign)
12722 /* Find the maximum stack alignment. */
12723 subrtx_iterator::array_type array;
12724 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
12725 if (MEM_P (*iter)
12726 && (reg_mentioned_p (stack_pointer_rtx,
12727 *iter)
12728 || reg_mentioned_p (frame_pointer_rtx,
12729 *iter)))
12731 unsigned int alignment = MEM_ALIGN (*iter);
12732 if (alignment > stack_alignment)
12733 stack_alignment = alignment;
12739 if (require_stack_frame)
12741 /* Stack frame is required. If stack alignment needed is less
12742 than incoming stack boundary, don't realign stack. */
12743 stack_realign = incoming_stack_boundary < stack_alignment;
12744 if (!stack_realign)
12746 crtl->max_used_stack_slot_alignment
12747 = incoming_stack_boundary;
12748 crtl->stack_alignment_needed
12749 = incoming_stack_boundary;
12750 /* Also update preferred_stack_boundary for leaf
12751 functions. */
12752 crtl->preferred_stack_boundary
12753 = incoming_stack_boundary;
12756 else
12758 /* If drap has been set, but it actually isn't live at the
12759 start of the function, there is no reason to set it up. */
12760 if (crtl->drap_reg)
12762 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12763 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
12764 REGNO (crtl->drap_reg)))
12766 crtl->drap_reg = NULL_RTX;
12767 crtl->need_drap = false;
12770 else
12771 cfun->machine->no_drap_save_restore = true;
12773 frame_pointer_needed = false;
12774 stack_realign = false;
12775 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
12776 crtl->stack_alignment_needed = incoming_stack_boundary;
12777 crtl->stack_alignment_estimated = incoming_stack_boundary;
12778 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
12779 crtl->preferred_stack_boundary = incoming_stack_boundary;
12780 df_finish_pass (true);
12781 df_scan_alloc (NULL);
12782 df_scan_blocks ();
12783 df_compute_regs_ever_live (true);
12784 df_analyze ();
12786 if (flag_var_tracking)
12788 /* Since frame pointer is no longer available, replace it with
12789 stack pointer - UNITS_PER_WORD in debug insns. */
12790 df_ref ref, next;
12791 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
12792 ref; ref = next)
12794 next = DF_REF_NEXT_REG (ref);
12795 if (!DF_REF_INSN_INFO (ref))
12796 continue;
12798 /* Make sure the next ref is for a different instruction,
12799 so that we're not affected by the rescan. */
12800 rtx_insn *insn = DF_REF_INSN (ref);
12801 while (next && DF_REF_INSN (next) == insn)
12802 next = DF_REF_NEXT_REG (next);
12804 if (DEBUG_INSN_P (insn))
12806 bool changed = false;
12807 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
12809 rtx *loc = DF_REF_LOC (ref);
12810 if (*loc == hard_frame_pointer_rtx)
12812 *loc = plus_constant (Pmode,
12813 stack_pointer_rtx,
12814 -UNITS_PER_WORD);
12815 changed = true;
12818 if (changed)
12819 df_insn_rescan (insn);
12824 recompute_frame_layout_p = true;
12828 if (crtl->stack_realign_needed != stack_realign)
12829 recompute_frame_layout_p = true;
12830 crtl->stack_realign_needed = stack_realign;
12831 crtl->stack_realign_finalized = true;
12832 if (recompute_frame_layout_p)
12833 ix86_compute_frame_layout ();
12836 /* Delete SET_GOT right after entry block if it is allocated to reg. */
12838 static void
12839 ix86_elim_entry_set_got (rtx reg)
12841 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12842 rtx_insn *c_insn = BB_HEAD (bb);
12843 if (!NONDEBUG_INSN_P (c_insn))
12844 c_insn = next_nonnote_nondebug_insn (c_insn);
12845 if (c_insn && NONJUMP_INSN_P (c_insn))
12847 rtx pat = PATTERN (c_insn);
12848 if (GET_CODE (pat) == PARALLEL)
12850 rtx vec = XVECEXP (pat, 0, 0);
12851 if (GET_CODE (vec) == SET
12852 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
12853 && REGNO (XEXP (vec, 0)) == REGNO (reg))
12854 delete_insn (c_insn);
12859 static rtx
12860 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
12862 rtx addr, mem;
12864 if (offset)
12865 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
12866 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
12867 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
12870 static inline rtx
12871 gen_frame_load (rtx reg, rtx frame_reg, int offset)
12873 return gen_frame_set (reg, frame_reg, offset, false);
12876 static inline rtx
12877 gen_frame_store (rtx reg, rtx frame_reg, int offset)
12879 return gen_frame_set (reg, frame_reg, offset, true);
12882 static void
12883 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
12885 struct machine_function *m = cfun->machine;
12886 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
12887 + m->call_ms2sysv_extra_regs;
12888 rtvec v = rtvec_alloc (ncregs + 1);
12889 unsigned int align, i, vi = 0;
12890 rtx_insn *insn;
12891 rtx sym, addr;
12892 rtx rax = gen_rtx_REG (word_mode, AX_REG);
12893 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12895 /* AL should only be live with sysv_abi. */
12896 gcc_assert (!ix86_eax_live_at_start_p ());
12897 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
12899 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
12900 we've actually realigned the stack or not. */
12901 align = GET_MODE_ALIGNMENT (V4SFmode);
12902 addr = choose_baseaddr (frame.stack_realign_offset
12903 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
12904 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
12906 emit_insn (gen_rtx_SET (rax, addr));
12908 /* Get the stub symbol. */
12909 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
12910 : XLOGUE_STUB_SAVE);
12911 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
12913 for (i = 0; i < ncregs; ++i)
12915 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
12916 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
12917 r.regno);
12918 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
12921 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
12923 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
12924 RTX_FRAME_RELATED_P (insn) = true;
12927 /* Expand the prologue into a bunch of separate insns. */
12929 void
12930 ix86_expand_prologue (void)
12932 struct machine_function *m = cfun->machine;
12933 rtx insn, t;
12934 struct ix86_frame frame;
12935 HOST_WIDE_INT allocate;
12936 bool int_registers_saved;
12937 bool sse_registers_saved;
12938 bool save_stub_call_needed;
12939 rtx static_chain = NULL_RTX;
12941 if (ix86_function_naked (current_function_decl))
12942 return;
12944 ix86_finalize_stack_frame_flags ();
12946 /* DRAP should not coexist with stack_realign_fp */
12947 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
12949 memset (&m->fs, 0, sizeof (m->fs));
12951 /* Initialize CFA state for before the prologue. */
12952 m->fs.cfa_reg = stack_pointer_rtx;
12953 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
12955 /* Track SP offset to the CFA. We continue tracking this after we've
12956 swapped the CFA register away from SP. In the case of re-alignment
12957 this is fudged; we're interested to offsets within the local frame. */
12958 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
12959 m->fs.sp_valid = true;
12960 m->fs.sp_realigned = false;
12962 frame = m->frame;
12964 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
12966 /* We should have already generated an error for any use of
12967 ms_hook on a nested function. */
12968 gcc_checking_assert (!ix86_static_chain_on_stack);
12970 /* Check if profiling is active and we shall use profiling before
12971 prologue variant. If so sorry. */
12972 if (crtl->profile && flag_fentry != 0)
12973 sorry ("ms_hook_prologue attribute isn%'t compatible "
12974 "with -mfentry for 32-bit");
12976 /* In ix86_asm_output_function_label we emitted:
12977 8b ff movl.s %edi,%edi
12978 55 push %ebp
12979 8b ec movl.s %esp,%ebp
12981 This matches the hookable function prologue in Win32 API
12982 functions in Microsoft Windows XP Service Pack 2 and newer.
12983 Wine uses this to enable Windows apps to hook the Win32 API
12984 functions provided by Wine.
12986 What that means is that we've already set up the frame pointer. */
12988 if (frame_pointer_needed
12989 && !(crtl->drap_reg && crtl->stack_realign_needed))
12991 rtx push, mov;
12993 /* We've decided to use the frame pointer already set up.
12994 Describe this to the unwinder by pretending that both
12995 push and mov insns happen right here.
12997 Putting the unwind info here at the end of the ms_hook
12998 is done so that we can make absolutely certain we get
12999 the required byte sequence at the start of the function,
13000 rather than relying on an assembler that can produce
13001 the exact encoding required.
13003 However it does mean (in the unpatched case) that we have
13004 a 1 insn window where the asynchronous unwind info is
13005 incorrect. However, if we placed the unwind info at
13006 its correct location we would have incorrect unwind info
13007 in the patched case. Which is probably all moot since
13008 I don't expect Wine generates dwarf2 unwind info for the
13009 system libraries that use this feature. */
13011 insn = emit_insn (gen_blockage ());
13013 push = gen_push (hard_frame_pointer_rtx);
13014 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13015 stack_pointer_rtx);
13016 RTX_FRAME_RELATED_P (push) = 1;
13017 RTX_FRAME_RELATED_P (mov) = 1;
13019 RTX_FRAME_RELATED_P (insn) = 1;
13020 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13021 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13023 /* Note that gen_push incremented m->fs.cfa_offset, even
13024 though we didn't emit the push insn here. */
13025 m->fs.cfa_reg = hard_frame_pointer_rtx;
13026 m->fs.fp_offset = m->fs.cfa_offset;
13027 m->fs.fp_valid = true;
13029 else
13031 /* The frame pointer is not needed so pop %ebp again.
13032 This leaves us with a pristine state. */
13033 emit_insn (gen_pop (hard_frame_pointer_rtx));
13037 /* The first insn of a function that accepts its static chain on the
13038 stack is to push the register that would be filled in by a direct
13039 call. This insn will be skipped by the trampoline. */
13040 else if (ix86_static_chain_on_stack)
13042 static_chain = ix86_static_chain (cfun->decl, false);
13043 insn = emit_insn (gen_push (static_chain));
13044 emit_insn (gen_blockage ());
13046 /* We don't want to interpret this push insn as a register save,
13047 only as a stack adjustment. The real copy of the register as
13048 a save will be done later, if needed. */
13049 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13050 t = gen_rtx_SET (stack_pointer_rtx, t);
13051 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13052 RTX_FRAME_RELATED_P (insn) = 1;
13055 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13056 of DRAP is needed and stack realignment is really needed after reload */
13057 if (stack_realign_drap)
13059 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13061 /* Can't use DRAP in interrupt function. */
13062 if (cfun->machine->func_type != TYPE_NORMAL)
13063 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13064 "in interrupt service routine. This may be worked "
13065 "around by avoiding functions with aggregate return.");
13067 /* Only need to push parameter pointer reg if it is caller saved. */
13068 if (!call_used_regs[REGNO (crtl->drap_reg)])
13070 /* Push arg pointer reg */
13071 insn = emit_insn (gen_push (crtl->drap_reg));
13072 RTX_FRAME_RELATED_P (insn) = 1;
13075 /* Grab the argument pointer. */
13076 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13077 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13078 RTX_FRAME_RELATED_P (insn) = 1;
13079 m->fs.cfa_reg = crtl->drap_reg;
13080 m->fs.cfa_offset = 0;
13082 /* Align the stack. */
13083 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13084 stack_pointer_rtx,
13085 GEN_INT (-align_bytes)));
13086 RTX_FRAME_RELATED_P (insn) = 1;
13088 /* Replicate the return address on the stack so that return
13089 address can be reached via (argp - 1) slot. This is needed
13090 to implement macro RETURN_ADDR_RTX and intrinsic function
13091 expand_builtin_return_addr etc. */
13092 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13093 t = gen_frame_mem (word_mode, t);
13094 insn = emit_insn (gen_push (t));
13095 RTX_FRAME_RELATED_P (insn) = 1;
13097 /* For the purposes of frame and register save area addressing,
13098 we've started over with a new frame. */
13099 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13100 m->fs.realigned = true;
13102 if (static_chain)
13104 /* Replicate static chain on the stack so that static chain
13105 can be reached via (argp - 2) slot. This is needed for
13106 nested function with stack realignment. */
13107 insn = emit_insn (gen_push (static_chain));
13108 RTX_FRAME_RELATED_P (insn) = 1;
13112 int_registers_saved = (frame.nregs == 0);
13113 sse_registers_saved = (frame.nsseregs == 0);
13114 save_stub_call_needed = (m->call_ms2sysv);
13115 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13117 if (frame_pointer_needed && !m->fs.fp_valid)
13119 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13120 slower on all targets. Also sdb didn't like it. */
13121 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13122 RTX_FRAME_RELATED_P (insn) = 1;
13124 /* Push registers now, before setting the frame pointer
13125 on SEH target. */
13126 if (!int_registers_saved
13127 && TARGET_SEH
13128 && !frame.save_regs_using_mov)
13130 ix86_emit_save_regs ();
13131 int_registers_saved = true;
13132 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13135 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13137 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13138 RTX_FRAME_RELATED_P (insn) = 1;
13140 if (m->fs.cfa_reg == stack_pointer_rtx)
13141 m->fs.cfa_reg = hard_frame_pointer_rtx;
13142 m->fs.fp_offset = m->fs.sp_offset;
13143 m->fs.fp_valid = true;
13147 if (!int_registers_saved)
13149 /* If saving registers via PUSH, do so now. */
13150 if (!frame.save_regs_using_mov)
13152 ix86_emit_save_regs ();
13153 int_registers_saved = true;
13154 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13157 /* When using red zone we may start register saving before allocating
13158 the stack frame saving one cycle of the prologue. However, avoid
13159 doing this if we have to probe the stack; at least on x86_64 the
13160 stack probe can turn into a call that clobbers a red zone location. */
13161 else if (ix86_using_red_zone ()
13162 && (! TARGET_STACK_PROBE
13163 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13165 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13166 int_registers_saved = true;
13170 if (stack_realign_fp)
13172 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13173 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13175 /* Record last valid frame pointer offset. */
13176 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13178 /* The computation of the size of the re-aligned stack frame means
13179 that we must allocate the size of the register save area before
13180 performing the actual alignment. Otherwise we cannot guarantee
13181 that there's enough storage above the realignment point. */
13182 allocate = frame.reg_save_offset - m->fs.sp_offset
13183 + frame.stack_realign_allocate;
13184 if (allocate)
13185 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13186 GEN_INT (-allocate), -1, false);
13188 /* Align the stack. */
13189 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13190 stack_pointer_rtx,
13191 GEN_INT (-align_bytes)));
13192 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13193 m->fs.sp_realigned_offset = m->fs.sp_offset
13194 - frame.stack_realign_allocate;
13195 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13196 Beyond this point, stack access should be done via choose_baseaddr or
13197 by using sp_valid_at and fp_valid_at to determine the correct base
13198 register. Henceforth, any CFA offset should be thought of as logical
13199 and not physical. */
13200 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13201 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13202 m->fs.sp_realigned = true;
13204 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13205 is needed to describe where a register is saved using a realigned
13206 stack pointer, so we need to invalidate the stack pointer for that
13207 target. */
13208 if (TARGET_SEH)
13209 m->fs.sp_valid = false;
13211 /* If SP offset is non-immediate after allocation of the stack frame,
13212 then emit SSE saves or stub call prior to allocating the rest of the
13213 stack frame. This is less efficient for the out-of-line stub because
13214 we can't combine allocations across the call barrier, but it's better
13215 than using a scratch register. */
13216 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13217 - m->fs.sp_realigned_offset),
13218 Pmode))
13220 if (!sse_registers_saved)
13222 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13223 sse_registers_saved = true;
13225 else if (save_stub_call_needed)
13227 ix86_emit_outlined_ms2sysv_save (frame);
13228 save_stub_call_needed = false;
13233 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13235 if (flag_stack_usage_info)
13237 /* We start to count from ARG_POINTER. */
13238 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13240 /* If it was realigned, take into account the fake frame. */
13241 if (stack_realign_drap)
13243 if (ix86_static_chain_on_stack)
13244 stack_size += UNITS_PER_WORD;
13246 if (!call_used_regs[REGNO (crtl->drap_reg)])
13247 stack_size += UNITS_PER_WORD;
13249 /* This over-estimates by 1 minimal-stack-alignment-unit but
13250 mitigates that by counting in the new return address slot. */
13251 current_function_dynamic_stack_size
13252 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13255 current_function_static_stack_size = stack_size;
13258 /* On SEH target with very large frame size, allocate an area to save
13259 SSE registers (as the very large allocation won't be described). */
13260 if (TARGET_SEH
13261 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13262 && !sse_registers_saved)
13264 HOST_WIDE_INT sse_size =
13265 frame.sse_reg_save_offset - frame.reg_save_offset;
13267 gcc_assert (int_registers_saved);
13269 /* No need to do stack checking as the area will be immediately
13270 written. */
13271 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13272 GEN_INT (-sse_size), -1,
13273 m->fs.cfa_reg == stack_pointer_rtx);
13274 allocate -= sse_size;
13275 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13276 sse_registers_saved = true;
13279 /* The stack has already been decremented by the instruction calling us
13280 so probe if the size is non-negative to preserve the protection area. */
13281 if (allocate >= 0
13282 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13283 || flag_stack_clash_protection))
13285 /* This assert wants to verify that integer registers were saved
13286 prior to probing. This is necessary when probing may be implemented
13287 as a function call (Windows). It is not necessary for stack clash
13288 protection probing. */
13289 if (!flag_stack_clash_protection)
13290 gcc_assert (int_registers_saved);
13292 if (flag_stack_clash_protection)
13294 ix86_adjust_stack_and_probe_stack_clash (allocate);
13295 allocate = 0;
13297 else if (STACK_CHECK_MOVING_SP)
13299 if (!(crtl->is_leaf && !cfun->calls_alloca
13300 && allocate <= get_probe_interval ()))
13302 ix86_adjust_stack_and_probe (allocate);
13303 allocate = 0;
13306 else
13308 HOST_WIDE_INT size = allocate;
13310 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13311 size = 0x80000000 - get_stack_check_protect () - 1;
13313 if (TARGET_STACK_PROBE)
13315 if (crtl->is_leaf && !cfun->calls_alloca)
13317 if (size > get_probe_interval ())
13318 ix86_emit_probe_stack_range (0, size);
13320 else
13321 ix86_emit_probe_stack_range (0,
13322 size + get_stack_check_protect ());
13324 else
13326 if (crtl->is_leaf && !cfun->calls_alloca)
13328 if (size > get_probe_interval ()
13329 && size > get_stack_check_protect ())
13330 ix86_emit_probe_stack_range (get_stack_check_protect (),
13331 size - get_stack_check_protect ());
13333 else
13334 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
13339 if (allocate == 0)
13341 else if (!ix86_target_stack_probe ()
13342 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13344 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13345 GEN_INT (-allocate), -1,
13346 m->fs.cfa_reg == stack_pointer_rtx);
13348 else
13350 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13351 rtx r10 = NULL;
13352 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13353 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13354 bool eax_live = ix86_eax_live_at_start_p ();
13355 bool r10_live = false;
13357 if (TARGET_64BIT)
13358 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13360 if (eax_live)
13362 insn = emit_insn (gen_push (eax));
13363 allocate -= UNITS_PER_WORD;
13364 /* Note that SEH directives need to continue tracking the stack
13365 pointer even after the frame pointer has been set up. */
13366 if (sp_is_cfa_reg || TARGET_SEH)
13368 if (sp_is_cfa_reg)
13369 m->fs.cfa_offset += UNITS_PER_WORD;
13370 RTX_FRAME_RELATED_P (insn) = 1;
13371 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13372 gen_rtx_SET (stack_pointer_rtx,
13373 plus_constant (Pmode, stack_pointer_rtx,
13374 -UNITS_PER_WORD)));
13378 if (r10_live)
13380 r10 = gen_rtx_REG (Pmode, R10_REG);
13381 insn = emit_insn (gen_push (r10));
13382 allocate -= UNITS_PER_WORD;
13383 if (sp_is_cfa_reg || TARGET_SEH)
13385 if (sp_is_cfa_reg)
13386 m->fs.cfa_offset += UNITS_PER_WORD;
13387 RTX_FRAME_RELATED_P (insn) = 1;
13388 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13389 gen_rtx_SET (stack_pointer_rtx,
13390 plus_constant (Pmode, stack_pointer_rtx,
13391 -UNITS_PER_WORD)));
13395 emit_move_insn (eax, GEN_INT (allocate));
13396 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13398 /* Use the fact that AX still contains ALLOCATE. */
13399 adjust_stack_insn = (Pmode == DImode
13400 ? gen_pro_epilogue_adjust_stack_di_sub
13401 : gen_pro_epilogue_adjust_stack_si_sub);
13403 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13404 stack_pointer_rtx, eax));
13406 if (sp_is_cfa_reg || TARGET_SEH)
13408 if (sp_is_cfa_reg)
13409 m->fs.cfa_offset += allocate;
13410 RTX_FRAME_RELATED_P (insn) = 1;
13411 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13412 gen_rtx_SET (stack_pointer_rtx,
13413 plus_constant (Pmode, stack_pointer_rtx,
13414 -allocate)));
13416 m->fs.sp_offset += allocate;
13418 /* Use stack_pointer_rtx for relative addressing so that code
13419 works for realigned stack, too. */
13420 if (r10_live && eax_live)
13422 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13423 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13424 gen_frame_mem (word_mode, t));
13425 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13426 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13427 gen_frame_mem (word_mode, t));
13429 else if (eax_live || r10_live)
13431 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13432 emit_move_insn (gen_rtx_REG (word_mode,
13433 (eax_live ? AX_REG : R10_REG)),
13434 gen_frame_mem (word_mode, t));
13437 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13439 /* If we havn't already set up the frame pointer, do so now. */
13440 if (frame_pointer_needed && !m->fs.fp_valid)
13442 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13443 GEN_INT (frame.stack_pointer_offset
13444 - frame.hard_frame_pointer_offset));
13445 insn = emit_insn (insn);
13446 RTX_FRAME_RELATED_P (insn) = 1;
13447 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13449 if (m->fs.cfa_reg == stack_pointer_rtx)
13450 m->fs.cfa_reg = hard_frame_pointer_rtx;
13451 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13452 m->fs.fp_valid = true;
13455 if (!int_registers_saved)
13456 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13457 if (!sse_registers_saved)
13458 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13459 else if (save_stub_call_needed)
13460 ix86_emit_outlined_ms2sysv_save (frame);
13462 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13463 in PROLOGUE. */
13464 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13466 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13467 insn = emit_insn (gen_set_got (pic));
13468 RTX_FRAME_RELATED_P (insn) = 1;
13469 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13470 emit_insn (gen_prologue_use (pic));
13471 /* Deleting already emmitted SET_GOT if exist and allocated to
13472 REAL_PIC_OFFSET_TABLE_REGNUM. */
13473 ix86_elim_entry_set_got (pic);
13476 if (crtl->drap_reg && !crtl->stack_realign_needed)
13478 /* vDRAP is setup but after reload it turns out stack realign
13479 isn't necessary, here we will emit prologue to setup DRAP
13480 without stack realign adjustment */
13481 t = choose_baseaddr (0, NULL);
13482 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13485 /* Prevent instructions from being scheduled into register save push
13486 sequence when access to the redzone area is done through frame pointer.
13487 The offset between the frame pointer and the stack pointer is calculated
13488 relative to the value of the stack pointer at the end of the function
13489 prologue, and moving instructions that access redzone area via frame
13490 pointer inside push sequence violates this assumption. */
13491 if (frame_pointer_needed && frame.red_zone_size)
13492 emit_insn (gen_memory_blockage ());
13494 /* SEH requires that the prologue end within 256 bytes of the start of
13495 the function. Prevent instruction schedules that would extend that.
13496 Further, prevent alloca modifications to the stack pointer from being
13497 combined with prologue modifications. */
13498 if (TARGET_SEH)
13499 emit_insn (gen_prologue_use (stack_pointer_rtx));
13502 /* Emit code to restore REG using a POP insn. */
13504 static void
13505 ix86_emit_restore_reg_using_pop (rtx reg)
13507 struct machine_function *m = cfun->machine;
13508 rtx_insn *insn = emit_insn (gen_pop (reg));
13510 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13511 m->fs.sp_offset -= UNITS_PER_WORD;
13513 if (m->fs.cfa_reg == crtl->drap_reg
13514 && REGNO (reg) == REGNO (crtl->drap_reg))
13516 /* Previously we'd represented the CFA as an expression
13517 like *(%ebp - 8). We've just popped that value from
13518 the stack, which means we need to reset the CFA to
13519 the drap register. This will remain until we restore
13520 the stack pointer. */
13521 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13522 RTX_FRAME_RELATED_P (insn) = 1;
13524 /* This means that the DRAP register is valid for addressing too. */
13525 m->fs.drap_valid = true;
13526 return;
13529 if (m->fs.cfa_reg == stack_pointer_rtx)
13531 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13532 x = gen_rtx_SET (stack_pointer_rtx, x);
13533 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
13534 RTX_FRAME_RELATED_P (insn) = 1;
13536 m->fs.cfa_offset -= UNITS_PER_WORD;
13539 /* When the frame pointer is the CFA, and we pop it, we are
13540 swapping back to the stack pointer as the CFA. This happens
13541 for stack frames that don't allocate other data, so we assume
13542 the stack pointer is now pointing at the return address, i.e.
13543 the function entry state, which makes the offset be 1 word. */
13544 if (reg == hard_frame_pointer_rtx)
13546 m->fs.fp_valid = false;
13547 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13549 m->fs.cfa_reg = stack_pointer_rtx;
13550 m->fs.cfa_offset -= UNITS_PER_WORD;
13552 add_reg_note (insn, REG_CFA_DEF_CFA,
13553 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
13554 GEN_INT (m->fs.cfa_offset)));
13555 RTX_FRAME_RELATED_P (insn) = 1;
13560 /* Emit code to restore saved registers using POP insns. */
13562 static void
13563 ix86_emit_restore_regs_using_pop (void)
13565 unsigned int regno;
13567 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13568 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
13569 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
13572 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
13573 omits the emit and only attaches the notes. */
13575 static void
13576 ix86_emit_leave (rtx_insn *insn)
13578 struct machine_function *m = cfun->machine;
13579 if (!insn)
13580 insn = emit_insn (ix86_gen_leave ());
13582 ix86_add_queued_cfa_restore_notes (insn);
13584 gcc_assert (m->fs.fp_valid);
13585 m->fs.sp_valid = true;
13586 m->fs.sp_realigned = false;
13587 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
13588 m->fs.fp_valid = false;
13590 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13592 m->fs.cfa_reg = stack_pointer_rtx;
13593 m->fs.cfa_offset = m->fs.sp_offset;
13595 add_reg_note (insn, REG_CFA_DEF_CFA,
13596 plus_constant (Pmode, stack_pointer_rtx,
13597 m->fs.sp_offset));
13598 RTX_FRAME_RELATED_P (insn) = 1;
13600 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
13601 m->fs.fp_offset);
13604 /* Emit code to restore saved registers using MOV insns.
13605 First register is restored from CFA - CFA_OFFSET. */
13606 static void
13607 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
13608 bool maybe_eh_return)
13610 struct machine_function *m = cfun->machine;
13611 unsigned int regno;
13613 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13614 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13616 rtx reg = gen_rtx_REG (word_mode, regno);
13617 rtx mem;
13618 rtx_insn *insn;
13620 mem = choose_baseaddr (cfa_offset, NULL);
13621 mem = gen_frame_mem (word_mode, mem);
13622 insn = emit_move_insn (reg, mem);
13624 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
13626 /* Previously we'd represented the CFA as an expression
13627 like *(%ebp - 8). We've just popped that value from
13628 the stack, which means we need to reset the CFA to
13629 the drap register. This will remain until we restore
13630 the stack pointer. */
13631 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13632 RTX_FRAME_RELATED_P (insn) = 1;
13634 /* This means that the DRAP register is valid for addressing. */
13635 m->fs.drap_valid = true;
13637 else
13638 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13640 cfa_offset -= UNITS_PER_WORD;
13644 /* Emit code to restore saved registers using MOV insns.
13645 First register is restored from CFA - CFA_OFFSET. */
13646 static void
13647 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
13648 bool maybe_eh_return)
13650 unsigned int regno;
13652 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13653 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13655 rtx reg = gen_rtx_REG (V4SFmode, regno);
13656 rtx mem;
13657 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
13659 mem = choose_baseaddr (cfa_offset, &align);
13660 mem = gen_rtx_MEM (V4SFmode, mem);
13662 /* The location aligment depends upon the base register. */
13663 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
13664 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13665 set_mem_align (mem, align);
13666 emit_insn (gen_rtx_SET (reg, mem));
13668 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13670 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13674 static void
13675 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
13676 bool use_call, int style)
13678 struct machine_function *m = cfun->machine;
13679 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13680 + m->call_ms2sysv_extra_regs;
13681 rtvec v;
13682 unsigned int elems_needed, align, i, vi = 0;
13683 rtx_insn *insn;
13684 rtx sym, tmp;
13685 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
13686 rtx r10 = NULL_RTX;
13687 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13688 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
13689 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
13690 rtx rsi_frame_load = NULL_RTX;
13691 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
13692 enum xlogue_stub stub;
13694 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
13696 /* If using a realigned stack, we should never start with padding. */
13697 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
13699 /* Setup RSI as the stub's base pointer. */
13700 align = GET_MODE_ALIGNMENT (V4SFmode);
13701 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
13702 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13704 emit_insn (gen_rtx_SET (rsi, tmp));
13706 /* Get a symbol for the stub. */
13707 if (frame_pointer_needed)
13708 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
13709 : XLOGUE_STUB_RESTORE_HFP_TAIL;
13710 else
13711 stub = use_call ? XLOGUE_STUB_RESTORE
13712 : XLOGUE_STUB_RESTORE_TAIL;
13713 sym = xlogue.get_stub_rtx (stub);
13715 elems_needed = ncregs;
13716 if (use_call)
13717 elems_needed += 1;
13718 else
13719 elems_needed += frame_pointer_needed ? 5 : 3;
13720 v = rtvec_alloc (elems_needed);
13722 /* We call the epilogue stub when we need to pop incoming args or we are
13723 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
13724 epilogue stub and it is the tail-call. */
13725 if (use_call)
13726 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13727 else
13729 RTVEC_ELT (v, vi++) = ret_rtx;
13730 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13731 if (frame_pointer_needed)
13733 rtx rbp = gen_rtx_REG (DImode, BP_REG);
13734 gcc_assert (m->fs.fp_valid);
13735 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
13737 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
13738 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
13739 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
13740 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
13741 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
13743 else
13745 /* If no hard frame pointer, we set R10 to the SP restore value. */
13746 gcc_assert (!m->fs.fp_valid);
13747 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13748 gcc_assert (m->fs.sp_valid);
13750 r10 = gen_rtx_REG (DImode, R10_REG);
13751 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
13752 emit_insn (gen_rtx_SET (r10, tmp));
13754 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
13758 /* Generate frame load insns and restore notes. */
13759 for (i = 0; i < ncregs; ++i)
13761 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13762 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
13763 rtx reg, frame_load;
13765 reg = gen_rtx_REG (mode, r.regno);
13766 frame_load = gen_frame_load (reg, rsi, r.offset);
13768 /* Save RSI frame load insn & note to add last. */
13769 if (r.regno == SI_REG)
13771 gcc_assert (!rsi_frame_load);
13772 rsi_frame_load = frame_load;
13773 rsi_restore_offset = r.offset;
13775 else
13777 RTVEC_ELT (v, vi++) = frame_load;
13778 ix86_add_cfa_restore_note (NULL, reg, r.offset);
13782 /* Add RSI frame load & restore note at the end. */
13783 gcc_assert (rsi_frame_load);
13784 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
13785 RTVEC_ELT (v, vi++) = rsi_frame_load;
13786 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
13787 rsi_restore_offset);
13789 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
13790 if (!use_call && !frame_pointer_needed)
13792 gcc_assert (m->fs.sp_valid);
13793 gcc_assert (!m->fs.sp_realigned);
13795 /* At this point, R10 should point to frame.stack_realign_offset. */
13796 if (m->fs.cfa_reg == stack_pointer_rtx)
13797 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
13798 m->fs.sp_offset = frame.stack_realign_offset;
13801 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
13802 tmp = gen_rtx_PARALLEL (VOIDmode, v);
13803 if (use_call)
13804 insn = emit_insn (tmp);
13805 else
13807 insn = emit_jump_insn (tmp);
13808 JUMP_LABEL (insn) = ret_rtx;
13810 if (frame_pointer_needed)
13811 ix86_emit_leave (insn);
13812 else
13814 /* Need CFA adjust note. */
13815 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
13816 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
13820 RTX_FRAME_RELATED_P (insn) = true;
13821 ix86_add_queued_cfa_restore_notes (insn);
13823 /* If we're not doing a tail-call, we need to adjust the stack. */
13824 if (use_call && m->fs.sp_valid)
13826 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
13827 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13828 GEN_INT (dealloc), style,
13829 m->fs.cfa_reg == stack_pointer_rtx);
13833 /* Restore function stack, frame, and registers. */
13835 void
13836 ix86_expand_epilogue (int style)
13838 struct machine_function *m = cfun->machine;
13839 struct machine_frame_state frame_state_save = m->fs;
13840 struct ix86_frame frame;
13841 bool restore_regs_via_mov;
13842 bool using_drap;
13843 bool restore_stub_is_tail = false;
13845 if (ix86_function_naked (current_function_decl))
13847 /* The program should not reach this point. */
13848 emit_insn (gen_ud2 ());
13849 return;
13852 ix86_finalize_stack_frame_flags ();
13853 frame = m->frame;
13855 m->fs.sp_realigned = stack_realign_fp;
13856 m->fs.sp_valid = stack_realign_fp
13857 || !frame_pointer_needed
13858 || crtl->sp_is_unchanging;
13859 gcc_assert (!m->fs.sp_valid
13860 || m->fs.sp_offset == frame.stack_pointer_offset);
13862 /* The FP must be valid if the frame pointer is present. */
13863 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
13864 gcc_assert (!m->fs.fp_valid
13865 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
13867 /* We must have *some* valid pointer to the stack frame. */
13868 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
13870 /* The DRAP is never valid at this point. */
13871 gcc_assert (!m->fs.drap_valid);
13873 /* See the comment about red zone and frame
13874 pointer usage in ix86_expand_prologue. */
13875 if (frame_pointer_needed && frame.red_zone_size)
13876 emit_insn (gen_memory_blockage ());
13878 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
13879 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
13881 /* Determine the CFA offset of the end of the red-zone. */
13882 m->fs.red_zone_offset = 0;
13883 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
13885 /* The red-zone begins below return address and error code in
13886 exception handler. */
13887 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
13889 /* When the register save area is in the aligned portion of
13890 the stack, determine the maximum runtime displacement that
13891 matches up with the aligned frame. */
13892 if (stack_realign_drap)
13893 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
13894 + UNITS_PER_WORD);
13897 /* Special care must be taken for the normal return case of a function
13898 using eh_return: the eax and edx registers are marked as saved, but
13899 not restored along this path. Adjust the save location to match. */
13900 if (crtl->calls_eh_return && style != 2)
13901 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
13903 /* EH_RETURN requires the use of moves to function properly. */
13904 if (crtl->calls_eh_return)
13905 restore_regs_via_mov = true;
13906 /* SEH requires the use of pops to identify the epilogue. */
13907 else if (TARGET_SEH)
13908 restore_regs_via_mov = false;
13909 /* If we're only restoring one register and sp cannot be used then
13910 using a move instruction to restore the register since it's
13911 less work than reloading sp and popping the register. */
13912 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
13913 restore_regs_via_mov = true;
13914 else if (TARGET_EPILOGUE_USING_MOVE
13915 && cfun->machine->use_fast_prologue_epilogue
13916 && (frame.nregs > 1
13917 || m->fs.sp_offset != frame.reg_save_offset))
13918 restore_regs_via_mov = true;
13919 else if (frame_pointer_needed
13920 && !frame.nregs
13921 && m->fs.sp_offset != frame.reg_save_offset)
13922 restore_regs_via_mov = true;
13923 else if (frame_pointer_needed
13924 && TARGET_USE_LEAVE
13925 && cfun->machine->use_fast_prologue_epilogue
13926 && frame.nregs == 1)
13927 restore_regs_via_mov = true;
13928 else
13929 restore_regs_via_mov = false;
13931 if (restore_regs_via_mov || frame.nsseregs)
13933 /* Ensure that the entire register save area is addressable via
13934 the stack pointer, if we will restore SSE regs via sp. */
13935 if (TARGET_64BIT
13936 && m->fs.sp_offset > 0x7fffffff
13937 && sp_valid_at (frame.stack_realign_offset + 1)
13938 && (frame.nsseregs + frame.nregs) != 0)
13940 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13941 GEN_INT (m->fs.sp_offset
13942 - frame.sse_reg_save_offset),
13943 style,
13944 m->fs.cfa_reg == stack_pointer_rtx);
13948 /* If there are any SSE registers to restore, then we have to do it
13949 via moves, since there's obviously no pop for SSE regs. */
13950 if (frame.nsseregs)
13951 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
13952 style == 2);
13954 if (m->call_ms2sysv)
13956 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
13958 /* We cannot use a tail-call for the stub if:
13959 1. We have to pop incoming args,
13960 2. We have additional int regs to restore, or
13961 3. A sibling call will be the tail-call, or
13962 4. We are emitting an eh_return_internal epilogue.
13964 TODO: Item 4 has not yet tested!
13966 If any of the above are true, we will call the stub rather than
13967 jump to it. */
13968 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
13969 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
13972 /* If using out-of-line stub that is a tail-call, then...*/
13973 if (m->call_ms2sysv && restore_stub_is_tail)
13975 /* TODO: parinoid tests. (remove eventually) */
13976 gcc_assert (m->fs.sp_valid);
13977 gcc_assert (!m->fs.sp_realigned);
13978 gcc_assert (!m->fs.fp_valid);
13979 gcc_assert (!m->fs.realigned);
13980 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
13981 gcc_assert (!crtl->drap_reg);
13982 gcc_assert (!frame.nregs);
13984 else if (restore_regs_via_mov)
13986 rtx t;
13988 if (frame.nregs)
13989 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
13991 /* eh_return epilogues need %ecx added to the stack pointer. */
13992 if (style == 2)
13994 rtx sa = EH_RETURN_STACKADJ_RTX;
13995 rtx_insn *insn;
13997 /* %ecx can't be used for both DRAP register and eh_return. */
13998 if (crtl->drap_reg)
13999 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14001 /* regparm nested functions don't work with eh_return. */
14002 gcc_assert (!ix86_static_chain_on_stack);
14004 if (frame_pointer_needed)
14006 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14007 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14008 emit_insn (gen_rtx_SET (sa, t));
14010 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14011 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14013 /* Note that we use SA as a temporary CFA, as the return
14014 address is at the proper place relative to it. We
14015 pretend this happens at the FP restore insn because
14016 prior to this insn the FP would be stored at the wrong
14017 offset relative to SA, and after this insn we have no
14018 other reasonable register to use for the CFA. We don't
14019 bother resetting the CFA to the SP for the duration of
14020 the return insn, unless the control flow instrumentation
14021 is done. In this case the SP is used later and we have
14022 to reset CFA to SP. */
14023 add_reg_note (insn, REG_CFA_DEF_CFA,
14024 plus_constant (Pmode, sa, UNITS_PER_WORD));
14025 ix86_add_queued_cfa_restore_notes (insn);
14026 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14027 RTX_FRAME_RELATED_P (insn) = 1;
14029 m->fs.cfa_reg = sa;
14030 m->fs.cfa_offset = UNITS_PER_WORD;
14031 m->fs.fp_valid = false;
14033 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14034 const0_rtx, style,
14035 flag_cf_protection);
14037 else
14039 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14040 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14041 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14042 ix86_add_queued_cfa_restore_notes (insn);
14044 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14045 if (m->fs.cfa_offset != UNITS_PER_WORD)
14047 m->fs.cfa_offset = UNITS_PER_WORD;
14048 add_reg_note (insn, REG_CFA_DEF_CFA,
14049 plus_constant (Pmode, stack_pointer_rtx,
14050 UNITS_PER_WORD));
14051 RTX_FRAME_RELATED_P (insn) = 1;
14054 m->fs.sp_offset = UNITS_PER_WORD;
14055 m->fs.sp_valid = true;
14056 m->fs.sp_realigned = false;
14059 else
14061 /* SEH requires that the function end with (1) a stack adjustment
14062 if necessary, (2) a sequence of pops, and (3) a return or
14063 jump instruction. Prevent insns from the function body from
14064 being scheduled into this sequence. */
14065 if (TARGET_SEH)
14067 /* Prevent a catch region from being adjacent to the standard
14068 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14069 several other flags that would be interesting to test are
14070 not yet set up. */
14071 if (flag_non_call_exceptions)
14072 emit_insn (gen_nops (const1_rtx));
14073 else
14074 emit_insn (gen_blockage ());
14077 /* First step is to deallocate the stack frame so that we can
14078 pop the registers. If the stack pointer was realigned, it needs
14079 to be restored now. Also do it on SEH target for very large
14080 frame as the emitted instructions aren't allowed by the ABI
14081 in epilogues. */
14082 if (!m->fs.sp_valid || m->fs.sp_realigned
14083 || (TARGET_SEH
14084 && (m->fs.sp_offset - frame.reg_save_offset
14085 >= SEH_MAX_FRAME_SIZE)))
14087 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14088 GEN_INT (m->fs.fp_offset
14089 - frame.reg_save_offset),
14090 style, false);
14092 else if (m->fs.sp_offset != frame.reg_save_offset)
14094 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14095 GEN_INT (m->fs.sp_offset
14096 - frame.reg_save_offset),
14097 style,
14098 m->fs.cfa_reg == stack_pointer_rtx);
14101 ix86_emit_restore_regs_using_pop ();
14104 /* If we used a stack pointer and haven't already got rid of it,
14105 then do so now. */
14106 if (m->fs.fp_valid)
14108 /* If the stack pointer is valid and pointing at the frame
14109 pointer store address, then we only need a pop. */
14110 if (sp_valid_at (frame.hfp_save_offset)
14111 && m->fs.sp_offset == frame.hfp_save_offset)
14112 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14113 /* Leave results in shorter dependency chains on CPUs that are
14114 able to grok it fast. */
14115 else if (TARGET_USE_LEAVE
14116 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14117 || !cfun->machine->use_fast_prologue_epilogue)
14118 ix86_emit_leave (NULL);
14119 else
14121 pro_epilogue_adjust_stack (stack_pointer_rtx,
14122 hard_frame_pointer_rtx,
14123 const0_rtx, style, !using_drap);
14124 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14128 if (using_drap)
14130 int param_ptr_offset = UNITS_PER_WORD;
14131 rtx_insn *insn;
14133 gcc_assert (stack_realign_drap);
14135 if (ix86_static_chain_on_stack)
14136 param_ptr_offset += UNITS_PER_WORD;
14137 if (!call_used_regs[REGNO (crtl->drap_reg)])
14138 param_ptr_offset += UNITS_PER_WORD;
14140 insn = emit_insn (gen_rtx_SET
14141 (stack_pointer_rtx,
14142 gen_rtx_PLUS (Pmode,
14143 crtl->drap_reg,
14144 GEN_INT (-param_ptr_offset))));
14145 m->fs.cfa_reg = stack_pointer_rtx;
14146 m->fs.cfa_offset = param_ptr_offset;
14147 m->fs.sp_offset = param_ptr_offset;
14148 m->fs.realigned = false;
14150 add_reg_note (insn, REG_CFA_DEF_CFA,
14151 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14152 GEN_INT (param_ptr_offset)));
14153 RTX_FRAME_RELATED_P (insn) = 1;
14155 if (!call_used_regs[REGNO (crtl->drap_reg)])
14156 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14159 /* At this point the stack pointer must be valid, and we must have
14160 restored all of the registers. We may not have deallocated the
14161 entire stack frame. We've delayed this until now because it may
14162 be possible to merge the local stack deallocation with the
14163 deallocation forced by ix86_static_chain_on_stack. */
14164 gcc_assert (m->fs.sp_valid);
14165 gcc_assert (!m->fs.sp_realigned);
14166 gcc_assert (!m->fs.fp_valid);
14167 gcc_assert (!m->fs.realigned);
14168 if (m->fs.sp_offset != UNITS_PER_WORD)
14170 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14171 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14172 style, true);
14174 else
14175 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14177 /* Sibcall epilogues don't want a return instruction. */
14178 if (style == 0)
14180 m->fs = frame_state_save;
14181 return;
14184 if (cfun->machine->func_type != TYPE_NORMAL)
14185 emit_jump_insn (gen_interrupt_return ());
14186 else if (crtl->args.pops_args && crtl->args.size)
14188 rtx popc = GEN_INT (crtl->args.pops_args);
14190 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14191 address, do explicit add, and jump indirectly to the caller. */
14193 if (crtl->args.pops_args >= 65536)
14195 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14196 rtx_insn *insn;
14198 /* There is no "pascal" calling convention in any 64bit ABI. */
14199 gcc_assert (!TARGET_64BIT);
14201 insn = emit_insn (gen_pop (ecx));
14202 m->fs.cfa_offset -= UNITS_PER_WORD;
14203 m->fs.sp_offset -= UNITS_PER_WORD;
14205 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14206 x = gen_rtx_SET (stack_pointer_rtx, x);
14207 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14208 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14209 RTX_FRAME_RELATED_P (insn) = 1;
14211 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14212 popc, -1, true);
14213 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14215 else
14216 emit_jump_insn (gen_simple_return_pop_internal (popc));
14218 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14220 /* In case of return from EH a simple return cannot be used
14221 as a return address will be compared with a shadow stack
14222 return address. Use indirect jump instead. */
14223 if (style == 2 && flag_cf_protection)
14225 /* Register used in indirect jump must be in word_mode. But
14226 Pmode may not be the same as word_mode for x32. */
14227 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14228 rtx_insn *insn;
14230 insn = emit_insn (gen_pop (ecx));
14231 m->fs.cfa_offset -= UNITS_PER_WORD;
14232 m->fs.sp_offset -= UNITS_PER_WORD;
14234 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14235 x = gen_rtx_SET (stack_pointer_rtx, x);
14236 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14237 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14238 RTX_FRAME_RELATED_P (insn) = 1;
14240 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14242 else
14243 emit_jump_insn (gen_simple_return_internal ());
14246 /* Restore the state back to the state from the prologue,
14247 so that it's correct for the next epilogue. */
14248 m->fs = frame_state_save;
14251 /* Reset from the function's potential modifications. */
14253 static void
14254 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14256 if (pic_offset_table_rtx
14257 && !ix86_use_pseudo_pic_reg ())
14258 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14260 if (TARGET_MACHO)
14262 rtx_insn *insn = get_last_insn ();
14263 rtx_insn *deleted_debug_label = NULL;
14265 /* Mach-O doesn't support labels at the end of objects, so if
14266 it looks like we might want one, take special action.
14267 First, collect any sequence of deleted debug labels. */
14268 while (insn
14269 && NOTE_P (insn)
14270 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14272 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14273 notes only, instead set their CODE_LABEL_NUMBER to -1,
14274 otherwise there would be code generation differences
14275 in between -g and -g0. */
14276 if (NOTE_P (insn) && NOTE_KIND (insn)
14277 == NOTE_INSN_DELETED_DEBUG_LABEL)
14278 deleted_debug_label = insn;
14279 insn = PREV_INSN (insn);
14282 /* If we have:
14283 label:
14284 barrier
14285 then this needs to be detected, so skip past the barrier. */
14287 if (insn && BARRIER_P (insn))
14288 insn = PREV_INSN (insn);
14290 /* Up to now we've only seen notes or barriers. */
14291 if (insn)
14293 if (LABEL_P (insn)
14294 || (NOTE_P (insn)
14295 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14296 /* Trailing label. */
14297 fputs ("\tnop\n", file);
14298 else if (cfun && ! cfun->is_thunk)
14300 /* See if we have a completely empty function body, skipping
14301 the special case of the picbase thunk emitted as asm. */
14302 while (insn && ! INSN_P (insn))
14303 insn = PREV_INSN (insn);
14304 /* If we don't find any insns, we've got an empty function body;
14305 I.e. completely empty - without a return or branch. This is
14306 taken as the case where a function body has been removed
14307 because it contains an inline __builtin_unreachable(). GCC
14308 declares that reaching __builtin_unreachable() means UB so
14309 we're not obliged to do anything special; however, we want
14310 non-zero-sized function bodies. To meet this, and help the
14311 user out, let's trap the case. */
14312 if (insn == NULL)
14313 fputs ("\tud2\n", file);
14316 else if (deleted_debug_label)
14317 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14318 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14319 CODE_LABEL_NUMBER (insn) = -1;
14323 /* Return a scratch register to use in the split stack prologue. The
14324 split stack prologue is used for -fsplit-stack. It is the first
14325 instructions in the function, even before the regular prologue.
14326 The scratch register can be any caller-saved register which is not
14327 used for parameters or for the static chain. */
14329 static unsigned int
14330 split_stack_prologue_scratch_regno (void)
14332 if (TARGET_64BIT)
14333 return R11_REG;
14334 else
14336 bool is_fastcall, is_thiscall;
14337 int regparm;
14339 is_fastcall = (lookup_attribute ("fastcall",
14340 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14341 != NULL);
14342 is_thiscall = (lookup_attribute ("thiscall",
14343 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14344 != NULL);
14345 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14347 if (is_fastcall)
14349 if (DECL_STATIC_CHAIN (cfun->decl))
14351 sorry ("-fsplit-stack does not support fastcall with "
14352 "nested function");
14353 return INVALID_REGNUM;
14355 return AX_REG;
14357 else if (is_thiscall)
14359 if (!DECL_STATIC_CHAIN (cfun->decl))
14360 return DX_REG;
14361 return AX_REG;
14363 else if (regparm < 3)
14365 if (!DECL_STATIC_CHAIN (cfun->decl))
14366 return CX_REG;
14367 else
14369 if (regparm >= 2)
14371 sorry ("-fsplit-stack does not support 2 register "
14372 "parameters for a nested function");
14373 return INVALID_REGNUM;
14375 return DX_REG;
14378 else
14380 /* FIXME: We could make this work by pushing a register
14381 around the addition and comparison. */
14382 sorry ("-fsplit-stack does not support 3 register parameters");
14383 return INVALID_REGNUM;
14388 /* A SYMBOL_REF for the function which allocates new stackspace for
14389 -fsplit-stack. */
14391 static GTY(()) rtx split_stack_fn;
14393 /* A SYMBOL_REF for the more stack function when using the large
14394 model. */
14396 static GTY(()) rtx split_stack_fn_large;
14398 /* Return location of the stack guard value in the TLS block. */
14401 ix86_split_stack_guard (void)
14403 int offset;
14404 addr_space_t as = DEFAULT_TLS_SEG_REG;
14405 rtx r;
14407 gcc_assert (flag_split_stack);
14409 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14410 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14411 #else
14412 gcc_unreachable ();
14413 #endif
14415 r = GEN_INT (offset);
14416 r = gen_const_mem (Pmode, r);
14417 set_mem_addr_space (r, as);
14419 return r;
14422 /* Handle -fsplit-stack. These are the first instructions in the
14423 function, even before the regular prologue. */
14425 void
14426 ix86_expand_split_stack_prologue (void)
14428 HOST_WIDE_INT allocate;
14429 unsigned HOST_WIDE_INT args_size;
14430 rtx_code_label *label;
14431 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14432 rtx scratch_reg = NULL_RTX;
14433 rtx_code_label *varargs_label = NULL;
14434 rtx fn;
14436 gcc_assert (flag_split_stack && reload_completed);
14438 ix86_finalize_stack_frame_flags ();
14439 struct ix86_frame &frame = cfun->machine->frame;
14440 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14442 /* This is the label we will branch to if we have enough stack
14443 space. We expect the basic block reordering pass to reverse this
14444 branch if optimizing, so that we branch in the unlikely case. */
14445 label = gen_label_rtx ();
14447 /* We need to compare the stack pointer minus the frame size with
14448 the stack boundary in the TCB. The stack boundary always gives
14449 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14450 can compare directly. Otherwise we need to do an addition. */
14452 limit = ix86_split_stack_guard ();
14454 if (allocate < SPLIT_STACK_AVAILABLE)
14455 current = stack_pointer_rtx;
14456 else
14458 unsigned int scratch_regno;
14459 rtx offset;
14461 /* We need a scratch register to hold the stack pointer minus
14462 the required frame size. Since this is the very start of the
14463 function, the scratch register can be any caller-saved
14464 register which is not used for parameters. */
14465 offset = GEN_INT (- allocate);
14466 scratch_regno = split_stack_prologue_scratch_regno ();
14467 if (scratch_regno == INVALID_REGNUM)
14468 return;
14469 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14470 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14472 /* We don't use ix86_gen_add3 in this case because it will
14473 want to split to lea, but when not optimizing the insn
14474 will not be split after this point. */
14475 emit_insn (gen_rtx_SET (scratch_reg,
14476 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14477 offset)));
14479 else
14481 emit_move_insn (scratch_reg, offset);
14482 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14483 stack_pointer_rtx));
14485 current = scratch_reg;
14488 ix86_expand_branch (GEU, current, limit, label);
14489 rtx_insn *jump_insn = get_last_insn ();
14490 JUMP_LABEL (jump_insn) = label;
14492 /* Mark the jump as very likely to be taken. */
14493 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14495 if (split_stack_fn == NULL_RTX)
14497 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14498 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14500 fn = split_stack_fn;
14502 /* Get more stack space. We pass in the desired stack space and the
14503 size of the arguments to copy to the new stack. In 32-bit mode
14504 we push the parameters; __morestack will return on a new stack
14505 anyhow. In 64-bit mode we pass the parameters in r10 and
14506 r11. */
14507 allocate_rtx = GEN_INT (allocate);
14508 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14509 call_fusage = NULL_RTX;
14510 rtx pop = NULL_RTX;
14511 if (TARGET_64BIT)
14513 rtx reg10, reg11;
14515 reg10 = gen_rtx_REG (Pmode, R10_REG);
14516 reg11 = gen_rtx_REG (Pmode, R11_REG);
14518 /* If this function uses a static chain, it will be in %r10.
14519 Preserve it across the call to __morestack. */
14520 if (DECL_STATIC_CHAIN (cfun->decl))
14522 rtx rax;
14524 rax = gen_rtx_REG (word_mode, AX_REG);
14525 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14526 use_reg (&call_fusage, rax);
14529 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14530 && !TARGET_PECOFF)
14532 HOST_WIDE_INT argval;
14534 gcc_assert (Pmode == DImode);
14535 /* When using the large model we need to load the address
14536 into a register, and we've run out of registers. So we
14537 switch to a different calling convention, and we call a
14538 different function: __morestack_large. We pass the
14539 argument size in the upper 32 bits of r10 and pass the
14540 frame size in the lower 32 bits. */
14541 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14542 gcc_assert ((args_size & 0xffffffff) == args_size);
14544 if (split_stack_fn_large == NULL_RTX)
14546 split_stack_fn_large =
14547 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14548 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14550 if (ix86_cmodel == CM_LARGE_PIC)
14552 rtx_code_label *label;
14553 rtx x;
14555 label = gen_label_rtx ();
14556 emit_label (label);
14557 LABEL_PRESERVE_P (label) = 1;
14558 emit_insn (gen_set_rip_rex64 (reg10, label));
14559 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14560 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14561 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14562 UNSPEC_GOT);
14563 x = gen_rtx_CONST (Pmode, x);
14564 emit_move_insn (reg11, x);
14565 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14566 x = gen_const_mem (Pmode, x);
14567 emit_move_insn (reg11, x);
14569 else
14570 emit_move_insn (reg11, split_stack_fn_large);
14572 fn = reg11;
14574 argval = ((args_size << 16) << 16) + allocate;
14575 emit_move_insn (reg10, GEN_INT (argval));
14577 else
14579 emit_move_insn (reg10, allocate_rtx);
14580 emit_move_insn (reg11, GEN_INT (args_size));
14581 use_reg (&call_fusage, reg11);
14584 use_reg (&call_fusage, reg10);
14586 else
14588 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
14589 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
14590 insn = emit_insn (gen_push (allocate_rtx));
14591 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
14592 pop = GEN_INT (2 * UNITS_PER_WORD);
14594 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14595 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14596 pop, false);
14597 add_function_usage_to (call_insn, call_fusage);
14598 if (!TARGET_64BIT)
14599 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
14600 /* Indicate that this function can't jump to non-local gotos. */
14601 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
14603 /* In order to make call/return prediction work right, we now need
14604 to execute a return instruction. See
14605 libgcc/config/i386/morestack.S for the details on how this works.
14607 For flow purposes gcc must not see this as a return
14608 instruction--we need control flow to continue at the subsequent
14609 label. Therefore, we use an unspec. */
14610 gcc_assert (crtl->args.pops_args < 65536);
14611 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14613 /* If we are in 64-bit mode and this function uses a static chain,
14614 we saved %r10 in %rax before calling _morestack. */
14615 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14616 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14617 gen_rtx_REG (word_mode, AX_REG));
14619 /* If this function calls va_start, we need to store a pointer to
14620 the arguments on the old stack, because they may not have been
14621 all copied to the new stack. At this point the old stack can be
14622 found at the frame pointer value used by __morestack, because
14623 __morestack has set that up before calling back to us. Here we
14624 store that pointer in a scratch register, and in
14625 ix86_expand_prologue we store the scratch register in a stack
14626 slot. */
14627 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14629 unsigned int scratch_regno;
14630 rtx frame_reg;
14631 int words;
14633 scratch_regno = split_stack_prologue_scratch_regno ();
14634 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14635 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14637 /* 64-bit:
14638 fp -> old fp value
14639 return address within this function
14640 return address of caller of this function
14641 stack arguments
14642 So we add three words to get to the stack arguments.
14644 32-bit:
14645 fp -> old fp value
14646 return address within this function
14647 first argument to __morestack
14648 second argument to __morestack
14649 return address of caller of this function
14650 stack arguments
14651 So we add five words to get to the stack arguments.
14653 words = TARGET_64BIT ? 3 : 5;
14654 emit_insn (gen_rtx_SET (scratch_reg,
14655 gen_rtx_PLUS (Pmode, frame_reg,
14656 GEN_INT (words * UNITS_PER_WORD))));
14658 varargs_label = gen_label_rtx ();
14659 emit_jump_insn (gen_jump (varargs_label));
14660 JUMP_LABEL (get_last_insn ()) = varargs_label;
14662 emit_barrier ();
14665 emit_label (label);
14666 LABEL_NUSES (label) = 1;
14668 /* If this function calls va_start, we now have to set the scratch
14669 register for the case where we do not call __morestack. In this
14670 case we need to set it based on the stack pointer. */
14671 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14673 emit_insn (gen_rtx_SET (scratch_reg,
14674 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14675 GEN_INT (UNITS_PER_WORD))));
14677 emit_label (varargs_label);
14678 LABEL_NUSES (varargs_label) = 1;
14682 /* We may have to tell the dataflow pass that the split stack prologue
14683 is initializing a scratch register. */
14685 static void
14686 ix86_live_on_entry (bitmap regs)
14688 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14690 gcc_assert (flag_split_stack);
14691 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14695 /* Extract the parts of an RTL expression that is a valid memory address
14696 for an instruction. Return 0 if the structure of the address is
14697 grossly off. Return -1 if the address contains ASHIFT, so it is not
14698 strictly valid, but still used for computing length of lea instruction. */
14701 ix86_decompose_address (rtx addr, struct ix86_address *out)
14703 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14704 rtx base_reg, index_reg;
14705 HOST_WIDE_INT scale = 1;
14706 rtx scale_rtx = NULL_RTX;
14707 rtx tmp;
14708 int retval = 1;
14709 addr_space_t seg = ADDR_SPACE_GENERIC;
14711 /* Allow zero-extended SImode addresses,
14712 they will be emitted with addr32 prefix. */
14713 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14715 if (GET_CODE (addr) == ZERO_EXTEND
14716 && GET_MODE (XEXP (addr, 0)) == SImode)
14718 addr = XEXP (addr, 0);
14719 if (CONST_INT_P (addr))
14720 return 0;
14722 else if (GET_CODE (addr) == AND
14723 && const_32bit_mask (XEXP (addr, 1), DImode))
14725 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14726 if (addr == NULL_RTX)
14727 return 0;
14729 if (CONST_INT_P (addr))
14730 return 0;
14734 /* Allow SImode subregs of DImode addresses,
14735 they will be emitted with addr32 prefix. */
14736 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14738 if (SUBREG_P (addr)
14739 && GET_MODE (SUBREG_REG (addr)) == DImode)
14741 addr = SUBREG_REG (addr);
14742 if (CONST_INT_P (addr))
14743 return 0;
14747 if (REG_P (addr))
14748 base = addr;
14749 else if (SUBREG_P (addr))
14751 if (REG_P (SUBREG_REG (addr)))
14752 base = addr;
14753 else
14754 return 0;
14756 else if (GET_CODE (addr) == PLUS)
14758 rtx addends[4], op;
14759 int n = 0, i;
14761 op = addr;
14764 if (n >= 4)
14765 return 0;
14766 addends[n++] = XEXP (op, 1);
14767 op = XEXP (op, 0);
14769 while (GET_CODE (op) == PLUS);
14770 if (n >= 4)
14771 return 0;
14772 addends[n] = op;
14774 for (i = n; i >= 0; --i)
14776 op = addends[i];
14777 switch (GET_CODE (op))
14779 case MULT:
14780 if (index)
14781 return 0;
14782 index = XEXP (op, 0);
14783 scale_rtx = XEXP (op, 1);
14784 break;
14786 case ASHIFT:
14787 if (index)
14788 return 0;
14789 index = XEXP (op, 0);
14790 tmp = XEXP (op, 1);
14791 if (!CONST_INT_P (tmp))
14792 return 0;
14793 scale = INTVAL (tmp);
14794 if ((unsigned HOST_WIDE_INT) scale > 3)
14795 return 0;
14796 scale = 1 << scale;
14797 break;
14799 case ZERO_EXTEND:
14800 op = XEXP (op, 0);
14801 if (GET_CODE (op) != UNSPEC)
14802 return 0;
14803 /* FALLTHRU */
14805 case UNSPEC:
14806 if (XINT (op, 1) == UNSPEC_TP
14807 && TARGET_TLS_DIRECT_SEG_REFS
14808 && seg == ADDR_SPACE_GENERIC)
14809 seg = DEFAULT_TLS_SEG_REG;
14810 else
14811 return 0;
14812 break;
14814 case SUBREG:
14815 if (!REG_P (SUBREG_REG (op)))
14816 return 0;
14817 /* FALLTHRU */
14819 case REG:
14820 if (!base)
14821 base = op;
14822 else if (!index)
14823 index = op;
14824 else
14825 return 0;
14826 break;
14828 case CONST:
14829 case CONST_INT:
14830 case SYMBOL_REF:
14831 case LABEL_REF:
14832 if (disp)
14833 return 0;
14834 disp = op;
14835 break;
14837 default:
14838 return 0;
14842 else if (GET_CODE (addr) == MULT)
14844 index = XEXP (addr, 0); /* index*scale */
14845 scale_rtx = XEXP (addr, 1);
14847 else if (GET_CODE (addr) == ASHIFT)
14849 /* We're called for lea too, which implements ashift on occasion. */
14850 index = XEXP (addr, 0);
14851 tmp = XEXP (addr, 1);
14852 if (!CONST_INT_P (tmp))
14853 return 0;
14854 scale = INTVAL (tmp);
14855 if ((unsigned HOST_WIDE_INT) scale > 3)
14856 return 0;
14857 scale = 1 << scale;
14858 retval = -1;
14860 else
14861 disp = addr; /* displacement */
14863 if (index)
14865 if (REG_P (index))
14867 else if (SUBREG_P (index)
14868 && REG_P (SUBREG_REG (index)))
14870 else
14871 return 0;
14874 /* Extract the integral value of scale. */
14875 if (scale_rtx)
14877 if (!CONST_INT_P (scale_rtx))
14878 return 0;
14879 scale = INTVAL (scale_rtx);
14882 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
14883 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
14885 /* Avoid useless 0 displacement. */
14886 if (disp == const0_rtx && (base || index))
14887 disp = NULL_RTX;
14889 /* Allow arg pointer and stack pointer as index if there is not scaling. */
14890 if (base_reg && index_reg && scale == 1
14891 && (REGNO (index_reg) == ARG_POINTER_REGNUM
14892 || REGNO (index_reg) == FRAME_POINTER_REGNUM
14893 || REGNO (index_reg) == SP_REG))
14895 std::swap (base, index);
14896 std::swap (base_reg, index_reg);
14899 /* Special case: %ebp cannot be encoded as a base without a displacement.
14900 Similarly %r13. */
14901 if (!disp && base_reg
14902 && (REGNO (base_reg) == ARG_POINTER_REGNUM
14903 || REGNO (base_reg) == FRAME_POINTER_REGNUM
14904 || REGNO (base_reg) == BP_REG
14905 || REGNO (base_reg) == R13_REG))
14906 disp = const0_rtx;
14908 /* Special case: on K6, [%esi] makes the instruction vector decoded.
14909 Avoid this by transforming to [%esi+0].
14910 Reload calls address legitimization without cfun defined, so we need
14911 to test cfun for being non-NULL. */
14912 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
14913 && base_reg && !index_reg && !disp
14914 && REGNO (base_reg) == SI_REG)
14915 disp = const0_rtx;
14917 /* Special case: encode reg+reg instead of reg*2. */
14918 if (!base && index && scale == 2)
14919 base = index, base_reg = index_reg, scale = 1;
14921 /* Special case: scaling cannot be encoded without base or displacement. */
14922 if (!base && !disp && index && scale != 1)
14923 disp = const0_rtx;
14925 out->base = base;
14926 out->index = index;
14927 out->disp = disp;
14928 out->scale = scale;
14929 out->seg = seg;
14931 return retval;
14934 /* Return cost of the memory address x.
14935 For i386, it is better to use a complex address than let gcc copy
14936 the address into a reg and make a new pseudo. But not if the address
14937 requires to two regs - that would mean more pseudos with longer
14938 lifetimes. */
14939 static int
14940 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
14942 struct ix86_address parts;
14943 int cost = 1;
14944 int ok = ix86_decompose_address (x, &parts);
14946 gcc_assert (ok);
14948 if (parts.base && SUBREG_P (parts.base))
14949 parts.base = SUBREG_REG (parts.base);
14950 if (parts.index && SUBREG_P (parts.index))
14951 parts.index = SUBREG_REG (parts.index);
14953 /* Attempt to minimize number of registers in the address by increasing
14954 address cost for each used register. We don't increase address cost
14955 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
14956 is not invariant itself it most likely means that base or index is not
14957 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
14958 which is not profitable for x86. */
14959 if (parts.base
14960 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
14961 && (current_pass->type == GIMPLE_PASS
14962 || !pic_offset_table_rtx
14963 || !REG_P (parts.base)
14964 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
14965 cost++;
14967 if (parts.index
14968 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
14969 && (current_pass->type == GIMPLE_PASS
14970 || !pic_offset_table_rtx
14971 || !REG_P (parts.index)
14972 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
14973 cost++;
14975 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
14976 since it's predecode logic can't detect the length of instructions
14977 and it degenerates to vector decoded. Increase cost of such
14978 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
14979 to split such addresses or even refuse such addresses at all.
14981 Following addressing modes are affected:
14982 [base+scale*index]
14983 [scale*index+disp]
14984 [base+index]
14986 The first and last case may be avoidable by explicitly coding the zero in
14987 memory address, but I don't have AMD-K6 machine handy to check this
14988 theory. */
14990 if (TARGET_K6
14991 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
14992 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
14993 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
14994 cost += 10;
14996 return cost;
14999 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15000 this is used for to form addresses to local data when -fPIC is in
15001 use. */
15003 static bool
15004 darwin_local_data_pic (rtx disp)
15006 return (GET_CODE (disp) == UNSPEC
15007 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15010 /* True if operand X should be loaded from GOT. */
15012 bool
15013 ix86_force_load_from_GOT_p (rtx x)
15015 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15016 && !TARGET_PECOFF && !TARGET_MACHO
15017 && !flag_plt && !flag_pic
15018 && ix86_cmodel != CM_LARGE
15019 && GET_CODE (x) == SYMBOL_REF
15020 && SYMBOL_REF_FUNCTION_P (x)
15021 && !SYMBOL_REF_LOCAL_P (x));
15024 /* Determine if a given RTX is a valid constant. We already know this
15025 satisfies CONSTANT_P. */
15027 static bool
15028 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15030 /* Pointer bounds constants are not valid. */
15031 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15032 return false;
15034 switch (GET_CODE (x))
15036 case CONST:
15037 x = XEXP (x, 0);
15039 if (GET_CODE (x) == PLUS)
15041 if (!CONST_INT_P (XEXP (x, 1)))
15042 return false;
15043 x = XEXP (x, 0);
15046 if (TARGET_MACHO && darwin_local_data_pic (x))
15047 return true;
15049 /* Only some unspecs are valid as "constants". */
15050 if (GET_CODE (x) == UNSPEC)
15051 switch (XINT (x, 1))
15053 case UNSPEC_GOT:
15054 case UNSPEC_GOTOFF:
15055 case UNSPEC_PLTOFF:
15056 return TARGET_64BIT;
15057 case UNSPEC_TPOFF:
15058 case UNSPEC_NTPOFF:
15059 x = XVECEXP (x, 0, 0);
15060 return (GET_CODE (x) == SYMBOL_REF
15061 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15062 case UNSPEC_DTPOFF:
15063 x = XVECEXP (x, 0, 0);
15064 return (GET_CODE (x) == SYMBOL_REF
15065 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15066 default:
15067 return false;
15070 /* We must have drilled down to a symbol. */
15071 if (GET_CODE (x) == LABEL_REF)
15072 return true;
15073 if (GET_CODE (x) != SYMBOL_REF)
15074 return false;
15075 /* FALLTHRU */
15077 case SYMBOL_REF:
15078 /* TLS symbols are never valid. */
15079 if (SYMBOL_REF_TLS_MODEL (x))
15080 return false;
15082 /* DLLIMPORT symbols are never valid. */
15083 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15084 && SYMBOL_REF_DLLIMPORT_P (x))
15085 return false;
15087 #if TARGET_MACHO
15088 /* mdynamic-no-pic */
15089 if (MACHO_DYNAMIC_NO_PIC_P)
15090 return machopic_symbol_defined_p (x);
15091 #endif
15093 /* External function address should be loaded
15094 via the GOT slot to avoid PLT. */
15095 if (ix86_force_load_from_GOT_p (x))
15096 return false;
15098 break;
15100 CASE_CONST_SCALAR_INT:
15101 switch (mode)
15103 case E_TImode:
15104 if (TARGET_64BIT)
15105 return true;
15106 /* FALLTHRU */
15107 case E_OImode:
15108 case E_XImode:
15109 if (!standard_sse_constant_p (x, mode))
15110 return false;
15111 default:
15112 break;
15114 break;
15116 case CONST_VECTOR:
15117 if (!standard_sse_constant_p (x, mode))
15118 return false;
15120 default:
15121 break;
15124 /* Otherwise we handle everything else in the move patterns. */
15125 return true;
15128 /* Determine if it's legal to put X into the constant pool. This
15129 is not possible for the address of thread-local symbols, which
15130 is checked above. */
15132 static bool
15133 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15135 /* We can put any immediate constant in memory. */
15136 switch (GET_CODE (x))
15138 CASE_CONST_ANY:
15139 return false;
15141 default:
15142 break;
15145 return !ix86_legitimate_constant_p (mode, x);
15148 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15149 otherwise zero. */
15151 static bool
15152 is_imported_p (rtx x)
15154 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15155 || GET_CODE (x) != SYMBOL_REF)
15156 return false;
15158 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15162 /* Nonzero if the constant value X is a legitimate general operand
15163 when generating PIC code. It is given that flag_pic is on and
15164 that X satisfies CONSTANT_P. */
15166 bool
15167 legitimate_pic_operand_p (rtx x)
15169 rtx inner;
15171 switch (GET_CODE (x))
15173 case CONST:
15174 inner = XEXP (x, 0);
15175 if (GET_CODE (inner) == PLUS
15176 && CONST_INT_P (XEXP (inner, 1)))
15177 inner = XEXP (inner, 0);
15179 /* Only some unspecs are valid as "constants". */
15180 if (GET_CODE (inner) == UNSPEC)
15181 switch (XINT (inner, 1))
15183 case UNSPEC_GOT:
15184 case UNSPEC_GOTOFF:
15185 case UNSPEC_PLTOFF:
15186 return TARGET_64BIT;
15187 case UNSPEC_TPOFF:
15188 x = XVECEXP (inner, 0, 0);
15189 return (GET_CODE (x) == SYMBOL_REF
15190 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15191 case UNSPEC_MACHOPIC_OFFSET:
15192 return legitimate_pic_address_disp_p (x);
15193 default:
15194 return false;
15196 /* FALLTHRU */
15198 case SYMBOL_REF:
15199 case LABEL_REF:
15200 return legitimate_pic_address_disp_p (x);
15202 default:
15203 return true;
15207 /* Determine if a given CONST RTX is a valid memory displacement
15208 in PIC mode. */
15210 bool
15211 legitimate_pic_address_disp_p (rtx disp)
15213 bool saw_plus;
15215 /* In 64bit mode we can allow direct addresses of symbols and labels
15216 when they are not dynamic symbols. */
15217 if (TARGET_64BIT)
15219 rtx op0 = disp, op1;
15221 switch (GET_CODE (disp))
15223 case LABEL_REF:
15224 return true;
15226 case CONST:
15227 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15228 break;
15229 op0 = XEXP (XEXP (disp, 0), 0);
15230 op1 = XEXP (XEXP (disp, 0), 1);
15231 if (!CONST_INT_P (op1))
15232 break;
15233 if (GET_CODE (op0) == UNSPEC
15234 && (XINT (op0, 1) == UNSPEC_DTPOFF
15235 || XINT (op0, 1) == UNSPEC_NTPOFF)
15236 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15237 return true;
15238 if (INTVAL (op1) >= 16*1024*1024
15239 || INTVAL (op1) < -16*1024*1024)
15240 break;
15241 if (GET_CODE (op0) == LABEL_REF)
15242 return true;
15243 if (GET_CODE (op0) == CONST
15244 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15245 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15246 return true;
15247 if (GET_CODE (op0) == UNSPEC
15248 && XINT (op0, 1) == UNSPEC_PCREL)
15249 return true;
15250 if (GET_CODE (op0) != SYMBOL_REF)
15251 break;
15252 /* FALLTHRU */
15254 case SYMBOL_REF:
15255 /* TLS references should always be enclosed in UNSPEC.
15256 The dllimported symbol needs always to be resolved. */
15257 if (SYMBOL_REF_TLS_MODEL (op0)
15258 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15259 return false;
15261 if (TARGET_PECOFF)
15263 if (is_imported_p (op0))
15264 return true;
15266 if (SYMBOL_REF_FAR_ADDR_P (op0)
15267 || !SYMBOL_REF_LOCAL_P (op0))
15268 break;
15270 /* Function-symbols need to be resolved only for
15271 large-model.
15272 For the small-model we don't need to resolve anything
15273 here. */
15274 if ((ix86_cmodel != CM_LARGE_PIC
15275 && SYMBOL_REF_FUNCTION_P (op0))
15276 || ix86_cmodel == CM_SMALL_PIC)
15277 return true;
15278 /* Non-external symbols don't need to be resolved for
15279 large, and medium-model. */
15280 if ((ix86_cmodel == CM_LARGE_PIC
15281 || ix86_cmodel == CM_MEDIUM_PIC)
15282 && !SYMBOL_REF_EXTERNAL_P (op0))
15283 return true;
15285 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15286 && (SYMBOL_REF_LOCAL_P (op0)
15287 || (HAVE_LD_PIE_COPYRELOC
15288 && flag_pie
15289 && !SYMBOL_REF_WEAK (op0)
15290 && !SYMBOL_REF_FUNCTION_P (op0)))
15291 && ix86_cmodel != CM_LARGE_PIC)
15292 return true;
15293 break;
15295 default:
15296 break;
15299 if (GET_CODE (disp) != CONST)
15300 return false;
15301 disp = XEXP (disp, 0);
15303 if (TARGET_64BIT)
15305 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15306 of GOT tables. We should not need these anyway. */
15307 if (GET_CODE (disp) != UNSPEC
15308 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15309 && XINT (disp, 1) != UNSPEC_GOTOFF
15310 && XINT (disp, 1) != UNSPEC_PCREL
15311 && XINT (disp, 1) != UNSPEC_PLTOFF))
15312 return false;
15314 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15315 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15316 return false;
15317 return true;
15320 saw_plus = false;
15321 if (GET_CODE (disp) == PLUS)
15323 if (!CONST_INT_P (XEXP (disp, 1)))
15324 return false;
15325 disp = XEXP (disp, 0);
15326 saw_plus = true;
15329 if (TARGET_MACHO && darwin_local_data_pic (disp))
15330 return true;
15332 if (GET_CODE (disp) != UNSPEC)
15333 return false;
15335 switch (XINT (disp, 1))
15337 case UNSPEC_GOT:
15338 if (saw_plus)
15339 return false;
15340 /* We need to check for both symbols and labels because VxWorks loads
15341 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15342 details. */
15343 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15344 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15345 case UNSPEC_GOTOFF:
15346 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15347 While ABI specify also 32bit relocation but we don't produce it in
15348 small PIC model at all. */
15349 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15350 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15351 && !TARGET_64BIT)
15352 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15353 return false;
15354 case UNSPEC_GOTTPOFF:
15355 case UNSPEC_GOTNTPOFF:
15356 case UNSPEC_INDNTPOFF:
15357 if (saw_plus)
15358 return false;
15359 disp = XVECEXP (disp, 0, 0);
15360 return (GET_CODE (disp) == SYMBOL_REF
15361 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15362 case UNSPEC_NTPOFF:
15363 disp = XVECEXP (disp, 0, 0);
15364 return (GET_CODE (disp) == SYMBOL_REF
15365 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15366 case UNSPEC_DTPOFF:
15367 disp = XVECEXP (disp, 0, 0);
15368 return (GET_CODE (disp) == SYMBOL_REF
15369 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15372 return false;
15375 /* Determine if op is suitable RTX for an address register.
15376 Return naked register if a register or a register subreg is
15377 found, otherwise return NULL_RTX. */
15379 static rtx
15380 ix86_validate_address_register (rtx op)
15382 machine_mode mode = GET_MODE (op);
15384 /* Only SImode or DImode registers can form the address. */
15385 if (mode != SImode && mode != DImode)
15386 return NULL_RTX;
15388 if (REG_P (op))
15389 return op;
15390 else if (SUBREG_P (op))
15392 rtx reg = SUBREG_REG (op);
15394 if (!REG_P (reg))
15395 return NULL_RTX;
15397 mode = GET_MODE (reg);
15399 /* Don't allow SUBREGs that span more than a word. It can
15400 lead to spill failures when the register is one word out
15401 of a two word structure. */
15402 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15403 return NULL_RTX;
15405 /* Allow only SUBREGs of non-eliminable hard registers. */
15406 if (register_no_elim_operand (reg, mode))
15407 return reg;
15410 /* Op is not a register. */
15411 return NULL_RTX;
15414 /* Recognizes RTL expressions that are valid memory addresses for an
15415 instruction. The MODE argument is the machine mode for the MEM
15416 expression that wants to use this address.
15418 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15419 convert common non-canonical forms to canonical form so that they will
15420 be recognized. */
15422 static bool
15423 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15425 struct ix86_address parts;
15426 rtx base, index, disp;
15427 HOST_WIDE_INT scale;
15428 addr_space_t seg;
15430 if (ix86_decompose_address (addr, &parts) <= 0)
15431 /* Decomposition failed. */
15432 return false;
15434 base = parts.base;
15435 index = parts.index;
15436 disp = parts.disp;
15437 scale = parts.scale;
15438 seg = parts.seg;
15440 /* Validate base register. */
15441 if (base)
15443 rtx reg = ix86_validate_address_register (base);
15445 if (reg == NULL_RTX)
15446 return false;
15448 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15449 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15450 /* Base is not valid. */
15451 return false;
15454 /* Validate index register. */
15455 if (index)
15457 rtx reg = ix86_validate_address_register (index);
15459 if (reg == NULL_RTX)
15460 return false;
15462 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15463 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15464 /* Index is not valid. */
15465 return false;
15468 /* Index and base should have the same mode. */
15469 if (base && index
15470 && GET_MODE (base) != GET_MODE (index))
15471 return false;
15473 /* Address override works only on the (%reg) part of %fs:(%reg). */
15474 if (seg != ADDR_SPACE_GENERIC
15475 && ((base && GET_MODE (base) != word_mode)
15476 || (index && GET_MODE (index) != word_mode)))
15477 return false;
15479 /* Validate scale factor. */
15480 if (scale != 1)
15482 if (!index)
15483 /* Scale without index. */
15484 return false;
15486 if (scale != 2 && scale != 4 && scale != 8)
15487 /* Scale is not a valid multiplier. */
15488 return false;
15491 /* Validate displacement. */
15492 if (disp)
15494 if (GET_CODE (disp) == CONST
15495 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15496 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15497 switch (XINT (XEXP (disp, 0), 1))
15499 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15500 when used. While ABI specify also 32bit relocations, we
15501 don't produce them at all and use IP relative instead.
15502 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15503 should be loaded via GOT. */
15504 case UNSPEC_GOT:
15505 if (!TARGET_64BIT
15506 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15507 goto is_legitimate_pic;
15508 /* FALLTHRU */
15509 case UNSPEC_GOTOFF:
15510 gcc_assert (flag_pic);
15511 if (!TARGET_64BIT)
15512 goto is_legitimate_pic;
15514 /* 64bit address unspec. */
15515 return false;
15517 case UNSPEC_GOTPCREL:
15518 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15519 goto is_legitimate_pic;
15520 /* FALLTHRU */
15521 case UNSPEC_PCREL:
15522 gcc_assert (flag_pic);
15523 goto is_legitimate_pic;
15525 case UNSPEC_GOTTPOFF:
15526 case UNSPEC_GOTNTPOFF:
15527 case UNSPEC_INDNTPOFF:
15528 case UNSPEC_NTPOFF:
15529 case UNSPEC_DTPOFF:
15530 break;
15532 default:
15533 /* Invalid address unspec. */
15534 return false;
15537 else if (SYMBOLIC_CONST (disp)
15538 && (flag_pic
15539 || (TARGET_MACHO
15540 #if TARGET_MACHO
15541 && MACHOPIC_INDIRECT
15542 && !machopic_operand_p (disp)
15543 #endif
15547 is_legitimate_pic:
15548 if (TARGET_64BIT && (index || base))
15550 /* foo@dtpoff(%rX) is ok. */
15551 if (GET_CODE (disp) != CONST
15552 || GET_CODE (XEXP (disp, 0)) != PLUS
15553 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15554 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15555 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15556 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15557 /* Non-constant pic memory reference. */
15558 return false;
15560 else if ((!TARGET_MACHO || flag_pic)
15561 && ! legitimate_pic_address_disp_p (disp))
15562 /* Displacement is an invalid pic construct. */
15563 return false;
15564 #if TARGET_MACHO
15565 else if (MACHO_DYNAMIC_NO_PIC_P
15566 && !ix86_legitimate_constant_p (Pmode, disp))
15567 /* displacment must be referenced via non_lazy_pointer */
15568 return false;
15569 #endif
15571 /* This code used to verify that a symbolic pic displacement
15572 includes the pic_offset_table_rtx register.
15574 While this is good idea, unfortunately these constructs may
15575 be created by "adds using lea" optimization for incorrect
15576 code like:
15578 int a;
15579 int foo(int i)
15581 return *(&a+i);
15584 This code is nonsensical, but results in addressing
15585 GOT table with pic_offset_table_rtx base. We can't
15586 just refuse it easily, since it gets matched by
15587 "addsi3" pattern, that later gets split to lea in the
15588 case output register differs from input. While this
15589 can be handled by separate addsi pattern for this case
15590 that never results in lea, this seems to be easier and
15591 correct fix for crash to disable this test. */
15593 else if (GET_CODE (disp) != LABEL_REF
15594 && !CONST_INT_P (disp)
15595 && (GET_CODE (disp) != CONST
15596 || !ix86_legitimate_constant_p (Pmode, disp))
15597 && (GET_CODE (disp) != SYMBOL_REF
15598 || !ix86_legitimate_constant_p (Pmode, disp)))
15599 /* Displacement is not constant. */
15600 return false;
15601 else if (TARGET_64BIT
15602 && !x86_64_immediate_operand (disp, VOIDmode))
15603 /* Displacement is out of range. */
15604 return false;
15605 /* In x32 mode, constant addresses are sign extended to 64bit, so
15606 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15607 else if (TARGET_X32 && !(index || base)
15608 && CONST_INT_P (disp)
15609 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15610 return false;
15613 /* Everything looks valid. */
15614 return true;
15617 /* Determine if a given RTX is a valid constant address. */
15619 bool
15620 constant_address_p (rtx x)
15622 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15625 /* Return a unique alias set for the GOT. */
15627 static alias_set_type
15628 ix86_GOT_alias_set (void)
15630 static alias_set_type set = -1;
15631 if (set == -1)
15632 set = new_alias_set ();
15633 return set;
15636 /* Return a legitimate reference for ORIG (an address) using the
15637 register REG. If REG is 0, a new pseudo is generated.
15639 There are two types of references that must be handled:
15641 1. Global data references must load the address from the GOT, via
15642 the PIC reg. An insn is emitted to do this load, and the reg is
15643 returned.
15645 2. Static data references, constant pool addresses, and code labels
15646 compute the address as an offset from the GOT, whose base is in
15647 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15648 differentiate them from global data objects. The returned
15649 address is the PIC reg + an unspec constant.
15651 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15652 reg also appears in the address. */
15654 static rtx
15655 legitimize_pic_address (rtx orig, rtx reg)
15657 rtx addr = orig;
15658 rtx new_rtx = orig;
15660 #if TARGET_MACHO
15661 if (TARGET_MACHO && !TARGET_64BIT)
15663 if (reg == 0)
15664 reg = gen_reg_rtx (Pmode);
15665 /* Use the generic Mach-O PIC machinery. */
15666 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15668 #endif
15670 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15672 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15673 if (tmp)
15674 return tmp;
15677 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15678 new_rtx = addr;
15679 else if ((!TARGET_64BIT
15680 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15681 && !TARGET_PECOFF
15682 && gotoff_operand (addr, Pmode))
15684 /* This symbol may be referenced via a displacement
15685 from the PIC base address (@GOTOFF). */
15686 if (GET_CODE (addr) == CONST)
15687 addr = XEXP (addr, 0);
15689 if (GET_CODE (addr) == PLUS)
15691 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15692 UNSPEC_GOTOFF);
15693 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15695 else
15696 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15698 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15700 if (TARGET_64BIT)
15701 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15703 if (reg != 0)
15705 gcc_assert (REG_P (reg));
15706 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15707 new_rtx, reg, 1, OPTAB_DIRECT);
15709 else
15710 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15712 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15713 /* We can't use @GOTOFF for text labels
15714 on VxWorks, see gotoff_operand. */
15715 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15717 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15718 if (tmp)
15719 return tmp;
15721 /* For x64 PE-COFF there is no GOT table,
15722 so we use address directly. */
15723 if (TARGET_64BIT && TARGET_PECOFF)
15725 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15726 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15728 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15730 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15731 UNSPEC_GOTPCREL);
15732 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15733 new_rtx = gen_const_mem (Pmode, new_rtx);
15734 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15736 else
15738 /* This symbol must be referenced via a load
15739 from the Global Offset Table (@GOT). */
15740 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15741 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15742 if (TARGET_64BIT)
15743 new_rtx = force_reg (Pmode, new_rtx);
15744 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15745 new_rtx = gen_const_mem (Pmode, new_rtx);
15746 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15749 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15751 else
15753 if (CONST_INT_P (addr)
15754 && !x86_64_immediate_operand (addr, VOIDmode))
15755 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15756 else if (GET_CODE (addr) == CONST)
15758 addr = XEXP (addr, 0);
15760 /* We must match stuff we generate before. Assume the only
15761 unspecs that can get here are ours. Not that we could do
15762 anything with them anyway.... */
15763 if (GET_CODE (addr) == UNSPEC
15764 || (GET_CODE (addr) == PLUS
15765 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
15766 return orig;
15767 gcc_assert (GET_CODE (addr) == PLUS);
15770 if (GET_CODE (addr) == PLUS)
15772 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
15774 /* Check first to see if this is a constant
15775 offset from a @GOTOFF symbol reference. */
15776 if (!TARGET_PECOFF
15777 && gotoff_operand (op0, Pmode)
15778 && CONST_INT_P (op1))
15780 if (!TARGET_64BIT)
15782 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
15783 UNSPEC_GOTOFF);
15784 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
15785 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15787 if (reg != 0)
15789 gcc_assert (REG_P (reg));
15790 new_rtx = expand_simple_binop (Pmode, PLUS,
15791 pic_offset_table_rtx,
15792 new_rtx, reg, 1,
15793 OPTAB_DIRECT);
15795 else
15796 new_rtx
15797 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15799 else
15801 if (INTVAL (op1) < -16*1024*1024
15802 || INTVAL (op1) >= 16*1024*1024)
15804 if (!x86_64_immediate_operand (op1, Pmode))
15805 op1 = force_reg (Pmode, op1);
15807 new_rtx
15808 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
15812 else
15814 rtx base = legitimize_pic_address (op0, reg);
15815 machine_mode mode = GET_MODE (base);
15816 new_rtx
15817 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
15819 if (CONST_INT_P (new_rtx))
15821 if (INTVAL (new_rtx) < -16*1024*1024
15822 || INTVAL (new_rtx) >= 16*1024*1024)
15824 if (!x86_64_immediate_operand (new_rtx, mode))
15825 new_rtx = force_reg (mode, new_rtx);
15827 new_rtx
15828 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
15830 else
15831 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
15833 else
15835 /* For %rip addressing, we have to use
15836 just disp32, not base nor index. */
15837 if (TARGET_64BIT
15838 && (GET_CODE (base) == SYMBOL_REF
15839 || GET_CODE (base) == LABEL_REF))
15840 base = force_reg (mode, base);
15841 if (GET_CODE (new_rtx) == PLUS
15842 && CONSTANT_P (XEXP (new_rtx, 1)))
15844 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
15845 new_rtx = XEXP (new_rtx, 1);
15847 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
15852 return new_rtx;
15855 /* Load the thread pointer. If TO_REG is true, force it into a register. */
15857 static rtx
15858 get_thread_pointer (machine_mode tp_mode, bool to_reg)
15860 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
15862 if (GET_MODE (tp) != tp_mode)
15864 gcc_assert (GET_MODE (tp) == SImode);
15865 gcc_assert (tp_mode == DImode);
15867 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
15870 if (to_reg)
15871 tp = copy_to_mode_reg (tp_mode, tp);
15873 return tp;
15876 /* Construct the SYMBOL_REF for the tls_get_addr function. */
15878 static GTY(()) rtx ix86_tls_symbol;
15880 static rtx
15881 ix86_tls_get_addr (void)
15883 if (!ix86_tls_symbol)
15885 const char *sym
15886 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
15887 ? "___tls_get_addr" : "__tls_get_addr");
15889 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
15892 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
15894 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
15895 UNSPEC_PLTOFF);
15896 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
15897 gen_rtx_CONST (Pmode, unspec));
15900 return ix86_tls_symbol;
15903 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
15905 static GTY(()) rtx ix86_tls_module_base_symbol;
15908 ix86_tls_module_base (void)
15910 if (!ix86_tls_module_base_symbol)
15912 ix86_tls_module_base_symbol
15913 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
15915 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15916 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15919 return ix86_tls_module_base_symbol;
15922 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
15923 false if we expect this to be used for a memory address and true if
15924 we expect to load the address into a register. */
15926 static rtx
15927 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
15929 rtx dest, base, off;
15930 rtx pic = NULL_RTX, tp = NULL_RTX;
15931 machine_mode tp_mode = Pmode;
15932 int type;
15934 /* Fall back to global dynamic model if tool chain cannot support local
15935 dynamic. */
15936 if (TARGET_SUN_TLS && !TARGET_64BIT
15937 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
15938 && model == TLS_MODEL_LOCAL_DYNAMIC)
15939 model = TLS_MODEL_GLOBAL_DYNAMIC;
15941 switch (model)
15943 case TLS_MODEL_GLOBAL_DYNAMIC:
15944 dest = gen_reg_rtx (Pmode);
15946 if (!TARGET_64BIT)
15948 if (flag_pic && !TARGET_PECOFF)
15949 pic = pic_offset_table_rtx;
15950 else
15952 pic = gen_reg_rtx (Pmode);
15953 emit_insn (gen_set_got (pic));
15957 if (TARGET_GNU2_TLS)
15959 if (TARGET_64BIT)
15960 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
15961 else
15962 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
15964 tp = get_thread_pointer (Pmode, true);
15965 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
15967 if (GET_MODE (x) != Pmode)
15968 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15970 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
15972 else
15974 rtx caddr = ix86_tls_get_addr ();
15976 if (TARGET_64BIT)
15978 rtx rax = gen_rtx_REG (Pmode, AX_REG);
15979 rtx_insn *insns;
15981 start_sequence ();
15982 emit_call_insn
15983 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
15984 insns = get_insns ();
15985 end_sequence ();
15987 if (GET_MODE (x) != Pmode)
15988 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15990 RTL_CONST_CALL_P (insns) = 1;
15991 emit_libcall_block (insns, dest, rax, x);
15993 else
15994 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
15996 break;
15998 case TLS_MODEL_LOCAL_DYNAMIC:
15999 base = gen_reg_rtx (Pmode);
16001 if (!TARGET_64BIT)
16003 if (flag_pic)
16004 pic = pic_offset_table_rtx;
16005 else
16007 pic = gen_reg_rtx (Pmode);
16008 emit_insn (gen_set_got (pic));
16012 if (TARGET_GNU2_TLS)
16014 rtx tmp = ix86_tls_module_base ();
16016 if (TARGET_64BIT)
16017 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16018 else
16019 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16021 tp = get_thread_pointer (Pmode, true);
16022 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16023 gen_rtx_MINUS (Pmode, tmp, tp));
16025 else
16027 rtx caddr = ix86_tls_get_addr ();
16029 if (TARGET_64BIT)
16031 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16032 rtx_insn *insns;
16033 rtx eqv;
16035 start_sequence ();
16036 emit_call_insn
16037 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16038 insns = get_insns ();
16039 end_sequence ();
16041 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16042 share the LD_BASE result with other LD model accesses. */
16043 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16044 UNSPEC_TLS_LD_BASE);
16046 RTL_CONST_CALL_P (insns) = 1;
16047 emit_libcall_block (insns, base, rax, eqv);
16049 else
16050 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16053 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16054 off = gen_rtx_CONST (Pmode, off);
16056 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16058 if (TARGET_GNU2_TLS)
16060 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16062 if (GET_MODE (x) != Pmode)
16063 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16065 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16067 break;
16069 case TLS_MODEL_INITIAL_EXEC:
16070 if (TARGET_64BIT)
16072 if (TARGET_SUN_TLS && !TARGET_X32)
16074 /* The Sun linker took the AMD64 TLS spec literally
16075 and can only handle %rax as destination of the
16076 initial executable code sequence. */
16078 dest = gen_reg_rtx (DImode);
16079 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16080 return dest;
16083 /* Generate DImode references to avoid %fs:(%reg32)
16084 problems and linker IE->LE relaxation bug. */
16085 tp_mode = DImode;
16086 pic = NULL;
16087 type = UNSPEC_GOTNTPOFF;
16089 else if (flag_pic)
16091 pic = pic_offset_table_rtx;
16092 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16094 else if (!TARGET_ANY_GNU_TLS)
16096 pic = gen_reg_rtx (Pmode);
16097 emit_insn (gen_set_got (pic));
16098 type = UNSPEC_GOTTPOFF;
16100 else
16102 pic = NULL;
16103 type = UNSPEC_INDNTPOFF;
16106 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16107 off = gen_rtx_CONST (tp_mode, off);
16108 if (pic)
16109 off = gen_rtx_PLUS (tp_mode, pic, off);
16110 off = gen_const_mem (tp_mode, off);
16111 set_mem_alias_set (off, ix86_GOT_alias_set ());
16113 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16115 base = get_thread_pointer (tp_mode,
16116 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16117 off = force_reg (tp_mode, off);
16118 dest = gen_rtx_PLUS (tp_mode, base, off);
16119 if (tp_mode != Pmode)
16120 dest = convert_to_mode (Pmode, dest, 1);
16122 else
16124 base = get_thread_pointer (Pmode, true);
16125 dest = gen_reg_rtx (Pmode);
16126 emit_insn (ix86_gen_sub3 (dest, base, off));
16128 break;
16130 case TLS_MODEL_LOCAL_EXEC:
16131 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16132 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16133 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16134 off = gen_rtx_CONST (Pmode, off);
16136 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16138 base = get_thread_pointer (Pmode,
16139 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16140 return gen_rtx_PLUS (Pmode, base, off);
16142 else
16144 base = get_thread_pointer (Pmode, true);
16145 dest = gen_reg_rtx (Pmode);
16146 emit_insn (ix86_gen_sub3 (dest, base, off));
16148 break;
16150 default:
16151 gcc_unreachable ();
16154 return dest;
16157 /* Return true if OP refers to a TLS address. */
16158 bool
16159 ix86_tls_address_pattern_p (rtx op)
16161 subrtx_var_iterator::array_type array;
16162 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16164 rtx op = *iter;
16165 if (MEM_P (op))
16167 rtx *x = &XEXP (op, 0);
16168 while (GET_CODE (*x) == PLUS)
16170 int i;
16171 for (i = 0; i < 2; i++)
16173 rtx u = XEXP (*x, i);
16174 if (GET_CODE (u) == ZERO_EXTEND)
16175 u = XEXP (u, 0);
16176 if (GET_CODE (u) == UNSPEC
16177 && XINT (u, 1) == UNSPEC_TP)
16178 return true;
16180 x = &XEXP (*x, 0);
16183 iter.skip_subrtxes ();
16187 return false;
16190 /* Rewrite *LOC so that it refers to a default TLS address space. */
16191 void
16192 ix86_rewrite_tls_address_1 (rtx *loc)
16194 subrtx_ptr_iterator::array_type array;
16195 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16197 rtx *loc = *iter;
16198 if (MEM_P (*loc))
16200 rtx addr = XEXP (*loc, 0);
16201 rtx *x = &addr;
16202 while (GET_CODE (*x) == PLUS)
16204 int i;
16205 for (i = 0; i < 2; i++)
16207 rtx u = XEXP (*x, i);
16208 if (GET_CODE (u) == ZERO_EXTEND)
16209 u = XEXP (u, 0);
16210 if (GET_CODE (u) == UNSPEC
16211 && XINT (u, 1) == UNSPEC_TP)
16213 addr_space_t as = DEFAULT_TLS_SEG_REG;
16215 *x = XEXP (*x, 1 - i);
16217 *loc = replace_equiv_address_nv (*loc, addr, true);
16218 set_mem_addr_space (*loc, as);
16219 return;
16222 x = &XEXP (*x, 0);
16225 iter.skip_subrtxes ();
16230 /* Rewrite instruction pattern involvning TLS address
16231 so that it refers to a default TLS address space. */
16233 ix86_rewrite_tls_address (rtx pattern)
16235 pattern = copy_insn (pattern);
16236 ix86_rewrite_tls_address_1 (&pattern);
16237 return pattern;
16240 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16241 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16242 unique refptr-DECL symbol corresponding to symbol DECL. */
16244 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16246 static inline hashval_t hash (tree_map *m) { return m->hash; }
16247 static inline bool
16248 equal (tree_map *a, tree_map *b)
16250 return a->base.from == b->base.from;
16253 static int
16254 keep_cache_entry (tree_map *&m)
16256 return ggc_marked_p (m->base.from);
16260 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16262 static tree
16263 get_dllimport_decl (tree decl, bool beimport)
16265 struct tree_map *h, in;
16266 const char *name;
16267 const char *prefix;
16268 size_t namelen, prefixlen;
16269 char *imp_name;
16270 tree to;
16271 rtx rtl;
16273 if (!dllimport_map)
16274 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16276 in.hash = htab_hash_pointer (decl);
16277 in.base.from = decl;
16278 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16279 h = *loc;
16280 if (h)
16281 return h->to;
16283 *loc = h = ggc_alloc<tree_map> ();
16284 h->hash = in.hash;
16285 h->base.from = decl;
16286 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16287 VAR_DECL, NULL, ptr_type_node);
16288 DECL_ARTIFICIAL (to) = 1;
16289 DECL_IGNORED_P (to) = 1;
16290 DECL_EXTERNAL (to) = 1;
16291 TREE_READONLY (to) = 1;
16293 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16294 name = targetm.strip_name_encoding (name);
16295 if (beimport)
16296 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16297 ? "*__imp_" : "*__imp__";
16298 else
16299 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16300 namelen = strlen (name);
16301 prefixlen = strlen (prefix);
16302 imp_name = (char *) alloca (namelen + prefixlen + 1);
16303 memcpy (imp_name, prefix, prefixlen);
16304 memcpy (imp_name + prefixlen, name, namelen + 1);
16306 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16307 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16308 SET_SYMBOL_REF_DECL (rtl, to);
16309 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16310 if (!beimport)
16312 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16313 #ifdef SUB_TARGET_RECORD_STUB
16314 SUB_TARGET_RECORD_STUB (name);
16315 #endif
16318 rtl = gen_const_mem (Pmode, rtl);
16319 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16321 SET_DECL_RTL (to, rtl);
16322 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16324 return to;
16327 /* Expand SYMBOL into its corresponding far-address symbol.
16328 WANT_REG is true if we require the result be a register. */
16330 static rtx
16331 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16333 tree imp_decl;
16334 rtx x;
16336 gcc_assert (SYMBOL_REF_DECL (symbol));
16337 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16339 x = DECL_RTL (imp_decl);
16340 if (want_reg)
16341 x = force_reg (Pmode, x);
16342 return x;
16345 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16346 true if we require the result be a register. */
16348 static rtx
16349 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16351 tree imp_decl;
16352 rtx x;
16354 gcc_assert (SYMBOL_REF_DECL (symbol));
16355 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16357 x = DECL_RTL (imp_decl);
16358 if (want_reg)
16359 x = force_reg (Pmode, x);
16360 return x;
16363 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16364 is true if we require the result be a register. */
16366 static rtx
16367 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16369 if (!TARGET_PECOFF)
16370 return NULL_RTX;
16372 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16374 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16375 return legitimize_dllimport_symbol (addr, inreg);
16376 if (GET_CODE (addr) == CONST
16377 && GET_CODE (XEXP (addr, 0)) == PLUS
16378 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16379 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16381 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16382 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16386 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16387 return NULL_RTX;
16388 if (GET_CODE (addr) == SYMBOL_REF
16389 && !is_imported_p (addr)
16390 && SYMBOL_REF_EXTERNAL_P (addr)
16391 && SYMBOL_REF_DECL (addr))
16392 return legitimize_pe_coff_extern_decl (addr, inreg);
16394 if (GET_CODE (addr) == CONST
16395 && GET_CODE (XEXP (addr, 0)) == PLUS
16396 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16397 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16398 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16399 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16401 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16402 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16404 return NULL_RTX;
16407 /* Try machine-dependent ways of modifying an illegitimate address
16408 to be legitimate. If we find one, return the new, valid address.
16409 This macro is used in only one place: `memory_address' in explow.c.
16411 OLDX is the address as it was before break_out_memory_refs was called.
16412 In some cases it is useful to look at this to decide what needs to be done.
16414 It is always safe for this macro to do nothing. It exists to recognize
16415 opportunities to optimize the output.
16417 For the 80386, we handle X+REG by loading X into a register R and
16418 using R+REG. R will go in a general reg and indexing will be used.
16419 However, if REG is a broken-out memory address or multiplication,
16420 nothing needs to be done because REG can certainly go in a general reg.
16422 When -fpic is used, special handling is needed for symbolic references.
16423 See comments by legitimize_pic_address in i386.c for details. */
16425 static rtx
16426 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16428 bool changed = false;
16429 unsigned log;
16431 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16432 if (log)
16433 return legitimize_tls_address (x, (enum tls_model) log, false);
16434 if (GET_CODE (x) == CONST
16435 && GET_CODE (XEXP (x, 0)) == PLUS
16436 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16437 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16439 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16440 (enum tls_model) log, false);
16441 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16444 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16446 rtx tmp = legitimize_pe_coff_symbol (x, true);
16447 if (tmp)
16448 return tmp;
16451 if (flag_pic && SYMBOLIC_CONST (x))
16452 return legitimize_pic_address (x, 0);
16454 #if TARGET_MACHO
16455 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16456 return machopic_indirect_data_reference (x, 0);
16457 #endif
16459 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16460 if (GET_CODE (x) == ASHIFT
16461 && CONST_INT_P (XEXP (x, 1))
16462 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16464 changed = true;
16465 log = INTVAL (XEXP (x, 1));
16466 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16467 GEN_INT (1 << log));
16470 if (GET_CODE (x) == PLUS)
16472 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16474 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16475 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16476 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16478 changed = true;
16479 log = INTVAL (XEXP (XEXP (x, 0), 1));
16480 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16481 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16482 GEN_INT (1 << log));
16485 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16486 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16487 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16489 changed = true;
16490 log = INTVAL (XEXP (XEXP (x, 1), 1));
16491 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16492 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16493 GEN_INT (1 << log));
16496 /* Put multiply first if it isn't already. */
16497 if (GET_CODE (XEXP (x, 1)) == MULT)
16499 std::swap (XEXP (x, 0), XEXP (x, 1));
16500 changed = true;
16503 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16504 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16505 created by virtual register instantiation, register elimination, and
16506 similar optimizations. */
16507 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16509 changed = true;
16510 x = gen_rtx_PLUS (Pmode,
16511 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16512 XEXP (XEXP (x, 1), 0)),
16513 XEXP (XEXP (x, 1), 1));
16516 /* Canonicalize
16517 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16518 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16519 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16520 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16521 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16522 && CONSTANT_P (XEXP (x, 1)))
16524 rtx constant;
16525 rtx other = NULL_RTX;
16527 if (CONST_INT_P (XEXP (x, 1)))
16529 constant = XEXP (x, 1);
16530 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16532 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16534 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16535 other = XEXP (x, 1);
16537 else
16538 constant = 0;
16540 if (constant)
16542 changed = true;
16543 x = gen_rtx_PLUS (Pmode,
16544 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16545 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16546 plus_constant (Pmode, other,
16547 INTVAL (constant)));
16551 if (changed && ix86_legitimate_address_p (mode, x, false))
16552 return x;
16554 if (GET_CODE (XEXP (x, 0)) == MULT)
16556 changed = true;
16557 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16560 if (GET_CODE (XEXP (x, 1)) == MULT)
16562 changed = true;
16563 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16566 if (changed
16567 && REG_P (XEXP (x, 1))
16568 && REG_P (XEXP (x, 0)))
16569 return x;
16571 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16573 changed = true;
16574 x = legitimize_pic_address (x, 0);
16577 if (changed && ix86_legitimate_address_p (mode, x, false))
16578 return x;
16580 if (REG_P (XEXP (x, 0)))
16582 rtx temp = gen_reg_rtx (Pmode);
16583 rtx val = force_operand (XEXP (x, 1), temp);
16584 if (val != temp)
16586 val = convert_to_mode (Pmode, val, 1);
16587 emit_move_insn (temp, val);
16590 XEXP (x, 1) = temp;
16591 return x;
16594 else if (REG_P (XEXP (x, 1)))
16596 rtx temp = gen_reg_rtx (Pmode);
16597 rtx val = force_operand (XEXP (x, 0), temp);
16598 if (val != temp)
16600 val = convert_to_mode (Pmode, val, 1);
16601 emit_move_insn (temp, val);
16604 XEXP (x, 0) = temp;
16605 return x;
16609 return x;
16612 /* Print an integer constant expression in assembler syntax. Addition
16613 and subtraction are the only arithmetic that may appear in these
16614 expressions. FILE is the stdio stream to write to, X is the rtx, and
16615 CODE is the operand print code from the output string. */
16617 static void
16618 output_pic_addr_const (FILE *file, rtx x, int code)
16620 char buf[256];
16622 switch (GET_CODE (x))
16624 case PC:
16625 gcc_assert (flag_pic);
16626 putc ('.', file);
16627 break;
16629 case SYMBOL_REF:
16630 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16631 output_addr_const (file, x);
16632 else
16634 const char *name = XSTR (x, 0);
16636 /* Mark the decl as referenced so that cgraph will
16637 output the function. */
16638 if (SYMBOL_REF_DECL (x))
16639 mark_decl_referenced (SYMBOL_REF_DECL (x));
16641 #if TARGET_MACHO
16642 if (MACHOPIC_INDIRECT
16643 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16644 name = machopic_indirection_name (x, /*stub_p=*/true);
16645 #endif
16646 assemble_name (file, name);
16648 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16649 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16650 fputs ("@PLT", file);
16651 break;
16653 case LABEL_REF:
16654 x = XEXP (x, 0);
16655 /* FALLTHRU */
16656 case CODE_LABEL:
16657 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16658 assemble_name (asm_out_file, buf);
16659 break;
16661 case CONST_INT:
16662 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16663 break;
16665 case CONST:
16666 /* This used to output parentheses around the expression,
16667 but that does not work on the 386 (either ATT or BSD assembler). */
16668 output_pic_addr_const (file, XEXP (x, 0), code);
16669 break;
16671 case CONST_DOUBLE:
16672 /* We can't handle floating point constants;
16673 TARGET_PRINT_OPERAND must handle them. */
16674 output_operand_lossage ("floating constant misused");
16675 break;
16677 case PLUS:
16678 /* Some assemblers need integer constants to appear first. */
16679 if (CONST_INT_P (XEXP (x, 0)))
16681 output_pic_addr_const (file, XEXP (x, 0), code);
16682 putc ('+', file);
16683 output_pic_addr_const (file, XEXP (x, 1), code);
16685 else
16687 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16688 output_pic_addr_const (file, XEXP (x, 1), code);
16689 putc ('+', file);
16690 output_pic_addr_const (file, XEXP (x, 0), code);
16692 break;
16694 case MINUS:
16695 if (!TARGET_MACHO)
16696 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16697 output_pic_addr_const (file, XEXP (x, 0), code);
16698 putc ('-', file);
16699 output_pic_addr_const (file, XEXP (x, 1), code);
16700 if (!TARGET_MACHO)
16701 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16702 break;
16704 case UNSPEC:
16705 gcc_assert (XVECLEN (x, 0) == 1);
16706 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16707 switch (XINT (x, 1))
16709 case UNSPEC_GOT:
16710 fputs ("@GOT", file);
16711 break;
16712 case UNSPEC_GOTOFF:
16713 fputs ("@GOTOFF", file);
16714 break;
16715 case UNSPEC_PLTOFF:
16716 fputs ("@PLTOFF", file);
16717 break;
16718 case UNSPEC_PCREL:
16719 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16720 "(%rip)" : "[rip]", file);
16721 break;
16722 case UNSPEC_GOTPCREL:
16723 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16724 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16725 break;
16726 case UNSPEC_GOTTPOFF:
16727 /* FIXME: This might be @TPOFF in Sun ld too. */
16728 fputs ("@gottpoff", file);
16729 break;
16730 case UNSPEC_TPOFF:
16731 fputs ("@tpoff", file);
16732 break;
16733 case UNSPEC_NTPOFF:
16734 if (TARGET_64BIT)
16735 fputs ("@tpoff", file);
16736 else
16737 fputs ("@ntpoff", file);
16738 break;
16739 case UNSPEC_DTPOFF:
16740 fputs ("@dtpoff", file);
16741 break;
16742 case UNSPEC_GOTNTPOFF:
16743 if (TARGET_64BIT)
16744 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16745 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16746 else
16747 fputs ("@gotntpoff", file);
16748 break;
16749 case UNSPEC_INDNTPOFF:
16750 fputs ("@indntpoff", file);
16751 break;
16752 #if TARGET_MACHO
16753 case UNSPEC_MACHOPIC_OFFSET:
16754 putc ('-', file);
16755 machopic_output_function_base_name (file);
16756 break;
16757 #endif
16758 default:
16759 output_operand_lossage ("invalid UNSPEC as operand");
16760 break;
16762 break;
16764 default:
16765 output_operand_lossage ("invalid expression as operand");
16769 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16770 We need to emit DTP-relative relocations. */
16772 static void ATTRIBUTE_UNUSED
16773 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16775 fputs (ASM_LONG, file);
16776 output_addr_const (file, x);
16777 fputs ("@dtpoff", file);
16778 switch (size)
16780 case 4:
16781 break;
16782 case 8:
16783 fputs (", 0", file);
16784 break;
16785 default:
16786 gcc_unreachable ();
16790 /* Return true if X is a representation of the PIC register. This copes
16791 with calls from ix86_find_base_term, where the register might have
16792 been replaced by a cselib value. */
16794 static bool
16795 ix86_pic_register_p (rtx x)
16797 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16798 return (pic_offset_table_rtx
16799 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16800 else if (!REG_P (x))
16801 return false;
16802 else if (pic_offset_table_rtx)
16804 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16805 return true;
16806 if (HARD_REGISTER_P (x)
16807 && !HARD_REGISTER_P (pic_offset_table_rtx)
16808 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16809 return true;
16810 return false;
16812 else
16813 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16816 /* Helper function for ix86_delegitimize_address.
16817 Attempt to delegitimize TLS local-exec accesses. */
16819 static rtx
16820 ix86_delegitimize_tls_address (rtx orig_x)
16822 rtx x = orig_x, unspec;
16823 struct ix86_address addr;
16825 if (!TARGET_TLS_DIRECT_SEG_REFS)
16826 return orig_x;
16827 if (MEM_P (x))
16828 x = XEXP (x, 0);
16829 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16830 return orig_x;
16831 if (ix86_decompose_address (x, &addr) == 0
16832 || addr.seg != DEFAULT_TLS_SEG_REG
16833 || addr.disp == NULL_RTX
16834 || GET_CODE (addr.disp) != CONST)
16835 return orig_x;
16836 unspec = XEXP (addr.disp, 0);
16837 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
16838 unspec = XEXP (unspec, 0);
16839 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
16840 return orig_x;
16841 x = XVECEXP (unspec, 0, 0);
16842 gcc_assert (GET_CODE (x) == SYMBOL_REF);
16843 if (unspec != XEXP (addr.disp, 0))
16844 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
16845 if (addr.index)
16847 rtx idx = addr.index;
16848 if (addr.scale != 1)
16849 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
16850 x = gen_rtx_PLUS (Pmode, idx, x);
16852 if (addr.base)
16853 x = gen_rtx_PLUS (Pmode, addr.base, x);
16854 if (MEM_P (orig_x))
16855 x = replace_equiv_address_nv (orig_x, x);
16856 return x;
16859 /* In the name of slightly smaller debug output, and to cater to
16860 general assembler lossage, recognize PIC+GOTOFF and turn it back
16861 into a direct symbol reference.
16863 On Darwin, this is necessary to avoid a crash, because Darwin
16864 has a different PIC label for each routine but the DWARF debugging
16865 information is not associated with any particular routine, so it's
16866 necessary to remove references to the PIC label from RTL stored by
16867 the DWARF output code.
16869 This helper is used in the normal ix86_delegitimize_address
16870 entrypoint (e.g. used in the target delegitimization hook) and
16871 in ix86_find_base_term. As compile time memory optimization, we
16872 avoid allocating rtxes that will not change anything on the outcome
16873 of the callers (find_base_value and find_base_term). */
16875 static inline rtx
16876 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
16878 rtx orig_x = delegitimize_mem_from_attrs (x);
16879 /* addend is NULL or some rtx if x is something+GOTOFF where
16880 something doesn't include the PIC register. */
16881 rtx addend = NULL_RTX;
16882 /* reg_addend is NULL or a multiple of some register. */
16883 rtx reg_addend = NULL_RTX;
16884 /* const_addend is NULL or a const_int. */
16885 rtx const_addend = NULL_RTX;
16886 /* This is the result, or NULL. */
16887 rtx result = NULL_RTX;
16889 x = orig_x;
16891 if (MEM_P (x))
16892 x = XEXP (x, 0);
16894 if (TARGET_64BIT)
16896 if (GET_CODE (x) == CONST
16897 && GET_CODE (XEXP (x, 0)) == PLUS
16898 && GET_MODE (XEXP (x, 0)) == Pmode
16899 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16900 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
16901 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
16903 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
16904 base. A CONST can't be arg_pointer_rtx based. */
16905 if (base_term_p && MEM_P (orig_x))
16906 return orig_x;
16907 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
16908 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
16909 if (MEM_P (orig_x))
16910 x = replace_equiv_address_nv (orig_x, x);
16911 return x;
16914 if (GET_CODE (x) == CONST
16915 && GET_CODE (XEXP (x, 0)) == UNSPEC
16916 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
16917 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
16918 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
16920 x = XVECEXP (XEXP (x, 0), 0, 0);
16921 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
16923 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
16924 if (x == NULL_RTX)
16925 return orig_x;
16927 return x;
16930 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
16931 return ix86_delegitimize_tls_address (orig_x);
16933 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
16934 and -mcmodel=medium -fpic. */
16937 if (GET_CODE (x) != PLUS
16938 || GET_CODE (XEXP (x, 1)) != CONST)
16939 return ix86_delegitimize_tls_address (orig_x);
16941 if (ix86_pic_register_p (XEXP (x, 0)))
16942 /* %ebx + GOT/GOTOFF */
16944 else if (GET_CODE (XEXP (x, 0)) == PLUS)
16946 /* %ebx + %reg * scale + GOT/GOTOFF */
16947 reg_addend = XEXP (x, 0);
16948 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
16949 reg_addend = XEXP (reg_addend, 1);
16950 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
16951 reg_addend = XEXP (reg_addend, 0);
16952 else
16954 reg_addend = NULL_RTX;
16955 addend = XEXP (x, 0);
16958 else
16959 addend = XEXP (x, 0);
16961 x = XEXP (XEXP (x, 1), 0);
16962 if (GET_CODE (x) == PLUS
16963 && CONST_INT_P (XEXP (x, 1)))
16965 const_addend = XEXP (x, 1);
16966 x = XEXP (x, 0);
16969 if (GET_CODE (x) == UNSPEC
16970 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
16971 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
16972 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
16973 && !MEM_P (orig_x) && !addend)))
16974 result = XVECEXP (x, 0, 0);
16976 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
16977 && !MEM_P (orig_x))
16978 result = XVECEXP (x, 0, 0);
16980 if (! result)
16981 return ix86_delegitimize_tls_address (orig_x);
16983 /* For (PLUS something CONST_INT) both find_base_{value,term} just
16984 recurse on the first operand. */
16985 if (const_addend && !base_term_p)
16986 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
16987 if (reg_addend)
16988 result = gen_rtx_PLUS (Pmode, reg_addend, result);
16989 if (addend)
16991 /* If the rest of original X doesn't involve the PIC register, add
16992 addend and subtract pic_offset_table_rtx. This can happen e.g.
16993 for code like:
16994 leal (%ebx, %ecx, 4), %ecx
16996 movl foo@GOTOFF(%ecx), %edx
16997 in which case we return (%ecx - %ebx) + foo
16998 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
16999 and reload has completed. Don't do the latter for debug,
17000 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17001 if (pic_offset_table_rtx
17002 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17003 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17004 pic_offset_table_rtx),
17005 result);
17006 else if (base_term_p
17007 && pic_offset_table_rtx
17008 && !TARGET_MACHO
17009 && !TARGET_VXWORKS_RTP)
17011 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17012 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17013 result = gen_rtx_PLUS (Pmode, tmp, result);
17015 else
17016 return orig_x;
17018 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17020 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17021 if (result == NULL_RTX)
17022 return orig_x;
17024 return result;
17027 /* The normal instantiation of the above template. */
17029 static rtx
17030 ix86_delegitimize_address (rtx x)
17032 return ix86_delegitimize_address_1 (x, false);
17035 /* If X is a machine specific address (i.e. a symbol or label being
17036 referenced as a displacement from the GOT implemented using an
17037 UNSPEC), then return the base term. Otherwise return X. */
17040 ix86_find_base_term (rtx x)
17042 rtx term;
17044 if (TARGET_64BIT)
17046 if (GET_CODE (x) != CONST)
17047 return x;
17048 term = XEXP (x, 0);
17049 if (GET_CODE (term) == PLUS
17050 && CONST_INT_P (XEXP (term, 1)))
17051 term = XEXP (term, 0);
17052 if (GET_CODE (term) != UNSPEC
17053 || (XINT (term, 1) != UNSPEC_GOTPCREL
17054 && XINT (term, 1) != UNSPEC_PCREL))
17055 return x;
17057 return XVECEXP (term, 0, 0);
17060 return ix86_delegitimize_address_1 (x, true);
17063 /* Return true if X shouldn't be emitted into the debug info.
17064 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17065 symbol easily into the .debug_info section, so we need not to
17066 delegitimize, but instead assemble as @gotoff.
17067 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17068 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17070 static bool
17071 ix86_const_not_ok_for_debug_p (rtx x)
17073 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17074 return true;
17076 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17077 return true;
17079 return false;
17082 static void
17083 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17084 bool fp, FILE *file)
17086 const char *suffix;
17088 if (mode == CCFPmode)
17090 code = ix86_fp_compare_code_to_integer (code);
17091 mode = CCmode;
17093 if (reverse)
17094 code = reverse_condition (code);
17096 switch (code)
17098 case EQ:
17099 gcc_assert (mode != CCGZmode);
17100 switch (mode)
17102 case E_CCAmode:
17103 suffix = "a";
17104 break;
17105 case E_CCCmode:
17106 suffix = "c";
17107 break;
17108 case E_CCOmode:
17109 suffix = "o";
17110 break;
17111 case E_CCPmode:
17112 suffix = "p";
17113 break;
17114 case E_CCSmode:
17115 suffix = "s";
17116 break;
17117 default:
17118 suffix = "e";
17119 break;
17121 break;
17122 case NE:
17123 gcc_assert (mode != CCGZmode);
17124 switch (mode)
17126 case E_CCAmode:
17127 suffix = "na";
17128 break;
17129 case E_CCCmode:
17130 suffix = "nc";
17131 break;
17132 case E_CCOmode:
17133 suffix = "no";
17134 break;
17135 case E_CCPmode:
17136 suffix = "np";
17137 break;
17138 case E_CCSmode:
17139 suffix = "ns";
17140 break;
17141 default:
17142 suffix = "ne";
17143 break;
17145 break;
17146 case GT:
17147 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17148 suffix = "g";
17149 break;
17150 case GTU:
17151 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17152 Those same assemblers have the same but opposite lossage on cmov. */
17153 if (mode == CCmode)
17154 suffix = fp ? "nbe" : "a";
17155 else
17156 gcc_unreachable ();
17157 break;
17158 case LT:
17159 switch (mode)
17161 case E_CCNOmode:
17162 case E_CCGOCmode:
17163 suffix = "s";
17164 break;
17166 case E_CCmode:
17167 case E_CCGCmode:
17168 case E_CCGZmode:
17169 suffix = "l";
17170 break;
17172 default:
17173 gcc_unreachable ();
17175 break;
17176 case LTU:
17177 if (mode == CCmode || mode == CCGZmode)
17178 suffix = "b";
17179 else if (mode == CCCmode)
17180 suffix = fp ? "b" : "c";
17181 else
17182 gcc_unreachable ();
17183 break;
17184 case GE:
17185 switch (mode)
17187 case E_CCNOmode:
17188 case E_CCGOCmode:
17189 suffix = "ns";
17190 break;
17192 case E_CCmode:
17193 case E_CCGCmode:
17194 case E_CCGZmode:
17195 suffix = "ge";
17196 break;
17198 default:
17199 gcc_unreachable ();
17201 break;
17202 case GEU:
17203 if (mode == CCmode || mode == CCGZmode)
17204 suffix = "nb";
17205 else if (mode == CCCmode)
17206 suffix = fp ? "nb" : "nc";
17207 else
17208 gcc_unreachable ();
17209 break;
17210 case LE:
17211 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17212 suffix = "le";
17213 break;
17214 case LEU:
17215 if (mode == CCmode)
17216 suffix = "be";
17217 else
17218 gcc_unreachable ();
17219 break;
17220 case UNORDERED:
17221 suffix = fp ? "u" : "p";
17222 break;
17223 case ORDERED:
17224 suffix = fp ? "nu" : "np";
17225 break;
17226 default:
17227 gcc_unreachable ();
17229 fputs (suffix, file);
17232 /* Print the name of register X to FILE based on its machine mode and number.
17233 If CODE is 'w', pretend the mode is HImode.
17234 If CODE is 'b', pretend the mode is QImode.
17235 If CODE is 'k', pretend the mode is SImode.
17236 If CODE is 'q', pretend the mode is DImode.
17237 If CODE is 'x', pretend the mode is V4SFmode.
17238 If CODE is 't', pretend the mode is V8SFmode.
17239 If CODE is 'g', pretend the mode is V16SFmode.
17240 If CODE is 'h', pretend the reg is the 'high' byte register.
17241 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17242 If CODE is 'd', duplicate the operand for AVX instruction.
17245 void
17246 print_reg (rtx x, int code, FILE *file)
17248 const char *reg;
17249 int msize;
17250 unsigned int regno;
17251 bool duplicated;
17253 if (ASSEMBLER_DIALECT == ASM_ATT)
17254 putc ('%', file);
17256 if (x == pc_rtx)
17258 gcc_assert (TARGET_64BIT);
17259 fputs ("rip", file);
17260 return;
17263 if (code == 'y' && STACK_TOP_P (x))
17265 fputs ("st(0)", file);
17266 return;
17269 if (code == 'w')
17270 msize = 2;
17271 else if (code == 'b')
17272 msize = 1;
17273 else if (code == 'k')
17274 msize = 4;
17275 else if (code == 'q')
17276 msize = 8;
17277 else if (code == 'h')
17278 msize = 0;
17279 else if (code == 'x')
17280 msize = 16;
17281 else if (code == 't')
17282 msize = 32;
17283 else if (code == 'g')
17284 msize = 64;
17285 else
17286 msize = GET_MODE_SIZE (GET_MODE (x));
17288 regno = REGNO (x);
17290 if (regno == ARG_POINTER_REGNUM
17291 || regno == FRAME_POINTER_REGNUM
17292 || regno == FPSR_REG
17293 || regno == FPCR_REG)
17295 output_operand_lossage
17296 ("invalid use of register '%s'", reg_names[regno]);
17297 return;
17299 else if (regno == FLAGS_REG)
17301 output_operand_lossage ("invalid use of asm flag output");
17302 return;
17305 duplicated = code == 'd' && TARGET_AVX;
17307 switch (msize)
17309 case 16:
17310 case 12:
17311 case 8:
17312 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17313 warning (0, "unsupported size for integer register");
17314 /* FALLTHRU */
17315 case 4:
17316 if (LEGACY_INT_REGNO_P (regno))
17317 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17318 /* FALLTHRU */
17319 case 2:
17320 normal:
17321 reg = hi_reg_name[regno];
17322 break;
17323 case 1:
17324 if (regno >= ARRAY_SIZE (qi_reg_name))
17325 goto normal;
17326 if (!ANY_QI_REGNO_P (regno))
17327 error ("unsupported size for integer register");
17328 reg = qi_reg_name[regno];
17329 break;
17330 case 0:
17331 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17332 goto normal;
17333 reg = qi_high_reg_name[regno];
17334 break;
17335 case 32:
17336 case 64:
17337 if (SSE_REGNO_P (regno))
17339 gcc_assert (!duplicated);
17340 putc (msize == 32 ? 'y' : 'z', file);
17341 reg = hi_reg_name[regno] + 1;
17342 break;
17344 goto normal;
17345 default:
17346 gcc_unreachable ();
17349 fputs (reg, file);
17351 /* Irritatingly, AMD extended registers use
17352 different naming convention: "r%d[bwd]" */
17353 if (REX_INT_REGNO_P (regno))
17355 gcc_assert (TARGET_64BIT);
17356 switch (msize)
17358 case 0:
17359 error ("extended registers have no high halves");
17360 break;
17361 case 1:
17362 putc ('b', file);
17363 break;
17364 case 2:
17365 putc ('w', file);
17366 break;
17367 case 4:
17368 putc ('d', file);
17369 break;
17370 case 8:
17371 /* no suffix */
17372 break;
17373 default:
17374 error ("unsupported operand size for extended register");
17375 break;
17377 return;
17380 if (duplicated)
17382 if (ASSEMBLER_DIALECT == ASM_ATT)
17383 fprintf (file, ", %%%s", reg);
17384 else
17385 fprintf (file, ", %s", reg);
17389 /* Meaning of CODE:
17390 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17391 C -- print opcode suffix for set/cmov insn.
17392 c -- like C, but print reversed condition
17393 F,f -- likewise, but for floating-point.
17394 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17395 otherwise nothing
17396 R -- print embeded rounding and sae.
17397 r -- print only sae.
17398 z -- print the opcode suffix for the size of the current operand.
17399 Z -- likewise, with special suffixes for x87 instructions.
17400 * -- print a star (in certain assembler syntax)
17401 A -- print an absolute memory reference.
17402 E -- print address with DImode register names if TARGET_64BIT.
17403 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17404 s -- print a shift double count, followed by the assemblers argument
17405 delimiter.
17406 b -- print the QImode name of the register for the indicated operand.
17407 %b0 would print %al if operands[0] is reg 0.
17408 w -- likewise, print the HImode name of the register.
17409 k -- likewise, print the SImode name of the register.
17410 q -- likewise, print the DImode name of the register.
17411 x -- likewise, print the V4SFmode name of the register.
17412 t -- likewise, print the V8SFmode name of the register.
17413 g -- likewise, print the V16SFmode name of the register.
17414 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17415 y -- print "st(0)" instead of "st" as a register.
17416 d -- print duplicated register operand for AVX instruction.
17417 D -- print condition for SSE cmp instruction.
17418 P -- if PIC, print an @PLT suffix.
17419 p -- print raw symbol name.
17420 X -- don't print any sort of PIC '@' suffix for a symbol.
17421 & -- print some in-use local-dynamic symbol name.
17422 H -- print a memory address offset by 8; used for sse high-parts
17423 Y -- print condition for XOP pcom* instruction.
17424 + -- print a branch hint as 'cs' or 'ds' prefix
17425 ; -- print a semicolon (after prefixes due to bug in older gas).
17426 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17427 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17428 ! -- print MPX prefix for jxx/call/ret instructions if required.
17431 void
17432 ix86_print_operand (FILE *file, rtx x, int code)
17434 if (code)
17436 switch (code)
17438 case 'A':
17439 switch (ASSEMBLER_DIALECT)
17441 case ASM_ATT:
17442 putc ('*', file);
17443 break;
17445 case ASM_INTEL:
17446 /* Intel syntax. For absolute addresses, registers should not
17447 be surrounded by braces. */
17448 if (!REG_P (x))
17450 putc ('[', file);
17451 ix86_print_operand (file, x, 0);
17452 putc (']', file);
17453 return;
17455 break;
17457 default:
17458 gcc_unreachable ();
17461 ix86_print_operand (file, x, 0);
17462 return;
17464 case 'E':
17465 /* Wrap address in an UNSPEC to declare special handling. */
17466 if (TARGET_64BIT)
17467 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17469 output_address (VOIDmode, x);
17470 return;
17472 case 'L':
17473 if (ASSEMBLER_DIALECT == ASM_ATT)
17474 putc ('l', file);
17475 return;
17477 case 'W':
17478 if (ASSEMBLER_DIALECT == ASM_ATT)
17479 putc ('w', file);
17480 return;
17482 case 'B':
17483 if (ASSEMBLER_DIALECT == ASM_ATT)
17484 putc ('b', file);
17485 return;
17487 case 'Q':
17488 if (ASSEMBLER_DIALECT == ASM_ATT)
17489 putc ('l', file);
17490 return;
17492 case 'S':
17493 if (ASSEMBLER_DIALECT == ASM_ATT)
17494 putc ('s', file);
17495 return;
17497 case 'T':
17498 if (ASSEMBLER_DIALECT == ASM_ATT)
17499 putc ('t', file);
17500 return;
17502 case 'O':
17503 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17504 if (ASSEMBLER_DIALECT != ASM_ATT)
17505 return;
17507 switch (GET_MODE_SIZE (GET_MODE (x)))
17509 case 2:
17510 putc ('w', file);
17511 break;
17513 case 4:
17514 putc ('l', file);
17515 break;
17517 case 8:
17518 putc ('q', file);
17519 break;
17521 default:
17522 output_operand_lossage ("invalid operand size for operand "
17523 "code 'O'");
17524 return;
17527 putc ('.', file);
17528 #endif
17529 return;
17531 case 'z':
17532 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17534 /* Opcodes don't get size suffixes if using Intel opcodes. */
17535 if (ASSEMBLER_DIALECT == ASM_INTEL)
17536 return;
17538 switch (GET_MODE_SIZE (GET_MODE (x)))
17540 case 1:
17541 putc ('b', file);
17542 return;
17544 case 2:
17545 putc ('w', file);
17546 return;
17548 case 4:
17549 putc ('l', file);
17550 return;
17552 case 8:
17553 putc ('q', file);
17554 return;
17556 default:
17557 output_operand_lossage ("invalid operand size for operand "
17558 "code 'z'");
17559 return;
17563 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17564 warning (0, "non-integer operand used with operand code 'z'");
17565 /* FALLTHRU */
17567 case 'Z':
17568 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17569 if (ASSEMBLER_DIALECT == ASM_INTEL)
17570 return;
17572 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17574 switch (GET_MODE_SIZE (GET_MODE (x)))
17576 case 2:
17577 #ifdef HAVE_AS_IX86_FILDS
17578 putc ('s', file);
17579 #endif
17580 return;
17582 case 4:
17583 putc ('l', file);
17584 return;
17586 case 8:
17587 #ifdef HAVE_AS_IX86_FILDQ
17588 putc ('q', file);
17589 #else
17590 fputs ("ll", file);
17591 #endif
17592 return;
17594 default:
17595 break;
17598 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17600 /* 387 opcodes don't get size suffixes
17601 if the operands are registers. */
17602 if (STACK_REG_P (x))
17603 return;
17605 switch (GET_MODE_SIZE (GET_MODE (x)))
17607 case 4:
17608 putc ('s', file);
17609 return;
17611 case 8:
17612 putc ('l', file);
17613 return;
17615 case 12:
17616 case 16:
17617 putc ('t', file);
17618 return;
17620 default:
17621 break;
17624 else
17626 output_operand_lossage ("invalid operand type used with "
17627 "operand code 'Z'");
17628 return;
17631 output_operand_lossage ("invalid operand size for operand code 'Z'");
17632 return;
17634 case 'd':
17635 case 'b':
17636 case 'w':
17637 case 'k':
17638 case 'q':
17639 case 'h':
17640 case 't':
17641 case 'g':
17642 case 'y':
17643 case 'x':
17644 case 'X':
17645 case 'P':
17646 case 'p':
17647 break;
17649 case 's':
17650 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17652 ix86_print_operand (file, x, 0);
17653 fputs (", ", file);
17655 return;
17657 case 'Y':
17658 switch (GET_CODE (x))
17660 case NE:
17661 fputs ("neq", file);
17662 break;
17663 case EQ:
17664 fputs ("eq", file);
17665 break;
17666 case GE:
17667 case GEU:
17668 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17669 break;
17670 case GT:
17671 case GTU:
17672 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17673 break;
17674 case LE:
17675 case LEU:
17676 fputs ("le", file);
17677 break;
17678 case LT:
17679 case LTU:
17680 fputs ("lt", file);
17681 break;
17682 case UNORDERED:
17683 fputs ("unord", file);
17684 break;
17685 case ORDERED:
17686 fputs ("ord", file);
17687 break;
17688 case UNEQ:
17689 fputs ("ueq", file);
17690 break;
17691 case UNGE:
17692 fputs ("nlt", file);
17693 break;
17694 case UNGT:
17695 fputs ("nle", file);
17696 break;
17697 case UNLE:
17698 fputs ("ule", file);
17699 break;
17700 case UNLT:
17701 fputs ("ult", file);
17702 break;
17703 case LTGT:
17704 fputs ("une", file);
17705 break;
17706 default:
17707 output_operand_lossage ("operand is not a condition code, "
17708 "invalid operand code 'Y'");
17709 return;
17711 return;
17713 case 'D':
17714 /* Little bit of braindamage here. The SSE compare instructions
17715 does use completely different names for the comparisons that the
17716 fp conditional moves. */
17717 switch (GET_CODE (x))
17719 case UNEQ:
17720 if (TARGET_AVX)
17722 fputs ("eq_us", file);
17723 break;
17725 /* FALLTHRU */
17726 case EQ:
17727 fputs ("eq", file);
17728 break;
17729 case UNLT:
17730 if (TARGET_AVX)
17732 fputs ("nge", file);
17733 break;
17735 /* FALLTHRU */
17736 case LT:
17737 fputs ("lt", file);
17738 break;
17739 case UNLE:
17740 if (TARGET_AVX)
17742 fputs ("ngt", file);
17743 break;
17745 /* FALLTHRU */
17746 case LE:
17747 fputs ("le", file);
17748 break;
17749 case UNORDERED:
17750 fputs ("unord", file);
17751 break;
17752 case LTGT:
17753 if (TARGET_AVX)
17755 fputs ("neq_oq", file);
17756 break;
17758 /* FALLTHRU */
17759 case NE:
17760 fputs ("neq", file);
17761 break;
17762 case GE:
17763 if (TARGET_AVX)
17765 fputs ("ge", file);
17766 break;
17768 /* FALLTHRU */
17769 case UNGE:
17770 fputs ("nlt", file);
17771 break;
17772 case GT:
17773 if (TARGET_AVX)
17775 fputs ("gt", file);
17776 break;
17778 /* FALLTHRU */
17779 case UNGT:
17780 fputs ("nle", file);
17781 break;
17782 case ORDERED:
17783 fputs ("ord", file);
17784 break;
17785 default:
17786 output_operand_lossage ("operand is not a condition code, "
17787 "invalid operand code 'D'");
17788 return;
17790 return;
17792 case 'F':
17793 case 'f':
17794 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17795 if (ASSEMBLER_DIALECT == ASM_ATT)
17796 putc ('.', file);
17797 gcc_fallthrough ();
17798 #endif
17800 case 'C':
17801 case 'c':
17802 if (!COMPARISON_P (x))
17804 output_operand_lossage ("operand is not a condition code, "
17805 "invalid operand code '%c'", code);
17806 return;
17808 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17809 code == 'c' || code == 'f',
17810 code == 'F' || code == 'f',
17811 file);
17812 return;
17814 case 'H':
17815 if (!offsettable_memref_p (x))
17817 output_operand_lossage ("operand is not an offsettable memory "
17818 "reference, invalid operand code 'H'");
17819 return;
17821 /* It doesn't actually matter what mode we use here, as we're
17822 only going to use this for printing. */
17823 x = adjust_address_nv (x, DImode, 8);
17824 /* Output 'qword ptr' for intel assembler dialect. */
17825 if (ASSEMBLER_DIALECT == ASM_INTEL)
17826 code = 'q';
17827 break;
17829 case 'K':
17830 if (!CONST_INT_P (x))
17832 output_operand_lossage ("operand is not an integer, invalid "
17833 "operand code 'K'");
17834 return;
17837 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17838 #ifdef HAVE_AS_IX86_HLE
17839 fputs ("xacquire ", file);
17840 #else
17841 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17842 #endif
17843 else if (INTVAL (x) & IX86_HLE_RELEASE)
17844 #ifdef HAVE_AS_IX86_HLE
17845 fputs ("xrelease ", file);
17846 #else
17847 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17848 #endif
17849 /* We do not want to print value of the operand. */
17850 return;
17852 case 'N':
17853 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17854 fputs ("{z}", file);
17855 return;
17857 case 'r':
17858 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
17860 output_operand_lossage ("operand is not a specific integer, "
17861 "invalid operand code 'r'");
17862 return;
17865 if (ASSEMBLER_DIALECT == ASM_INTEL)
17866 fputs (", ", file);
17868 fputs ("{sae}", file);
17870 if (ASSEMBLER_DIALECT == ASM_ATT)
17871 fputs (", ", file);
17873 return;
17875 case 'R':
17876 if (!CONST_INT_P (x))
17878 output_operand_lossage ("operand is not an integer, invalid "
17879 "operand code 'R'");
17880 return;
17883 if (ASSEMBLER_DIALECT == ASM_INTEL)
17884 fputs (", ", file);
17886 switch (INTVAL (x))
17888 case ROUND_NEAREST_INT | ROUND_SAE:
17889 fputs ("{rn-sae}", file);
17890 break;
17891 case ROUND_NEG_INF | ROUND_SAE:
17892 fputs ("{rd-sae}", file);
17893 break;
17894 case ROUND_POS_INF | ROUND_SAE:
17895 fputs ("{ru-sae}", file);
17896 break;
17897 case ROUND_ZERO | ROUND_SAE:
17898 fputs ("{rz-sae}", file);
17899 break;
17900 default:
17901 output_operand_lossage ("operand is not a specific integer, "
17902 "invalid operand code 'R'");
17905 if (ASSEMBLER_DIALECT == ASM_ATT)
17906 fputs (", ", file);
17908 return;
17910 case '*':
17911 if (ASSEMBLER_DIALECT == ASM_ATT)
17912 putc ('*', file);
17913 return;
17915 case '&':
17917 const char *name = get_some_local_dynamic_name ();
17918 if (name == NULL)
17919 output_operand_lossage ("'%%&' used without any "
17920 "local dynamic TLS references");
17921 else
17922 assemble_name (file, name);
17923 return;
17926 case '+':
17928 rtx x;
17930 if (!optimize
17931 || optimize_function_for_size_p (cfun)
17932 || !TARGET_BRANCH_PREDICTION_HINTS)
17933 return;
17935 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
17936 if (x)
17938 int pred_val = profile_probability::from_reg_br_prob_note
17939 (XINT (x, 0)).to_reg_br_prob_base ();
17941 if (pred_val < REG_BR_PROB_BASE * 45 / 100
17942 || pred_val > REG_BR_PROB_BASE * 55 / 100)
17944 bool taken = pred_val > REG_BR_PROB_BASE / 2;
17945 bool cputaken
17946 = final_forward_branch_p (current_output_insn) == 0;
17948 /* Emit hints only in the case default branch prediction
17949 heuristics would fail. */
17950 if (taken != cputaken)
17952 /* We use 3e (DS) prefix for taken branches and
17953 2e (CS) prefix for not taken branches. */
17954 if (taken)
17955 fputs ("ds ; ", file);
17956 else
17957 fputs ("cs ; ", file);
17961 return;
17964 case ';':
17965 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
17966 putc (';', file);
17967 #endif
17968 return;
17970 case '~':
17971 putc (TARGET_AVX2 ? 'i' : 'f', file);
17972 return;
17974 case '^':
17975 if (TARGET_64BIT && Pmode != word_mode)
17976 fputs ("addr32 ", file);
17977 return;
17979 case '!':
17980 if (ix86_bnd_prefixed_insn_p (current_output_insn))
17981 fputs ("bnd ", file);
17982 if (ix86_notrack_prefixed_insn_p (current_output_insn))
17983 fputs ("notrack ", file);
17984 return;
17986 default:
17987 output_operand_lossage ("invalid operand code '%c'", code);
17991 if (REG_P (x))
17992 print_reg (x, code, file);
17994 else if (MEM_P (x))
17996 rtx addr = XEXP (x, 0);
17998 /* No `byte ptr' prefix for call instructions ... */
17999 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18001 machine_mode mode = GET_MODE (x);
18002 const char *size;
18004 /* Check for explicit size override codes. */
18005 if (code == 'b')
18006 size = "BYTE";
18007 else if (code == 'w')
18008 size = "WORD";
18009 else if (code == 'k')
18010 size = "DWORD";
18011 else if (code == 'q')
18012 size = "QWORD";
18013 else if (code == 'x')
18014 size = "XMMWORD";
18015 else if (code == 't')
18016 size = "YMMWORD";
18017 else if (code == 'g')
18018 size = "ZMMWORD";
18019 else if (mode == BLKmode)
18020 /* ... or BLKmode operands, when not overridden. */
18021 size = NULL;
18022 else
18023 switch (GET_MODE_SIZE (mode))
18025 case 1: size = "BYTE"; break;
18026 case 2: size = "WORD"; break;
18027 case 4: size = "DWORD"; break;
18028 case 8: size = "QWORD"; break;
18029 case 12: size = "TBYTE"; break;
18030 case 16:
18031 if (mode == XFmode)
18032 size = "TBYTE";
18033 else
18034 size = "XMMWORD";
18035 break;
18036 case 32: size = "YMMWORD"; break;
18037 case 64: size = "ZMMWORD"; break;
18038 default:
18039 gcc_unreachable ();
18041 if (size)
18043 fputs (size, file);
18044 fputs (" PTR ", file);
18048 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18049 output_operand_lossage ("invalid constraints for operand");
18050 else
18051 ix86_print_operand_address_as
18052 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18055 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18057 long l;
18059 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18061 if (ASSEMBLER_DIALECT == ASM_ATT)
18062 putc ('$', file);
18063 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18064 if (code == 'q')
18065 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18066 (unsigned long long) (int) l);
18067 else
18068 fprintf (file, "0x%08x", (unsigned int) l);
18071 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18073 long l[2];
18075 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18077 if (ASSEMBLER_DIALECT == ASM_ATT)
18078 putc ('$', file);
18079 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18082 /* These float cases don't actually occur as immediate operands. */
18083 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18085 char dstr[30];
18087 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18088 fputs (dstr, file);
18091 else
18093 /* We have patterns that allow zero sets of memory, for instance.
18094 In 64-bit mode, we should probably support all 8-byte vectors,
18095 since we can in fact encode that into an immediate. */
18096 if (GET_CODE (x) == CONST_VECTOR)
18098 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18099 x = const0_rtx;
18102 if (code != 'P' && code != 'p')
18104 if (CONST_INT_P (x))
18106 if (ASSEMBLER_DIALECT == ASM_ATT)
18107 putc ('$', file);
18109 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18110 || GET_CODE (x) == LABEL_REF)
18112 if (ASSEMBLER_DIALECT == ASM_ATT)
18113 putc ('$', file);
18114 else
18115 fputs ("OFFSET FLAT:", file);
18118 if (CONST_INT_P (x))
18119 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18120 else if (flag_pic || MACHOPIC_INDIRECT)
18121 output_pic_addr_const (file, x, code);
18122 else
18123 output_addr_const (file, x);
18127 static bool
18128 ix86_print_operand_punct_valid_p (unsigned char code)
18130 return (code == '*' || code == '+' || code == '&' || code == ';'
18131 || code == '~' || code == '^' || code == '!');
18134 /* Print a memory operand whose address is ADDR. */
18136 static void
18137 ix86_print_operand_address_as (FILE *file, rtx addr,
18138 addr_space_t as, bool no_rip)
18140 struct ix86_address parts;
18141 rtx base, index, disp;
18142 int scale;
18143 int ok;
18144 bool vsib = false;
18145 int code = 0;
18147 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18149 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18150 gcc_assert (parts.index == NULL_RTX);
18151 parts.index = XVECEXP (addr, 0, 1);
18152 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18153 addr = XVECEXP (addr, 0, 0);
18154 vsib = true;
18156 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18158 gcc_assert (TARGET_64BIT);
18159 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18160 code = 'q';
18162 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18164 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18165 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18166 if (parts.base != NULL_RTX)
18168 parts.index = parts.base;
18169 parts.scale = 1;
18171 parts.base = XVECEXP (addr, 0, 0);
18172 addr = XVECEXP (addr, 0, 0);
18174 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18176 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18177 gcc_assert (parts.index == NULL_RTX);
18178 parts.index = XVECEXP (addr, 0, 1);
18179 addr = XVECEXP (addr, 0, 0);
18181 else
18182 ok = ix86_decompose_address (addr, &parts);
18184 gcc_assert (ok);
18186 base = parts.base;
18187 index = parts.index;
18188 disp = parts.disp;
18189 scale = parts.scale;
18191 if (ADDR_SPACE_GENERIC_P (as))
18192 as = parts.seg;
18193 else
18194 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18196 if (!ADDR_SPACE_GENERIC_P (as))
18198 const char *string;
18200 if (as == ADDR_SPACE_SEG_FS)
18201 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18202 else if (as == ADDR_SPACE_SEG_GS)
18203 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18204 else
18205 gcc_unreachable ();
18206 fputs (string, file);
18209 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18210 if (TARGET_64BIT && !base && !index && !no_rip)
18212 rtx symbol = disp;
18214 if (GET_CODE (disp) == CONST
18215 && GET_CODE (XEXP (disp, 0)) == PLUS
18216 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18217 symbol = XEXP (XEXP (disp, 0), 0);
18219 if (GET_CODE (symbol) == LABEL_REF
18220 || (GET_CODE (symbol) == SYMBOL_REF
18221 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18222 base = pc_rtx;
18225 if (!base && !index)
18227 /* Displacement only requires special attention. */
18228 if (CONST_INT_P (disp))
18230 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18231 fputs ("ds:", file);
18232 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18234 /* Load the external function address via the GOT slot to avoid PLT. */
18235 else if (GET_CODE (disp) == CONST
18236 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18237 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18238 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18239 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18240 output_pic_addr_const (file, disp, 0);
18241 else if (flag_pic)
18242 output_pic_addr_const (file, disp, 0);
18243 else
18244 output_addr_const (file, disp);
18246 else
18248 /* Print SImode register names to force addr32 prefix. */
18249 if (SImode_address_operand (addr, VOIDmode))
18251 if (flag_checking)
18253 gcc_assert (TARGET_64BIT);
18254 switch (GET_CODE (addr))
18256 case SUBREG:
18257 gcc_assert (GET_MODE (addr) == SImode);
18258 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18259 break;
18260 case ZERO_EXTEND:
18261 case AND:
18262 gcc_assert (GET_MODE (addr) == DImode);
18263 break;
18264 default:
18265 gcc_unreachable ();
18268 gcc_assert (!code);
18269 code = 'k';
18271 else if (code == 0
18272 && TARGET_X32
18273 && disp
18274 && CONST_INT_P (disp)
18275 && INTVAL (disp) < -16*1024*1024)
18277 /* X32 runs in 64-bit mode, where displacement, DISP, in
18278 address DISP(%r64), is encoded as 32-bit immediate sign-
18279 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18280 address is %r64 + 0xffffffffbffffd00. When %r64 <
18281 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18282 which is invalid for x32. The correct address is %r64
18283 - 0x40000300 == 0xf7ffdd64. To properly encode
18284 -0x40000300(%r64) for x32, we zero-extend negative
18285 displacement by forcing addr32 prefix which truncates
18286 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18287 zero-extend all negative displacements, including -1(%rsp).
18288 However, for small negative displacements, sign-extension
18289 won't cause overflow. We only zero-extend negative
18290 displacements if they < -16*1024*1024, which is also used
18291 to check legitimate address displacements for PIC. */
18292 code = 'k';
18295 /* Since the upper 32 bits of RSP are always zero for x32,
18296 we can encode %esp as %rsp to avoid 0x67 prefix if
18297 there is no index register. */
18298 if (TARGET_X32 && Pmode == SImode
18299 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18300 code = 'q';
18302 if (ASSEMBLER_DIALECT == ASM_ATT)
18304 if (disp)
18306 if (flag_pic)
18307 output_pic_addr_const (file, disp, 0);
18308 else if (GET_CODE (disp) == LABEL_REF)
18309 output_asm_label (disp);
18310 else
18311 output_addr_const (file, disp);
18314 putc ('(', file);
18315 if (base)
18316 print_reg (base, code, file);
18317 if (index)
18319 putc (',', file);
18320 print_reg (index, vsib ? 0 : code, file);
18321 if (scale != 1 || vsib)
18322 fprintf (file, ",%d", scale);
18324 putc (')', file);
18326 else
18328 rtx offset = NULL_RTX;
18330 if (disp)
18332 /* Pull out the offset of a symbol; print any symbol itself. */
18333 if (GET_CODE (disp) == CONST
18334 && GET_CODE (XEXP (disp, 0)) == PLUS
18335 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18337 offset = XEXP (XEXP (disp, 0), 1);
18338 disp = gen_rtx_CONST (VOIDmode,
18339 XEXP (XEXP (disp, 0), 0));
18342 if (flag_pic)
18343 output_pic_addr_const (file, disp, 0);
18344 else if (GET_CODE (disp) == LABEL_REF)
18345 output_asm_label (disp);
18346 else if (CONST_INT_P (disp))
18347 offset = disp;
18348 else
18349 output_addr_const (file, disp);
18352 putc ('[', file);
18353 if (base)
18355 print_reg (base, code, file);
18356 if (offset)
18358 if (INTVAL (offset) >= 0)
18359 putc ('+', file);
18360 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18363 else if (offset)
18364 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18365 else
18366 putc ('0', file);
18368 if (index)
18370 putc ('+', file);
18371 print_reg (index, vsib ? 0 : code, file);
18372 if (scale != 1 || vsib)
18373 fprintf (file, "*%d", scale);
18375 putc (']', file);
18380 static void
18381 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18383 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18386 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18388 static bool
18389 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18391 rtx op;
18393 if (GET_CODE (x) != UNSPEC)
18394 return false;
18396 op = XVECEXP (x, 0, 0);
18397 switch (XINT (x, 1))
18399 case UNSPEC_GOTOFF:
18400 output_addr_const (file, op);
18401 fputs ("@gotoff", file);
18402 break;
18403 case UNSPEC_GOTTPOFF:
18404 output_addr_const (file, op);
18405 /* FIXME: This might be @TPOFF in Sun ld. */
18406 fputs ("@gottpoff", file);
18407 break;
18408 case UNSPEC_TPOFF:
18409 output_addr_const (file, op);
18410 fputs ("@tpoff", file);
18411 break;
18412 case UNSPEC_NTPOFF:
18413 output_addr_const (file, op);
18414 if (TARGET_64BIT)
18415 fputs ("@tpoff", file);
18416 else
18417 fputs ("@ntpoff", file);
18418 break;
18419 case UNSPEC_DTPOFF:
18420 output_addr_const (file, op);
18421 fputs ("@dtpoff", file);
18422 break;
18423 case UNSPEC_GOTNTPOFF:
18424 output_addr_const (file, op);
18425 if (TARGET_64BIT)
18426 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18427 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18428 else
18429 fputs ("@gotntpoff", file);
18430 break;
18431 case UNSPEC_INDNTPOFF:
18432 output_addr_const (file, op);
18433 fputs ("@indntpoff", file);
18434 break;
18435 #if TARGET_MACHO
18436 case UNSPEC_MACHOPIC_OFFSET:
18437 output_addr_const (file, op);
18438 putc ('-', file);
18439 machopic_output_function_base_name (file);
18440 break;
18441 #endif
18443 default:
18444 return false;
18447 return true;
18450 /* Split one or more double-mode RTL references into pairs of half-mode
18451 references. The RTL can be REG, offsettable MEM, integer constant, or
18452 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18453 split and "num" is its length. lo_half and hi_half are output arrays
18454 that parallel "operands". */
18456 void
18457 split_double_mode (machine_mode mode, rtx operands[],
18458 int num, rtx lo_half[], rtx hi_half[])
18460 machine_mode half_mode;
18461 unsigned int byte;
18463 switch (mode)
18465 case E_TImode:
18466 half_mode = DImode;
18467 break;
18468 case E_DImode:
18469 half_mode = SImode;
18470 break;
18471 default:
18472 gcc_unreachable ();
18475 byte = GET_MODE_SIZE (half_mode);
18477 while (num--)
18479 rtx op = operands[num];
18481 /* simplify_subreg refuse to split volatile memory addresses,
18482 but we still have to handle it. */
18483 if (MEM_P (op))
18485 lo_half[num] = adjust_address (op, half_mode, 0);
18486 hi_half[num] = adjust_address (op, half_mode, byte);
18488 else
18490 lo_half[num] = simplify_gen_subreg (half_mode, op,
18491 GET_MODE (op) == VOIDmode
18492 ? mode : GET_MODE (op), 0);
18493 hi_half[num] = simplify_gen_subreg (half_mode, op,
18494 GET_MODE (op) == VOIDmode
18495 ? mode : GET_MODE (op), byte);
18500 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18501 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18502 is the expression of the binary operation. The output may either be
18503 emitted here, or returned to the caller, like all output_* functions.
18505 There is no guarantee that the operands are the same mode, as they
18506 might be within FLOAT or FLOAT_EXTEND expressions. */
18508 #ifndef SYSV386_COMPAT
18509 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18510 wants to fix the assemblers because that causes incompatibility
18511 with gcc. No-one wants to fix gcc because that causes
18512 incompatibility with assemblers... You can use the option of
18513 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18514 #define SYSV386_COMPAT 1
18515 #endif
18517 const char *
18518 output_387_binary_op (rtx_insn *insn, rtx *operands)
18520 static char buf[40];
18521 const char *p;
18522 bool is_sse
18523 = (SSE_REG_P (operands[0])
18524 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
18526 if (is_sse)
18527 p = "%v";
18528 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18529 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18530 p = "fi";
18531 else
18532 p = "f";
18534 strcpy (buf, p);
18536 switch (GET_CODE (operands[3]))
18538 case PLUS:
18539 p = "add"; break;
18540 case MINUS:
18541 p = "sub"; break;
18542 case MULT:
18543 p = "mul"; break;
18544 case DIV:
18545 p = "div"; break;
18546 default:
18547 gcc_unreachable ();
18550 strcat (buf, p);
18552 if (is_sse)
18554 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
18555 strcat (buf, p);
18557 if (TARGET_AVX)
18558 p = "\t{%2, %1, %0|%0, %1, %2}";
18559 else
18560 p = "\t{%2, %0|%0, %2}";
18562 strcat (buf, p);
18563 return buf;
18566 /* Even if we do not want to check the inputs, this documents input
18567 constraints. Which helps in understanding the following code. */
18568 if (flag_checking)
18570 if (STACK_REG_P (operands[0])
18571 && ((REG_P (operands[1])
18572 && REGNO (operands[0]) == REGNO (operands[1])
18573 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18574 || (REG_P (operands[2])
18575 && REGNO (operands[0]) == REGNO (operands[2])
18576 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18577 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18578 ; /* ok */
18579 else
18580 gcc_unreachable ();
18583 switch (GET_CODE (operands[3]))
18585 case MULT:
18586 case PLUS:
18587 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18588 std::swap (operands[1], operands[2]);
18590 /* know operands[0] == operands[1]. */
18592 if (MEM_P (operands[2]))
18594 p = "%Z2\t%2";
18595 break;
18598 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18600 if (STACK_TOP_P (operands[0]))
18601 /* How is it that we are storing to a dead operand[2]?
18602 Well, presumably operands[1] is dead too. We can't
18603 store the result to st(0) as st(0) gets popped on this
18604 instruction. Instead store to operands[2] (which I
18605 think has to be st(1)). st(1) will be popped later.
18606 gcc <= 2.8.1 didn't have this check and generated
18607 assembly code that the Unixware assembler rejected. */
18608 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18609 else
18610 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18611 break;
18614 if (STACK_TOP_P (operands[0]))
18615 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18616 else
18617 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18618 break;
18620 case MINUS:
18621 case DIV:
18622 if (MEM_P (operands[1]))
18624 p = "r%Z1\t%1";
18625 break;
18628 if (MEM_P (operands[2]))
18630 p = "%Z2\t%2";
18631 break;
18634 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18636 #if SYSV386_COMPAT
18637 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18638 derived assemblers, confusingly reverse the direction of
18639 the operation for fsub{r} and fdiv{r} when the
18640 destination register is not st(0). The Intel assembler
18641 doesn't have this brain damage. Read !SYSV386_COMPAT to
18642 figure out what the hardware really does. */
18643 if (STACK_TOP_P (operands[0]))
18644 p = "{p\t%0, %2|rp\t%2, %0}";
18645 else
18646 p = "{rp\t%2, %0|p\t%0, %2}";
18647 #else
18648 if (STACK_TOP_P (operands[0]))
18649 /* As above for fmul/fadd, we can't store to st(0). */
18650 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18651 else
18652 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18653 #endif
18654 break;
18657 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18659 #if SYSV386_COMPAT
18660 if (STACK_TOP_P (operands[0]))
18661 p = "{rp\t%0, %1|p\t%1, %0}";
18662 else
18663 p = "{p\t%1, %0|rp\t%0, %1}";
18664 #else
18665 if (STACK_TOP_P (operands[0]))
18666 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18667 else
18668 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18669 #endif
18670 break;
18673 if (STACK_TOP_P (operands[0]))
18675 if (STACK_TOP_P (operands[1]))
18676 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18677 else
18678 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18679 break;
18681 else if (STACK_TOP_P (operands[1]))
18683 #if SYSV386_COMPAT
18684 p = "{\t%1, %0|r\t%0, %1}";
18685 #else
18686 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18687 #endif
18689 else
18691 #if SYSV386_COMPAT
18692 p = "{r\t%2, %0|\t%0, %2}";
18693 #else
18694 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18695 #endif
18697 break;
18699 default:
18700 gcc_unreachable ();
18703 strcat (buf, p);
18704 return buf;
18707 /* Return needed mode for entity in optimize_mode_switching pass. */
18709 static int
18710 ix86_dirflag_mode_needed (rtx_insn *insn)
18712 if (CALL_P (insn))
18714 if (cfun->machine->func_type == TYPE_NORMAL)
18715 return X86_DIRFLAG_ANY;
18716 else
18717 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18718 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18721 if (recog_memoized (insn) < 0)
18722 return X86_DIRFLAG_ANY;
18724 if (get_attr_type (insn) == TYPE_STR)
18726 /* Emit cld instruction if stringops are used in the function. */
18727 if (cfun->machine->func_type == TYPE_NORMAL)
18728 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18729 else
18730 return X86_DIRFLAG_RESET;
18733 return X86_DIRFLAG_ANY;
18736 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
18738 static bool
18739 ix86_check_avx_upper_register (const_rtx exp)
18741 if (SUBREG_P (exp))
18742 exp = SUBREG_REG (exp);
18744 return (REG_P (exp)
18745 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
18746 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
18749 /* Return needed mode for entity in optimize_mode_switching pass. */
18751 static int
18752 ix86_avx_u128_mode_needed (rtx_insn *insn)
18754 if (CALL_P (insn))
18756 rtx link;
18758 /* Needed mode is set to AVX_U128_CLEAN if there are
18759 no 256bit or 512bit modes used in function arguments. */
18760 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18761 link;
18762 link = XEXP (link, 1))
18764 if (GET_CODE (XEXP (link, 0)) == USE)
18766 rtx arg = XEXP (XEXP (link, 0), 0);
18768 if (ix86_check_avx_upper_register (arg))
18769 return AVX_U128_DIRTY;
18773 return AVX_U128_CLEAN;
18776 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
18777 Hardware changes state only when a 256bit register is written to,
18778 but we need to prevent the compiler from moving optimal insertion
18779 point above eventual read from 256bit or 512 bit register. */
18780 subrtx_iterator::array_type array;
18781 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18782 if (ix86_check_avx_upper_register (*iter))
18783 return AVX_U128_DIRTY;
18785 return AVX_U128_ANY;
18788 /* Return mode that i387 must be switched into
18789 prior to the execution of insn. */
18791 static int
18792 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18794 enum attr_i387_cw mode;
18796 /* The mode UNINITIALIZED is used to store control word after a
18797 function call or ASM pattern. The mode ANY specify that function
18798 has no requirements on the control word and make no changes in the
18799 bits we are interested in. */
18801 if (CALL_P (insn)
18802 || (NONJUMP_INSN_P (insn)
18803 && (asm_noperands (PATTERN (insn)) >= 0
18804 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18805 return I387_CW_UNINITIALIZED;
18807 if (recog_memoized (insn) < 0)
18808 return I387_CW_ANY;
18810 mode = get_attr_i387_cw (insn);
18812 switch (entity)
18814 case I387_TRUNC:
18815 if (mode == I387_CW_TRUNC)
18816 return mode;
18817 break;
18819 case I387_FLOOR:
18820 if (mode == I387_CW_FLOOR)
18821 return mode;
18822 break;
18824 case I387_CEIL:
18825 if (mode == I387_CW_CEIL)
18826 return mode;
18827 break;
18829 case I387_MASK_PM:
18830 if (mode == I387_CW_MASK_PM)
18831 return mode;
18832 break;
18834 default:
18835 gcc_unreachable ();
18838 return I387_CW_ANY;
18841 /* Return mode that entity must be switched into
18842 prior to the execution of insn. */
18844 static int
18845 ix86_mode_needed (int entity, rtx_insn *insn)
18847 switch (entity)
18849 case X86_DIRFLAG:
18850 return ix86_dirflag_mode_needed (insn);
18851 case AVX_U128:
18852 return ix86_avx_u128_mode_needed (insn);
18853 case I387_TRUNC:
18854 case I387_FLOOR:
18855 case I387_CEIL:
18856 case I387_MASK_PM:
18857 return ix86_i387_mode_needed (entity, insn);
18858 default:
18859 gcc_unreachable ();
18861 return 0;
18864 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
18866 static void
18867 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
18869 if (ix86_check_avx_upper_register (dest))
18871 bool *used = (bool *) data;
18872 *used = true;
18876 /* Calculate mode of upper 128bit AVX registers after the insn. */
18878 static int
18879 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
18881 rtx pat = PATTERN (insn);
18883 if (vzeroupper_operation (pat, VOIDmode)
18884 || vzeroall_operation (pat, VOIDmode))
18885 return AVX_U128_CLEAN;
18887 /* We know that state is clean after CALL insn if there are no
18888 256bit or 512bit registers used in the function return register. */
18889 if (CALL_P (insn))
18891 bool avx_upper_reg_found = false;
18892 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
18894 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
18897 /* Otherwise, return current mode. Remember that if insn
18898 references AVX 256bit or 512bit registers, the mode was already
18899 changed to DIRTY from MODE_NEEDED. */
18900 return mode;
18903 /* Return the mode that an insn results in. */
18905 static int
18906 ix86_mode_after (int entity, int mode, rtx_insn *insn)
18908 switch (entity)
18910 case X86_DIRFLAG:
18911 return mode;
18912 case AVX_U128:
18913 return ix86_avx_u128_mode_after (mode, insn);
18914 case I387_TRUNC:
18915 case I387_FLOOR:
18916 case I387_CEIL:
18917 case I387_MASK_PM:
18918 return mode;
18919 default:
18920 gcc_unreachable ();
18924 static int
18925 ix86_dirflag_mode_entry (void)
18927 /* For TARGET_CLD or in the interrupt handler we can't assume
18928 direction flag state at function entry. */
18929 if (TARGET_CLD
18930 || cfun->machine->func_type != TYPE_NORMAL)
18931 return X86_DIRFLAG_ANY;
18933 return X86_DIRFLAG_RESET;
18936 static int
18937 ix86_avx_u128_mode_entry (void)
18939 tree arg;
18941 /* Entry mode is set to AVX_U128_DIRTY if there are
18942 256bit or 512bit modes used in function arguments. */
18943 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
18944 arg = TREE_CHAIN (arg))
18946 rtx incoming = DECL_INCOMING_RTL (arg);
18948 if (incoming && ix86_check_avx_upper_register (incoming))
18949 return AVX_U128_DIRTY;
18952 return AVX_U128_CLEAN;
18955 /* Return a mode that ENTITY is assumed to be
18956 switched to at function entry. */
18958 static int
18959 ix86_mode_entry (int entity)
18961 switch (entity)
18963 case X86_DIRFLAG:
18964 return ix86_dirflag_mode_entry ();
18965 case AVX_U128:
18966 return ix86_avx_u128_mode_entry ();
18967 case I387_TRUNC:
18968 case I387_FLOOR:
18969 case I387_CEIL:
18970 case I387_MASK_PM:
18971 return I387_CW_ANY;
18972 default:
18973 gcc_unreachable ();
18977 static int
18978 ix86_avx_u128_mode_exit (void)
18980 rtx reg = crtl->return_rtx;
18982 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
18983 or 512 bit modes used in the function return register. */
18984 if (reg && ix86_check_avx_upper_register (reg))
18985 return AVX_U128_DIRTY;
18987 return AVX_U128_CLEAN;
18990 /* Return a mode that ENTITY is assumed to be
18991 switched to at function exit. */
18993 static int
18994 ix86_mode_exit (int entity)
18996 switch (entity)
18998 case X86_DIRFLAG:
18999 return X86_DIRFLAG_ANY;
19000 case AVX_U128:
19001 return ix86_avx_u128_mode_exit ();
19002 case I387_TRUNC:
19003 case I387_FLOOR:
19004 case I387_CEIL:
19005 case I387_MASK_PM:
19006 return I387_CW_ANY;
19007 default:
19008 gcc_unreachable ();
19012 static int
19013 ix86_mode_priority (int, int n)
19015 return n;
19018 /* Output code to initialize control word copies used by trunc?f?i and
19019 rounding patterns. CURRENT_MODE is set to current control word,
19020 while NEW_MODE is set to new control word. */
19022 static void
19023 emit_i387_cw_initialization (int mode)
19025 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19026 rtx new_mode;
19028 enum ix86_stack_slot slot;
19030 rtx reg = gen_reg_rtx (HImode);
19032 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19033 emit_move_insn (reg, copy_rtx (stored_mode));
19035 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19036 || optimize_insn_for_size_p ())
19038 switch (mode)
19040 case I387_CW_TRUNC:
19041 /* round toward zero (truncate) */
19042 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19043 slot = SLOT_CW_TRUNC;
19044 break;
19046 case I387_CW_FLOOR:
19047 /* round down toward -oo */
19048 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19049 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19050 slot = SLOT_CW_FLOOR;
19051 break;
19053 case I387_CW_CEIL:
19054 /* round up toward +oo */
19055 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19056 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19057 slot = SLOT_CW_CEIL;
19058 break;
19060 case I387_CW_MASK_PM:
19061 /* mask precision exception for nearbyint() */
19062 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19063 slot = SLOT_CW_MASK_PM;
19064 break;
19066 default:
19067 gcc_unreachable ();
19070 else
19072 switch (mode)
19074 case I387_CW_TRUNC:
19075 /* round toward zero (truncate) */
19076 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19077 slot = SLOT_CW_TRUNC;
19078 break;
19080 case I387_CW_FLOOR:
19081 /* round down toward -oo */
19082 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19083 slot = SLOT_CW_FLOOR;
19084 break;
19086 case I387_CW_CEIL:
19087 /* round up toward +oo */
19088 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19089 slot = SLOT_CW_CEIL;
19090 break;
19092 case I387_CW_MASK_PM:
19093 /* mask precision exception for nearbyint() */
19094 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19095 slot = SLOT_CW_MASK_PM;
19096 break;
19098 default:
19099 gcc_unreachable ();
19103 gcc_assert (slot < MAX_386_STACK_LOCALS);
19105 new_mode = assign_386_stack_local (HImode, slot);
19106 emit_move_insn (new_mode, reg);
19109 /* Emit vzeroupper. */
19111 void
19112 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19114 int i;
19116 /* Cancel automatic vzeroupper insertion if there are
19117 live call-saved SSE registers at the insertion point. */
19119 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19120 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19121 return;
19123 if (TARGET_64BIT)
19124 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19125 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19126 return;
19128 emit_insn (gen_avx_vzeroupper ());
19131 /* Generate one or more insns to set ENTITY to MODE. */
19133 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19134 is the set of hard registers live at the point where the insn(s)
19135 are to be inserted. */
19137 static void
19138 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19139 HARD_REG_SET regs_live)
19141 switch (entity)
19143 case X86_DIRFLAG:
19144 if (mode == X86_DIRFLAG_RESET)
19145 emit_insn (gen_cld ());
19146 break;
19147 case AVX_U128:
19148 if (mode == AVX_U128_CLEAN)
19149 ix86_avx_emit_vzeroupper (regs_live);
19150 break;
19151 case I387_TRUNC:
19152 case I387_FLOOR:
19153 case I387_CEIL:
19154 case I387_MASK_PM:
19155 if (mode != I387_CW_ANY
19156 && mode != I387_CW_UNINITIALIZED)
19157 emit_i387_cw_initialization (mode);
19158 break;
19159 default:
19160 gcc_unreachable ();
19164 /* Output code for INSN to convert a float to a signed int. OPERANDS
19165 are the insn operands. The output may be [HSD]Imode and the input
19166 operand may be [SDX]Fmode. */
19168 const char *
19169 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19171 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19172 bool dimode_p = GET_MODE (operands[0]) == DImode;
19173 int round_mode = get_attr_i387_cw (insn);
19175 static char buf[40];
19176 const char *p;
19178 /* Jump through a hoop or two for DImode, since the hardware has no
19179 non-popping instruction. We used to do this a different way, but
19180 that was somewhat fragile and broke with post-reload splitters. */
19181 if ((dimode_p || fisttp) && !stack_top_dies)
19182 output_asm_insn ("fld\t%y1", operands);
19184 gcc_assert (STACK_TOP_P (operands[1]));
19185 gcc_assert (MEM_P (operands[0]));
19186 gcc_assert (GET_MODE (operands[1]) != TFmode);
19188 if (fisttp)
19189 return "fisttp%Z0\t%0";
19191 strcpy (buf, "fist");
19193 if (round_mode != I387_CW_ANY)
19194 output_asm_insn ("fldcw\t%3", operands);
19196 p = "p%Z0\t%0";
19197 strcat (buf, p + !(stack_top_dies || dimode_p));
19199 output_asm_insn (buf, operands);
19201 if (round_mode != I387_CW_ANY)
19202 output_asm_insn ("fldcw\t%2", operands);
19204 return "";
19207 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19208 have the values zero or one, indicates the ffreep insn's operand
19209 from the OPERANDS array. */
19211 static const char *
19212 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19214 if (TARGET_USE_FFREEP)
19215 #ifdef HAVE_AS_IX86_FFREEP
19216 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19217 #else
19219 static char retval[32];
19220 int regno = REGNO (operands[opno]);
19222 gcc_assert (STACK_REGNO_P (regno));
19224 regno -= FIRST_STACK_REG;
19226 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19227 return retval;
19229 #endif
19231 return opno ? "fstp\t%y1" : "fstp\t%y0";
19235 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19236 should be used. UNORDERED_P is true when fucom should be used. */
19238 const char *
19239 output_fp_compare (rtx_insn *insn, rtx *operands,
19240 bool eflags_p, bool unordered_p)
19242 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19243 bool stack_top_dies;
19245 static char buf[40];
19246 const char *p;
19248 gcc_assert (STACK_TOP_P (xops[0]));
19250 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19252 if (eflags_p)
19254 p = unordered_p ? "fucomi" : "fcomi";
19255 strcpy (buf, p);
19257 p = "p\t{%y1, %0|%0, %y1}";
19258 strcat (buf, p + !stack_top_dies);
19260 return buf;
19263 if (STACK_REG_P (xops[1])
19264 && stack_top_dies
19265 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19267 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19269 /* If both the top of the 387 stack die, and the other operand
19270 is also a stack register that dies, then this must be a
19271 `fcompp' float compare. */
19272 p = unordered_p ? "fucompp" : "fcompp";
19273 strcpy (buf, p);
19275 else if (const0_operand (xops[1], VOIDmode))
19277 gcc_assert (!unordered_p);
19278 strcpy (buf, "ftst");
19280 else
19282 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19284 gcc_assert (!unordered_p);
19285 p = "ficom";
19287 else
19288 p = unordered_p ? "fucom" : "fcom";
19290 strcpy (buf, p);
19292 p = "p%Z2\t%y2";
19293 strcat (buf, p + !stack_top_dies);
19296 output_asm_insn (buf, operands);
19297 return "fnstsw\t%0";
19300 void
19301 ix86_output_addr_vec_elt (FILE *file, int value)
19303 const char *directive = ASM_LONG;
19305 #ifdef ASM_QUAD
19306 if (TARGET_LP64)
19307 directive = ASM_QUAD;
19308 #else
19309 gcc_assert (!TARGET_64BIT);
19310 #endif
19312 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19315 void
19316 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19318 const char *directive = ASM_LONG;
19320 #ifdef ASM_QUAD
19321 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19322 directive = ASM_QUAD;
19323 #else
19324 gcc_assert (!TARGET_64BIT);
19325 #endif
19326 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19327 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19328 fprintf (file, "%s%s%d-%s%d\n",
19329 directive, LPREFIX, value, LPREFIX, rel);
19330 else if (HAVE_AS_GOTOFF_IN_DATA)
19331 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19332 #if TARGET_MACHO
19333 else if (TARGET_MACHO)
19335 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19336 machopic_output_function_base_name (file);
19337 putc ('\n', file);
19339 #endif
19340 else
19341 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19342 GOT_SYMBOL_NAME, LPREFIX, value);
19345 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19346 for the target. */
19348 void
19349 ix86_expand_clear (rtx dest)
19351 rtx tmp;
19353 /* We play register width games, which are only valid after reload. */
19354 gcc_assert (reload_completed);
19356 /* Avoid HImode and its attendant prefix byte. */
19357 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19358 dest = gen_rtx_REG (SImode, REGNO (dest));
19359 tmp = gen_rtx_SET (dest, const0_rtx);
19361 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19363 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19364 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19367 emit_insn (tmp);
19370 void
19371 ix86_expand_move (machine_mode mode, rtx operands[])
19373 rtx op0, op1;
19374 rtx tmp, addend = NULL_RTX;
19375 enum tls_model model;
19377 op0 = operands[0];
19378 op1 = operands[1];
19380 switch (GET_CODE (op1))
19382 case CONST:
19383 tmp = XEXP (op1, 0);
19385 if (GET_CODE (tmp) != PLUS
19386 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19387 break;
19389 op1 = XEXP (tmp, 0);
19390 addend = XEXP (tmp, 1);
19391 /* FALLTHRU */
19393 case SYMBOL_REF:
19394 model = SYMBOL_REF_TLS_MODEL (op1);
19396 if (model)
19397 op1 = legitimize_tls_address (op1, model, true);
19398 else if (ix86_force_load_from_GOT_p (op1))
19400 /* Load the external function address via GOT slot to avoid PLT. */
19401 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19402 (TARGET_64BIT
19403 ? UNSPEC_GOTPCREL
19404 : UNSPEC_GOT));
19405 op1 = gen_rtx_CONST (Pmode, op1);
19406 op1 = gen_const_mem (Pmode, op1);
19407 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19409 else
19411 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19412 if (tmp)
19414 op1 = tmp;
19415 if (!addend)
19416 break;
19418 else
19420 op1 = operands[1];
19421 break;
19425 if (addend)
19427 op1 = force_operand (op1, NULL_RTX);
19428 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19429 op0, 1, OPTAB_DIRECT);
19431 else
19432 op1 = force_operand (op1, op0);
19434 if (op1 == op0)
19435 return;
19437 op1 = convert_to_mode (mode, op1, 1);
19439 default:
19440 break;
19443 if ((flag_pic || MACHOPIC_INDIRECT)
19444 && symbolic_operand (op1, mode))
19446 if (TARGET_MACHO && !TARGET_64BIT)
19448 #if TARGET_MACHO
19449 /* dynamic-no-pic */
19450 if (MACHOPIC_INDIRECT)
19452 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19453 ? op0 : gen_reg_rtx (Pmode);
19454 op1 = machopic_indirect_data_reference (op1, temp);
19455 if (MACHOPIC_PURE)
19456 op1 = machopic_legitimize_pic_address (op1, mode,
19457 temp == op1 ? 0 : temp);
19459 if (op0 != op1 && GET_CODE (op0) != MEM)
19461 rtx insn = gen_rtx_SET (op0, op1);
19462 emit_insn (insn);
19463 return;
19465 if (GET_CODE (op0) == MEM)
19466 op1 = force_reg (Pmode, op1);
19467 else
19469 rtx temp = op0;
19470 if (GET_CODE (temp) != REG)
19471 temp = gen_reg_rtx (Pmode);
19472 temp = legitimize_pic_address (op1, temp);
19473 if (temp == op0)
19474 return;
19475 op1 = temp;
19477 /* dynamic-no-pic */
19478 #endif
19480 else
19482 if (MEM_P (op0))
19483 op1 = force_reg (mode, op1);
19484 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19486 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19487 op1 = legitimize_pic_address (op1, reg);
19488 if (op0 == op1)
19489 return;
19490 op1 = convert_to_mode (mode, op1, 1);
19494 else
19496 if (MEM_P (op0)
19497 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19498 || !push_operand (op0, mode))
19499 && MEM_P (op1))
19500 op1 = force_reg (mode, op1);
19502 if (push_operand (op0, mode)
19503 && ! general_no_elim_operand (op1, mode))
19504 op1 = copy_to_mode_reg (mode, op1);
19506 /* Force large constants in 64bit compilation into register
19507 to get them CSEed. */
19508 if (can_create_pseudo_p ()
19509 && (mode == DImode) && TARGET_64BIT
19510 && immediate_operand (op1, mode)
19511 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19512 && !register_operand (op0, mode)
19513 && optimize)
19514 op1 = copy_to_mode_reg (mode, op1);
19516 if (can_create_pseudo_p ()
19517 && CONST_DOUBLE_P (op1))
19519 /* If we are loading a floating point constant to a register,
19520 force the value to memory now, since we'll get better code
19521 out the back end. */
19523 op1 = validize_mem (force_const_mem (mode, op1));
19524 if (!register_operand (op0, mode))
19526 rtx temp = gen_reg_rtx (mode);
19527 emit_insn (gen_rtx_SET (temp, op1));
19528 emit_move_insn (op0, temp);
19529 return;
19534 emit_insn (gen_rtx_SET (op0, op1));
19537 void
19538 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19540 rtx op0 = operands[0], op1 = operands[1];
19541 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19542 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19543 unsigned int align = (TARGET_IAMCU
19544 ? GET_MODE_BITSIZE (mode)
19545 : GET_MODE_ALIGNMENT (mode));
19547 if (push_operand (op0, VOIDmode))
19548 op0 = emit_move_resolve_push (mode, op0);
19550 /* Force constants other than zero into memory. We do not know how
19551 the instructions used to build constants modify the upper 64 bits
19552 of the register, once we have that information we may be able
19553 to handle some of them more efficiently. */
19554 if (can_create_pseudo_p ()
19555 && (CONSTANT_P (op1)
19556 || (SUBREG_P (op1)
19557 && CONSTANT_P (SUBREG_REG (op1))))
19558 && ((register_operand (op0, mode)
19559 && !standard_sse_constant_p (op1, mode))
19560 /* ix86_expand_vector_move_misalign() does not like constants. */
19561 || (SSE_REG_MODE_P (mode)
19562 && MEM_P (op0)
19563 && MEM_ALIGN (op0) < align)))
19565 if (SUBREG_P (op1))
19567 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19568 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19569 if (r)
19570 r = validize_mem (r);
19571 else
19572 r = force_reg (imode, SUBREG_REG (op1));
19573 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19575 else
19576 op1 = validize_mem (force_const_mem (mode, op1));
19579 /* We need to check memory alignment for SSE mode since attribute
19580 can make operands unaligned. */
19581 if (can_create_pseudo_p ()
19582 && SSE_REG_MODE_P (mode)
19583 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19584 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19586 rtx tmp[2];
19588 /* ix86_expand_vector_move_misalign() does not like both
19589 arguments in memory. */
19590 if (!register_operand (op0, mode)
19591 && !register_operand (op1, mode))
19592 op1 = force_reg (mode, op1);
19594 tmp[0] = op0; tmp[1] = op1;
19595 ix86_expand_vector_move_misalign (mode, tmp);
19596 return;
19599 /* Make operand1 a register if it isn't already. */
19600 if (can_create_pseudo_p ()
19601 && !register_operand (op0, mode)
19602 && !register_operand (op1, mode))
19604 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19605 return;
19608 emit_insn (gen_rtx_SET (op0, op1));
19611 /* Split 32-byte AVX unaligned load and store if needed. */
19613 static void
19614 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19616 rtx m;
19617 rtx (*extract) (rtx, rtx, rtx);
19618 machine_mode mode;
19620 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19621 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19623 emit_insn (gen_rtx_SET (op0, op1));
19624 return;
19627 rtx orig_op0 = NULL_RTX;
19628 mode = GET_MODE (op0);
19629 switch (GET_MODE_CLASS (mode))
19631 case MODE_VECTOR_INT:
19632 case MODE_INT:
19633 if (mode != V32QImode)
19635 if (!MEM_P (op0))
19637 orig_op0 = op0;
19638 op0 = gen_reg_rtx (V32QImode);
19640 else
19641 op0 = gen_lowpart (V32QImode, op0);
19642 op1 = gen_lowpart (V32QImode, op1);
19643 mode = V32QImode;
19645 break;
19646 case MODE_VECTOR_FLOAT:
19647 break;
19648 default:
19649 gcc_unreachable ();
19652 switch (mode)
19654 default:
19655 gcc_unreachable ();
19656 case E_V32QImode:
19657 extract = gen_avx_vextractf128v32qi;
19658 mode = V16QImode;
19659 break;
19660 case E_V8SFmode:
19661 extract = gen_avx_vextractf128v8sf;
19662 mode = V4SFmode;
19663 break;
19664 case E_V4DFmode:
19665 extract = gen_avx_vextractf128v4df;
19666 mode = V2DFmode;
19667 break;
19670 if (MEM_P (op1))
19672 rtx r = gen_reg_rtx (mode);
19673 m = adjust_address (op1, mode, 0);
19674 emit_move_insn (r, m);
19675 m = adjust_address (op1, mode, 16);
19676 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19677 emit_move_insn (op0, r);
19679 else if (MEM_P (op0))
19681 m = adjust_address (op0, mode, 0);
19682 emit_insn (extract (m, op1, const0_rtx));
19683 m = adjust_address (op0, mode, 16);
19684 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19686 else
19687 gcc_unreachable ();
19689 if (orig_op0)
19690 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19693 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19694 straight to ix86_expand_vector_move. */
19695 /* Code generation for scalar reg-reg moves of single and double precision data:
19696 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19697 movaps reg, reg
19698 else
19699 movss reg, reg
19700 if (x86_sse_partial_reg_dependency == true)
19701 movapd reg, reg
19702 else
19703 movsd reg, reg
19705 Code generation for scalar loads of double precision data:
19706 if (x86_sse_split_regs == true)
19707 movlpd mem, reg (gas syntax)
19708 else
19709 movsd mem, reg
19711 Code generation for unaligned packed loads of single precision data
19712 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19713 if (x86_sse_unaligned_move_optimal)
19714 movups mem, reg
19716 if (x86_sse_partial_reg_dependency == true)
19718 xorps reg, reg
19719 movlps mem, reg
19720 movhps mem+8, reg
19722 else
19724 movlps mem, reg
19725 movhps mem+8, reg
19728 Code generation for unaligned packed loads of double precision data
19729 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19730 if (x86_sse_unaligned_move_optimal)
19731 movupd mem, reg
19733 if (x86_sse_split_regs == true)
19735 movlpd mem, reg
19736 movhpd mem+8, reg
19738 else
19740 movsd mem, reg
19741 movhpd mem+8, reg
19745 void
19746 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19748 rtx op0, op1, m;
19750 op0 = operands[0];
19751 op1 = operands[1];
19753 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19754 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19756 emit_insn (gen_rtx_SET (op0, op1));
19757 return;
19760 if (TARGET_AVX)
19762 if (GET_MODE_SIZE (mode) == 32)
19763 ix86_avx256_split_vector_move_misalign (op0, op1);
19764 else
19765 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19766 emit_insn (gen_rtx_SET (op0, op1));
19767 return;
19770 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19771 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19773 emit_insn (gen_rtx_SET (op0, op1));
19774 return;
19777 /* ??? If we have typed data, then it would appear that using
19778 movdqu is the only way to get unaligned data loaded with
19779 integer type. */
19780 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19782 emit_insn (gen_rtx_SET (op0, op1));
19783 return;
19786 if (MEM_P (op1))
19788 if (TARGET_SSE2 && mode == V2DFmode)
19790 rtx zero;
19792 /* When SSE registers are split into halves, we can avoid
19793 writing to the top half twice. */
19794 if (TARGET_SSE_SPLIT_REGS)
19796 emit_clobber (op0);
19797 zero = op0;
19799 else
19801 /* ??? Not sure about the best option for the Intel chips.
19802 The following would seem to satisfy; the register is
19803 entirely cleared, breaking the dependency chain. We
19804 then store to the upper half, with a dependency depth
19805 of one. A rumor has it that Intel recommends two movsd
19806 followed by an unpacklpd, but this is unconfirmed. And
19807 given that the dependency depth of the unpacklpd would
19808 still be one, I'm not sure why this would be better. */
19809 zero = CONST0_RTX (V2DFmode);
19812 m = adjust_address (op1, DFmode, 0);
19813 emit_insn (gen_sse2_loadlpd (op0, zero, m));
19814 m = adjust_address (op1, DFmode, 8);
19815 emit_insn (gen_sse2_loadhpd (op0, op0, m));
19817 else
19819 rtx t;
19821 if (mode != V4SFmode)
19822 t = gen_reg_rtx (V4SFmode);
19823 else
19824 t = op0;
19826 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
19827 emit_move_insn (t, CONST0_RTX (V4SFmode));
19828 else
19829 emit_clobber (t);
19831 m = adjust_address (op1, V2SFmode, 0);
19832 emit_insn (gen_sse_loadlps (t, t, m));
19833 m = adjust_address (op1, V2SFmode, 8);
19834 emit_insn (gen_sse_loadhps (t, t, m));
19835 if (mode != V4SFmode)
19836 emit_move_insn (op0, gen_lowpart (mode, t));
19839 else if (MEM_P (op0))
19841 if (TARGET_SSE2 && mode == V2DFmode)
19843 m = adjust_address (op0, DFmode, 0);
19844 emit_insn (gen_sse2_storelpd (m, op1));
19845 m = adjust_address (op0, DFmode, 8);
19846 emit_insn (gen_sse2_storehpd (m, op1));
19848 else
19850 if (mode != V4SFmode)
19851 op1 = gen_lowpart (V4SFmode, op1);
19853 m = adjust_address (op0, V2SFmode, 0);
19854 emit_insn (gen_sse_storelps (m, op1));
19855 m = adjust_address (op0, V2SFmode, 8);
19856 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
19859 else
19860 gcc_unreachable ();
19863 /* Helper function of ix86_fixup_binary_operands to canonicalize
19864 operand order. Returns true if the operands should be swapped. */
19866 static bool
19867 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
19868 rtx operands[])
19870 rtx dst = operands[0];
19871 rtx src1 = operands[1];
19872 rtx src2 = operands[2];
19874 /* If the operation is not commutative, we can't do anything. */
19875 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
19876 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
19877 return false;
19879 /* Highest priority is that src1 should match dst. */
19880 if (rtx_equal_p (dst, src1))
19881 return false;
19882 if (rtx_equal_p (dst, src2))
19883 return true;
19885 /* Next highest priority is that immediate constants come second. */
19886 if (immediate_operand (src2, mode))
19887 return false;
19888 if (immediate_operand (src1, mode))
19889 return true;
19891 /* Lowest priority is that memory references should come second. */
19892 if (MEM_P (src2))
19893 return false;
19894 if (MEM_P (src1))
19895 return true;
19897 return false;
19901 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
19902 destination to use for the operation. If different from the true
19903 destination in operands[0], a copy operation will be required. */
19906 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
19907 rtx operands[])
19909 rtx dst = operands[0];
19910 rtx src1 = operands[1];
19911 rtx src2 = operands[2];
19913 /* Canonicalize operand order. */
19914 if (ix86_swap_binary_operands_p (code, mode, operands))
19916 /* It is invalid to swap operands of different modes. */
19917 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
19919 std::swap (src1, src2);
19922 /* Both source operands cannot be in memory. */
19923 if (MEM_P (src1) && MEM_P (src2))
19925 /* Optimization: Only read from memory once. */
19926 if (rtx_equal_p (src1, src2))
19928 src2 = force_reg (mode, src2);
19929 src1 = src2;
19931 else if (rtx_equal_p (dst, src1))
19932 src2 = force_reg (mode, src2);
19933 else
19934 src1 = force_reg (mode, src1);
19937 /* If the destination is memory, and we do not have matching source
19938 operands, do things in registers. */
19939 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19940 dst = gen_reg_rtx (mode);
19942 /* Source 1 cannot be a constant. */
19943 if (CONSTANT_P (src1))
19944 src1 = force_reg (mode, src1);
19946 /* Source 1 cannot be a non-matching memory. */
19947 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
19948 src1 = force_reg (mode, src1);
19950 /* Improve address combine. */
19951 if (code == PLUS
19952 && GET_MODE_CLASS (mode) == MODE_INT
19953 && MEM_P (src2))
19954 src2 = force_reg (mode, src2);
19956 operands[1] = src1;
19957 operands[2] = src2;
19958 return dst;
19961 /* Similarly, but assume that the destination has already been
19962 set up properly. */
19964 void
19965 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
19966 machine_mode mode, rtx operands[])
19968 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
19969 gcc_assert (dst == operands[0]);
19972 /* Attempt to expand a binary operator. Make the expansion closer to the
19973 actual machine, then just general_operand, which will allow 3 separate
19974 memory references (one output, two input) in a single insn. */
19976 void
19977 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
19978 rtx operands[])
19980 rtx src1, src2, dst, op, clob;
19982 dst = ix86_fixup_binary_operands (code, mode, operands);
19983 src1 = operands[1];
19984 src2 = operands[2];
19986 /* Emit the instruction. */
19988 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
19990 if (reload_completed
19991 && code == PLUS
19992 && !rtx_equal_p (dst, src1))
19994 /* This is going to be an LEA; avoid splitting it later. */
19995 emit_insn (op);
19997 else
19999 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20000 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20003 /* Fix up the destination if needed. */
20004 if (dst != operands[0])
20005 emit_move_insn (operands[0], dst);
20008 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20009 the given OPERANDS. */
20011 void
20012 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20013 rtx operands[])
20015 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20016 if (SUBREG_P (operands[1]))
20018 op1 = operands[1];
20019 op2 = operands[2];
20021 else if (SUBREG_P (operands[2]))
20023 op1 = operands[2];
20024 op2 = operands[1];
20026 /* Optimize (__m128i) d | (__m128i) e and similar code
20027 when d and e are float vectors into float vector logical
20028 insn. In C/C++ without using intrinsics there is no other way
20029 to express vector logical operation on float vectors than
20030 to cast them temporarily to integer vectors. */
20031 if (op1
20032 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20033 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20034 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20035 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20036 && SUBREG_BYTE (op1) == 0
20037 && (GET_CODE (op2) == CONST_VECTOR
20038 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20039 && SUBREG_BYTE (op2) == 0))
20040 && can_create_pseudo_p ())
20042 rtx dst;
20043 switch (GET_MODE (SUBREG_REG (op1)))
20045 case E_V4SFmode:
20046 case E_V8SFmode:
20047 case E_V16SFmode:
20048 case E_V2DFmode:
20049 case E_V4DFmode:
20050 case E_V8DFmode:
20051 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20052 if (GET_CODE (op2) == CONST_VECTOR)
20054 op2 = gen_lowpart (GET_MODE (dst), op2);
20055 op2 = force_reg (GET_MODE (dst), op2);
20057 else
20059 op1 = operands[1];
20060 op2 = SUBREG_REG (operands[2]);
20061 if (!vector_operand (op2, GET_MODE (dst)))
20062 op2 = force_reg (GET_MODE (dst), op2);
20064 op1 = SUBREG_REG (op1);
20065 if (!vector_operand (op1, GET_MODE (dst)))
20066 op1 = force_reg (GET_MODE (dst), op1);
20067 emit_insn (gen_rtx_SET (dst,
20068 gen_rtx_fmt_ee (code, GET_MODE (dst),
20069 op1, op2)));
20070 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20071 return;
20072 default:
20073 break;
20076 if (!vector_operand (operands[1], mode))
20077 operands[1] = force_reg (mode, operands[1]);
20078 if (!vector_operand (operands[2], mode))
20079 operands[2] = force_reg (mode, operands[2]);
20080 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20081 emit_insn (gen_rtx_SET (operands[0],
20082 gen_rtx_fmt_ee (code, mode, operands[1],
20083 operands[2])));
20086 /* Return TRUE or FALSE depending on whether the binary operator meets the
20087 appropriate constraints. */
20089 bool
20090 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20091 rtx operands[3])
20093 rtx dst = operands[0];
20094 rtx src1 = operands[1];
20095 rtx src2 = operands[2];
20097 /* Both source operands cannot be in memory. */
20098 if (MEM_P (src1) && MEM_P (src2))
20099 return false;
20101 /* Canonicalize operand order for commutative operators. */
20102 if (ix86_swap_binary_operands_p (code, mode, operands))
20103 std::swap (src1, src2);
20105 /* If the destination is memory, we must have a matching source operand. */
20106 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20107 return false;
20109 /* Source 1 cannot be a constant. */
20110 if (CONSTANT_P (src1))
20111 return false;
20113 /* Source 1 cannot be a non-matching memory. */
20114 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20115 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20116 return (code == AND
20117 && (mode == HImode
20118 || mode == SImode
20119 || (TARGET_64BIT && mode == DImode))
20120 && satisfies_constraint_L (src2));
20122 return true;
20125 /* Attempt to expand a unary operator. Make the expansion closer to the
20126 actual machine, then just general_operand, which will allow 2 separate
20127 memory references (one output, one input) in a single insn. */
20129 void
20130 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20131 rtx operands[])
20133 bool matching_memory = false;
20134 rtx src, dst, op, clob;
20136 dst = operands[0];
20137 src = operands[1];
20139 /* If the destination is memory, and we do not have matching source
20140 operands, do things in registers. */
20141 if (MEM_P (dst))
20143 if (rtx_equal_p (dst, src))
20144 matching_memory = true;
20145 else
20146 dst = gen_reg_rtx (mode);
20149 /* When source operand is memory, destination must match. */
20150 if (MEM_P (src) && !matching_memory)
20151 src = force_reg (mode, src);
20153 /* Emit the instruction. */
20155 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20157 if (code == NOT)
20158 emit_insn (op);
20159 else
20161 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20162 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20165 /* Fix up the destination if needed. */
20166 if (dst != operands[0])
20167 emit_move_insn (operands[0], dst);
20170 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20171 divisor are within the range [0-255]. */
20173 void
20174 ix86_split_idivmod (machine_mode mode, rtx operands[],
20175 bool signed_p)
20177 rtx_code_label *end_label, *qimode_label;
20178 rtx div, mod;
20179 rtx_insn *insn;
20180 rtx scratch, tmp0, tmp1, tmp2;
20181 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20182 rtx (*gen_zero_extend) (rtx, rtx);
20183 rtx (*gen_test_ccno_1) (rtx, rtx);
20185 switch (mode)
20187 case E_SImode:
20188 if (GET_MODE (operands[0]) == SImode)
20190 if (GET_MODE (operands[1]) == SImode)
20191 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20192 else
20193 gen_divmod4_1
20194 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20195 gen_zero_extend = gen_zero_extendqisi2;
20197 else
20199 gen_divmod4_1
20200 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20201 gen_zero_extend = gen_zero_extendqidi2;
20203 gen_test_ccno_1 = gen_testsi_ccno_1;
20204 break;
20205 case E_DImode:
20206 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20207 gen_test_ccno_1 = gen_testdi_ccno_1;
20208 gen_zero_extend = gen_zero_extendqidi2;
20209 break;
20210 default:
20211 gcc_unreachable ();
20214 end_label = gen_label_rtx ();
20215 qimode_label = gen_label_rtx ();
20217 scratch = gen_reg_rtx (mode);
20219 /* Use 8bit unsigned divimod if dividend and divisor are within
20220 the range [0-255]. */
20221 emit_move_insn (scratch, operands[2]);
20222 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20223 scratch, 1, OPTAB_DIRECT);
20224 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20225 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20226 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20227 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20228 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20229 pc_rtx);
20230 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20231 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20232 JUMP_LABEL (insn) = qimode_label;
20234 /* Generate original signed/unsigned divimod. */
20235 div = gen_divmod4_1 (operands[0], operands[1],
20236 operands[2], operands[3]);
20237 emit_insn (div);
20239 /* Branch to the end. */
20240 emit_jump_insn (gen_jump (end_label));
20241 emit_barrier ();
20243 /* Generate 8bit unsigned divide. */
20244 emit_label (qimode_label);
20245 /* Don't use operands[0] for result of 8bit divide since not all
20246 registers support QImode ZERO_EXTRACT. */
20247 tmp0 = lowpart_subreg (HImode, scratch, mode);
20248 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20249 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20250 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20252 if (signed_p)
20254 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20255 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20257 else
20259 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20260 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20262 if (mode == SImode)
20264 if (GET_MODE (operands[0]) != SImode)
20265 div = gen_rtx_ZERO_EXTEND (DImode, div);
20266 if (GET_MODE (operands[1]) != SImode)
20267 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20270 /* Extract remainder from AH. */
20271 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20272 tmp0, GEN_INT (8), GEN_INT (8));
20273 if (REG_P (operands[1]))
20274 insn = emit_move_insn (operands[1], tmp1);
20275 else
20277 /* Need a new scratch register since the old one has result
20278 of 8bit divide. */
20279 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20280 emit_move_insn (scratch, tmp1);
20281 insn = emit_move_insn (operands[1], scratch);
20283 set_unique_reg_note (insn, REG_EQUAL, mod);
20285 /* Zero extend quotient from AL. */
20286 tmp1 = gen_lowpart (QImode, tmp0);
20287 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20288 set_unique_reg_note (insn, REG_EQUAL, div);
20290 emit_label (end_label);
20293 #define LEA_MAX_STALL (3)
20294 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20296 /* Increase given DISTANCE in half-cycles according to
20297 dependencies between PREV and NEXT instructions.
20298 Add 1 half-cycle if there is no dependency and
20299 go to next cycle if there is some dependecy. */
20301 static unsigned int
20302 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20304 df_ref def, use;
20306 if (!prev || !next)
20307 return distance + (distance & 1) + 2;
20309 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20310 return distance + 1;
20312 FOR_EACH_INSN_USE (use, next)
20313 FOR_EACH_INSN_DEF (def, prev)
20314 if (!DF_REF_IS_ARTIFICIAL (def)
20315 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20316 return distance + (distance & 1) + 2;
20318 return distance + 1;
20321 /* Function checks if instruction INSN defines register number
20322 REGNO1 or REGNO2. */
20324 static bool
20325 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20326 rtx_insn *insn)
20328 df_ref def;
20330 FOR_EACH_INSN_DEF (def, insn)
20331 if (DF_REF_REG_DEF_P (def)
20332 && !DF_REF_IS_ARTIFICIAL (def)
20333 && (regno1 == DF_REF_REGNO (def)
20334 || regno2 == DF_REF_REGNO (def)))
20335 return true;
20337 return false;
20340 /* Function checks if instruction INSN uses register number
20341 REGNO as a part of address expression. */
20343 static bool
20344 insn_uses_reg_mem (unsigned int regno, rtx insn)
20346 df_ref use;
20348 FOR_EACH_INSN_USE (use, insn)
20349 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20350 return true;
20352 return false;
20355 /* Search backward for non-agu definition of register number REGNO1
20356 or register number REGNO2 in basic block starting from instruction
20357 START up to head of basic block or instruction INSN.
20359 Function puts true value into *FOUND var if definition was found
20360 and false otherwise.
20362 Distance in half-cycles between START and found instruction or head
20363 of BB is added to DISTANCE and returned. */
20365 static int
20366 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20367 rtx_insn *insn, int distance,
20368 rtx_insn *start, bool *found)
20370 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20371 rtx_insn *prev = start;
20372 rtx_insn *next = NULL;
20374 *found = false;
20376 while (prev
20377 && prev != insn
20378 && distance < LEA_SEARCH_THRESHOLD)
20380 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20382 distance = increase_distance (prev, next, distance);
20383 if (insn_defines_reg (regno1, regno2, prev))
20385 if (recog_memoized (prev) < 0
20386 || get_attr_type (prev) != TYPE_LEA)
20388 *found = true;
20389 return distance;
20393 next = prev;
20395 if (prev == BB_HEAD (bb))
20396 break;
20398 prev = PREV_INSN (prev);
20401 return distance;
20404 /* Search backward for non-agu definition of register number REGNO1
20405 or register number REGNO2 in INSN's basic block until
20406 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20407 2. Reach neighbor BBs boundary, or
20408 3. Reach agu definition.
20409 Returns the distance between the non-agu definition point and INSN.
20410 If no definition point, returns -1. */
20412 static int
20413 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20414 rtx_insn *insn)
20416 basic_block bb = BLOCK_FOR_INSN (insn);
20417 int distance = 0;
20418 bool found = false;
20420 if (insn != BB_HEAD (bb))
20421 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20422 distance, PREV_INSN (insn),
20423 &found);
20425 if (!found && distance < LEA_SEARCH_THRESHOLD)
20427 edge e;
20428 edge_iterator ei;
20429 bool simple_loop = false;
20431 FOR_EACH_EDGE (e, ei, bb->preds)
20432 if (e->src == bb)
20434 simple_loop = true;
20435 break;
20438 if (simple_loop)
20439 distance = distance_non_agu_define_in_bb (regno1, regno2,
20440 insn, distance,
20441 BB_END (bb), &found);
20442 else
20444 int shortest_dist = -1;
20445 bool found_in_bb = false;
20447 FOR_EACH_EDGE (e, ei, bb->preds)
20449 int bb_dist
20450 = distance_non_agu_define_in_bb (regno1, regno2,
20451 insn, distance,
20452 BB_END (e->src),
20453 &found_in_bb);
20454 if (found_in_bb)
20456 if (shortest_dist < 0)
20457 shortest_dist = bb_dist;
20458 else if (bb_dist > 0)
20459 shortest_dist = MIN (bb_dist, shortest_dist);
20461 found = true;
20465 distance = shortest_dist;
20469 /* get_attr_type may modify recog data. We want to make sure
20470 that recog data is valid for instruction INSN, on which
20471 distance_non_agu_define is called. INSN is unchanged here. */
20472 extract_insn_cached (insn);
20474 if (!found)
20475 return -1;
20477 return distance >> 1;
20480 /* Return the distance in half-cycles between INSN and the next
20481 insn that uses register number REGNO in memory address added
20482 to DISTANCE. Return -1 if REGNO0 is set.
20484 Put true value into *FOUND if register usage was found and
20485 false otherwise.
20486 Put true value into *REDEFINED if register redefinition was
20487 found and false otherwise. */
20489 static int
20490 distance_agu_use_in_bb (unsigned int regno,
20491 rtx_insn *insn, int distance, rtx_insn *start,
20492 bool *found, bool *redefined)
20494 basic_block bb = NULL;
20495 rtx_insn *next = start;
20496 rtx_insn *prev = NULL;
20498 *found = false;
20499 *redefined = false;
20501 if (start != NULL_RTX)
20503 bb = BLOCK_FOR_INSN (start);
20504 if (start != BB_HEAD (bb))
20505 /* If insn and start belong to the same bb, set prev to insn,
20506 so the call to increase_distance will increase the distance
20507 between insns by 1. */
20508 prev = insn;
20511 while (next
20512 && next != insn
20513 && distance < LEA_SEARCH_THRESHOLD)
20515 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20517 distance = increase_distance(prev, next, distance);
20518 if (insn_uses_reg_mem (regno, next))
20520 /* Return DISTANCE if OP0 is used in memory
20521 address in NEXT. */
20522 *found = true;
20523 return distance;
20526 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20528 /* Return -1 if OP0 is set in NEXT. */
20529 *redefined = true;
20530 return -1;
20533 prev = next;
20536 if (next == BB_END (bb))
20537 break;
20539 next = NEXT_INSN (next);
20542 return distance;
20545 /* Return the distance between INSN and the next insn that uses
20546 register number REGNO0 in memory address. Return -1 if no such
20547 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20549 static int
20550 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20552 basic_block bb = BLOCK_FOR_INSN (insn);
20553 int distance = 0;
20554 bool found = false;
20555 bool redefined = false;
20557 if (insn != BB_END (bb))
20558 distance = distance_agu_use_in_bb (regno0, insn, distance,
20559 NEXT_INSN (insn),
20560 &found, &redefined);
20562 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20564 edge e;
20565 edge_iterator ei;
20566 bool simple_loop = false;
20568 FOR_EACH_EDGE (e, ei, bb->succs)
20569 if (e->dest == bb)
20571 simple_loop = true;
20572 break;
20575 if (simple_loop)
20576 distance = distance_agu_use_in_bb (regno0, insn,
20577 distance, BB_HEAD (bb),
20578 &found, &redefined);
20579 else
20581 int shortest_dist = -1;
20582 bool found_in_bb = false;
20583 bool redefined_in_bb = false;
20585 FOR_EACH_EDGE (e, ei, bb->succs)
20587 int bb_dist
20588 = distance_agu_use_in_bb (regno0, insn,
20589 distance, BB_HEAD (e->dest),
20590 &found_in_bb, &redefined_in_bb);
20591 if (found_in_bb)
20593 if (shortest_dist < 0)
20594 shortest_dist = bb_dist;
20595 else if (bb_dist > 0)
20596 shortest_dist = MIN (bb_dist, shortest_dist);
20598 found = true;
20602 distance = shortest_dist;
20606 if (!found || redefined)
20607 return -1;
20609 return distance >> 1;
20612 /* Define this macro to tune LEA priority vs ADD, it take effect when
20613 there is a dilemma of choicing LEA or ADD
20614 Negative value: ADD is more preferred than LEA
20615 Zero: Netrual
20616 Positive value: LEA is more preferred than ADD*/
20617 #define IX86_LEA_PRIORITY 0
20619 /* Return true if usage of lea INSN has performance advantage
20620 over a sequence of instructions. Instructions sequence has
20621 SPLIT_COST cycles higher latency than lea latency. */
20623 static bool
20624 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20625 unsigned int regno2, int split_cost, bool has_scale)
20627 int dist_define, dist_use;
20629 /* For Silvermont if using a 2-source or 3-source LEA for
20630 non-destructive destination purposes, or due to wanting
20631 ability to use SCALE, the use of LEA is justified. */
20632 if (TARGET_SILVERMONT || TARGET_INTEL)
20634 if (has_scale)
20635 return true;
20636 if (split_cost < 1)
20637 return false;
20638 if (regno0 == regno1 || regno0 == regno2)
20639 return false;
20640 return true;
20643 dist_define = distance_non_agu_define (regno1, regno2, insn);
20644 dist_use = distance_agu_use (regno0, insn);
20646 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20648 /* If there is no non AGU operand definition, no AGU
20649 operand usage and split cost is 0 then both lea
20650 and non lea variants have same priority. Currently
20651 we prefer lea for 64 bit code and non lea on 32 bit
20652 code. */
20653 if (dist_use < 0 && split_cost == 0)
20654 return TARGET_64BIT || IX86_LEA_PRIORITY;
20655 else
20656 return true;
20659 /* With longer definitions distance lea is more preferable.
20660 Here we change it to take into account splitting cost and
20661 lea priority. */
20662 dist_define += split_cost + IX86_LEA_PRIORITY;
20664 /* If there is no use in memory addess then we just check
20665 that split cost exceeds AGU stall. */
20666 if (dist_use < 0)
20667 return dist_define > LEA_MAX_STALL;
20669 /* If this insn has both backward non-agu dependence and forward
20670 agu dependence, the one with short distance takes effect. */
20671 return dist_define >= dist_use;
20674 /* Return true if it is legal to clobber flags by INSN and
20675 false otherwise. */
20677 static bool
20678 ix86_ok_to_clobber_flags (rtx_insn *insn)
20680 basic_block bb = BLOCK_FOR_INSN (insn);
20681 df_ref use;
20682 bitmap live;
20684 while (insn)
20686 if (NONDEBUG_INSN_P (insn))
20688 FOR_EACH_INSN_USE (use, insn)
20689 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20690 return false;
20692 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20693 return true;
20696 if (insn == BB_END (bb))
20697 break;
20699 insn = NEXT_INSN (insn);
20702 live = df_get_live_out(bb);
20703 return !REGNO_REG_SET_P (live, FLAGS_REG);
20706 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20707 move and add to avoid AGU stalls. */
20709 bool
20710 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20712 unsigned int regno0, regno1, regno2;
20714 /* Check if we need to optimize. */
20715 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20716 return false;
20718 /* Check it is correct to split here. */
20719 if (!ix86_ok_to_clobber_flags(insn))
20720 return false;
20722 regno0 = true_regnum (operands[0]);
20723 regno1 = true_regnum (operands[1]);
20724 regno2 = true_regnum (operands[2]);
20726 /* We need to split only adds with non destructive
20727 destination operand. */
20728 if (regno0 == regno1 || regno0 == regno2)
20729 return false;
20730 else
20731 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20734 /* Return true if we should emit lea instruction instead of mov
20735 instruction. */
20737 bool
20738 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20740 unsigned int regno0, regno1;
20742 /* Check if we need to optimize. */
20743 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20744 return false;
20746 /* Use lea for reg to reg moves only. */
20747 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20748 return false;
20750 regno0 = true_regnum (operands[0]);
20751 regno1 = true_regnum (operands[1]);
20753 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20756 /* Return true if we need to split lea into a sequence of
20757 instructions to avoid AGU stalls. */
20759 bool
20760 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20762 unsigned int regno0, regno1, regno2;
20763 int split_cost;
20764 struct ix86_address parts;
20765 int ok;
20767 /* Check we need to optimize. */
20768 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20769 return false;
20771 /* The "at least two components" test below might not catch simple
20772 move or zero extension insns if parts.base is non-NULL and parts.disp
20773 is const0_rtx as the only components in the address, e.g. if the
20774 register is %rbp or %r13. As this test is much cheaper and moves or
20775 zero extensions are the common case, do this check first. */
20776 if (REG_P (operands[1])
20777 || (SImode_address_operand (operands[1], VOIDmode)
20778 && REG_P (XEXP (operands[1], 0))))
20779 return false;
20781 /* Check if it is OK to split here. */
20782 if (!ix86_ok_to_clobber_flags (insn))
20783 return false;
20785 ok = ix86_decompose_address (operands[1], &parts);
20786 gcc_assert (ok);
20788 /* There should be at least two components in the address. */
20789 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20790 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20791 return false;
20793 /* We should not split into add if non legitimate pic
20794 operand is used as displacement. */
20795 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20796 return false;
20798 regno0 = true_regnum (operands[0]) ;
20799 regno1 = INVALID_REGNUM;
20800 regno2 = INVALID_REGNUM;
20802 if (parts.base)
20803 regno1 = true_regnum (parts.base);
20804 if (parts.index)
20805 regno2 = true_regnum (parts.index);
20807 split_cost = 0;
20809 /* Compute how many cycles we will add to execution time
20810 if split lea into a sequence of instructions. */
20811 if (parts.base || parts.index)
20813 /* Have to use mov instruction if non desctructive
20814 destination form is used. */
20815 if (regno1 != regno0 && regno2 != regno0)
20816 split_cost += 1;
20818 /* Have to add index to base if both exist. */
20819 if (parts.base && parts.index)
20820 split_cost += 1;
20822 /* Have to use shift and adds if scale is 2 or greater. */
20823 if (parts.scale > 1)
20825 if (regno0 != regno1)
20826 split_cost += 1;
20827 else if (regno2 == regno0)
20828 split_cost += 4;
20829 else
20830 split_cost += parts.scale;
20833 /* Have to use add instruction with immediate if
20834 disp is non zero. */
20835 if (parts.disp && parts.disp != const0_rtx)
20836 split_cost += 1;
20838 /* Subtract the price of lea. */
20839 split_cost -= 1;
20842 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
20843 parts.scale > 1);
20846 /* Emit x86 binary operand CODE in mode MODE, where the first operand
20847 matches destination. RTX includes clobber of FLAGS_REG. */
20849 static void
20850 ix86_emit_binop (enum rtx_code code, machine_mode mode,
20851 rtx dst, rtx src)
20853 rtx op, clob;
20855 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
20856 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20858 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20861 /* Return true if regno1 def is nearest to the insn. */
20863 static bool
20864 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
20866 rtx_insn *prev = insn;
20867 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
20869 if (insn == start)
20870 return false;
20871 while (prev && prev != start)
20873 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
20875 prev = PREV_INSN (prev);
20876 continue;
20878 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
20879 return true;
20880 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
20881 return false;
20882 prev = PREV_INSN (prev);
20885 /* None of the regs is defined in the bb. */
20886 return false;
20889 /* Split lea instructions into a sequence of instructions
20890 which are executed on ALU to avoid AGU stalls.
20891 It is assumed that it is allowed to clobber flags register
20892 at lea position. */
20894 void
20895 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
20897 unsigned int regno0, regno1, regno2;
20898 struct ix86_address parts;
20899 rtx target, tmp;
20900 int ok, adds;
20902 ok = ix86_decompose_address (operands[1], &parts);
20903 gcc_assert (ok);
20905 target = gen_lowpart (mode, operands[0]);
20907 regno0 = true_regnum (target);
20908 regno1 = INVALID_REGNUM;
20909 regno2 = INVALID_REGNUM;
20911 if (parts.base)
20913 parts.base = gen_lowpart (mode, parts.base);
20914 regno1 = true_regnum (parts.base);
20917 if (parts.index)
20919 parts.index = gen_lowpart (mode, parts.index);
20920 regno2 = true_regnum (parts.index);
20923 if (parts.disp)
20924 parts.disp = gen_lowpart (mode, parts.disp);
20926 if (parts.scale > 1)
20928 /* Case r1 = r1 + ... */
20929 if (regno1 == regno0)
20931 /* If we have a case r1 = r1 + C * r2 then we
20932 should use multiplication which is very
20933 expensive. Assume cost model is wrong if we
20934 have such case here. */
20935 gcc_assert (regno2 != regno0);
20937 for (adds = parts.scale; adds > 0; adds--)
20938 ix86_emit_binop (PLUS, mode, target, parts.index);
20940 else
20942 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
20943 if (regno0 != regno2)
20944 emit_insn (gen_rtx_SET (target, parts.index));
20946 /* Use shift for scaling. */
20947 ix86_emit_binop (ASHIFT, mode, target,
20948 GEN_INT (exact_log2 (parts.scale)));
20950 if (parts.base)
20951 ix86_emit_binop (PLUS, mode, target, parts.base);
20953 if (parts.disp && parts.disp != const0_rtx)
20954 ix86_emit_binop (PLUS, mode, target, parts.disp);
20957 else if (!parts.base && !parts.index)
20959 gcc_assert(parts.disp);
20960 emit_insn (gen_rtx_SET (target, parts.disp));
20962 else
20964 if (!parts.base)
20966 if (regno0 != regno2)
20967 emit_insn (gen_rtx_SET (target, parts.index));
20969 else if (!parts.index)
20971 if (regno0 != regno1)
20972 emit_insn (gen_rtx_SET (target, parts.base));
20974 else
20976 if (regno0 == regno1)
20977 tmp = parts.index;
20978 else if (regno0 == regno2)
20979 tmp = parts.base;
20980 else
20982 rtx tmp1;
20984 /* Find better operand for SET instruction, depending
20985 on which definition is farther from the insn. */
20986 if (find_nearest_reg_def (insn, regno1, regno2))
20987 tmp = parts.index, tmp1 = parts.base;
20988 else
20989 tmp = parts.base, tmp1 = parts.index;
20991 emit_insn (gen_rtx_SET (target, tmp));
20993 if (parts.disp && parts.disp != const0_rtx)
20994 ix86_emit_binop (PLUS, mode, target, parts.disp);
20996 ix86_emit_binop (PLUS, mode, target, tmp1);
20997 return;
21000 ix86_emit_binop (PLUS, mode, target, tmp);
21003 if (parts.disp && parts.disp != const0_rtx)
21004 ix86_emit_binop (PLUS, mode, target, parts.disp);
21008 /* Return true if it is ok to optimize an ADD operation to LEA
21009 operation to avoid flag register consumation. For most processors,
21010 ADD is faster than LEA. For the processors like BONNELL, if the
21011 destination register of LEA holds an actual address which will be
21012 used soon, LEA is better and otherwise ADD is better. */
21014 bool
21015 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21017 unsigned int regno0 = true_regnum (operands[0]);
21018 unsigned int regno1 = true_regnum (operands[1]);
21019 unsigned int regno2 = true_regnum (operands[2]);
21021 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21022 if (regno0 != regno1 && regno0 != regno2)
21023 return true;
21025 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21026 return false;
21028 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21031 /* Return true if destination reg of SET_BODY is shift count of
21032 USE_BODY. */
21034 static bool
21035 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21037 rtx set_dest;
21038 rtx shift_rtx;
21039 int i;
21041 /* Retrieve destination of SET_BODY. */
21042 switch (GET_CODE (set_body))
21044 case SET:
21045 set_dest = SET_DEST (set_body);
21046 if (!set_dest || !REG_P (set_dest))
21047 return false;
21048 break;
21049 case PARALLEL:
21050 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21051 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21052 use_body))
21053 return true;
21054 /* FALLTHROUGH */
21055 default:
21056 return false;
21059 /* Retrieve shift count of USE_BODY. */
21060 switch (GET_CODE (use_body))
21062 case SET:
21063 shift_rtx = XEXP (use_body, 1);
21064 break;
21065 case PARALLEL:
21066 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21067 if (ix86_dep_by_shift_count_body (set_body,
21068 XVECEXP (use_body, 0, i)))
21069 return true;
21070 /* FALLTHROUGH */
21071 default:
21072 return false;
21075 if (shift_rtx
21076 && (GET_CODE (shift_rtx) == ASHIFT
21077 || GET_CODE (shift_rtx) == LSHIFTRT
21078 || GET_CODE (shift_rtx) == ASHIFTRT
21079 || GET_CODE (shift_rtx) == ROTATE
21080 || GET_CODE (shift_rtx) == ROTATERT))
21082 rtx shift_count = XEXP (shift_rtx, 1);
21084 /* Return true if shift count is dest of SET_BODY. */
21085 if (REG_P (shift_count))
21087 /* Add check since it can be invoked before register
21088 allocation in pre-reload schedule. */
21089 if (reload_completed
21090 && true_regnum (set_dest) == true_regnum (shift_count))
21091 return true;
21092 else if (REGNO(set_dest) == REGNO(shift_count))
21093 return true;
21097 return false;
21100 /* Return true if destination reg of SET_INSN is shift count of
21101 USE_INSN. */
21103 bool
21104 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21106 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21107 PATTERN (use_insn));
21110 /* Return TRUE or FALSE depending on whether the unary operator meets the
21111 appropriate constraints. */
21113 bool
21114 ix86_unary_operator_ok (enum rtx_code,
21115 machine_mode,
21116 rtx operands[2])
21118 /* If one of operands is memory, source and destination must match. */
21119 if ((MEM_P (operands[0])
21120 || MEM_P (operands[1]))
21121 && ! rtx_equal_p (operands[0], operands[1]))
21122 return false;
21123 return true;
21126 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21127 are ok, keeping in mind the possible movddup alternative. */
21129 bool
21130 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21132 if (MEM_P (operands[0]))
21133 return rtx_equal_p (operands[0], operands[1 + high]);
21134 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21135 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21136 return true;
21139 /* Post-reload splitter for converting an SF or DFmode value in an
21140 SSE register into an unsigned SImode. */
21142 void
21143 ix86_split_convert_uns_si_sse (rtx operands[])
21145 machine_mode vecmode;
21146 rtx value, large, zero_or_two31, input, two31, x;
21148 large = operands[1];
21149 zero_or_two31 = operands[2];
21150 input = operands[3];
21151 two31 = operands[4];
21152 vecmode = GET_MODE (large);
21153 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21155 /* Load up the value into the low element. We must ensure that the other
21156 elements are valid floats -- zero is the easiest such value. */
21157 if (MEM_P (input))
21159 if (vecmode == V4SFmode)
21160 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21161 else
21162 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21164 else
21166 input = gen_rtx_REG (vecmode, REGNO (input));
21167 emit_move_insn (value, CONST0_RTX (vecmode));
21168 if (vecmode == V4SFmode)
21169 emit_insn (gen_sse_movss (value, value, input));
21170 else
21171 emit_insn (gen_sse2_movsd (value, value, input));
21174 emit_move_insn (large, two31);
21175 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21177 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21178 emit_insn (gen_rtx_SET (large, x));
21180 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21181 emit_insn (gen_rtx_SET (zero_or_two31, x));
21183 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21184 emit_insn (gen_rtx_SET (value, x));
21186 large = gen_rtx_REG (V4SImode, REGNO (large));
21187 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21189 x = gen_rtx_REG (V4SImode, REGNO (value));
21190 if (vecmode == V4SFmode)
21191 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21192 else
21193 emit_insn (gen_sse2_cvttpd2dq (x, value));
21194 value = x;
21196 emit_insn (gen_xorv4si3 (value, value, large));
21199 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21200 Expects the 64-bit DImode to be supplied in a pair of integral
21201 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21202 -mfpmath=sse, !optimize_size only. */
21204 void
21205 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21207 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21208 rtx int_xmm, fp_xmm;
21209 rtx biases, exponents;
21210 rtx x;
21212 int_xmm = gen_reg_rtx (V4SImode);
21213 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21214 emit_insn (gen_movdi_to_sse (int_xmm, input));
21215 else if (TARGET_SSE_SPLIT_REGS)
21217 emit_clobber (int_xmm);
21218 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21220 else
21222 x = gen_reg_rtx (V2DImode);
21223 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21224 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21227 x = gen_rtx_CONST_VECTOR (V4SImode,
21228 gen_rtvec (4, GEN_INT (0x43300000UL),
21229 GEN_INT (0x45300000UL),
21230 const0_rtx, const0_rtx));
21231 exponents = validize_mem (force_const_mem (V4SImode, x));
21233 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21234 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21236 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21237 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21238 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21239 (0x1.0p84 + double(fp_value_hi_xmm)).
21240 Note these exponents differ by 32. */
21242 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21244 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21245 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21246 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21247 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21248 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21249 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21250 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21251 biases = validize_mem (force_const_mem (V2DFmode, biases));
21252 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21254 /* Add the upper and lower DFmode values together. */
21255 if (TARGET_SSE3)
21256 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21257 else
21259 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21260 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21261 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21264 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21267 /* Not used, but eases macroization of patterns. */
21268 void
21269 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21271 gcc_unreachable ();
21274 /* Convert an unsigned SImode value into a DFmode. Only currently used
21275 for SSE, but applicable anywhere. */
21277 void
21278 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21280 REAL_VALUE_TYPE TWO31r;
21281 rtx x, fp;
21283 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21284 NULL, 1, OPTAB_DIRECT);
21286 fp = gen_reg_rtx (DFmode);
21287 emit_insn (gen_floatsidf2 (fp, x));
21289 real_ldexp (&TWO31r, &dconst1, 31);
21290 x = const_double_from_real_value (TWO31r, DFmode);
21292 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21293 if (x != target)
21294 emit_move_insn (target, x);
21297 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21298 32-bit mode; otherwise we have a direct convert instruction. */
21300 void
21301 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21303 REAL_VALUE_TYPE TWO32r;
21304 rtx fp_lo, fp_hi, x;
21306 fp_lo = gen_reg_rtx (DFmode);
21307 fp_hi = gen_reg_rtx (DFmode);
21309 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21311 real_ldexp (&TWO32r, &dconst1, 32);
21312 x = const_double_from_real_value (TWO32r, DFmode);
21313 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21315 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21317 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21318 0, OPTAB_DIRECT);
21319 if (x != target)
21320 emit_move_insn (target, x);
21323 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21324 For x86_32, -mfpmath=sse, !optimize_size only. */
21325 void
21326 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21328 REAL_VALUE_TYPE ONE16r;
21329 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21331 real_ldexp (&ONE16r, &dconst1, 16);
21332 x = const_double_from_real_value (ONE16r, SFmode);
21333 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21334 NULL, 0, OPTAB_DIRECT);
21335 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21336 NULL, 0, OPTAB_DIRECT);
21337 fp_hi = gen_reg_rtx (SFmode);
21338 fp_lo = gen_reg_rtx (SFmode);
21339 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21340 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21341 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21342 0, OPTAB_DIRECT);
21343 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21344 0, OPTAB_DIRECT);
21345 if (!rtx_equal_p (target, fp_hi))
21346 emit_move_insn (target, fp_hi);
21349 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21350 a vector of unsigned ints VAL to vector of floats TARGET. */
21352 void
21353 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21355 rtx tmp[8];
21356 REAL_VALUE_TYPE TWO16r;
21357 machine_mode intmode = GET_MODE (val);
21358 machine_mode fltmode = GET_MODE (target);
21359 rtx (*cvt) (rtx, rtx);
21361 if (intmode == V4SImode)
21362 cvt = gen_floatv4siv4sf2;
21363 else
21364 cvt = gen_floatv8siv8sf2;
21365 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21366 tmp[0] = force_reg (intmode, tmp[0]);
21367 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21368 OPTAB_DIRECT);
21369 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21370 NULL_RTX, 1, OPTAB_DIRECT);
21371 tmp[3] = gen_reg_rtx (fltmode);
21372 emit_insn (cvt (tmp[3], tmp[1]));
21373 tmp[4] = gen_reg_rtx (fltmode);
21374 emit_insn (cvt (tmp[4], tmp[2]));
21375 real_ldexp (&TWO16r, &dconst1, 16);
21376 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21377 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21378 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21379 OPTAB_DIRECT);
21380 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21381 OPTAB_DIRECT);
21382 if (tmp[7] != target)
21383 emit_move_insn (target, tmp[7]);
21386 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21387 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21388 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21389 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21392 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21394 REAL_VALUE_TYPE TWO31r;
21395 rtx two31r, tmp[4];
21396 machine_mode mode = GET_MODE (val);
21397 machine_mode scalarmode = GET_MODE_INNER (mode);
21398 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21399 rtx (*cmp) (rtx, rtx, rtx, rtx);
21400 int i;
21402 for (i = 0; i < 3; i++)
21403 tmp[i] = gen_reg_rtx (mode);
21404 real_ldexp (&TWO31r, &dconst1, 31);
21405 two31r = const_double_from_real_value (TWO31r, scalarmode);
21406 two31r = ix86_build_const_vector (mode, 1, two31r);
21407 two31r = force_reg (mode, two31r);
21408 switch (mode)
21410 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21411 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21412 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21413 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21414 default: gcc_unreachable ();
21416 tmp[3] = gen_rtx_LE (mode, two31r, val);
21417 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21418 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21419 0, OPTAB_DIRECT);
21420 if (intmode == V4SImode || TARGET_AVX2)
21421 *xorp = expand_simple_binop (intmode, ASHIFT,
21422 gen_lowpart (intmode, tmp[0]),
21423 GEN_INT (31), NULL_RTX, 0,
21424 OPTAB_DIRECT);
21425 else
21427 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21428 two31 = ix86_build_const_vector (intmode, 1, two31);
21429 *xorp = expand_simple_binop (intmode, AND,
21430 gen_lowpart (intmode, tmp[0]),
21431 two31, NULL_RTX, 0,
21432 OPTAB_DIRECT);
21434 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21435 0, OPTAB_DIRECT);
21438 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21439 then replicate the value for all elements of the vector
21440 register. */
21443 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21445 int i, n_elt;
21446 rtvec v;
21447 machine_mode scalar_mode;
21449 switch (mode)
21451 case E_V64QImode:
21452 case E_V32QImode:
21453 case E_V16QImode:
21454 case E_V32HImode:
21455 case E_V16HImode:
21456 case E_V8HImode:
21457 case E_V16SImode:
21458 case E_V8SImode:
21459 case E_V4SImode:
21460 case E_V8DImode:
21461 case E_V4DImode:
21462 case E_V2DImode:
21463 gcc_assert (vect);
21464 /* FALLTHRU */
21465 case E_V16SFmode:
21466 case E_V8SFmode:
21467 case E_V4SFmode:
21468 case E_V8DFmode:
21469 case E_V4DFmode:
21470 case E_V2DFmode:
21471 n_elt = GET_MODE_NUNITS (mode);
21472 v = rtvec_alloc (n_elt);
21473 scalar_mode = GET_MODE_INNER (mode);
21475 RTVEC_ELT (v, 0) = value;
21477 for (i = 1; i < n_elt; ++i)
21478 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21480 return gen_rtx_CONST_VECTOR (mode, v);
21482 default:
21483 gcc_unreachable ();
21487 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21488 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21489 for an SSE register. If VECT is true, then replicate the mask for
21490 all elements of the vector register. If INVERT is true, then create
21491 a mask excluding the sign bit. */
21494 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21496 machine_mode vec_mode, imode;
21497 wide_int w;
21498 rtx mask, v;
21500 switch (mode)
21502 case E_V16SImode:
21503 case E_V16SFmode:
21504 case E_V8SImode:
21505 case E_V4SImode:
21506 case E_V8SFmode:
21507 case E_V4SFmode:
21508 vec_mode = mode;
21509 imode = SImode;
21510 break;
21512 case E_V8DImode:
21513 case E_V4DImode:
21514 case E_V2DImode:
21515 case E_V8DFmode:
21516 case E_V4DFmode:
21517 case E_V2DFmode:
21518 vec_mode = mode;
21519 imode = DImode;
21520 break;
21522 case E_TImode:
21523 case E_TFmode:
21524 vec_mode = VOIDmode;
21525 imode = TImode;
21526 break;
21528 default:
21529 gcc_unreachable ();
21532 machine_mode inner_mode = GET_MODE_INNER (mode);
21533 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21534 GET_MODE_BITSIZE (inner_mode));
21535 if (invert)
21536 w = wi::bit_not (w);
21538 /* Force this value into the low part of a fp vector constant. */
21539 mask = immed_wide_int_const (w, imode);
21540 mask = gen_lowpart (inner_mode, mask);
21542 if (vec_mode == VOIDmode)
21543 return force_reg (inner_mode, mask);
21545 v = ix86_build_const_vector (vec_mode, vect, mask);
21546 return force_reg (vec_mode, v);
21549 /* Generate code for floating point ABS or NEG. */
21551 void
21552 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21553 rtx operands[])
21555 rtx mask, set, dst, src;
21556 bool use_sse = false;
21557 bool vector_mode = VECTOR_MODE_P (mode);
21558 machine_mode vmode = mode;
21560 if (vector_mode)
21561 use_sse = true;
21562 else if (mode == TFmode)
21563 use_sse = true;
21564 else if (TARGET_SSE_MATH)
21566 use_sse = SSE_FLOAT_MODE_P (mode);
21567 if (mode == SFmode)
21568 vmode = V4SFmode;
21569 else if (mode == DFmode)
21570 vmode = V2DFmode;
21573 /* NEG and ABS performed with SSE use bitwise mask operations.
21574 Create the appropriate mask now. */
21575 if (use_sse)
21576 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21577 else
21578 mask = NULL_RTX;
21580 dst = operands[0];
21581 src = operands[1];
21583 set = gen_rtx_fmt_e (code, mode, src);
21584 set = gen_rtx_SET (dst, set);
21586 if (mask)
21588 rtx use, clob;
21589 rtvec par;
21591 use = gen_rtx_USE (VOIDmode, mask);
21592 if (vector_mode)
21593 par = gen_rtvec (2, set, use);
21594 else
21596 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21597 par = gen_rtvec (3, set, use, clob);
21599 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21601 else
21602 emit_insn (set);
21605 /* Expand a copysign operation. Special case operand 0 being a constant. */
21607 void
21608 ix86_expand_copysign (rtx operands[])
21610 machine_mode mode, vmode;
21611 rtx dest, op0, op1, mask, nmask;
21613 dest = operands[0];
21614 op0 = operands[1];
21615 op1 = operands[2];
21617 mode = GET_MODE (dest);
21619 if (mode == SFmode)
21620 vmode = V4SFmode;
21621 else if (mode == DFmode)
21622 vmode = V2DFmode;
21623 else
21624 vmode = mode;
21626 if (CONST_DOUBLE_P (op0))
21628 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21630 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21631 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21633 if (mode == SFmode || mode == DFmode)
21635 if (op0 == CONST0_RTX (mode))
21636 op0 = CONST0_RTX (vmode);
21637 else
21639 rtx v = ix86_build_const_vector (vmode, false, op0);
21641 op0 = force_reg (vmode, v);
21644 else if (op0 != CONST0_RTX (mode))
21645 op0 = force_reg (mode, op0);
21647 mask = ix86_build_signbit_mask (vmode, 0, 0);
21649 if (mode == SFmode)
21650 copysign_insn = gen_copysignsf3_const;
21651 else if (mode == DFmode)
21652 copysign_insn = gen_copysigndf3_const;
21653 else
21654 copysign_insn = gen_copysigntf3_const;
21656 emit_insn (copysign_insn (dest, op0, op1, mask));
21658 else
21660 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21662 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21663 mask = ix86_build_signbit_mask (vmode, 0, 0);
21665 if (mode == SFmode)
21666 copysign_insn = gen_copysignsf3_var;
21667 else if (mode == DFmode)
21668 copysign_insn = gen_copysigndf3_var;
21669 else
21670 copysign_insn = gen_copysigntf3_var;
21672 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21676 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21677 be a constant, and so has already been expanded into a vector constant. */
21679 void
21680 ix86_split_copysign_const (rtx operands[])
21682 machine_mode mode, vmode;
21683 rtx dest, op0, mask, x;
21685 dest = operands[0];
21686 op0 = operands[1];
21687 mask = operands[3];
21689 mode = GET_MODE (dest);
21690 vmode = GET_MODE (mask);
21692 dest = lowpart_subreg (vmode, dest, mode);
21693 x = gen_rtx_AND (vmode, dest, mask);
21694 emit_insn (gen_rtx_SET (dest, x));
21696 if (op0 != CONST0_RTX (vmode))
21698 x = gen_rtx_IOR (vmode, dest, op0);
21699 emit_insn (gen_rtx_SET (dest, x));
21703 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21704 so we have to do two masks. */
21706 void
21707 ix86_split_copysign_var (rtx operands[])
21709 machine_mode mode, vmode;
21710 rtx dest, scratch, op0, op1, mask, nmask, x;
21712 dest = operands[0];
21713 scratch = operands[1];
21714 op0 = operands[2];
21715 op1 = operands[3];
21716 nmask = operands[4];
21717 mask = operands[5];
21719 mode = GET_MODE (dest);
21720 vmode = GET_MODE (mask);
21722 if (rtx_equal_p (op0, op1))
21724 /* Shouldn't happen often (it's useless, obviously), but when it does
21725 we'd generate incorrect code if we continue below. */
21726 emit_move_insn (dest, op0);
21727 return;
21730 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21732 gcc_assert (REGNO (op1) == REGNO (scratch));
21734 x = gen_rtx_AND (vmode, scratch, mask);
21735 emit_insn (gen_rtx_SET (scratch, x));
21737 dest = mask;
21738 op0 = lowpart_subreg (vmode, op0, mode);
21739 x = gen_rtx_NOT (vmode, dest);
21740 x = gen_rtx_AND (vmode, x, op0);
21741 emit_insn (gen_rtx_SET (dest, x));
21743 else
21745 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21747 x = gen_rtx_AND (vmode, scratch, mask);
21749 else /* alternative 2,4 */
21751 gcc_assert (REGNO (mask) == REGNO (scratch));
21752 op1 = lowpart_subreg (vmode, op1, mode);
21753 x = gen_rtx_AND (vmode, scratch, op1);
21755 emit_insn (gen_rtx_SET (scratch, x));
21757 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21759 dest = lowpart_subreg (vmode, op0, mode);
21760 x = gen_rtx_AND (vmode, dest, nmask);
21762 else /* alternative 3,4 */
21764 gcc_assert (REGNO (nmask) == REGNO (dest));
21765 dest = nmask;
21766 op0 = lowpart_subreg (vmode, op0, mode);
21767 x = gen_rtx_AND (vmode, dest, op0);
21769 emit_insn (gen_rtx_SET (dest, x));
21772 x = gen_rtx_IOR (vmode, dest, scratch);
21773 emit_insn (gen_rtx_SET (dest, x));
21776 /* Return TRUE or FALSE depending on whether the first SET in INSN
21777 has source and destination with matching CC modes, and that the
21778 CC mode is at least as constrained as REQ_MODE. */
21780 bool
21781 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21783 rtx set;
21784 machine_mode set_mode;
21786 set = PATTERN (insn);
21787 if (GET_CODE (set) == PARALLEL)
21788 set = XVECEXP (set, 0, 0);
21789 gcc_assert (GET_CODE (set) == SET);
21790 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21792 set_mode = GET_MODE (SET_DEST (set));
21793 switch (set_mode)
21795 case E_CCNOmode:
21796 if (req_mode != CCNOmode
21797 && (req_mode != CCmode
21798 || XEXP (SET_SRC (set), 1) != const0_rtx))
21799 return false;
21800 break;
21801 case E_CCmode:
21802 if (req_mode == CCGCmode)
21803 return false;
21804 /* FALLTHRU */
21805 case E_CCGCmode:
21806 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21807 return false;
21808 /* FALLTHRU */
21809 case E_CCGOCmode:
21810 if (req_mode == CCZmode)
21811 return false;
21812 /* FALLTHRU */
21813 case E_CCZmode:
21814 break;
21816 case E_CCGZmode:
21818 case E_CCAmode:
21819 case E_CCCmode:
21820 case E_CCOmode:
21821 case E_CCPmode:
21822 case E_CCSmode:
21823 if (set_mode != req_mode)
21824 return false;
21825 break;
21827 default:
21828 gcc_unreachable ();
21831 return GET_MODE (SET_SRC (set)) == set_mode;
21834 /* Generate insn patterns to do an integer compare of OPERANDS. */
21836 static rtx
21837 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
21839 machine_mode cmpmode;
21840 rtx tmp, flags;
21842 cmpmode = SELECT_CC_MODE (code, op0, op1);
21843 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
21845 /* This is very simple, but making the interface the same as in the
21846 FP case makes the rest of the code easier. */
21847 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
21848 emit_insn (gen_rtx_SET (flags, tmp));
21850 /* Return the test that should be put into the flags user, i.e.
21851 the bcc, scc, or cmov instruction. */
21852 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
21855 /* Figure out whether to use unordered fp comparisons. */
21857 static bool
21858 ix86_unordered_fp_compare (enum rtx_code code)
21860 if (!TARGET_IEEE_FP)
21861 return false;
21863 switch (code)
21865 case GT:
21866 case GE:
21867 case LT:
21868 case LE:
21869 return false;
21871 case EQ:
21872 case NE:
21874 case LTGT:
21875 case UNORDERED:
21876 case ORDERED:
21877 case UNLT:
21878 case UNLE:
21879 case UNGT:
21880 case UNGE:
21881 case UNEQ:
21882 return true;
21884 default:
21885 gcc_unreachable ();
21889 machine_mode
21890 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
21892 machine_mode mode = GET_MODE (op0);
21894 if (SCALAR_FLOAT_MODE_P (mode))
21896 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
21897 return CCFPmode;
21900 switch (code)
21902 /* Only zero flag is needed. */
21903 case EQ: /* ZF=0 */
21904 case NE: /* ZF!=0 */
21905 return CCZmode;
21906 /* Codes needing carry flag. */
21907 case GEU: /* CF=0 */
21908 case LTU: /* CF=1 */
21909 /* Detect overflow checks. They need just the carry flag. */
21910 if (GET_CODE (op0) == PLUS
21911 && (rtx_equal_p (op1, XEXP (op0, 0))
21912 || rtx_equal_p (op1, XEXP (op0, 1))))
21913 return CCCmode;
21914 else
21915 return CCmode;
21916 case GTU: /* CF=0 & ZF=0 */
21917 case LEU: /* CF=1 | ZF=1 */
21918 return CCmode;
21919 /* Codes possibly doable only with sign flag when
21920 comparing against zero. */
21921 case GE: /* SF=OF or SF=0 */
21922 case LT: /* SF<>OF or SF=1 */
21923 if (op1 == const0_rtx)
21924 return CCGOCmode;
21925 else
21926 /* For other cases Carry flag is not required. */
21927 return CCGCmode;
21928 /* Codes doable only with sign flag when comparing
21929 against zero, but we miss jump instruction for it
21930 so we need to use relational tests against overflow
21931 that thus needs to be zero. */
21932 case GT: /* ZF=0 & SF=OF */
21933 case LE: /* ZF=1 | SF<>OF */
21934 if (op1 == const0_rtx)
21935 return CCNOmode;
21936 else
21937 return CCGCmode;
21938 /* strcmp pattern do (use flags) and combine may ask us for proper
21939 mode. */
21940 case USE:
21941 return CCmode;
21942 default:
21943 gcc_unreachable ();
21947 /* Return the fixed registers used for condition codes. */
21949 static bool
21950 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
21952 *p1 = FLAGS_REG;
21953 *p2 = FPSR_REG;
21954 return true;
21957 /* If two condition code modes are compatible, return a condition code
21958 mode which is compatible with both. Otherwise, return
21959 VOIDmode. */
21961 static machine_mode
21962 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
21964 if (m1 == m2)
21965 return m1;
21967 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
21968 return VOIDmode;
21970 if ((m1 == CCGCmode && m2 == CCGOCmode)
21971 || (m1 == CCGOCmode && m2 == CCGCmode))
21972 return CCGCmode;
21974 if ((m1 == CCNOmode && m2 == CCGOCmode)
21975 || (m1 == CCGOCmode && m2 == CCNOmode))
21976 return CCNOmode;
21978 if (m1 == CCZmode
21979 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
21980 return m2;
21981 else if (m2 == CCZmode
21982 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
21983 return m1;
21985 switch (m1)
21987 default:
21988 gcc_unreachable ();
21990 case E_CCmode:
21991 case E_CCGCmode:
21992 case E_CCGOCmode:
21993 case E_CCNOmode:
21994 case E_CCAmode:
21995 case E_CCCmode:
21996 case E_CCOmode:
21997 case E_CCPmode:
21998 case E_CCSmode:
21999 case E_CCZmode:
22000 switch (m2)
22002 default:
22003 return VOIDmode;
22005 case E_CCmode:
22006 case E_CCGCmode:
22007 case E_CCGOCmode:
22008 case E_CCNOmode:
22009 case E_CCAmode:
22010 case E_CCCmode:
22011 case E_CCOmode:
22012 case E_CCPmode:
22013 case E_CCSmode:
22014 case E_CCZmode:
22015 return CCmode;
22018 case E_CCFPmode:
22019 /* These are only compatible with themselves, which we already
22020 checked above. */
22021 return VOIDmode;
22026 /* Return a comparison we can do and that it is equivalent to
22027 swap_condition (code) apart possibly from orderedness.
22028 But, never change orderedness if TARGET_IEEE_FP, returning
22029 UNKNOWN in that case if necessary. */
22031 static enum rtx_code
22032 ix86_fp_swap_condition (enum rtx_code code)
22034 switch (code)
22036 case GT: /* GTU - CF=0 & ZF=0 */
22037 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22038 case GE: /* GEU - CF=0 */
22039 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22040 case UNLT: /* LTU - CF=1 */
22041 return TARGET_IEEE_FP ? UNKNOWN : GT;
22042 case UNLE: /* LEU - CF=1 | ZF=1 */
22043 return TARGET_IEEE_FP ? UNKNOWN : GE;
22044 default:
22045 return swap_condition (code);
22049 /* Return cost of comparison CODE using the best strategy for performance.
22050 All following functions do use number of instructions as a cost metrics.
22051 In future this should be tweaked to compute bytes for optimize_size and
22052 take into account performance of various instructions on various CPUs. */
22054 static int
22055 ix86_fp_comparison_cost (enum rtx_code code)
22057 int arith_cost;
22059 /* The cost of code using bit-twiddling on %ah. */
22060 switch (code)
22062 case UNLE:
22063 case UNLT:
22064 case LTGT:
22065 case GT:
22066 case GE:
22067 case UNORDERED:
22068 case ORDERED:
22069 case UNEQ:
22070 arith_cost = 4;
22071 break;
22072 case LT:
22073 case NE:
22074 case EQ:
22075 case UNGE:
22076 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22077 break;
22078 case LE:
22079 case UNGT:
22080 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22081 break;
22082 default:
22083 gcc_unreachable ();
22086 switch (ix86_fp_comparison_strategy (code))
22088 case IX86_FPCMP_COMI:
22089 return arith_cost > 4 ? 3 : 2;
22090 case IX86_FPCMP_SAHF:
22091 return arith_cost > 4 ? 4 : 3;
22092 default:
22093 return arith_cost;
22097 /* Return strategy to use for floating-point. We assume that fcomi is always
22098 preferrable where available, since that is also true when looking at size
22099 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22101 enum ix86_fpcmp_strategy
22102 ix86_fp_comparison_strategy (enum rtx_code)
22104 /* Do fcomi/sahf based test when profitable. */
22106 if (TARGET_CMOVE)
22107 return IX86_FPCMP_COMI;
22109 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22110 return IX86_FPCMP_SAHF;
22112 return IX86_FPCMP_ARITH;
22115 /* Swap, force into registers, or otherwise massage the two operands
22116 to a fp comparison. The operands are updated in place; the new
22117 comparison code is returned. */
22119 static enum rtx_code
22120 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22122 bool unordered_compare = ix86_unordered_fp_compare (code);
22123 rtx op0 = *pop0, op1 = *pop1;
22124 machine_mode op_mode = GET_MODE (op0);
22125 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22127 /* All of the unordered compare instructions only work on registers.
22128 The same is true of the fcomi compare instructions. The XFmode
22129 compare instructions require registers except when comparing
22130 against zero or when converting operand 1 from fixed point to
22131 floating point. */
22133 if (!is_sse
22134 && (unordered_compare
22135 || (op_mode == XFmode
22136 && ! (standard_80387_constant_p (op0) == 1
22137 || standard_80387_constant_p (op1) == 1)
22138 && GET_CODE (op1) != FLOAT)
22139 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22141 op0 = force_reg (op_mode, op0);
22142 op1 = force_reg (op_mode, op1);
22144 else
22146 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22147 things around if they appear profitable, otherwise force op0
22148 into a register. */
22150 if (standard_80387_constant_p (op0) == 0
22151 || (MEM_P (op0)
22152 && ! (standard_80387_constant_p (op1) == 0
22153 || MEM_P (op1))))
22155 enum rtx_code new_code = ix86_fp_swap_condition (code);
22156 if (new_code != UNKNOWN)
22158 std::swap (op0, op1);
22159 code = new_code;
22163 if (!REG_P (op0))
22164 op0 = force_reg (op_mode, op0);
22166 if (CONSTANT_P (op1))
22168 int tmp = standard_80387_constant_p (op1);
22169 if (tmp == 0)
22170 op1 = validize_mem (force_const_mem (op_mode, op1));
22171 else if (tmp == 1)
22173 if (TARGET_CMOVE)
22174 op1 = force_reg (op_mode, op1);
22176 else
22177 op1 = force_reg (op_mode, op1);
22181 /* Try to rearrange the comparison to make it cheaper. */
22182 if (ix86_fp_comparison_cost (code)
22183 > ix86_fp_comparison_cost (swap_condition (code))
22184 && (REG_P (op1) || can_create_pseudo_p ()))
22186 std::swap (op0, op1);
22187 code = swap_condition (code);
22188 if (!REG_P (op0))
22189 op0 = force_reg (op_mode, op0);
22192 *pop0 = op0;
22193 *pop1 = op1;
22194 return code;
22197 /* Convert comparison codes we use to represent FP comparison to integer
22198 code that will result in proper branch. Return UNKNOWN if no such code
22199 is available. */
22201 enum rtx_code
22202 ix86_fp_compare_code_to_integer (enum rtx_code code)
22204 switch (code)
22206 case GT:
22207 return GTU;
22208 case GE:
22209 return GEU;
22210 case ORDERED:
22211 case UNORDERED:
22212 return code;
22213 case UNEQ:
22214 return EQ;
22215 case UNLT:
22216 return LTU;
22217 case UNLE:
22218 return LEU;
22219 case LTGT:
22220 return NE;
22221 default:
22222 return UNKNOWN;
22226 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22228 static rtx
22229 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22231 bool unordered_compare = ix86_unordered_fp_compare (code);
22232 machine_mode intcmp_mode;
22233 rtx tmp, tmp2;
22235 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22237 /* Do fcomi/sahf based test when profitable. */
22238 switch (ix86_fp_comparison_strategy (code))
22240 case IX86_FPCMP_COMI:
22241 intcmp_mode = CCFPmode;
22242 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22243 if (unordered_compare)
22244 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22245 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22246 break;
22248 case IX86_FPCMP_SAHF:
22249 intcmp_mode = CCFPmode;
22250 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22251 if (unordered_compare)
22252 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22253 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22254 if (!scratch)
22255 scratch = gen_reg_rtx (HImode);
22256 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22257 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22258 break;
22260 case IX86_FPCMP_ARITH:
22261 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22262 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22263 if (unordered_compare)
22264 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22265 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22266 if (!scratch)
22267 scratch = gen_reg_rtx (HImode);
22268 emit_insn (gen_rtx_SET (scratch, tmp));
22270 /* In the unordered case, we have to check C2 for NaN's, which
22271 doesn't happen to work out to anything nice combination-wise.
22272 So do some bit twiddling on the value we've got in AH to come
22273 up with an appropriate set of condition codes. */
22275 intcmp_mode = CCNOmode;
22276 switch (code)
22278 case GT:
22279 case UNGT:
22280 if (code == GT || !TARGET_IEEE_FP)
22282 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22283 code = EQ;
22285 else
22287 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22288 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22289 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22290 intcmp_mode = CCmode;
22291 code = GEU;
22293 break;
22294 case LT:
22295 case UNLT:
22296 if (code == LT && TARGET_IEEE_FP)
22298 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22299 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22300 intcmp_mode = CCmode;
22301 code = EQ;
22303 else
22305 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22306 code = NE;
22308 break;
22309 case GE:
22310 case UNGE:
22311 if (code == GE || !TARGET_IEEE_FP)
22313 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22314 code = EQ;
22316 else
22318 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22319 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22320 code = NE;
22322 break;
22323 case LE:
22324 case UNLE:
22325 if (code == LE && TARGET_IEEE_FP)
22327 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22328 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22329 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22330 intcmp_mode = CCmode;
22331 code = LTU;
22333 else
22335 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22336 code = NE;
22338 break;
22339 case EQ:
22340 case UNEQ:
22341 if (code == EQ && TARGET_IEEE_FP)
22343 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22344 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22345 intcmp_mode = CCmode;
22346 code = EQ;
22348 else
22350 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22351 code = NE;
22353 break;
22354 case NE:
22355 case LTGT:
22356 if (code == NE && TARGET_IEEE_FP)
22358 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22359 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22360 GEN_INT (0x40)));
22361 code = NE;
22363 else
22365 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22366 code = EQ;
22368 break;
22370 case UNORDERED:
22371 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22372 code = NE;
22373 break;
22374 case ORDERED:
22375 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22376 code = EQ;
22377 break;
22379 default:
22380 gcc_unreachable ();
22382 break;
22384 default:
22385 gcc_unreachable();
22388 /* Return the test that should be put into the flags user, i.e.
22389 the bcc, scc, or cmov instruction. */
22390 return gen_rtx_fmt_ee (code, VOIDmode,
22391 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22392 const0_rtx);
22395 static rtx
22396 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22398 rtx ret;
22400 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22401 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22403 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22405 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22406 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22408 else
22409 ret = ix86_expand_int_compare (code, op0, op1);
22411 return ret;
22414 void
22415 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22417 machine_mode mode = GET_MODE (op0);
22418 rtx tmp;
22420 /* Handle special case - vector comparsion with boolean result, transform
22421 it using ptest instruction. */
22422 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22424 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22425 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22427 gcc_assert (code == EQ || code == NE);
22428 /* Generate XOR since we can't check that one operand is zero vector. */
22429 tmp = gen_reg_rtx (mode);
22430 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22431 tmp = gen_lowpart (p_mode, tmp);
22432 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22433 gen_rtx_UNSPEC (CCmode,
22434 gen_rtvec (2, tmp, tmp),
22435 UNSPEC_PTEST)));
22436 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22437 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22438 gen_rtx_LABEL_REF (VOIDmode, label),
22439 pc_rtx);
22440 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22441 return;
22444 switch (mode)
22446 case E_SFmode:
22447 case E_DFmode:
22448 case E_XFmode:
22449 case E_QImode:
22450 case E_HImode:
22451 case E_SImode:
22452 simple:
22453 tmp = ix86_expand_compare (code, op0, op1);
22454 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22455 gen_rtx_LABEL_REF (VOIDmode, label),
22456 pc_rtx);
22457 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22458 return;
22460 case E_DImode:
22461 if (TARGET_64BIT)
22462 goto simple;
22463 /* For 32-bit target DI comparison may be performed on
22464 SSE registers. To allow this we should avoid split
22465 to SI mode which is achieved by doing xor in DI mode
22466 and then comparing with zero (which is recognized by
22467 STV pass). We don't compare using xor when optimizing
22468 for size. */
22469 if (!optimize_insn_for_size_p ()
22470 && TARGET_STV
22471 && (code == EQ || code == NE))
22473 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22474 op1 = const0_rtx;
22476 /* FALLTHRU */
22477 case E_TImode:
22478 /* Expand DImode branch into multiple compare+branch. */
22480 rtx lo[2], hi[2];
22481 rtx_code_label *label2;
22482 enum rtx_code code1, code2, code3;
22483 machine_mode submode;
22485 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22487 std::swap (op0, op1);
22488 code = swap_condition (code);
22491 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22492 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22494 submode = mode == DImode ? SImode : DImode;
22496 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22497 avoid two branches. This costs one extra insn, so disable when
22498 optimizing for size. */
22500 if ((code == EQ || code == NE)
22501 && (!optimize_insn_for_size_p ()
22502 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22504 rtx xor0, xor1;
22506 xor1 = hi[0];
22507 if (hi[1] != const0_rtx)
22508 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22509 NULL_RTX, 0, OPTAB_WIDEN);
22511 xor0 = lo[0];
22512 if (lo[1] != const0_rtx)
22513 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22514 NULL_RTX, 0, OPTAB_WIDEN);
22516 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22517 NULL_RTX, 0, OPTAB_WIDEN);
22519 ix86_expand_branch (code, tmp, const0_rtx, label);
22520 return;
22523 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22524 op1 is a constant and the low word is zero, then we can just
22525 examine the high word. Similarly for low word -1 and
22526 less-or-equal-than or greater-than. */
22528 if (CONST_INT_P (hi[1]))
22529 switch (code)
22531 case LT: case LTU: case GE: case GEU:
22532 if (lo[1] == const0_rtx)
22534 ix86_expand_branch (code, hi[0], hi[1], label);
22535 return;
22537 break;
22538 case LE: case LEU: case GT: case GTU:
22539 if (lo[1] == constm1_rtx)
22541 ix86_expand_branch (code, hi[0], hi[1], label);
22542 return;
22544 break;
22545 default:
22546 break;
22549 /* Emulate comparisons that do not depend on Zero flag with
22550 double-word subtraction. Note that only Overflow, Sign
22551 and Carry flags are valid, so swap arguments and condition
22552 of comparisons that would otherwise test Zero flag. */
22554 switch (code)
22556 case LE: case LEU: case GT: case GTU:
22557 std::swap (lo[0], lo[1]);
22558 std::swap (hi[0], hi[1]);
22559 code = swap_condition (code);
22560 /* FALLTHRU */
22562 case LT: case LTU: case GE: case GEU:
22564 rtx (*cmp_insn) (rtx, rtx);
22565 rtx (*sbb_insn) (rtx, rtx, rtx);
22566 bool uns = (code == LTU || code == GEU);
22568 if (TARGET_64BIT)
22570 cmp_insn = gen_cmpdi_1;
22571 sbb_insn
22572 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
22574 else
22576 cmp_insn = gen_cmpsi_1;
22577 sbb_insn
22578 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
22581 if (!nonimmediate_operand (lo[0], submode))
22582 lo[0] = force_reg (submode, lo[0]);
22583 if (!x86_64_general_operand (lo[1], submode))
22584 lo[1] = force_reg (submode, lo[1]);
22586 if (!register_operand (hi[0], submode))
22587 hi[0] = force_reg (submode, hi[0]);
22588 if ((uns && !nonimmediate_operand (hi[1], submode))
22589 || (!uns && !x86_64_general_operand (hi[1], submode)))
22590 hi[1] = force_reg (submode, hi[1]);
22592 emit_insn (cmp_insn (lo[0], lo[1]));
22593 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
22595 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
22597 ix86_expand_branch (code, tmp, const0_rtx, label);
22598 return;
22601 default:
22602 break;
22605 /* Otherwise, we need two or three jumps. */
22607 label2 = gen_label_rtx ();
22609 code1 = code;
22610 code2 = swap_condition (code);
22611 code3 = unsigned_condition (code);
22613 switch (code)
22615 case LT: case GT: case LTU: case GTU:
22616 break;
22618 case LE: code1 = LT; code2 = GT; break;
22619 case GE: code1 = GT; code2 = LT; break;
22620 case LEU: code1 = LTU; code2 = GTU; break;
22621 case GEU: code1 = GTU; code2 = LTU; break;
22623 case EQ: code1 = UNKNOWN; code2 = NE; break;
22624 case NE: code2 = UNKNOWN; break;
22626 default:
22627 gcc_unreachable ();
22631 * a < b =>
22632 * if (hi(a) < hi(b)) goto true;
22633 * if (hi(a) > hi(b)) goto false;
22634 * if (lo(a) < lo(b)) goto true;
22635 * false:
22638 if (code1 != UNKNOWN)
22639 ix86_expand_branch (code1, hi[0], hi[1], label);
22640 if (code2 != UNKNOWN)
22641 ix86_expand_branch (code2, hi[0], hi[1], label2);
22643 ix86_expand_branch (code3, lo[0], lo[1], label);
22645 if (code2 != UNKNOWN)
22646 emit_label (label2);
22647 return;
22650 default:
22651 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22652 goto simple;
22656 void
22657 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22659 rtx ret;
22661 gcc_assert (GET_MODE (dest) == QImode);
22663 ret = ix86_expand_compare (code, op0, op1);
22664 PUT_MODE (ret, QImode);
22665 emit_insn (gen_rtx_SET (dest, ret));
22668 /* Expand comparison setting or clearing carry flag. Return true when
22669 successful and set pop for the operation. */
22670 static bool
22671 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22673 machine_mode mode =
22674 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22676 /* Do not handle double-mode compares that go through special path. */
22677 if (mode == (TARGET_64BIT ? TImode : DImode))
22678 return false;
22680 if (SCALAR_FLOAT_MODE_P (mode))
22682 rtx compare_op;
22683 rtx_insn *compare_seq;
22685 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22687 /* Shortcut: following common codes never translate
22688 into carry flag compares. */
22689 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22690 || code == ORDERED || code == UNORDERED)
22691 return false;
22693 /* These comparisons require zero flag; swap operands so they won't. */
22694 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22695 && !TARGET_IEEE_FP)
22697 std::swap (op0, op1);
22698 code = swap_condition (code);
22701 /* Try to expand the comparison and verify that we end up with
22702 carry flag based comparison. This fails to be true only when
22703 we decide to expand comparison using arithmetic that is not
22704 too common scenario. */
22705 start_sequence ();
22706 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22707 compare_seq = get_insns ();
22708 end_sequence ();
22710 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
22711 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22712 else
22713 code = GET_CODE (compare_op);
22715 if (code != LTU && code != GEU)
22716 return false;
22718 emit_insn (compare_seq);
22719 *pop = compare_op;
22720 return true;
22723 if (!INTEGRAL_MODE_P (mode))
22724 return false;
22726 switch (code)
22728 case LTU:
22729 case GEU:
22730 break;
22732 /* Convert a==0 into (unsigned)a<1. */
22733 case EQ:
22734 case NE:
22735 if (op1 != const0_rtx)
22736 return false;
22737 op1 = const1_rtx;
22738 code = (code == EQ ? LTU : GEU);
22739 break;
22741 /* Convert a>b into b<a or a>=b-1. */
22742 case GTU:
22743 case LEU:
22744 if (CONST_INT_P (op1))
22746 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22747 /* Bail out on overflow. We still can swap operands but that
22748 would force loading of the constant into register. */
22749 if (op1 == const0_rtx
22750 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22751 return false;
22752 code = (code == GTU ? GEU : LTU);
22754 else
22756 std::swap (op0, op1);
22757 code = (code == GTU ? LTU : GEU);
22759 break;
22761 /* Convert a>=0 into (unsigned)a<0x80000000. */
22762 case LT:
22763 case GE:
22764 if (mode == DImode || op1 != const0_rtx)
22765 return false;
22766 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22767 code = (code == LT ? GEU : LTU);
22768 break;
22769 case LE:
22770 case GT:
22771 if (mode == DImode || op1 != constm1_rtx)
22772 return false;
22773 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22774 code = (code == LE ? GEU : LTU);
22775 break;
22777 default:
22778 return false;
22780 /* Swapping operands may cause constant to appear as first operand. */
22781 if (!nonimmediate_operand (op0, VOIDmode))
22783 if (!can_create_pseudo_p ())
22784 return false;
22785 op0 = force_reg (mode, op0);
22787 *pop = ix86_expand_compare (code, op0, op1);
22788 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22789 return true;
22792 bool
22793 ix86_expand_int_movcc (rtx operands[])
22795 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22796 rtx_insn *compare_seq;
22797 rtx compare_op;
22798 machine_mode mode = GET_MODE (operands[0]);
22799 bool sign_bit_compare_p = false;
22800 rtx op0 = XEXP (operands[1], 0);
22801 rtx op1 = XEXP (operands[1], 1);
22803 if (GET_MODE (op0) == TImode
22804 || (GET_MODE (op0) == DImode
22805 && !TARGET_64BIT))
22806 return false;
22808 start_sequence ();
22809 compare_op = ix86_expand_compare (code, op0, op1);
22810 compare_seq = get_insns ();
22811 end_sequence ();
22813 compare_code = GET_CODE (compare_op);
22815 if ((op1 == const0_rtx && (code == GE || code == LT))
22816 || (op1 == constm1_rtx && (code == GT || code == LE)))
22817 sign_bit_compare_p = true;
22819 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22820 HImode insns, we'd be swallowed in word prefix ops. */
22822 if ((mode != HImode || TARGET_FAST_PREFIX)
22823 && (mode != (TARGET_64BIT ? TImode : DImode))
22824 && CONST_INT_P (operands[2])
22825 && CONST_INT_P (operands[3]))
22827 rtx out = operands[0];
22828 HOST_WIDE_INT ct = INTVAL (operands[2]);
22829 HOST_WIDE_INT cf = INTVAL (operands[3]);
22830 HOST_WIDE_INT diff;
22832 diff = ct - cf;
22833 /* Sign bit compares are better done using shifts than we do by using
22834 sbb. */
22835 if (sign_bit_compare_p
22836 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22838 /* Detect overlap between destination and compare sources. */
22839 rtx tmp = out;
22841 if (!sign_bit_compare_p)
22843 rtx flags;
22844 bool fpcmp = false;
22846 compare_code = GET_CODE (compare_op);
22848 flags = XEXP (compare_op, 0);
22850 if (GET_MODE (flags) == CCFPmode)
22852 fpcmp = true;
22853 compare_code
22854 = ix86_fp_compare_code_to_integer (compare_code);
22857 /* To simplify rest of code, restrict to the GEU case. */
22858 if (compare_code == LTU)
22860 std::swap (ct, cf);
22861 compare_code = reverse_condition (compare_code);
22862 code = reverse_condition (code);
22864 else
22866 if (fpcmp)
22867 PUT_CODE (compare_op,
22868 reverse_condition_maybe_unordered
22869 (GET_CODE (compare_op)));
22870 else
22871 PUT_CODE (compare_op,
22872 reverse_condition (GET_CODE (compare_op)));
22874 diff = ct - cf;
22876 if (reg_overlap_mentioned_p (out, op0)
22877 || reg_overlap_mentioned_p (out, op1))
22878 tmp = gen_reg_rtx (mode);
22880 if (mode == DImode)
22881 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22882 else
22883 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22884 flags, compare_op));
22886 else
22888 if (code == GT || code == GE)
22889 code = reverse_condition (code);
22890 else
22892 std::swap (ct, cf);
22893 diff = ct - cf;
22895 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
22898 if (diff == 1)
22901 * cmpl op0,op1
22902 * sbbl dest,dest
22903 * [addl dest, ct]
22905 * Size 5 - 8.
22907 if (ct)
22908 tmp = expand_simple_binop (mode, PLUS,
22909 tmp, GEN_INT (ct),
22910 copy_rtx (tmp), 1, OPTAB_DIRECT);
22912 else if (cf == -1)
22915 * cmpl op0,op1
22916 * sbbl dest,dest
22917 * orl $ct, dest
22919 * Size 8.
22921 tmp = expand_simple_binop (mode, IOR,
22922 tmp, GEN_INT (ct),
22923 copy_rtx (tmp), 1, OPTAB_DIRECT);
22925 else if (diff == -1 && ct)
22928 * cmpl op0,op1
22929 * sbbl dest,dest
22930 * notl dest
22931 * [addl dest, cf]
22933 * Size 8 - 11.
22935 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22936 if (cf)
22937 tmp = expand_simple_binop (mode, PLUS,
22938 copy_rtx (tmp), GEN_INT (cf),
22939 copy_rtx (tmp), 1, OPTAB_DIRECT);
22941 else
22944 * cmpl op0,op1
22945 * sbbl dest,dest
22946 * [notl dest]
22947 * andl cf - ct, dest
22948 * [addl dest, ct]
22950 * Size 8 - 11.
22953 if (cf == 0)
22955 cf = ct;
22956 ct = 0;
22957 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22960 tmp = expand_simple_binop (mode, AND,
22961 copy_rtx (tmp),
22962 gen_int_mode (cf - ct, mode),
22963 copy_rtx (tmp), 1, OPTAB_DIRECT);
22964 if (ct)
22965 tmp = expand_simple_binop (mode, PLUS,
22966 copy_rtx (tmp), GEN_INT (ct),
22967 copy_rtx (tmp), 1, OPTAB_DIRECT);
22970 if (!rtx_equal_p (tmp, out))
22971 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
22973 return true;
22976 if (diff < 0)
22978 machine_mode cmp_mode = GET_MODE (op0);
22979 enum rtx_code new_code;
22981 if (SCALAR_FLOAT_MODE_P (cmp_mode))
22983 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
22985 /* We may be reversing unordered compare to normal compare, that
22986 is not valid in general (we may convert non-trapping condition
22987 to trapping one), however on i386 we currently emit all
22988 comparisons unordered. */
22989 new_code = reverse_condition_maybe_unordered (code);
22991 else
22992 new_code = ix86_reverse_condition (code, cmp_mode);
22993 if (new_code != UNKNOWN)
22995 std::swap (ct, cf);
22996 diff = -diff;
22997 code = new_code;
23001 compare_code = UNKNOWN;
23002 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23003 && CONST_INT_P (op1))
23005 if (op1 == const0_rtx
23006 && (code == LT || code == GE))
23007 compare_code = code;
23008 else if (op1 == constm1_rtx)
23010 if (code == LE)
23011 compare_code = LT;
23012 else if (code == GT)
23013 compare_code = GE;
23017 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23018 if (compare_code != UNKNOWN
23019 && GET_MODE (op0) == GET_MODE (out)
23020 && (cf == -1 || ct == -1))
23022 /* If lea code below could be used, only optimize
23023 if it results in a 2 insn sequence. */
23025 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23026 || diff == 3 || diff == 5 || diff == 9)
23027 || (compare_code == LT && ct == -1)
23028 || (compare_code == GE && cf == -1))
23031 * notl op1 (if necessary)
23032 * sarl $31, op1
23033 * orl cf, op1
23035 if (ct != -1)
23037 cf = ct;
23038 ct = -1;
23039 code = reverse_condition (code);
23042 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23044 out = expand_simple_binop (mode, IOR,
23045 out, GEN_INT (cf),
23046 out, 1, OPTAB_DIRECT);
23047 if (out != operands[0])
23048 emit_move_insn (operands[0], out);
23050 return true;
23055 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23056 || diff == 3 || diff == 5 || diff == 9)
23057 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23058 && (mode != DImode
23059 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23062 * xorl dest,dest
23063 * cmpl op1,op2
23064 * setcc dest
23065 * lea cf(dest*(ct-cf)),dest
23067 * Size 14.
23069 * This also catches the degenerate setcc-only case.
23072 rtx tmp;
23073 int nops;
23075 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23077 nops = 0;
23078 /* On x86_64 the lea instruction operates on Pmode, so we need
23079 to get arithmetics done in proper mode to match. */
23080 if (diff == 1)
23081 tmp = copy_rtx (out);
23082 else
23084 rtx out1;
23085 out1 = copy_rtx (out);
23086 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23087 nops++;
23088 if (diff & 1)
23090 tmp = gen_rtx_PLUS (mode, tmp, out1);
23091 nops++;
23094 if (cf != 0)
23096 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23097 nops++;
23099 if (!rtx_equal_p (tmp, out))
23101 if (nops == 1)
23102 out = force_operand (tmp, copy_rtx (out));
23103 else
23104 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23106 if (!rtx_equal_p (out, operands[0]))
23107 emit_move_insn (operands[0], copy_rtx (out));
23109 return true;
23113 * General case: Jumpful:
23114 * xorl dest,dest cmpl op1, op2
23115 * cmpl op1, op2 movl ct, dest
23116 * setcc dest jcc 1f
23117 * decl dest movl cf, dest
23118 * andl (cf-ct),dest 1:
23119 * addl ct,dest
23121 * Size 20. Size 14.
23123 * This is reasonably steep, but branch mispredict costs are
23124 * high on modern cpus, so consider failing only if optimizing
23125 * for space.
23128 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23129 && BRANCH_COST (optimize_insn_for_speed_p (),
23130 false) >= 2)
23132 if (cf == 0)
23134 machine_mode cmp_mode = GET_MODE (op0);
23135 enum rtx_code new_code;
23137 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23139 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23141 /* We may be reversing unordered compare to normal compare,
23142 that is not valid in general (we may convert non-trapping
23143 condition to trapping one), however on i386 we currently
23144 emit all comparisons unordered. */
23145 new_code = reverse_condition_maybe_unordered (code);
23147 else
23149 new_code = ix86_reverse_condition (code, cmp_mode);
23150 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23151 compare_code = reverse_condition (compare_code);
23154 if (new_code != UNKNOWN)
23156 cf = ct;
23157 ct = 0;
23158 code = new_code;
23162 if (compare_code != UNKNOWN)
23164 /* notl op1 (if needed)
23165 sarl $31, op1
23166 andl (cf-ct), op1
23167 addl ct, op1
23169 For x < 0 (resp. x <= -1) there will be no notl,
23170 so if possible swap the constants to get rid of the
23171 complement.
23172 True/false will be -1/0 while code below (store flag
23173 followed by decrement) is 0/-1, so the constants need
23174 to be exchanged once more. */
23176 if (compare_code == GE || !cf)
23178 code = reverse_condition (code);
23179 compare_code = LT;
23181 else
23182 std::swap (ct, cf);
23184 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23186 else
23188 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23190 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23191 constm1_rtx,
23192 copy_rtx (out), 1, OPTAB_DIRECT);
23195 out = expand_simple_binop (mode, AND, copy_rtx (out),
23196 gen_int_mode (cf - ct, mode),
23197 copy_rtx (out), 1, OPTAB_DIRECT);
23198 if (ct)
23199 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23200 copy_rtx (out), 1, OPTAB_DIRECT);
23201 if (!rtx_equal_p (out, operands[0]))
23202 emit_move_insn (operands[0], copy_rtx (out));
23204 return true;
23208 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23210 /* Try a few things more with specific constants and a variable. */
23212 optab op;
23213 rtx var, orig_out, out, tmp;
23215 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23216 return false;
23218 /* If one of the two operands is an interesting constant, load a
23219 constant with the above and mask it in with a logical operation. */
23221 if (CONST_INT_P (operands[2]))
23223 var = operands[3];
23224 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23225 operands[3] = constm1_rtx, op = and_optab;
23226 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23227 operands[3] = const0_rtx, op = ior_optab;
23228 else
23229 return false;
23231 else if (CONST_INT_P (operands[3]))
23233 var = operands[2];
23234 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23235 operands[2] = constm1_rtx, op = and_optab;
23236 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23237 operands[2] = const0_rtx, op = ior_optab;
23238 else
23239 return false;
23241 else
23242 return false;
23244 orig_out = operands[0];
23245 tmp = gen_reg_rtx (mode);
23246 operands[0] = tmp;
23248 /* Recurse to get the constant loaded. */
23249 if (!ix86_expand_int_movcc (operands))
23250 return false;
23252 /* Mask in the interesting variable. */
23253 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23254 OPTAB_WIDEN);
23255 if (!rtx_equal_p (out, orig_out))
23256 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23258 return true;
23262 * For comparison with above,
23264 * movl cf,dest
23265 * movl ct,tmp
23266 * cmpl op1,op2
23267 * cmovcc tmp,dest
23269 * Size 15.
23272 if (! nonimmediate_operand (operands[2], mode))
23273 operands[2] = force_reg (mode, operands[2]);
23274 if (! nonimmediate_operand (operands[3], mode))
23275 operands[3] = force_reg (mode, operands[3]);
23277 if (! register_operand (operands[2], VOIDmode)
23278 && (mode == QImode
23279 || ! register_operand (operands[3], VOIDmode)))
23280 operands[2] = force_reg (mode, operands[2]);
23282 if (mode == QImode
23283 && ! register_operand (operands[3], VOIDmode))
23284 operands[3] = force_reg (mode, operands[3]);
23286 emit_insn (compare_seq);
23287 emit_insn (gen_rtx_SET (operands[0],
23288 gen_rtx_IF_THEN_ELSE (mode,
23289 compare_op, operands[2],
23290 operands[3])));
23291 return true;
23294 /* Swap, force into registers, or otherwise massage the two operands
23295 to an sse comparison with a mask result. Thus we differ a bit from
23296 ix86_prepare_fp_compare_args which expects to produce a flags result.
23298 The DEST operand exists to help determine whether to commute commutative
23299 operators. The POP0/POP1 operands are updated in place. The new
23300 comparison code is returned, or UNKNOWN if not implementable. */
23302 static enum rtx_code
23303 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23304 rtx *pop0, rtx *pop1)
23306 switch (code)
23308 case LTGT:
23309 case UNEQ:
23310 /* AVX supports all the needed comparisons. */
23311 if (TARGET_AVX)
23312 break;
23313 /* We have no LTGT as an operator. We could implement it with
23314 NE & ORDERED, but this requires an extra temporary. It's
23315 not clear that it's worth it. */
23316 return UNKNOWN;
23318 case LT:
23319 case LE:
23320 case UNGT:
23321 case UNGE:
23322 /* These are supported directly. */
23323 break;
23325 case EQ:
23326 case NE:
23327 case UNORDERED:
23328 case ORDERED:
23329 /* AVX has 3 operand comparisons, no need to swap anything. */
23330 if (TARGET_AVX)
23331 break;
23332 /* For commutative operators, try to canonicalize the destination
23333 operand to be first in the comparison - this helps reload to
23334 avoid extra moves. */
23335 if (!dest || !rtx_equal_p (dest, *pop1))
23336 break;
23337 /* FALLTHRU */
23339 case GE:
23340 case GT:
23341 case UNLE:
23342 case UNLT:
23343 /* These are not supported directly before AVX, and furthermore
23344 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23345 comparison operands to transform into something that is
23346 supported. */
23347 std::swap (*pop0, *pop1);
23348 code = swap_condition (code);
23349 break;
23351 default:
23352 gcc_unreachable ();
23355 return code;
23358 /* Detect conditional moves that exactly match min/max operational
23359 semantics. Note that this is IEEE safe, as long as we don't
23360 interchange the operands.
23362 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23363 and TRUE if the operation is successful and instructions are emitted. */
23365 static bool
23366 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23367 rtx cmp_op1, rtx if_true, rtx if_false)
23369 machine_mode mode;
23370 bool is_min;
23371 rtx tmp;
23373 if (code == LT)
23375 else if (code == UNGE)
23376 std::swap (if_true, if_false);
23377 else
23378 return false;
23380 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23381 is_min = true;
23382 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23383 is_min = false;
23384 else
23385 return false;
23387 mode = GET_MODE (dest);
23389 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23390 but MODE may be a vector mode and thus not appropriate. */
23391 if (!flag_finite_math_only || flag_signed_zeros)
23393 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23394 rtvec v;
23396 if_true = force_reg (mode, if_true);
23397 v = gen_rtvec (2, if_true, if_false);
23398 tmp = gen_rtx_UNSPEC (mode, v, u);
23400 else
23402 code = is_min ? SMIN : SMAX;
23403 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23406 emit_insn (gen_rtx_SET (dest, tmp));
23407 return true;
23410 /* Expand an sse vector comparison. Return the register with the result. */
23412 static rtx
23413 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23414 rtx op_true, rtx op_false)
23416 machine_mode mode = GET_MODE (dest);
23417 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23419 /* In general case result of comparison can differ from operands' type. */
23420 machine_mode cmp_mode;
23422 /* In AVX512F the result of comparison is an integer mask. */
23423 bool maskcmp = false;
23424 rtx x;
23426 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23428 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23429 cmp_mode = int_mode_for_size (nbits, 0).require ();
23430 maskcmp = true;
23432 else
23433 cmp_mode = cmp_ops_mode;
23436 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23437 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23438 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23440 if (optimize
23441 || (maskcmp && cmp_mode != mode)
23442 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23443 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23444 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23446 /* Compare patterns for int modes are unspec in AVX512F only. */
23447 if (maskcmp && (code == GT || code == EQ))
23449 rtx (*gen)(rtx, rtx, rtx);
23451 switch (cmp_ops_mode)
23453 case E_V64QImode:
23454 gcc_assert (TARGET_AVX512BW);
23455 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23456 break;
23457 case E_V32HImode:
23458 gcc_assert (TARGET_AVX512BW);
23459 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23460 break;
23461 case E_V16SImode:
23462 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23463 break;
23464 case E_V8DImode:
23465 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23466 break;
23467 default:
23468 gen = NULL;
23471 if (gen)
23473 emit_insn (gen (dest, cmp_op0, cmp_op1));
23474 return dest;
23477 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23479 if (cmp_mode != mode && !maskcmp)
23481 x = force_reg (cmp_ops_mode, x);
23482 convert_move (dest, x, false);
23484 else
23485 emit_insn (gen_rtx_SET (dest, x));
23487 return dest;
23490 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23491 operations. This is used for both scalar and vector conditional moves. */
23493 void
23494 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23496 machine_mode mode = GET_MODE (dest);
23497 machine_mode cmpmode = GET_MODE (cmp);
23499 /* In AVX512F the result of comparison is an integer mask. */
23500 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23502 rtx t2, t3, x;
23504 /* If we have an integer mask and FP value then we need
23505 to cast mask to FP mode. */
23506 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23508 cmp = force_reg (cmpmode, cmp);
23509 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23512 if (vector_all_ones_operand (op_true, mode)
23513 && rtx_equal_p (op_false, CONST0_RTX (mode))
23514 && !maskcmp)
23516 emit_insn (gen_rtx_SET (dest, cmp));
23518 else if (op_false == CONST0_RTX (mode)
23519 && !maskcmp)
23521 op_true = force_reg (mode, op_true);
23522 x = gen_rtx_AND (mode, cmp, op_true);
23523 emit_insn (gen_rtx_SET (dest, x));
23525 else if (op_true == CONST0_RTX (mode)
23526 && !maskcmp)
23528 op_false = force_reg (mode, op_false);
23529 x = gen_rtx_NOT (mode, cmp);
23530 x = gen_rtx_AND (mode, x, op_false);
23531 emit_insn (gen_rtx_SET (dest, x));
23533 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23534 && !maskcmp)
23536 op_false = force_reg (mode, op_false);
23537 x = gen_rtx_IOR (mode, cmp, op_false);
23538 emit_insn (gen_rtx_SET (dest, x));
23540 else if (TARGET_XOP
23541 && !maskcmp)
23543 op_true = force_reg (mode, op_true);
23545 if (!nonimmediate_operand (op_false, mode))
23546 op_false = force_reg (mode, op_false);
23548 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23549 op_true,
23550 op_false)));
23552 else
23554 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23555 rtx d = dest;
23557 if (!nonimmediate_operand (op_true, mode))
23558 op_true = force_reg (mode, op_true);
23560 op_false = force_reg (mode, op_false);
23562 switch (mode)
23564 case E_V4SFmode:
23565 if (TARGET_SSE4_1)
23566 gen = gen_sse4_1_blendvps;
23567 break;
23568 case E_V2DFmode:
23569 if (TARGET_SSE4_1)
23570 gen = gen_sse4_1_blendvpd;
23571 break;
23572 case E_V16QImode:
23573 case E_V8HImode:
23574 case E_V4SImode:
23575 case E_V2DImode:
23576 if (TARGET_SSE4_1)
23578 gen = gen_sse4_1_pblendvb;
23579 if (mode != V16QImode)
23580 d = gen_reg_rtx (V16QImode);
23581 op_false = gen_lowpart (V16QImode, op_false);
23582 op_true = gen_lowpart (V16QImode, op_true);
23583 cmp = gen_lowpart (V16QImode, cmp);
23585 break;
23586 case E_V8SFmode:
23587 if (TARGET_AVX)
23588 gen = gen_avx_blendvps256;
23589 break;
23590 case E_V4DFmode:
23591 if (TARGET_AVX)
23592 gen = gen_avx_blendvpd256;
23593 break;
23594 case E_V32QImode:
23595 case E_V16HImode:
23596 case E_V8SImode:
23597 case E_V4DImode:
23598 if (TARGET_AVX2)
23600 gen = gen_avx2_pblendvb;
23601 if (mode != V32QImode)
23602 d = gen_reg_rtx (V32QImode);
23603 op_false = gen_lowpart (V32QImode, op_false);
23604 op_true = gen_lowpart (V32QImode, op_true);
23605 cmp = gen_lowpart (V32QImode, cmp);
23607 break;
23609 case E_V64QImode:
23610 gen = gen_avx512bw_blendmv64qi;
23611 break;
23612 case E_V32HImode:
23613 gen = gen_avx512bw_blendmv32hi;
23614 break;
23615 case E_V16SImode:
23616 gen = gen_avx512f_blendmv16si;
23617 break;
23618 case E_V8DImode:
23619 gen = gen_avx512f_blendmv8di;
23620 break;
23621 case E_V8DFmode:
23622 gen = gen_avx512f_blendmv8df;
23623 break;
23624 case E_V16SFmode:
23625 gen = gen_avx512f_blendmv16sf;
23626 break;
23628 default:
23629 break;
23632 if (gen != NULL)
23634 emit_insn (gen (d, op_false, op_true, cmp));
23635 if (d != dest)
23636 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23638 else
23640 op_true = force_reg (mode, op_true);
23642 t2 = gen_reg_rtx (mode);
23643 if (optimize)
23644 t3 = gen_reg_rtx (mode);
23645 else
23646 t3 = dest;
23648 x = gen_rtx_AND (mode, op_true, cmp);
23649 emit_insn (gen_rtx_SET (t2, x));
23651 x = gen_rtx_NOT (mode, cmp);
23652 x = gen_rtx_AND (mode, x, op_false);
23653 emit_insn (gen_rtx_SET (t3, x));
23655 x = gen_rtx_IOR (mode, t3, t2);
23656 emit_insn (gen_rtx_SET (dest, x));
23661 /* Expand a floating-point conditional move. Return true if successful. */
23663 bool
23664 ix86_expand_fp_movcc (rtx operands[])
23666 machine_mode mode = GET_MODE (operands[0]);
23667 enum rtx_code code = GET_CODE (operands[1]);
23668 rtx tmp, compare_op;
23669 rtx op0 = XEXP (operands[1], 0);
23670 rtx op1 = XEXP (operands[1], 1);
23672 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23674 machine_mode cmode;
23676 /* Since we've no cmove for sse registers, don't force bad register
23677 allocation just to gain access to it. Deny movcc when the
23678 comparison mode doesn't match the move mode. */
23679 cmode = GET_MODE (op0);
23680 if (cmode == VOIDmode)
23681 cmode = GET_MODE (op1);
23682 if (cmode != mode)
23683 return false;
23685 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23686 if (code == UNKNOWN)
23687 return false;
23689 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23690 operands[2], operands[3]))
23691 return true;
23693 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23694 operands[2], operands[3]);
23695 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23696 return true;
23699 if (GET_MODE (op0) == TImode
23700 || (GET_MODE (op0) == DImode
23701 && !TARGET_64BIT))
23702 return false;
23704 /* The floating point conditional move instructions don't directly
23705 support conditions resulting from a signed integer comparison. */
23707 compare_op = ix86_expand_compare (code, op0, op1);
23708 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23710 tmp = gen_reg_rtx (QImode);
23711 ix86_expand_setcc (tmp, code, op0, op1);
23713 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23716 emit_insn (gen_rtx_SET (operands[0],
23717 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23718 operands[2], operands[3])));
23720 return true;
23723 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23725 static int
23726 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23728 switch (code)
23730 case EQ:
23731 return 0;
23732 case LT:
23733 case LTU:
23734 return 1;
23735 case LE:
23736 case LEU:
23737 return 2;
23738 case NE:
23739 return 4;
23740 case GE:
23741 case GEU:
23742 return 5;
23743 case GT:
23744 case GTU:
23745 return 6;
23746 default:
23747 gcc_unreachable ();
23751 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23753 static int
23754 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23756 switch (code)
23758 case EQ:
23759 return 0x00;
23760 case NE:
23761 return 0x04;
23762 case GT:
23763 return 0x0e;
23764 case LE:
23765 return 0x02;
23766 case GE:
23767 return 0x0d;
23768 case LT:
23769 return 0x01;
23770 case UNLE:
23771 return 0x0a;
23772 case UNLT:
23773 return 0x09;
23774 case UNGE:
23775 return 0x05;
23776 case UNGT:
23777 return 0x06;
23778 case UNEQ:
23779 return 0x18;
23780 case LTGT:
23781 return 0x0c;
23782 case ORDERED:
23783 return 0x07;
23784 case UNORDERED:
23785 return 0x03;
23786 default:
23787 gcc_unreachable ();
23791 /* Return immediate value to be used in UNSPEC_PCMP
23792 for comparison CODE in MODE. */
23794 static int
23795 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23797 if (FLOAT_MODE_P (mode))
23798 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23799 return ix86_int_cmp_code_to_pcmp_immediate (code);
23802 /* Expand AVX-512 vector comparison. */
23804 bool
23805 ix86_expand_mask_vec_cmp (rtx operands[])
23807 machine_mode mask_mode = GET_MODE (operands[0]);
23808 machine_mode cmp_mode = GET_MODE (operands[2]);
23809 enum rtx_code code = GET_CODE (operands[1]);
23810 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23811 int unspec_code;
23812 rtx unspec;
23814 switch (code)
23816 case LEU:
23817 case GTU:
23818 case GEU:
23819 case LTU:
23820 unspec_code = UNSPEC_UNSIGNED_PCMP;
23821 break;
23823 default:
23824 unspec_code = UNSPEC_PCMP;
23827 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23828 operands[3], imm),
23829 unspec_code);
23830 emit_insn (gen_rtx_SET (operands[0], unspec));
23832 return true;
23835 /* Expand fp vector comparison. */
23837 bool
23838 ix86_expand_fp_vec_cmp (rtx operands[])
23840 enum rtx_code code = GET_CODE (operands[1]);
23841 rtx cmp;
23843 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23844 &operands[2], &operands[3]);
23845 if (code == UNKNOWN)
23847 rtx temp;
23848 switch (GET_CODE (operands[1]))
23850 case LTGT:
23851 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23852 operands[3], NULL, NULL);
23853 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23854 operands[3], NULL, NULL);
23855 code = AND;
23856 break;
23857 case UNEQ:
23858 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23859 operands[3], NULL, NULL);
23860 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23861 operands[3], NULL, NULL);
23862 code = IOR;
23863 break;
23864 default:
23865 gcc_unreachable ();
23867 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23868 OPTAB_DIRECT);
23870 else
23871 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23872 operands[1], operands[2]);
23874 if (operands[0] != cmp)
23875 emit_move_insn (operands[0], cmp);
23877 return true;
23880 static rtx
23881 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23882 rtx op_true, rtx op_false, bool *negate)
23884 machine_mode data_mode = GET_MODE (dest);
23885 machine_mode mode = GET_MODE (cop0);
23886 rtx x;
23888 *negate = false;
23890 /* XOP supports all of the comparisons on all 128-bit vector int types. */
23891 if (TARGET_XOP
23892 && (mode == V16QImode || mode == V8HImode
23893 || mode == V4SImode || mode == V2DImode))
23895 else
23897 /* Canonicalize the comparison to EQ, GT, GTU. */
23898 switch (code)
23900 case EQ:
23901 case GT:
23902 case GTU:
23903 break;
23905 case NE:
23906 case LE:
23907 case LEU:
23908 code = reverse_condition (code);
23909 *negate = true;
23910 break;
23912 case GE:
23913 case GEU:
23914 code = reverse_condition (code);
23915 *negate = true;
23916 /* FALLTHRU */
23918 case LT:
23919 case LTU:
23920 std::swap (cop0, cop1);
23921 code = swap_condition (code);
23922 break;
23924 default:
23925 gcc_unreachable ();
23928 /* Only SSE4.1/SSE4.2 supports V2DImode. */
23929 if (mode == V2DImode)
23931 switch (code)
23933 case EQ:
23934 /* SSE4.1 supports EQ. */
23935 if (!TARGET_SSE4_1)
23936 return NULL;
23937 break;
23939 case GT:
23940 case GTU:
23941 /* SSE4.2 supports GT/GTU. */
23942 if (!TARGET_SSE4_2)
23943 return NULL;
23944 break;
23946 default:
23947 gcc_unreachable ();
23951 /* Unsigned parallel compare is not supported by the hardware.
23952 Play some tricks to turn this into a signed comparison
23953 against 0. */
23954 if (code == GTU)
23956 cop0 = force_reg (mode, cop0);
23958 switch (mode)
23960 case E_V16SImode:
23961 case E_V8DImode:
23962 case E_V8SImode:
23963 case E_V4DImode:
23964 case E_V4SImode:
23965 case E_V2DImode:
23967 rtx t1, t2, mask;
23968 rtx (*gen_sub3) (rtx, rtx, rtx);
23970 switch (mode)
23972 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
23973 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
23974 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
23975 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
23976 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
23977 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
23978 default:
23979 gcc_unreachable ();
23981 /* Subtract (-(INT MAX) - 1) from both operands to make
23982 them signed. */
23983 mask = ix86_build_signbit_mask (mode, true, false);
23984 t1 = gen_reg_rtx (mode);
23985 emit_insn (gen_sub3 (t1, cop0, mask));
23987 t2 = gen_reg_rtx (mode);
23988 emit_insn (gen_sub3 (t2, cop1, mask));
23990 cop0 = t1;
23991 cop1 = t2;
23992 code = GT;
23994 break;
23996 case E_V64QImode:
23997 case E_V32HImode:
23998 case E_V32QImode:
23999 case E_V16HImode:
24000 case E_V16QImode:
24001 case E_V8HImode:
24002 /* Perform a parallel unsigned saturating subtraction. */
24003 x = gen_reg_rtx (mode);
24004 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24005 cop1)));
24007 cop0 = x;
24008 cop1 = CONST0_RTX (mode);
24009 code = EQ;
24010 *negate = !*negate;
24011 break;
24013 default:
24014 gcc_unreachable ();
24019 if (*negate)
24020 std::swap (op_true, op_false);
24022 /* Allow the comparison to be done in one mode, but the movcc to
24023 happen in another mode. */
24024 if (data_mode == mode)
24026 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24027 op_true, op_false);
24029 else
24031 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24032 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24033 op_true, op_false);
24034 if (GET_MODE (x) == mode)
24035 x = gen_lowpart (data_mode, x);
24038 return x;
24041 /* Expand integer vector comparison. */
24043 bool
24044 ix86_expand_int_vec_cmp (rtx operands[])
24046 rtx_code code = GET_CODE (operands[1]);
24047 bool negate = false;
24048 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24049 operands[3], NULL, NULL, &negate);
24051 if (!cmp)
24052 return false;
24054 if (negate)
24055 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24056 CONST0_RTX (GET_MODE (cmp)),
24057 NULL, NULL, &negate);
24059 gcc_assert (!negate);
24061 if (operands[0] != cmp)
24062 emit_move_insn (operands[0], cmp);
24064 return true;
24067 /* Expand a floating-point vector conditional move; a vcond operation
24068 rather than a movcc operation. */
24070 bool
24071 ix86_expand_fp_vcond (rtx operands[])
24073 enum rtx_code code = GET_CODE (operands[3]);
24074 rtx cmp;
24076 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24077 &operands[4], &operands[5]);
24078 if (code == UNKNOWN)
24080 rtx temp;
24081 switch (GET_CODE (operands[3]))
24083 case LTGT:
24084 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24085 operands[5], operands[0], operands[0]);
24086 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24087 operands[5], operands[1], operands[2]);
24088 code = AND;
24089 break;
24090 case UNEQ:
24091 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24092 operands[5], operands[0], operands[0]);
24093 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24094 operands[5], operands[1], operands[2]);
24095 code = IOR;
24096 break;
24097 default:
24098 gcc_unreachable ();
24100 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24101 OPTAB_DIRECT);
24102 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24103 return true;
24106 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24107 operands[5], operands[1], operands[2]))
24108 return true;
24110 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24111 operands[1], operands[2]);
24112 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24113 return true;
24116 /* Expand a signed/unsigned integral vector conditional move. */
24118 bool
24119 ix86_expand_int_vcond (rtx operands[])
24121 machine_mode data_mode = GET_MODE (operands[0]);
24122 machine_mode mode = GET_MODE (operands[4]);
24123 enum rtx_code code = GET_CODE (operands[3]);
24124 bool negate = false;
24125 rtx x, cop0, cop1;
24127 cop0 = operands[4];
24128 cop1 = operands[5];
24130 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24131 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24132 if ((code == LT || code == GE)
24133 && data_mode == mode
24134 && cop1 == CONST0_RTX (mode)
24135 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24136 && GET_MODE_UNIT_SIZE (data_mode) > 1
24137 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24138 && (GET_MODE_SIZE (data_mode) == 16
24139 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24141 rtx negop = operands[2 - (code == LT)];
24142 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24143 if (negop == CONST1_RTX (data_mode))
24145 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24146 operands[0], 1, OPTAB_DIRECT);
24147 if (res != operands[0])
24148 emit_move_insn (operands[0], res);
24149 return true;
24151 else if (GET_MODE_INNER (data_mode) != DImode
24152 && vector_all_ones_operand (negop, data_mode))
24154 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24155 operands[0], 0, OPTAB_DIRECT);
24156 if (res != operands[0])
24157 emit_move_insn (operands[0], res);
24158 return true;
24162 if (!nonimmediate_operand (cop1, mode))
24163 cop1 = force_reg (mode, cop1);
24164 if (!general_operand (operands[1], data_mode))
24165 operands[1] = force_reg (data_mode, operands[1]);
24166 if (!general_operand (operands[2], data_mode))
24167 operands[2] = force_reg (data_mode, operands[2]);
24169 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24170 operands[1], operands[2], &negate);
24172 if (!x)
24173 return false;
24175 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24176 operands[2-negate]);
24177 return true;
24180 /* AVX512F does support 64-byte integer vector operations,
24181 thus the longest vector we are faced with is V64QImode. */
24182 #define MAX_VECT_LEN 64
24184 struct expand_vec_perm_d
24186 rtx target, op0, op1;
24187 unsigned char perm[MAX_VECT_LEN];
24188 machine_mode vmode;
24189 unsigned char nelt;
24190 bool one_operand_p;
24191 bool testing_p;
24194 static bool
24195 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24196 struct expand_vec_perm_d *d)
24198 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24199 expander, so args are either in d, or in op0, op1 etc. */
24200 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24201 machine_mode maskmode = mode;
24202 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24204 switch (mode)
24206 case E_V8HImode:
24207 if (TARGET_AVX512VL && TARGET_AVX512BW)
24208 gen = gen_avx512vl_vpermt2varv8hi3;
24209 break;
24210 case E_V16HImode:
24211 if (TARGET_AVX512VL && TARGET_AVX512BW)
24212 gen = gen_avx512vl_vpermt2varv16hi3;
24213 break;
24214 case E_V64QImode:
24215 if (TARGET_AVX512VBMI)
24216 gen = gen_avx512bw_vpermt2varv64qi3;
24217 break;
24218 case E_V32HImode:
24219 if (TARGET_AVX512BW)
24220 gen = gen_avx512bw_vpermt2varv32hi3;
24221 break;
24222 case E_V4SImode:
24223 if (TARGET_AVX512VL)
24224 gen = gen_avx512vl_vpermt2varv4si3;
24225 break;
24226 case E_V8SImode:
24227 if (TARGET_AVX512VL)
24228 gen = gen_avx512vl_vpermt2varv8si3;
24229 break;
24230 case E_V16SImode:
24231 if (TARGET_AVX512F)
24232 gen = gen_avx512f_vpermt2varv16si3;
24233 break;
24234 case E_V4SFmode:
24235 if (TARGET_AVX512VL)
24237 gen = gen_avx512vl_vpermt2varv4sf3;
24238 maskmode = V4SImode;
24240 break;
24241 case E_V8SFmode:
24242 if (TARGET_AVX512VL)
24244 gen = gen_avx512vl_vpermt2varv8sf3;
24245 maskmode = V8SImode;
24247 break;
24248 case E_V16SFmode:
24249 if (TARGET_AVX512F)
24251 gen = gen_avx512f_vpermt2varv16sf3;
24252 maskmode = V16SImode;
24254 break;
24255 case E_V2DImode:
24256 if (TARGET_AVX512VL)
24257 gen = gen_avx512vl_vpermt2varv2di3;
24258 break;
24259 case E_V4DImode:
24260 if (TARGET_AVX512VL)
24261 gen = gen_avx512vl_vpermt2varv4di3;
24262 break;
24263 case E_V8DImode:
24264 if (TARGET_AVX512F)
24265 gen = gen_avx512f_vpermt2varv8di3;
24266 break;
24267 case E_V2DFmode:
24268 if (TARGET_AVX512VL)
24270 gen = gen_avx512vl_vpermt2varv2df3;
24271 maskmode = V2DImode;
24273 break;
24274 case E_V4DFmode:
24275 if (TARGET_AVX512VL)
24277 gen = gen_avx512vl_vpermt2varv4df3;
24278 maskmode = V4DImode;
24280 break;
24281 case E_V8DFmode:
24282 if (TARGET_AVX512F)
24284 gen = gen_avx512f_vpermt2varv8df3;
24285 maskmode = V8DImode;
24287 break;
24288 default:
24289 break;
24292 if (gen == NULL)
24293 return false;
24295 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24296 expander, so args are either in d, or in op0, op1 etc. */
24297 if (d)
24299 rtx vec[64];
24300 target = d->target;
24301 op0 = d->op0;
24302 op1 = d->op1;
24303 for (int i = 0; i < d->nelt; ++i)
24304 vec[i] = GEN_INT (d->perm[i]);
24305 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24308 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24309 return true;
24312 /* Expand a variable vector permutation. */
24314 void
24315 ix86_expand_vec_perm (rtx operands[])
24317 rtx target = operands[0];
24318 rtx op0 = operands[1];
24319 rtx op1 = operands[2];
24320 rtx mask = operands[3];
24321 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24322 machine_mode mode = GET_MODE (op0);
24323 machine_mode maskmode = GET_MODE (mask);
24324 int w, e, i;
24325 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24327 /* Number of elements in the vector. */
24328 w = GET_MODE_NUNITS (mode);
24329 e = GET_MODE_UNIT_SIZE (mode);
24330 gcc_assert (w <= 64);
24332 if (TARGET_AVX512F && one_operand_shuffle)
24334 rtx (*gen) (rtx, rtx, rtx) = NULL;
24335 switch (mode)
24337 case E_V16SImode:
24338 gen =gen_avx512f_permvarv16si;
24339 break;
24340 case E_V16SFmode:
24341 gen = gen_avx512f_permvarv16sf;
24342 break;
24343 case E_V8DImode:
24344 gen = gen_avx512f_permvarv8di;
24345 break;
24346 case E_V8DFmode:
24347 gen = gen_avx512f_permvarv8df;
24348 break;
24349 default:
24350 break;
24352 if (gen != NULL)
24354 emit_insn (gen (target, op0, mask));
24355 return;
24359 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24360 return;
24362 if (TARGET_AVX2)
24364 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24366 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24367 an constant shuffle operand. With a tiny bit of effort we can
24368 use VPERMD instead. A re-interpretation stall for V4DFmode is
24369 unfortunate but there's no avoiding it.
24370 Similarly for V16HImode we don't have instructions for variable
24371 shuffling, while for V32QImode we can use after preparing suitable
24372 masks vpshufb; vpshufb; vpermq; vpor. */
24374 if (mode == V16HImode)
24376 maskmode = mode = V32QImode;
24377 w = 32;
24378 e = 1;
24380 else
24382 maskmode = mode = V8SImode;
24383 w = 8;
24384 e = 4;
24386 t1 = gen_reg_rtx (maskmode);
24388 /* Replicate the low bits of the V4DImode mask into V8SImode:
24389 mask = { A B C D }
24390 t1 = { A A B B C C D D }. */
24391 for (i = 0; i < w / 2; ++i)
24392 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24393 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24394 vt = force_reg (maskmode, vt);
24395 mask = gen_lowpart (maskmode, mask);
24396 if (maskmode == V8SImode)
24397 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24398 else
24399 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24401 /* Multiply the shuffle indicies by two. */
24402 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24403 OPTAB_DIRECT);
24405 /* Add one to the odd shuffle indicies:
24406 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24407 for (i = 0; i < w / 2; ++i)
24409 vec[i * 2] = const0_rtx;
24410 vec[i * 2 + 1] = const1_rtx;
24412 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24413 vt = validize_mem (force_const_mem (maskmode, vt));
24414 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24415 OPTAB_DIRECT);
24417 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24418 operands[3] = mask = t1;
24419 target = gen_reg_rtx (mode);
24420 op0 = gen_lowpart (mode, op0);
24421 op1 = gen_lowpart (mode, op1);
24424 switch (mode)
24426 case E_V8SImode:
24427 /* The VPERMD and VPERMPS instructions already properly ignore
24428 the high bits of the shuffle elements. No need for us to
24429 perform an AND ourselves. */
24430 if (one_operand_shuffle)
24432 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24433 if (target != operands[0])
24434 emit_move_insn (operands[0],
24435 gen_lowpart (GET_MODE (operands[0]), target));
24437 else
24439 t1 = gen_reg_rtx (V8SImode);
24440 t2 = gen_reg_rtx (V8SImode);
24441 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24442 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24443 goto merge_two;
24445 return;
24447 case E_V8SFmode:
24448 mask = gen_lowpart (V8SImode, mask);
24449 if (one_operand_shuffle)
24450 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24451 else
24453 t1 = gen_reg_rtx (V8SFmode);
24454 t2 = gen_reg_rtx (V8SFmode);
24455 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24456 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24457 goto merge_two;
24459 return;
24461 case E_V4SImode:
24462 /* By combining the two 128-bit input vectors into one 256-bit
24463 input vector, we can use VPERMD and VPERMPS for the full
24464 two-operand shuffle. */
24465 t1 = gen_reg_rtx (V8SImode);
24466 t2 = gen_reg_rtx (V8SImode);
24467 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24468 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24469 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24470 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24471 return;
24473 case E_V4SFmode:
24474 t1 = gen_reg_rtx (V8SFmode);
24475 t2 = gen_reg_rtx (V8SImode);
24476 mask = gen_lowpart (V4SImode, mask);
24477 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24478 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24479 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24480 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24481 return;
24483 case E_V32QImode:
24484 t1 = gen_reg_rtx (V32QImode);
24485 t2 = gen_reg_rtx (V32QImode);
24486 t3 = gen_reg_rtx (V32QImode);
24487 vt2 = GEN_INT (-128);
24488 vt = gen_const_vec_duplicate (V32QImode, vt2);
24489 vt = force_reg (V32QImode, vt);
24490 for (i = 0; i < 32; i++)
24491 vec[i] = i < 16 ? vt2 : const0_rtx;
24492 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24493 vt2 = force_reg (V32QImode, vt2);
24494 /* From mask create two adjusted masks, which contain the same
24495 bits as mask in the low 7 bits of each vector element.
24496 The first mask will have the most significant bit clear
24497 if it requests element from the same 128-bit lane
24498 and MSB set if it requests element from the other 128-bit lane.
24499 The second mask will have the opposite values of the MSB,
24500 and additionally will have its 128-bit lanes swapped.
24501 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24502 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24503 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24504 stands for other 12 bytes. */
24505 /* The bit whether element is from the same lane or the other
24506 lane is bit 4, so shift it up by 3 to the MSB position. */
24507 t5 = gen_reg_rtx (V4DImode);
24508 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24509 GEN_INT (3)));
24510 /* Clear MSB bits from the mask just in case it had them set. */
24511 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24512 /* After this t1 will have MSB set for elements from other lane. */
24513 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24514 /* Clear bits other than MSB. */
24515 emit_insn (gen_andv32qi3 (t1, t1, vt));
24516 /* Or in the lower bits from mask into t3. */
24517 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24518 /* And invert MSB bits in t1, so MSB is set for elements from the same
24519 lane. */
24520 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24521 /* Swap 128-bit lanes in t3. */
24522 t6 = gen_reg_rtx (V4DImode);
24523 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24524 const2_rtx, GEN_INT (3),
24525 const0_rtx, const1_rtx));
24526 /* And or in the lower bits from mask into t1. */
24527 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24528 if (one_operand_shuffle)
24530 /* Each of these shuffles will put 0s in places where
24531 element from the other 128-bit lane is needed, otherwise
24532 will shuffle in the requested value. */
24533 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24534 gen_lowpart (V32QImode, t6)));
24535 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24536 /* For t3 the 128-bit lanes are swapped again. */
24537 t7 = gen_reg_rtx (V4DImode);
24538 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24539 const2_rtx, GEN_INT (3),
24540 const0_rtx, const1_rtx));
24541 /* And oring both together leads to the result. */
24542 emit_insn (gen_iorv32qi3 (target, t1,
24543 gen_lowpart (V32QImode, t7)));
24544 if (target != operands[0])
24545 emit_move_insn (operands[0],
24546 gen_lowpart (GET_MODE (operands[0]), target));
24547 return;
24550 t4 = gen_reg_rtx (V32QImode);
24551 /* Similarly to the above one_operand_shuffle code,
24552 just for repeated twice for each operand. merge_two:
24553 code will merge the two results together. */
24554 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24555 gen_lowpart (V32QImode, t6)));
24556 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24557 gen_lowpart (V32QImode, t6)));
24558 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24559 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24560 t7 = gen_reg_rtx (V4DImode);
24561 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24562 const2_rtx, GEN_INT (3),
24563 const0_rtx, const1_rtx));
24564 t8 = gen_reg_rtx (V4DImode);
24565 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24566 const2_rtx, GEN_INT (3),
24567 const0_rtx, const1_rtx));
24568 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24569 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24570 t1 = t4;
24571 t2 = t3;
24572 goto merge_two;
24574 default:
24575 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24576 break;
24580 if (TARGET_XOP)
24582 /* The XOP VPPERM insn supports three inputs. By ignoring the
24583 one_operand_shuffle special case, we avoid creating another
24584 set of constant vectors in memory. */
24585 one_operand_shuffle = false;
24587 /* mask = mask & {2*w-1, ...} */
24588 vt = GEN_INT (2*w - 1);
24590 else
24592 /* mask = mask & {w-1, ...} */
24593 vt = GEN_INT (w - 1);
24596 vt = gen_const_vec_duplicate (maskmode, vt);
24597 mask = expand_simple_binop (maskmode, AND, mask, vt,
24598 NULL_RTX, 0, OPTAB_DIRECT);
24600 /* For non-QImode operations, convert the word permutation control
24601 into a byte permutation control. */
24602 if (mode != V16QImode)
24604 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24605 GEN_INT (exact_log2 (e)),
24606 NULL_RTX, 0, OPTAB_DIRECT);
24608 /* Convert mask to vector of chars. */
24609 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24611 /* Replicate each of the input bytes into byte positions:
24612 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24613 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24614 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24615 for (i = 0; i < 16; ++i)
24616 vec[i] = GEN_INT (i/e * e);
24617 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24618 vt = validize_mem (force_const_mem (V16QImode, vt));
24619 if (TARGET_XOP)
24620 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24621 else
24622 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24624 /* Convert it into the byte positions by doing
24625 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24626 for (i = 0; i < 16; ++i)
24627 vec[i] = GEN_INT (i % e);
24628 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24629 vt = validize_mem (force_const_mem (V16QImode, vt));
24630 emit_insn (gen_addv16qi3 (mask, mask, vt));
24633 /* The actual shuffle operations all operate on V16QImode. */
24634 op0 = gen_lowpart (V16QImode, op0);
24635 op1 = gen_lowpart (V16QImode, op1);
24637 if (TARGET_XOP)
24639 if (GET_MODE (target) != V16QImode)
24640 target = gen_reg_rtx (V16QImode);
24641 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24642 if (target != operands[0])
24643 emit_move_insn (operands[0],
24644 gen_lowpart (GET_MODE (operands[0]), target));
24646 else if (one_operand_shuffle)
24648 if (GET_MODE (target) != V16QImode)
24649 target = gen_reg_rtx (V16QImode);
24650 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24651 if (target != operands[0])
24652 emit_move_insn (operands[0],
24653 gen_lowpart (GET_MODE (operands[0]), target));
24655 else
24657 rtx xops[6];
24658 bool ok;
24660 /* Shuffle the two input vectors independently. */
24661 t1 = gen_reg_rtx (V16QImode);
24662 t2 = gen_reg_rtx (V16QImode);
24663 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24664 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24666 merge_two:
24667 /* Then merge them together. The key is whether any given control
24668 element contained a bit set that indicates the second word. */
24669 mask = operands[3];
24670 vt = GEN_INT (w);
24671 if (maskmode == V2DImode && !TARGET_SSE4_1)
24673 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24674 more shuffle to convert the V2DI input mask into a V4SI
24675 input mask. At which point the masking that expand_int_vcond
24676 will work as desired. */
24677 rtx t3 = gen_reg_rtx (V4SImode);
24678 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24679 const0_rtx, const0_rtx,
24680 const2_rtx, const2_rtx));
24681 mask = t3;
24682 maskmode = V4SImode;
24683 e = w = 4;
24686 vt = gen_const_vec_duplicate (maskmode, vt);
24687 vt = force_reg (maskmode, vt);
24688 mask = expand_simple_binop (maskmode, AND, mask, vt,
24689 NULL_RTX, 0, OPTAB_DIRECT);
24691 if (GET_MODE (target) != mode)
24692 target = gen_reg_rtx (mode);
24693 xops[0] = target;
24694 xops[1] = gen_lowpart (mode, t2);
24695 xops[2] = gen_lowpart (mode, t1);
24696 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24697 xops[4] = mask;
24698 xops[5] = vt;
24699 ok = ix86_expand_int_vcond (xops);
24700 gcc_assert (ok);
24701 if (target != operands[0])
24702 emit_move_insn (operands[0],
24703 gen_lowpart (GET_MODE (operands[0]), target));
24707 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24708 true if we should do zero extension, else sign extension. HIGH_P is
24709 true if we want the N/2 high elements, else the low elements. */
24711 void
24712 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24714 machine_mode imode = GET_MODE (src);
24715 rtx tmp;
24717 if (TARGET_SSE4_1)
24719 rtx (*unpack)(rtx, rtx);
24720 rtx (*extract)(rtx, rtx) = NULL;
24721 machine_mode halfmode = BLKmode;
24723 switch (imode)
24725 case E_V64QImode:
24726 if (unsigned_p)
24727 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24728 else
24729 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24730 halfmode = V32QImode;
24731 extract
24732 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24733 break;
24734 case E_V32QImode:
24735 if (unsigned_p)
24736 unpack = gen_avx2_zero_extendv16qiv16hi2;
24737 else
24738 unpack = gen_avx2_sign_extendv16qiv16hi2;
24739 halfmode = V16QImode;
24740 extract
24741 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24742 break;
24743 case E_V32HImode:
24744 if (unsigned_p)
24745 unpack = gen_avx512f_zero_extendv16hiv16si2;
24746 else
24747 unpack = gen_avx512f_sign_extendv16hiv16si2;
24748 halfmode = V16HImode;
24749 extract
24750 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24751 break;
24752 case E_V16HImode:
24753 if (unsigned_p)
24754 unpack = gen_avx2_zero_extendv8hiv8si2;
24755 else
24756 unpack = gen_avx2_sign_extendv8hiv8si2;
24757 halfmode = V8HImode;
24758 extract
24759 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24760 break;
24761 case E_V16SImode:
24762 if (unsigned_p)
24763 unpack = gen_avx512f_zero_extendv8siv8di2;
24764 else
24765 unpack = gen_avx512f_sign_extendv8siv8di2;
24766 halfmode = V8SImode;
24767 extract
24768 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24769 break;
24770 case E_V8SImode:
24771 if (unsigned_p)
24772 unpack = gen_avx2_zero_extendv4siv4di2;
24773 else
24774 unpack = gen_avx2_sign_extendv4siv4di2;
24775 halfmode = V4SImode;
24776 extract
24777 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24778 break;
24779 case E_V16QImode:
24780 if (unsigned_p)
24781 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24782 else
24783 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24784 break;
24785 case E_V8HImode:
24786 if (unsigned_p)
24787 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24788 else
24789 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24790 break;
24791 case E_V4SImode:
24792 if (unsigned_p)
24793 unpack = gen_sse4_1_zero_extendv2siv2di2;
24794 else
24795 unpack = gen_sse4_1_sign_extendv2siv2di2;
24796 break;
24797 default:
24798 gcc_unreachable ();
24801 if (GET_MODE_SIZE (imode) >= 32)
24803 tmp = gen_reg_rtx (halfmode);
24804 emit_insn (extract (tmp, src));
24806 else if (high_p)
24808 /* Shift higher 8 bytes to lower 8 bytes. */
24809 tmp = gen_reg_rtx (V1TImode);
24810 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24811 GEN_INT (64)));
24812 tmp = gen_lowpart (imode, tmp);
24814 else
24815 tmp = src;
24817 emit_insn (unpack (dest, tmp));
24819 else
24821 rtx (*unpack)(rtx, rtx, rtx);
24823 switch (imode)
24825 case E_V16QImode:
24826 if (high_p)
24827 unpack = gen_vec_interleave_highv16qi;
24828 else
24829 unpack = gen_vec_interleave_lowv16qi;
24830 break;
24831 case E_V8HImode:
24832 if (high_p)
24833 unpack = gen_vec_interleave_highv8hi;
24834 else
24835 unpack = gen_vec_interleave_lowv8hi;
24836 break;
24837 case E_V4SImode:
24838 if (high_p)
24839 unpack = gen_vec_interleave_highv4si;
24840 else
24841 unpack = gen_vec_interleave_lowv4si;
24842 break;
24843 default:
24844 gcc_unreachable ();
24847 if (unsigned_p)
24848 tmp = force_reg (imode, CONST0_RTX (imode));
24849 else
24850 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24851 src, pc_rtx, pc_rtx);
24853 rtx tmp2 = gen_reg_rtx (imode);
24854 emit_insn (unpack (tmp2, src, tmp));
24855 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24859 /* Expand conditional increment or decrement using adb/sbb instructions.
24860 The default case using setcc followed by the conditional move can be
24861 done by generic code. */
24862 bool
24863 ix86_expand_int_addcc (rtx operands[])
24865 enum rtx_code code = GET_CODE (operands[1]);
24866 rtx flags;
24867 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24868 rtx compare_op;
24869 rtx val = const0_rtx;
24870 bool fpcmp = false;
24871 machine_mode mode;
24872 rtx op0 = XEXP (operands[1], 0);
24873 rtx op1 = XEXP (operands[1], 1);
24875 if (operands[3] != const1_rtx
24876 && operands[3] != constm1_rtx)
24877 return false;
24878 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24879 return false;
24880 code = GET_CODE (compare_op);
24882 flags = XEXP (compare_op, 0);
24884 if (GET_MODE (flags) == CCFPmode)
24886 fpcmp = true;
24887 code = ix86_fp_compare_code_to_integer (code);
24890 if (code != LTU)
24892 val = constm1_rtx;
24893 if (fpcmp)
24894 PUT_CODE (compare_op,
24895 reverse_condition_maybe_unordered
24896 (GET_CODE (compare_op)));
24897 else
24898 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
24901 mode = GET_MODE (operands[0]);
24903 /* Construct either adc or sbb insn. */
24904 if ((code == LTU) == (operands[3] == constm1_rtx))
24906 switch (mode)
24908 case E_QImode:
24909 insn = gen_subqi3_carry;
24910 break;
24911 case E_HImode:
24912 insn = gen_subhi3_carry;
24913 break;
24914 case E_SImode:
24915 insn = gen_subsi3_carry;
24916 break;
24917 case E_DImode:
24918 insn = gen_subdi3_carry;
24919 break;
24920 default:
24921 gcc_unreachable ();
24924 else
24926 switch (mode)
24928 case E_QImode:
24929 insn = gen_addqi3_carry;
24930 break;
24931 case E_HImode:
24932 insn = gen_addhi3_carry;
24933 break;
24934 case E_SImode:
24935 insn = gen_addsi3_carry;
24936 break;
24937 case E_DImode:
24938 insn = gen_adddi3_carry;
24939 break;
24940 default:
24941 gcc_unreachable ();
24944 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
24946 return true;
24950 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
24951 but works for floating pointer parameters and nonoffsetable memories.
24952 For pushes, it returns just stack offsets; the values will be saved
24953 in the right order. Maximally three parts are generated. */
24955 static int
24956 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
24958 int size;
24960 if (!TARGET_64BIT)
24961 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
24962 else
24963 size = (GET_MODE_SIZE (mode) + 4) / 8;
24965 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
24966 gcc_assert (size >= 2 && size <= 4);
24968 /* Optimize constant pool reference to immediates. This is used by fp
24969 moves, that force all constants to memory to allow combining. */
24970 if (MEM_P (operand) && MEM_READONLY_P (operand))
24971 operand = avoid_constant_pool_reference (operand);
24973 if (MEM_P (operand) && !offsettable_memref_p (operand))
24975 /* The only non-offsetable memories we handle are pushes. */
24976 int ok = push_operand (operand, VOIDmode);
24978 gcc_assert (ok);
24980 operand = copy_rtx (operand);
24981 PUT_MODE (operand, word_mode);
24982 parts[0] = parts[1] = parts[2] = parts[3] = operand;
24983 return size;
24986 if (GET_CODE (operand) == CONST_VECTOR)
24988 scalar_int_mode imode = int_mode_for_mode (mode).require ();
24989 /* Caution: if we looked through a constant pool memory above,
24990 the operand may actually have a different mode now. That's
24991 ok, since we want to pun this all the way back to an integer. */
24992 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
24993 gcc_assert (operand != NULL);
24994 mode = imode;
24997 if (!TARGET_64BIT)
24999 if (mode == DImode)
25000 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25001 else
25003 int i;
25005 if (REG_P (operand))
25007 gcc_assert (reload_completed);
25008 for (i = 0; i < size; i++)
25009 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25011 else if (offsettable_memref_p (operand))
25013 operand = adjust_address (operand, SImode, 0);
25014 parts[0] = operand;
25015 for (i = 1; i < size; i++)
25016 parts[i] = adjust_address (operand, SImode, 4 * i);
25018 else if (CONST_DOUBLE_P (operand))
25020 const REAL_VALUE_TYPE *r;
25021 long l[4];
25023 r = CONST_DOUBLE_REAL_VALUE (operand);
25024 switch (mode)
25026 case E_TFmode:
25027 real_to_target (l, r, mode);
25028 parts[3] = gen_int_mode (l[3], SImode);
25029 parts[2] = gen_int_mode (l[2], SImode);
25030 break;
25031 case E_XFmode:
25032 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25033 long double may not be 80-bit. */
25034 real_to_target (l, r, mode);
25035 parts[2] = gen_int_mode (l[2], SImode);
25036 break;
25037 case E_DFmode:
25038 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25039 break;
25040 default:
25041 gcc_unreachable ();
25043 parts[1] = gen_int_mode (l[1], SImode);
25044 parts[0] = gen_int_mode (l[0], SImode);
25046 else
25047 gcc_unreachable ();
25050 else
25052 if (mode == TImode)
25053 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25054 if (mode == XFmode || mode == TFmode)
25056 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25057 if (REG_P (operand))
25059 gcc_assert (reload_completed);
25060 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25061 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25063 else if (offsettable_memref_p (operand))
25065 operand = adjust_address (operand, DImode, 0);
25066 parts[0] = operand;
25067 parts[1] = adjust_address (operand, upper_mode, 8);
25069 else if (CONST_DOUBLE_P (operand))
25071 long l[4];
25073 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25075 /* real_to_target puts 32-bit pieces in each long. */
25076 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25077 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25078 << 32), DImode);
25080 if (upper_mode == SImode)
25081 parts[1] = gen_int_mode (l[2], SImode);
25082 else
25083 parts[1]
25084 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25085 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25086 << 32), DImode);
25088 else
25089 gcc_unreachable ();
25093 return size;
25096 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25097 Return false when normal moves are needed; true when all required
25098 insns have been emitted. Operands 2-4 contain the input values
25099 int the correct order; operands 5-7 contain the output values. */
25101 void
25102 ix86_split_long_move (rtx operands[])
25104 rtx part[2][4];
25105 int nparts, i, j;
25106 int push = 0;
25107 int collisions = 0;
25108 machine_mode mode = GET_MODE (operands[0]);
25109 bool collisionparts[4];
25111 /* The DFmode expanders may ask us to move double.
25112 For 64bit target this is single move. By hiding the fact
25113 here we simplify i386.md splitters. */
25114 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25116 /* Optimize constant pool reference to immediates. This is used by
25117 fp moves, that force all constants to memory to allow combining. */
25119 if (MEM_P (operands[1])
25120 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25121 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25122 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25123 if (push_operand (operands[0], VOIDmode))
25125 operands[0] = copy_rtx (operands[0]);
25126 PUT_MODE (operands[0], word_mode);
25128 else
25129 operands[0] = gen_lowpart (DImode, operands[0]);
25130 operands[1] = gen_lowpart (DImode, operands[1]);
25131 emit_move_insn (operands[0], operands[1]);
25132 return;
25135 /* The only non-offsettable memory we handle is push. */
25136 if (push_operand (operands[0], VOIDmode))
25137 push = 1;
25138 else
25139 gcc_assert (!MEM_P (operands[0])
25140 || offsettable_memref_p (operands[0]));
25142 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25143 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25145 /* When emitting push, take care for source operands on the stack. */
25146 if (push && MEM_P (operands[1])
25147 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25149 rtx src_base = XEXP (part[1][nparts - 1], 0);
25151 /* Compensate for the stack decrement by 4. */
25152 if (!TARGET_64BIT && nparts == 3
25153 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25154 src_base = plus_constant (Pmode, src_base, 4);
25156 /* src_base refers to the stack pointer and is
25157 automatically decreased by emitted push. */
25158 for (i = 0; i < nparts; i++)
25159 part[1][i] = change_address (part[1][i],
25160 GET_MODE (part[1][i]), src_base);
25163 /* We need to do copy in the right order in case an address register
25164 of the source overlaps the destination. */
25165 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25167 rtx tmp;
25169 for (i = 0; i < nparts; i++)
25171 collisionparts[i]
25172 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25173 if (collisionparts[i])
25174 collisions++;
25177 /* Collision in the middle part can be handled by reordering. */
25178 if (collisions == 1 && nparts == 3 && collisionparts [1])
25180 std::swap (part[0][1], part[0][2]);
25181 std::swap (part[1][1], part[1][2]);
25183 else if (collisions == 1
25184 && nparts == 4
25185 && (collisionparts [1] || collisionparts [2]))
25187 if (collisionparts [1])
25189 std::swap (part[0][1], part[0][2]);
25190 std::swap (part[1][1], part[1][2]);
25192 else
25194 std::swap (part[0][2], part[0][3]);
25195 std::swap (part[1][2], part[1][3]);
25199 /* If there are more collisions, we can't handle it by reordering.
25200 Do an lea to the last part and use only one colliding move. */
25201 else if (collisions > 1)
25203 rtx base, addr;
25205 collisions = 1;
25207 base = part[0][nparts - 1];
25209 /* Handle the case when the last part isn't valid for lea.
25210 Happens in 64-bit mode storing the 12-byte XFmode. */
25211 if (GET_MODE (base) != Pmode)
25212 base = gen_rtx_REG (Pmode, REGNO (base));
25214 addr = XEXP (part[1][0], 0);
25215 if (TARGET_TLS_DIRECT_SEG_REFS)
25217 struct ix86_address parts;
25218 int ok = ix86_decompose_address (addr, &parts);
25219 gcc_assert (ok);
25220 /* It is not valid to use %gs: or %fs: in lea. */
25221 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25223 emit_insn (gen_rtx_SET (base, addr));
25224 part[1][0] = replace_equiv_address (part[1][0], base);
25225 for (i = 1; i < nparts; i++)
25227 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25228 part[1][i] = replace_equiv_address (part[1][i], tmp);
25233 if (push)
25235 if (!TARGET_64BIT)
25237 if (nparts == 3)
25239 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25240 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25241 stack_pointer_rtx, GEN_INT (-4)));
25242 emit_move_insn (part[0][2], part[1][2]);
25244 else if (nparts == 4)
25246 emit_move_insn (part[0][3], part[1][3]);
25247 emit_move_insn (part[0][2], part[1][2]);
25250 else
25252 /* In 64bit mode we don't have 32bit push available. In case this is
25253 register, it is OK - we will just use larger counterpart. We also
25254 retype memory - these comes from attempt to avoid REX prefix on
25255 moving of second half of TFmode value. */
25256 if (GET_MODE (part[1][1]) == SImode)
25258 switch (GET_CODE (part[1][1]))
25260 case MEM:
25261 part[1][1] = adjust_address (part[1][1], DImode, 0);
25262 break;
25264 case REG:
25265 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25266 break;
25268 default:
25269 gcc_unreachable ();
25272 if (GET_MODE (part[1][0]) == SImode)
25273 part[1][0] = part[1][1];
25276 emit_move_insn (part[0][1], part[1][1]);
25277 emit_move_insn (part[0][0], part[1][0]);
25278 return;
25281 /* Choose correct order to not overwrite the source before it is copied. */
25282 if ((REG_P (part[0][0])
25283 && REG_P (part[1][1])
25284 && (REGNO (part[0][0]) == REGNO (part[1][1])
25285 || (nparts == 3
25286 && REGNO (part[0][0]) == REGNO (part[1][2]))
25287 || (nparts == 4
25288 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25289 || (collisions > 0
25290 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25292 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25294 operands[2 + i] = part[0][j];
25295 operands[6 + i] = part[1][j];
25298 else
25300 for (i = 0; i < nparts; i++)
25302 operands[2 + i] = part[0][i];
25303 operands[6 + i] = part[1][i];
25307 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25308 if (optimize_insn_for_size_p ())
25310 for (j = 0; j < nparts - 1; j++)
25311 if (CONST_INT_P (operands[6 + j])
25312 && operands[6 + j] != const0_rtx
25313 && REG_P (operands[2 + j]))
25314 for (i = j; i < nparts - 1; i++)
25315 if (CONST_INT_P (operands[7 + i])
25316 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25317 operands[7 + i] = operands[2 + j];
25320 for (i = 0; i < nparts; i++)
25321 emit_move_insn (operands[2 + i], operands[6 + i]);
25323 return;
25326 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25327 left shift by a constant, either using a single shift or
25328 a sequence of add instructions. */
25330 static void
25331 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25333 rtx (*insn)(rtx, rtx, rtx);
25335 if (count == 1
25336 || (count * ix86_cost->add <= ix86_cost->shift_const
25337 && !optimize_insn_for_size_p ()))
25339 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25340 while (count-- > 0)
25341 emit_insn (insn (operand, operand, operand));
25343 else
25345 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25346 emit_insn (insn (operand, operand, GEN_INT (count)));
25350 void
25351 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25353 rtx (*gen_ashl3)(rtx, rtx, rtx);
25354 rtx (*gen_shld)(rtx, rtx, rtx);
25355 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25357 rtx low[2], high[2];
25358 int count;
25360 if (CONST_INT_P (operands[2]))
25362 split_double_mode (mode, operands, 2, low, high);
25363 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25365 if (count >= half_width)
25367 emit_move_insn (high[0], low[1]);
25368 emit_move_insn (low[0], const0_rtx);
25370 if (count > half_width)
25371 ix86_expand_ashl_const (high[0], count - half_width, mode);
25373 else
25375 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25377 if (!rtx_equal_p (operands[0], operands[1]))
25378 emit_move_insn (operands[0], operands[1]);
25380 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25381 ix86_expand_ashl_const (low[0], count, mode);
25383 return;
25386 split_double_mode (mode, operands, 1, low, high);
25388 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25390 if (operands[1] == const1_rtx)
25392 /* Assuming we've chosen a QImode capable registers, then 1 << N
25393 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25394 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25396 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25398 ix86_expand_clear (low[0]);
25399 ix86_expand_clear (high[0]);
25400 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25402 d = gen_lowpart (QImode, low[0]);
25403 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25404 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25405 emit_insn (gen_rtx_SET (d, s));
25407 d = gen_lowpart (QImode, high[0]);
25408 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25409 s = gen_rtx_NE (QImode, flags, const0_rtx);
25410 emit_insn (gen_rtx_SET (d, s));
25413 /* Otherwise, we can get the same results by manually performing
25414 a bit extract operation on bit 5/6, and then performing the two
25415 shifts. The two methods of getting 0/1 into low/high are exactly
25416 the same size. Avoiding the shift in the bit extract case helps
25417 pentium4 a bit; no one else seems to care much either way. */
25418 else
25420 machine_mode half_mode;
25421 rtx (*gen_lshr3)(rtx, rtx, rtx);
25422 rtx (*gen_and3)(rtx, rtx, rtx);
25423 rtx (*gen_xor3)(rtx, rtx, rtx);
25424 HOST_WIDE_INT bits;
25425 rtx x;
25427 if (mode == DImode)
25429 half_mode = SImode;
25430 gen_lshr3 = gen_lshrsi3;
25431 gen_and3 = gen_andsi3;
25432 gen_xor3 = gen_xorsi3;
25433 bits = 5;
25435 else
25437 half_mode = DImode;
25438 gen_lshr3 = gen_lshrdi3;
25439 gen_and3 = gen_anddi3;
25440 gen_xor3 = gen_xordi3;
25441 bits = 6;
25444 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25445 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25446 else
25447 x = gen_lowpart (half_mode, operands[2]);
25448 emit_insn (gen_rtx_SET (high[0], x));
25450 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25451 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25452 emit_move_insn (low[0], high[0]);
25453 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25456 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25457 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25458 return;
25461 if (operands[1] == constm1_rtx)
25463 /* For -1 << N, we can avoid the shld instruction, because we
25464 know that we're shifting 0...31/63 ones into a -1. */
25465 emit_move_insn (low[0], constm1_rtx);
25466 if (optimize_insn_for_size_p ())
25467 emit_move_insn (high[0], low[0]);
25468 else
25469 emit_move_insn (high[0], constm1_rtx);
25471 else
25473 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25475 if (!rtx_equal_p (operands[0], operands[1]))
25476 emit_move_insn (operands[0], operands[1]);
25478 split_double_mode (mode, operands, 1, low, high);
25479 emit_insn (gen_shld (high[0], low[0], operands[2]));
25482 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25484 if (TARGET_CMOVE && scratch)
25486 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25487 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25489 ix86_expand_clear (scratch);
25490 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25492 else
25494 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25495 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25497 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25501 void
25502 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25504 rtx (*gen_ashr3)(rtx, rtx, rtx)
25505 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25506 rtx (*gen_shrd)(rtx, rtx, rtx);
25507 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25509 rtx low[2], high[2];
25510 int count;
25512 if (CONST_INT_P (operands[2]))
25514 split_double_mode (mode, operands, 2, low, high);
25515 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25517 if (count == GET_MODE_BITSIZE (mode) - 1)
25519 emit_move_insn (high[0], high[1]);
25520 emit_insn (gen_ashr3 (high[0], high[0],
25521 GEN_INT (half_width - 1)));
25522 emit_move_insn (low[0], high[0]);
25525 else if (count >= half_width)
25527 emit_move_insn (low[0], high[1]);
25528 emit_move_insn (high[0], low[0]);
25529 emit_insn (gen_ashr3 (high[0], high[0],
25530 GEN_INT (half_width - 1)));
25532 if (count > half_width)
25533 emit_insn (gen_ashr3 (low[0], low[0],
25534 GEN_INT (count - half_width)));
25536 else
25538 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25540 if (!rtx_equal_p (operands[0], operands[1]))
25541 emit_move_insn (operands[0], operands[1]);
25543 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25544 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25547 else
25549 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25551 if (!rtx_equal_p (operands[0], operands[1]))
25552 emit_move_insn (operands[0], operands[1]);
25554 split_double_mode (mode, operands, 1, low, high);
25556 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25557 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25559 if (TARGET_CMOVE && scratch)
25561 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25562 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25564 emit_move_insn (scratch, high[0]);
25565 emit_insn (gen_ashr3 (scratch, scratch,
25566 GEN_INT (half_width - 1)));
25567 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25568 scratch));
25570 else
25572 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25573 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25575 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25580 void
25581 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25583 rtx (*gen_lshr3)(rtx, rtx, rtx)
25584 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25585 rtx (*gen_shrd)(rtx, rtx, rtx);
25586 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25588 rtx low[2], high[2];
25589 int count;
25591 if (CONST_INT_P (operands[2]))
25593 split_double_mode (mode, operands, 2, low, high);
25594 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25596 if (count >= half_width)
25598 emit_move_insn (low[0], high[1]);
25599 ix86_expand_clear (high[0]);
25601 if (count > half_width)
25602 emit_insn (gen_lshr3 (low[0], low[0],
25603 GEN_INT (count - half_width)));
25605 else
25607 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25609 if (!rtx_equal_p (operands[0], operands[1]))
25610 emit_move_insn (operands[0], operands[1]);
25612 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25613 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25616 else
25618 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25620 if (!rtx_equal_p (operands[0], operands[1]))
25621 emit_move_insn (operands[0], operands[1]);
25623 split_double_mode (mode, operands, 1, low, high);
25625 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25626 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25628 if (TARGET_CMOVE && scratch)
25630 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25631 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25633 ix86_expand_clear (scratch);
25634 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25635 scratch));
25637 else
25639 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25640 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25642 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25647 /* Predict just emitted jump instruction to be taken with probability PROB. */
25648 static void
25649 predict_jump (int prob)
25651 rtx_insn *insn = get_last_insn ();
25652 gcc_assert (JUMP_P (insn));
25653 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
25656 /* Helper function for the string operations below. Dest VARIABLE whether
25657 it is aligned to VALUE bytes. If true, jump to the label. */
25658 static rtx_code_label *
25659 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25661 rtx_code_label *label = gen_label_rtx ();
25662 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25663 if (GET_MODE (variable) == DImode)
25664 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25665 else
25666 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25667 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25668 1, label);
25669 if (epilogue)
25670 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25671 else
25672 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25673 return label;
25676 /* Adjust COUNTER by the VALUE. */
25677 static void
25678 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25680 rtx (*gen_add)(rtx, rtx, rtx)
25681 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25683 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25686 /* Zero extend possibly SImode EXP to Pmode register. */
25688 ix86_zero_extend_to_Pmode (rtx exp)
25690 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25693 /* Divide COUNTREG by SCALE. */
25694 static rtx
25695 scale_counter (rtx countreg, int scale)
25697 rtx sc;
25699 if (scale == 1)
25700 return countreg;
25701 if (CONST_INT_P (countreg))
25702 return GEN_INT (INTVAL (countreg) / scale);
25703 gcc_assert (REG_P (countreg));
25705 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25706 GEN_INT (exact_log2 (scale)),
25707 NULL, 1, OPTAB_DIRECT);
25708 return sc;
25711 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25712 DImode for constant loop counts. */
25714 static machine_mode
25715 counter_mode (rtx count_exp)
25717 if (GET_MODE (count_exp) != VOIDmode)
25718 return GET_MODE (count_exp);
25719 if (!CONST_INT_P (count_exp))
25720 return Pmode;
25721 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25722 return DImode;
25723 return SImode;
25726 /* Copy the address to a Pmode register. This is used for x32 to
25727 truncate DImode TLS address to a SImode register. */
25729 static rtx
25730 ix86_copy_addr_to_reg (rtx addr)
25732 rtx reg;
25733 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25735 reg = copy_addr_to_reg (addr);
25736 REG_POINTER (reg) = 1;
25737 return reg;
25739 else
25741 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25742 reg = copy_to_mode_reg (DImode, addr);
25743 REG_POINTER (reg) = 1;
25744 return gen_rtx_SUBREG (SImode, reg, 0);
25748 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25749 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25750 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25751 memory by VALUE (supposed to be in MODE).
25753 The size is rounded down to whole number of chunk size moved at once.
25754 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25757 static void
25758 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25759 rtx destptr, rtx srcptr, rtx value,
25760 rtx count, machine_mode mode, int unroll,
25761 int expected_size, bool issetmem)
25763 rtx_code_label *out_label, *top_label;
25764 rtx iter, tmp;
25765 machine_mode iter_mode = counter_mode (count);
25766 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25767 rtx piece_size = GEN_INT (piece_size_n);
25768 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25769 rtx size;
25770 int i;
25772 top_label = gen_label_rtx ();
25773 out_label = gen_label_rtx ();
25774 iter = gen_reg_rtx (iter_mode);
25776 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25777 NULL, 1, OPTAB_DIRECT);
25778 /* Those two should combine. */
25779 if (piece_size == const1_rtx)
25781 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25782 true, out_label);
25783 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25785 emit_move_insn (iter, const0_rtx);
25787 emit_label (top_label);
25789 tmp = convert_modes (Pmode, iter_mode, iter, true);
25791 /* This assert could be relaxed - in this case we'll need to compute
25792 smallest power of two, containing in PIECE_SIZE_N and pass it to
25793 offset_address. */
25794 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25795 destmem = offset_address (destmem, tmp, piece_size_n);
25796 destmem = adjust_address (destmem, mode, 0);
25798 if (!issetmem)
25800 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25801 srcmem = adjust_address (srcmem, mode, 0);
25803 /* When unrolling for chips that reorder memory reads and writes,
25804 we can save registers by using single temporary.
25805 Also using 4 temporaries is overkill in 32bit mode. */
25806 if (!TARGET_64BIT && 0)
25808 for (i = 0; i < unroll; i++)
25810 if (i)
25812 destmem =
25813 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25814 srcmem =
25815 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25817 emit_move_insn (destmem, srcmem);
25820 else
25822 rtx tmpreg[4];
25823 gcc_assert (unroll <= 4);
25824 for (i = 0; i < unroll; i++)
25826 tmpreg[i] = gen_reg_rtx (mode);
25827 if (i)
25829 srcmem =
25830 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25832 emit_move_insn (tmpreg[i], srcmem);
25834 for (i = 0; i < unroll; i++)
25836 if (i)
25838 destmem =
25839 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25841 emit_move_insn (destmem, tmpreg[i]);
25845 else
25846 for (i = 0; i < unroll; i++)
25848 if (i)
25849 destmem =
25850 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25851 emit_move_insn (destmem, value);
25854 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
25855 true, OPTAB_LIB_WIDEN);
25856 if (tmp != iter)
25857 emit_move_insn (iter, tmp);
25859 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
25860 true, top_label);
25861 if (expected_size != -1)
25863 expected_size /= GET_MODE_SIZE (mode) * unroll;
25864 if (expected_size == 0)
25865 predict_jump (0);
25866 else if (expected_size > REG_BR_PROB_BASE)
25867 predict_jump (REG_BR_PROB_BASE - 1);
25868 else
25869 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
25871 else
25872 predict_jump (REG_BR_PROB_BASE * 80 / 100);
25873 iter = ix86_zero_extend_to_Pmode (iter);
25874 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
25875 true, OPTAB_LIB_WIDEN);
25876 if (tmp != destptr)
25877 emit_move_insn (destptr, tmp);
25878 if (!issetmem)
25880 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
25881 true, OPTAB_LIB_WIDEN);
25882 if (tmp != srcptr)
25883 emit_move_insn (srcptr, tmp);
25885 emit_label (out_label);
25888 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
25889 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
25890 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
25891 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
25892 ORIG_VALUE is the original value passed to memset to fill the memory with.
25893 Other arguments have same meaning as for previous function. */
25895 static void
25896 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
25897 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
25898 rtx count,
25899 machine_mode mode, bool issetmem)
25901 rtx destexp;
25902 rtx srcexp;
25903 rtx countreg;
25904 HOST_WIDE_INT rounded_count;
25906 /* If possible, it is shorter to use rep movs.
25907 TODO: Maybe it is better to move this logic to decide_alg. */
25908 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
25909 && (!issetmem || orig_value == const0_rtx))
25910 mode = SImode;
25912 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
25913 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
25915 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
25916 GET_MODE_SIZE (mode)));
25917 if (mode != QImode)
25919 destexp = gen_rtx_ASHIFT (Pmode, countreg,
25920 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25921 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
25923 else
25924 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
25925 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
25927 rounded_count
25928 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25929 destmem = shallow_copy_rtx (destmem);
25930 set_mem_size (destmem, rounded_count);
25932 else if (MEM_SIZE_KNOWN_P (destmem))
25933 clear_mem_size (destmem);
25935 if (issetmem)
25937 value = force_reg (mode, gen_lowpart (mode, value));
25938 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
25940 else
25942 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
25943 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
25944 if (mode != QImode)
25946 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
25947 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25948 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
25950 else
25951 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
25952 if (CONST_INT_P (count))
25954 rounded_count
25955 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25956 srcmem = shallow_copy_rtx (srcmem);
25957 set_mem_size (srcmem, rounded_count);
25959 else
25961 if (MEM_SIZE_KNOWN_P (srcmem))
25962 clear_mem_size (srcmem);
25964 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
25965 destexp, srcexp));
25969 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
25970 DESTMEM.
25971 SRC is passed by pointer to be updated on return.
25972 Return value is updated DST. */
25973 static rtx
25974 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
25975 HOST_WIDE_INT size_to_move)
25977 rtx dst = destmem, src = *srcmem, adjust, tempreg;
25978 enum insn_code code;
25979 machine_mode move_mode;
25980 int piece_size, i;
25982 /* Find the widest mode in which we could perform moves.
25983 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
25984 it until move of such size is supported. */
25985 piece_size = 1 << floor_log2 (size_to_move);
25986 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
25987 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25989 gcc_assert (piece_size > 1);
25990 piece_size >>= 1;
25993 /* Find the corresponding vector mode with the same size as MOVE_MODE.
25994 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
25995 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
25997 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
25998 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
25999 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26001 move_mode = word_mode;
26002 piece_size = GET_MODE_SIZE (move_mode);
26003 code = optab_handler (mov_optab, move_mode);
26006 gcc_assert (code != CODE_FOR_nothing);
26008 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26009 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26011 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26012 gcc_assert (size_to_move % piece_size == 0);
26013 adjust = GEN_INT (piece_size);
26014 for (i = 0; i < size_to_move; i += piece_size)
26016 /* We move from memory to memory, so we'll need to do it via
26017 a temporary register. */
26018 tempreg = gen_reg_rtx (move_mode);
26019 emit_insn (GEN_FCN (code) (tempreg, src));
26020 emit_insn (GEN_FCN (code) (dst, tempreg));
26022 emit_move_insn (destptr,
26023 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26024 emit_move_insn (srcptr,
26025 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26027 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26028 piece_size);
26029 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26030 piece_size);
26033 /* Update DST and SRC rtx. */
26034 *srcmem = src;
26035 return dst;
26038 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26039 static void
26040 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26041 rtx destptr, rtx srcptr, rtx count, int max_size)
26043 rtx src, dest;
26044 if (CONST_INT_P (count))
26046 HOST_WIDE_INT countval = INTVAL (count);
26047 HOST_WIDE_INT epilogue_size = countval % max_size;
26048 int i;
26050 /* For now MAX_SIZE should be a power of 2. This assert could be
26051 relaxed, but it'll require a bit more complicated epilogue
26052 expanding. */
26053 gcc_assert ((max_size & (max_size - 1)) == 0);
26054 for (i = max_size; i >= 1; i >>= 1)
26056 if (epilogue_size & i)
26057 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26059 return;
26061 if (max_size > 8)
26063 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26064 count, 1, OPTAB_DIRECT);
26065 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26066 count, QImode, 1, 4, false);
26067 return;
26070 /* When there are stringops, we can cheaply increase dest and src pointers.
26071 Otherwise we save code size by maintaining offset (zero is readily
26072 available from preceding rep operation) and using x86 addressing modes.
26074 if (TARGET_SINGLE_STRINGOP)
26076 if (max_size > 4)
26078 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26079 src = change_address (srcmem, SImode, srcptr);
26080 dest = change_address (destmem, SImode, destptr);
26081 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26082 emit_label (label);
26083 LABEL_NUSES (label) = 1;
26085 if (max_size > 2)
26087 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26088 src = change_address (srcmem, HImode, srcptr);
26089 dest = change_address (destmem, HImode, destptr);
26090 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26091 emit_label (label);
26092 LABEL_NUSES (label) = 1;
26094 if (max_size > 1)
26096 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26097 src = change_address (srcmem, QImode, srcptr);
26098 dest = change_address (destmem, QImode, destptr);
26099 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26100 emit_label (label);
26101 LABEL_NUSES (label) = 1;
26104 else
26106 rtx offset = force_reg (Pmode, const0_rtx);
26107 rtx tmp;
26109 if (max_size > 4)
26111 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26112 src = change_address (srcmem, SImode, srcptr);
26113 dest = change_address (destmem, SImode, destptr);
26114 emit_move_insn (dest, src);
26115 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26116 true, OPTAB_LIB_WIDEN);
26117 if (tmp != offset)
26118 emit_move_insn (offset, tmp);
26119 emit_label (label);
26120 LABEL_NUSES (label) = 1;
26122 if (max_size > 2)
26124 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26125 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26126 src = change_address (srcmem, HImode, tmp);
26127 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26128 dest = change_address (destmem, HImode, tmp);
26129 emit_move_insn (dest, src);
26130 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26131 true, OPTAB_LIB_WIDEN);
26132 if (tmp != offset)
26133 emit_move_insn (offset, tmp);
26134 emit_label (label);
26135 LABEL_NUSES (label) = 1;
26137 if (max_size > 1)
26139 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26140 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26141 src = change_address (srcmem, QImode, tmp);
26142 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26143 dest = change_address (destmem, QImode, tmp);
26144 emit_move_insn (dest, src);
26145 emit_label (label);
26146 LABEL_NUSES (label) = 1;
26151 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26152 with value PROMOTED_VAL.
26153 SRC is passed by pointer to be updated on return.
26154 Return value is updated DST. */
26155 static rtx
26156 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26157 HOST_WIDE_INT size_to_move)
26159 rtx dst = destmem, adjust;
26160 enum insn_code code;
26161 machine_mode move_mode;
26162 int piece_size, i;
26164 /* Find the widest mode in which we could perform moves.
26165 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26166 it until move of such size is supported. */
26167 move_mode = GET_MODE (promoted_val);
26168 if (move_mode == VOIDmode)
26169 move_mode = QImode;
26170 if (size_to_move < GET_MODE_SIZE (move_mode))
26172 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26173 move_mode = int_mode_for_size (move_bits, 0).require ();
26174 promoted_val = gen_lowpart (move_mode, promoted_val);
26176 piece_size = GET_MODE_SIZE (move_mode);
26177 code = optab_handler (mov_optab, move_mode);
26178 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26180 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26182 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26183 gcc_assert (size_to_move % piece_size == 0);
26184 adjust = GEN_INT (piece_size);
26185 for (i = 0; i < size_to_move; i += piece_size)
26187 if (piece_size <= GET_MODE_SIZE (word_mode))
26189 emit_insn (gen_strset (destptr, dst, promoted_val));
26190 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26191 piece_size);
26192 continue;
26195 emit_insn (GEN_FCN (code) (dst, promoted_val));
26197 emit_move_insn (destptr,
26198 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26200 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26201 piece_size);
26204 /* Update DST rtx. */
26205 return dst;
26207 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26208 static void
26209 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26210 rtx count, int max_size)
26212 count =
26213 expand_simple_binop (counter_mode (count), AND, count,
26214 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26215 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26216 gen_lowpart (QImode, value), count, QImode,
26217 1, max_size / 2, true);
26220 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26221 static void
26222 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26223 rtx count, int max_size)
26225 rtx dest;
26227 if (CONST_INT_P (count))
26229 HOST_WIDE_INT countval = INTVAL (count);
26230 HOST_WIDE_INT epilogue_size = countval % max_size;
26231 int i;
26233 /* For now MAX_SIZE should be a power of 2. This assert could be
26234 relaxed, but it'll require a bit more complicated epilogue
26235 expanding. */
26236 gcc_assert ((max_size & (max_size - 1)) == 0);
26237 for (i = max_size; i >= 1; i >>= 1)
26239 if (epilogue_size & i)
26241 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26242 destmem = emit_memset (destmem, destptr, vec_value, i);
26243 else
26244 destmem = emit_memset (destmem, destptr, value, i);
26247 return;
26249 if (max_size > 32)
26251 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26252 return;
26254 if (max_size > 16)
26256 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26257 if (TARGET_64BIT)
26259 dest = change_address (destmem, DImode, destptr);
26260 emit_insn (gen_strset (destptr, dest, value));
26261 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26262 emit_insn (gen_strset (destptr, dest, value));
26264 else
26266 dest = change_address (destmem, SImode, destptr);
26267 emit_insn (gen_strset (destptr, dest, value));
26268 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26269 emit_insn (gen_strset (destptr, dest, value));
26270 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26271 emit_insn (gen_strset (destptr, dest, value));
26272 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26273 emit_insn (gen_strset (destptr, dest, value));
26275 emit_label (label);
26276 LABEL_NUSES (label) = 1;
26278 if (max_size > 8)
26280 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26281 if (TARGET_64BIT)
26283 dest = change_address (destmem, DImode, destptr);
26284 emit_insn (gen_strset (destptr, dest, value));
26286 else
26288 dest = change_address (destmem, SImode, destptr);
26289 emit_insn (gen_strset (destptr, dest, value));
26290 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26291 emit_insn (gen_strset (destptr, dest, value));
26293 emit_label (label);
26294 LABEL_NUSES (label) = 1;
26296 if (max_size > 4)
26298 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26299 dest = change_address (destmem, SImode, destptr);
26300 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26301 emit_label (label);
26302 LABEL_NUSES (label) = 1;
26304 if (max_size > 2)
26306 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26307 dest = change_address (destmem, HImode, destptr);
26308 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26309 emit_label (label);
26310 LABEL_NUSES (label) = 1;
26312 if (max_size > 1)
26314 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26315 dest = change_address (destmem, QImode, destptr);
26316 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26317 emit_label (label);
26318 LABEL_NUSES (label) = 1;
26322 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26323 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26324 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26325 ignored.
26326 Return value is updated DESTMEM. */
26327 static rtx
26328 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26329 rtx destptr, rtx srcptr, rtx value,
26330 rtx vec_value, rtx count, int align,
26331 int desired_alignment, bool issetmem)
26333 int i;
26334 for (i = 1; i < desired_alignment; i <<= 1)
26336 if (align <= i)
26338 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26339 if (issetmem)
26341 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26342 destmem = emit_memset (destmem, destptr, vec_value, i);
26343 else
26344 destmem = emit_memset (destmem, destptr, value, i);
26346 else
26347 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26348 ix86_adjust_counter (count, i);
26349 emit_label (label);
26350 LABEL_NUSES (label) = 1;
26351 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26354 return destmem;
26357 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26358 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26359 and jump to DONE_LABEL. */
26360 static void
26361 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26362 rtx destptr, rtx srcptr,
26363 rtx value, rtx vec_value,
26364 rtx count, int size,
26365 rtx done_label, bool issetmem)
26367 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26368 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26369 rtx modesize;
26370 int n;
26372 /* If we do not have vector value to copy, we must reduce size. */
26373 if (issetmem)
26375 if (!vec_value)
26377 if (GET_MODE (value) == VOIDmode && size > 8)
26378 mode = Pmode;
26379 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26380 mode = GET_MODE (value);
26382 else
26383 mode = GET_MODE (vec_value), value = vec_value;
26385 else
26387 /* Choose appropriate vector mode. */
26388 if (size >= 32)
26389 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26390 else if (size >= 16)
26391 mode = TARGET_SSE ? V16QImode : DImode;
26392 srcmem = change_address (srcmem, mode, srcptr);
26394 destmem = change_address (destmem, mode, destptr);
26395 modesize = GEN_INT (GET_MODE_SIZE (mode));
26396 gcc_assert (GET_MODE_SIZE (mode) <= size);
26397 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26399 if (issetmem)
26400 emit_move_insn (destmem, gen_lowpart (mode, value));
26401 else
26403 emit_move_insn (destmem, srcmem);
26404 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26406 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26409 destmem = offset_address (destmem, count, 1);
26410 destmem = offset_address (destmem, GEN_INT (-2 * size),
26411 GET_MODE_SIZE (mode));
26412 if (!issetmem)
26414 srcmem = offset_address (srcmem, count, 1);
26415 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26416 GET_MODE_SIZE (mode));
26418 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26420 if (issetmem)
26421 emit_move_insn (destmem, gen_lowpart (mode, value));
26422 else
26424 emit_move_insn (destmem, srcmem);
26425 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26427 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26429 emit_jump_insn (gen_jump (done_label));
26430 emit_barrier ();
26432 emit_label (label);
26433 LABEL_NUSES (label) = 1;
26436 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26437 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26438 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26439 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26440 DONE_LABEL is a label after the whole copying sequence. The label is created
26441 on demand if *DONE_LABEL is NULL.
26442 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26443 bounds after the initial copies.
26445 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26446 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26447 we will dispatch to a library call for large blocks.
26449 In pseudocode we do:
26451 if (COUNT < SIZE)
26453 Assume that SIZE is 4. Bigger sizes are handled analogously
26454 if (COUNT & 4)
26456 copy 4 bytes from SRCPTR to DESTPTR
26457 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26458 goto done_label
26460 if (!COUNT)
26461 goto done_label;
26462 copy 1 byte from SRCPTR to DESTPTR
26463 if (COUNT & 2)
26465 copy 2 bytes from SRCPTR to DESTPTR
26466 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26469 else
26471 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26472 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26474 OLD_DESPTR = DESTPTR;
26475 Align DESTPTR up to DESIRED_ALIGN
26476 SRCPTR += DESTPTR - OLD_DESTPTR
26477 COUNT -= DEST_PTR - OLD_DESTPTR
26478 if (DYNAMIC_CHECK)
26479 Round COUNT down to multiple of SIZE
26480 << optional caller supplied zero size guard is here >>
26481 << optional caller supplied dynamic check is here >>
26482 << caller supplied main copy loop is here >>
26484 done_label:
26486 static void
26487 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26488 rtx *destptr, rtx *srcptr,
26489 machine_mode mode,
26490 rtx value, rtx vec_value,
26491 rtx *count,
26492 rtx_code_label **done_label,
26493 int size,
26494 int desired_align,
26495 int align,
26496 unsigned HOST_WIDE_INT *min_size,
26497 bool dynamic_check,
26498 bool issetmem)
26500 rtx_code_label *loop_label = NULL, *label;
26501 int n;
26502 rtx modesize;
26503 int prolog_size = 0;
26504 rtx mode_value;
26506 /* Chose proper value to copy. */
26507 if (issetmem && VECTOR_MODE_P (mode))
26508 mode_value = vec_value;
26509 else
26510 mode_value = value;
26511 gcc_assert (GET_MODE_SIZE (mode) <= size);
26513 /* See if block is big or small, handle small blocks. */
26514 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26516 int size2 = size;
26517 loop_label = gen_label_rtx ();
26519 if (!*done_label)
26520 *done_label = gen_label_rtx ();
26522 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26523 1, loop_label);
26524 size2 >>= 1;
26526 /* Handle sizes > 3. */
26527 for (;size2 > 2; size2 >>= 1)
26528 expand_small_movmem_or_setmem (destmem, srcmem,
26529 *destptr, *srcptr,
26530 value, vec_value,
26531 *count,
26532 size2, *done_label, issetmem);
26533 /* Nothing to copy? Jump to DONE_LABEL if so */
26534 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26535 1, *done_label);
26537 /* Do a byte copy. */
26538 destmem = change_address (destmem, QImode, *destptr);
26539 if (issetmem)
26540 emit_move_insn (destmem, gen_lowpart (QImode, value));
26541 else
26543 srcmem = change_address (srcmem, QImode, *srcptr);
26544 emit_move_insn (destmem, srcmem);
26547 /* Handle sizes 2 and 3. */
26548 label = ix86_expand_aligntest (*count, 2, false);
26549 destmem = change_address (destmem, HImode, *destptr);
26550 destmem = offset_address (destmem, *count, 1);
26551 destmem = offset_address (destmem, GEN_INT (-2), 2);
26552 if (issetmem)
26553 emit_move_insn (destmem, gen_lowpart (HImode, value));
26554 else
26556 srcmem = change_address (srcmem, HImode, *srcptr);
26557 srcmem = offset_address (srcmem, *count, 1);
26558 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26559 emit_move_insn (destmem, srcmem);
26562 emit_label (label);
26563 LABEL_NUSES (label) = 1;
26564 emit_jump_insn (gen_jump (*done_label));
26565 emit_barrier ();
26567 else
26568 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26569 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26571 /* Start memcpy for COUNT >= SIZE. */
26572 if (loop_label)
26574 emit_label (loop_label);
26575 LABEL_NUSES (loop_label) = 1;
26578 /* Copy first desired_align bytes. */
26579 if (!issetmem)
26580 srcmem = change_address (srcmem, mode, *srcptr);
26581 destmem = change_address (destmem, mode, *destptr);
26582 modesize = GEN_INT (GET_MODE_SIZE (mode));
26583 for (n = 0; prolog_size < desired_align - align; n++)
26585 if (issetmem)
26586 emit_move_insn (destmem, mode_value);
26587 else
26589 emit_move_insn (destmem, srcmem);
26590 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26592 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26593 prolog_size += GET_MODE_SIZE (mode);
26597 /* Copy last SIZE bytes. */
26598 destmem = offset_address (destmem, *count, 1);
26599 destmem = offset_address (destmem,
26600 GEN_INT (-size - prolog_size),
26602 if (issetmem)
26603 emit_move_insn (destmem, mode_value);
26604 else
26606 srcmem = offset_address (srcmem, *count, 1);
26607 srcmem = offset_address (srcmem,
26608 GEN_INT (-size - prolog_size),
26610 emit_move_insn (destmem, srcmem);
26612 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26614 destmem = offset_address (destmem, modesize, 1);
26615 if (issetmem)
26616 emit_move_insn (destmem, mode_value);
26617 else
26619 srcmem = offset_address (srcmem, modesize, 1);
26620 emit_move_insn (destmem, srcmem);
26624 /* Align destination. */
26625 if (desired_align > 1 && desired_align > align)
26627 rtx saveddest = *destptr;
26629 gcc_assert (desired_align <= size);
26630 /* Align destptr up, place it to new register. */
26631 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26632 GEN_INT (prolog_size),
26633 NULL_RTX, 1, OPTAB_DIRECT);
26634 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26635 REG_POINTER (*destptr) = 1;
26636 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26637 GEN_INT (-desired_align),
26638 *destptr, 1, OPTAB_DIRECT);
26639 /* See how many bytes we skipped. */
26640 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26641 *destptr,
26642 saveddest, 1, OPTAB_DIRECT);
26643 /* Adjust srcptr and count. */
26644 if (!issetmem)
26645 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26646 saveddest, *srcptr, 1, OPTAB_DIRECT);
26647 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26648 saveddest, *count, 1, OPTAB_DIRECT);
26649 /* We copied at most size + prolog_size. */
26650 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26651 *min_size
26652 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26653 else
26654 *min_size = 0;
26656 /* Our loops always round down the block size, but for dispatch to
26657 library we need precise value. */
26658 if (dynamic_check)
26659 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26660 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26662 else
26664 gcc_assert (prolog_size == 0);
26665 /* Decrease count, so we won't end up copying last word twice. */
26666 if (!CONST_INT_P (*count))
26667 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26668 constm1_rtx, *count, 1, OPTAB_DIRECT);
26669 else
26670 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26671 (unsigned HOST_WIDE_INT)size));
26672 if (*min_size)
26673 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26678 /* This function is like the previous one, except here we know how many bytes
26679 need to be copied. That allows us to update alignment not only of DST, which
26680 is returned, but also of SRC, which is passed as a pointer for that
26681 reason. */
26682 static rtx
26683 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26684 rtx srcreg, rtx value, rtx vec_value,
26685 int desired_align, int align_bytes,
26686 bool issetmem)
26688 rtx src = NULL;
26689 rtx orig_dst = dst;
26690 rtx orig_src = NULL;
26691 int piece_size = 1;
26692 int copied_bytes = 0;
26694 if (!issetmem)
26696 gcc_assert (srcp != NULL);
26697 src = *srcp;
26698 orig_src = src;
26701 for (piece_size = 1;
26702 piece_size <= desired_align && copied_bytes < align_bytes;
26703 piece_size <<= 1)
26705 if (align_bytes & piece_size)
26707 if (issetmem)
26709 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26710 dst = emit_memset (dst, destreg, vec_value, piece_size);
26711 else
26712 dst = emit_memset (dst, destreg, value, piece_size);
26714 else
26715 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26716 copied_bytes += piece_size;
26719 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26720 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26721 if (MEM_SIZE_KNOWN_P (orig_dst))
26722 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26724 if (!issetmem)
26726 int src_align_bytes = get_mem_align_offset (src, desired_align
26727 * BITS_PER_UNIT);
26728 if (src_align_bytes >= 0)
26729 src_align_bytes = desired_align - src_align_bytes;
26730 if (src_align_bytes >= 0)
26732 unsigned int src_align;
26733 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26735 if ((src_align_bytes & (src_align - 1))
26736 == (align_bytes & (src_align - 1)))
26737 break;
26739 if (src_align > (unsigned int) desired_align)
26740 src_align = desired_align;
26741 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26742 set_mem_align (src, src_align * BITS_PER_UNIT);
26744 if (MEM_SIZE_KNOWN_P (orig_src))
26745 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26746 *srcp = src;
26749 return dst;
26752 /* Return true if ALG can be used in current context.
26753 Assume we expand memset if MEMSET is true. */
26754 static bool
26755 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26757 if (alg == no_stringop)
26758 return false;
26759 if (alg == vector_loop)
26760 return TARGET_SSE || TARGET_AVX;
26761 /* Algorithms using the rep prefix want at least edi and ecx;
26762 additionally, memset wants eax and memcpy wants esi. Don't
26763 consider such algorithms if the user has appropriated those
26764 registers for their own purposes, or if we have a non-default
26765 address space, since some string insns cannot override the segment. */
26766 if (alg == rep_prefix_1_byte
26767 || alg == rep_prefix_4_byte
26768 || alg == rep_prefix_8_byte)
26770 if (have_as)
26771 return false;
26772 if (fixed_regs[CX_REG]
26773 || fixed_regs[DI_REG]
26774 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26775 return false;
26777 return true;
26780 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26781 static enum stringop_alg
26782 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26783 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26784 bool memset, bool zero_memset, bool have_as,
26785 int *dynamic_check, bool *noalign, bool recur)
26787 const struct stringop_algs *algs;
26788 bool optimize_for_speed;
26789 int max = 0;
26790 const struct processor_costs *cost;
26791 int i;
26792 bool any_alg_usable_p = false;
26794 *noalign = false;
26795 *dynamic_check = -1;
26797 /* Even if the string operation call is cold, we still might spend a lot
26798 of time processing large blocks. */
26799 if (optimize_function_for_size_p (cfun)
26800 || (optimize_insn_for_size_p ()
26801 && (max_size < 256
26802 || (expected_size != -1 && expected_size < 256))))
26803 optimize_for_speed = false;
26804 else
26805 optimize_for_speed = true;
26807 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26808 if (memset)
26809 algs = &cost->memset[TARGET_64BIT != 0];
26810 else
26811 algs = &cost->memcpy[TARGET_64BIT != 0];
26813 /* See maximal size for user defined algorithm. */
26814 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26816 enum stringop_alg candidate = algs->size[i].alg;
26817 bool usable = alg_usable_p (candidate, memset, have_as);
26818 any_alg_usable_p |= usable;
26820 if (candidate != libcall && candidate && usable)
26821 max = algs->size[i].max;
26824 /* If expected size is not known but max size is small enough
26825 so inline version is a win, set expected size into
26826 the range. */
26827 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26828 && expected_size == -1)
26829 expected_size = min_size / 2 + max_size / 2;
26831 /* If user specified the algorithm, honor it if possible. */
26832 if (ix86_stringop_alg != no_stringop
26833 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26834 return ix86_stringop_alg;
26835 /* rep; movq or rep; movl is the smallest variant. */
26836 else if (!optimize_for_speed)
26838 *noalign = true;
26839 if (!count || (count & 3) || (memset && !zero_memset))
26840 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26841 ? rep_prefix_1_byte : loop_1_byte;
26842 else
26843 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
26844 ? rep_prefix_4_byte : loop;
26846 /* Very tiny blocks are best handled via the loop, REP is expensive to
26847 setup. */
26848 else if (expected_size != -1 && expected_size < 4)
26849 return loop_1_byte;
26850 else if (expected_size != -1)
26852 enum stringop_alg alg = libcall;
26853 bool alg_noalign = false;
26854 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26856 /* We get here if the algorithms that were not libcall-based
26857 were rep-prefix based and we are unable to use rep prefixes
26858 based on global register usage. Break out of the loop and
26859 use the heuristic below. */
26860 if (algs->size[i].max == 0)
26861 break;
26862 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
26864 enum stringop_alg candidate = algs->size[i].alg;
26866 if (candidate != libcall
26867 && alg_usable_p (candidate, memset, have_as))
26869 alg = candidate;
26870 alg_noalign = algs->size[i].noalign;
26872 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
26873 last non-libcall inline algorithm. */
26874 if (TARGET_INLINE_ALL_STRINGOPS)
26876 /* When the current size is best to be copied by a libcall,
26877 but we are still forced to inline, run the heuristic below
26878 that will pick code for medium sized blocks. */
26879 if (alg != libcall)
26881 *noalign = alg_noalign;
26882 return alg;
26884 else if (!any_alg_usable_p)
26885 break;
26887 else if (alg_usable_p (candidate, memset, have_as))
26889 *noalign = algs->size[i].noalign;
26890 return candidate;
26895 /* When asked to inline the call anyway, try to pick meaningful choice.
26896 We look for maximal size of block that is faster to copy by hand and
26897 take blocks of at most of that size guessing that average size will
26898 be roughly half of the block.
26900 If this turns out to be bad, we might simply specify the preferred
26901 choice in ix86_costs. */
26902 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26903 && (algs->unknown_size == libcall
26904 || !alg_usable_p (algs->unknown_size, memset, have_as)))
26906 enum stringop_alg alg;
26907 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
26909 /* If there aren't any usable algorithms or if recursing already,
26910 then recursing on smaller sizes or same size isn't going to
26911 find anything. Just return the simple byte-at-a-time copy loop. */
26912 if (!any_alg_usable_p || recur)
26914 /* Pick something reasonable. */
26915 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
26916 *dynamic_check = 128;
26917 return loop_1_byte;
26919 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
26920 zero_memset, have_as, dynamic_check, noalign, true);
26921 gcc_assert (*dynamic_check == -1);
26922 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26923 *dynamic_check = max;
26924 else
26925 gcc_assert (alg != libcall);
26926 return alg;
26928 return (alg_usable_p (algs->unknown_size, memset, have_as)
26929 ? algs->unknown_size : libcall);
26932 /* Decide on alignment. We know that the operand is already aligned to ALIGN
26933 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
26934 static int
26935 decide_alignment (int align,
26936 enum stringop_alg alg,
26937 int expected_size,
26938 machine_mode move_mode)
26940 int desired_align = 0;
26942 gcc_assert (alg != no_stringop);
26944 if (alg == libcall)
26945 return 0;
26946 if (move_mode == VOIDmode)
26947 return 0;
26949 desired_align = GET_MODE_SIZE (move_mode);
26950 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
26951 copying whole cacheline at once. */
26952 if (TARGET_PENTIUMPRO
26953 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
26954 desired_align = 8;
26956 if (optimize_size)
26957 desired_align = 1;
26958 if (desired_align < align)
26959 desired_align = align;
26960 if (expected_size != -1 && expected_size < 4)
26961 desired_align = align;
26963 return desired_align;
26967 /* Helper function for memcpy. For QImode value 0xXY produce
26968 0xXYXYXYXY of wide specified by MODE. This is essentially
26969 a * 0x10101010, but we can do slightly better than
26970 synth_mult by unwinding the sequence by hand on CPUs with
26971 slow multiply. */
26972 static rtx
26973 promote_duplicated_reg (machine_mode mode, rtx val)
26975 machine_mode valmode = GET_MODE (val);
26976 rtx tmp;
26977 int nops = mode == DImode ? 3 : 2;
26979 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
26980 if (val == const0_rtx)
26981 return copy_to_mode_reg (mode, CONST0_RTX (mode));
26982 if (CONST_INT_P (val))
26984 HOST_WIDE_INT v = INTVAL (val) & 255;
26986 v |= v << 8;
26987 v |= v << 16;
26988 if (mode == DImode)
26989 v |= (v << 16) << 16;
26990 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
26993 if (valmode == VOIDmode)
26994 valmode = QImode;
26995 if (valmode != QImode)
26996 val = gen_lowpart (QImode, val);
26997 if (mode == QImode)
26998 return val;
26999 if (!TARGET_PARTIAL_REG_STALL)
27000 nops--;
27001 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27002 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27003 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27004 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27006 rtx reg = convert_modes (mode, QImode, val, true);
27007 tmp = promote_duplicated_reg (mode, const1_rtx);
27008 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27009 OPTAB_DIRECT);
27011 else
27013 rtx reg = convert_modes (mode, QImode, val, true);
27015 if (!TARGET_PARTIAL_REG_STALL)
27016 if (mode == SImode)
27017 emit_insn (gen_insvsi_1 (reg, reg));
27018 else
27019 emit_insn (gen_insvdi_1 (reg, reg));
27020 else
27022 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27023 NULL, 1, OPTAB_DIRECT);
27024 reg =
27025 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27027 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27028 NULL, 1, OPTAB_DIRECT);
27029 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27030 if (mode == SImode)
27031 return reg;
27032 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27033 NULL, 1, OPTAB_DIRECT);
27034 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27035 return reg;
27039 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27040 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27041 alignment from ALIGN to DESIRED_ALIGN. */
27042 static rtx
27043 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27044 int align)
27046 rtx promoted_val;
27048 if (TARGET_64BIT
27049 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27050 promoted_val = promote_duplicated_reg (DImode, val);
27051 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27052 promoted_val = promote_duplicated_reg (SImode, val);
27053 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27054 promoted_val = promote_duplicated_reg (HImode, val);
27055 else
27056 promoted_val = val;
27058 return promoted_val;
27061 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27062 operations when profitable. The code depends upon architecture, block size
27063 and alignment, but always has one of the following overall structures:
27065 Aligned move sequence:
27067 1) Prologue guard: Conditional that jumps up to epilogues for small
27068 blocks that can be handled by epilogue alone. This is faster
27069 but also needed for correctness, since prologue assume the block
27070 is larger than the desired alignment.
27072 Optional dynamic check for size and libcall for large
27073 blocks is emitted here too, with -minline-stringops-dynamically.
27075 2) Prologue: copy first few bytes in order to get destination
27076 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27077 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27078 copied. We emit either a jump tree on power of two sized
27079 blocks, or a byte loop.
27081 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27082 with specified algorithm.
27084 4) Epilogue: code copying tail of the block that is too small to be
27085 handled by main body (or up to size guarded by prologue guard).
27087 Misaligned move sequence
27089 1) missaligned move prologue/epilogue containing:
27090 a) Prologue handling small memory blocks and jumping to done_label
27091 (skipped if blocks are known to be large enough)
27092 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27093 needed by single possibly misaligned move
27094 (skipped if alignment is not needed)
27095 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27097 2) Zero size guard dispatching to done_label, if needed
27099 3) dispatch to library call, if needed,
27101 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27102 with specified algorithm. */
27103 bool
27104 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27105 rtx align_exp, rtx expected_align_exp,
27106 rtx expected_size_exp, rtx min_size_exp,
27107 rtx max_size_exp, rtx probable_max_size_exp,
27108 bool issetmem)
27110 rtx destreg;
27111 rtx srcreg = NULL;
27112 rtx_code_label *label = NULL;
27113 rtx tmp;
27114 rtx_code_label *jump_around_label = NULL;
27115 HOST_WIDE_INT align = 1;
27116 unsigned HOST_WIDE_INT count = 0;
27117 HOST_WIDE_INT expected_size = -1;
27118 int size_needed = 0, epilogue_size_needed;
27119 int desired_align = 0, align_bytes = 0;
27120 enum stringop_alg alg;
27121 rtx promoted_val = NULL;
27122 rtx vec_promoted_val = NULL;
27123 bool force_loopy_epilogue = false;
27124 int dynamic_check;
27125 bool need_zero_guard = false;
27126 bool noalign;
27127 machine_mode move_mode = VOIDmode;
27128 machine_mode wider_mode;
27129 int unroll_factor = 1;
27130 /* TODO: Once value ranges are available, fill in proper data. */
27131 unsigned HOST_WIDE_INT min_size = 0;
27132 unsigned HOST_WIDE_INT max_size = -1;
27133 unsigned HOST_WIDE_INT probable_max_size = -1;
27134 bool misaligned_prologue_used = false;
27135 bool have_as;
27137 if (CONST_INT_P (align_exp))
27138 align = INTVAL (align_exp);
27139 /* i386 can do misaligned access on reasonably increased cost. */
27140 if (CONST_INT_P (expected_align_exp)
27141 && INTVAL (expected_align_exp) > align)
27142 align = INTVAL (expected_align_exp);
27143 /* ALIGN is the minimum of destination and source alignment, but we care here
27144 just about destination alignment. */
27145 else if (!issetmem
27146 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27147 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27149 if (CONST_INT_P (count_exp))
27151 min_size = max_size = probable_max_size = count = expected_size
27152 = INTVAL (count_exp);
27153 /* When COUNT is 0, there is nothing to do. */
27154 if (!count)
27155 return true;
27157 else
27159 if (min_size_exp)
27160 min_size = INTVAL (min_size_exp);
27161 if (max_size_exp)
27162 max_size = INTVAL (max_size_exp);
27163 if (probable_max_size_exp)
27164 probable_max_size = INTVAL (probable_max_size_exp);
27165 if (CONST_INT_P (expected_size_exp))
27166 expected_size = INTVAL (expected_size_exp);
27169 /* Make sure we don't need to care about overflow later on. */
27170 if (count > (HOST_WIDE_INT_1U << 30))
27171 return false;
27173 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27174 if (!issetmem)
27175 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27177 /* Step 0: Decide on preferred algorithm, desired alignment and
27178 size of chunks to be copied by main loop. */
27179 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27180 issetmem,
27181 issetmem && val_exp == const0_rtx, have_as,
27182 &dynamic_check, &noalign, false);
27183 if (alg == libcall)
27184 return false;
27185 gcc_assert (alg != no_stringop);
27187 /* For now vector-version of memset is generated only for memory zeroing, as
27188 creating of promoted vector value is very cheap in this case. */
27189 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27190 alg = unrolled_loop;
27192 if (!count)
27193 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27194 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27195 if (!issetmem)
27196 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27198 unroll_factor = 1;
27199 move_mode = word_mode;
27200 switch (alg)
27202 case libcall:
27203 case no_stringop:
27204 case last_alg:
27205 gcc_unreachable ();
27206 case loop_1_byte:
27207 need_zero_guard = true;
27208 move_mode = QImode;
27209 break;
27210 case loop:
27211 need_zero_guard = true;
27212 break;
27213 case unrolled_loop:
27214 need_zero_guard = true;
27215 unroll_factor = (TARGET_64BIT ? 4 : 2);
27216 break;
27217 case vector_loop:
27218 need_zero_guard = true;
27219 unroll_factor = 4;
27220 /* Find the widest supported mode. */
27221 move_mode = word_mode;
27222 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27223 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27224 move_mode = wider_mode;
27226 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27227 move_mode = TImode;
27229 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27230 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27231 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27233 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27234 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27235 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27236 move_mode = word_mode;
27238 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27239 break;
27240 case rep_prefix_8_byte:
27241 move_mode = DImode;
27242 break;
27243 case rep_prefix_4_byte:
27244 move_mode = SImode;
27245 break;
27246 case rep_prefix_1_byte:
27247 move_mode = QImode;
27248 break;
27250 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27251 epilogue_size_needed = size_needed;
27253 /* If we are going to call any library calls conditionally, make sure any
27254 pending stack adjustment happen before the first conditional branch,
27255 otherwise they will be emitted before the library call only and won't
27256 happen from the other branches. */
27257 if (dynamic_check != -1)
27258 do_pending_stack_adjust ();
27260 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27261 if (!TARGET_ALIGN_STRINGOPS || noalign)
27262 align = desired_align;
27264 /* Step 1: Prologue guard. */
27266 /* Alignment code needs count to be in register. */
27267 if (CONST_INT_P (count_exp) && desired_align > align)
27269 if (INTVAL (count_exp) > desired_align
27270 && INTVAL (count_exp) > size_needed)
27272 align_bytes
27273 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27274 if (align_bytes <= 0)
27275 align_bytes = 0;
27276 else
27277 align_bytes = desired_align - align_bytes;
27279 if (align_bytes == 0)
27280 count_exp = force_reg (counter_mode (count_exp), count_exp);
27282 gcc_assert (desired_align >= 1 && align >= 1);
27284 /* Misaligned move sequences handle both prologue and epilogue at once.
27285 Default code generation results in a smaller code for large alignments
27286 and also avoids redundant job when sizes are known precisely. */
27287 misaligned_prologue_used
27288 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27289 && MAX (desired_align, epilogue_size_needed) <= 32
27290 && desired_align <= epilogue_size_needed
27291 && ((desired_align > align && !align_bytes)
27292 || (!count && epilogue_size_needed > 1)));
27294 /* Do the cheap promotion to allow better CSE across the
27295 main loop and epilogue (ie one load of the big constant in the
27296 front of all code.
27297 For now the misaligned move sequences do not have fast path
27298 without broadcasting. */
27299 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27301 if (alg == vector_loop)
27303 gcc_assert (val_exp == const0_rtx);
27304 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27305 promoted_val = promote_duplicated_reg_to_size (val_exp,
27306 GET_MODE_SIZE (word_mode),
27307 desired_align, align);
27309 else
27311 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27312 desired_align, align);
27315 /* Misaligned move sequences handles both prologues and epilogues at once.
27316 Default code generation results in smaller code for large alignments and
27317 also avoids redundant job when sizes are known precisely. */
27318 if (misaligned_prologue_used)
27320 /* Misaligned move prologue handled small blocks by itself. */
27321 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27322 (dst, src, &destreg, &srcreg,
27323 move_mode, promoted_val, vec_promoted_val,
27324 &count_exp,
27325 &jump_around_label,
27326 desired_align < align
27327 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27328 desired_align, align, &min_size, dynamic_check, issetmem);
27329 if (!issetmem)
27330 src = change_address (src, BLKmode, srcreg);
27331 dst = change_address (dst, BLKmode, destreg);
27332 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27333 epilogue_size_needed = 0;
27334 if (need_zero_guard
27335 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27337 /* It is possible that we copied enough so the main loop will not
27338 execute. */
27339 gcc_assert (size_needed > 1);
27340 if (jump_around_label == NULL_RTX)
27341 jump_around_label = gen_label_rtx ();
27342 emit_cmp_and_jump_insns (count_exp,
27343 GEN_INT (size_needed),
27344 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27345 if (expected_size == -1
27346 || expected_size < (desired_align - align) / 2 + size_needed)
27347 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27348 else
27349 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27352 /* Ensure that alignment prologue won't copy past end of block. */
27353 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27355 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27356 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27357 Make sure it is power of 2. */
27358 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27360 /* To improve performance of small blocks, we jump around the VAL
27361 promoting mode. This mean that if the promoted VAL is not constant,
27362 we might not use it in the epilogue and have to use byte
27363 loop variant. */
27364 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27365 force_loopy_epilogue = true;
27366 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27367 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27369 /* If main algorithm works on QImode, no epilogue is needed.
27370 For small sizes just don't align anything. */
27371 if (size_needed == 1)
27372 desired_align = align;
27373 else
27374 goto epilogue;
27376 else if (!count
27377 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27379 label = gen_label_rtx ();
27380 emit_cmp_and_jump_insns (count_exp,
27381 GEN_INT (epilogue_size_needed),
27382 LTU, 0, counter_mode (count_exp), 1, label);
27383 if (expected_size == -1 || expected_size < epilogue_size_needed)
27384 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27385 else
27386 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27390 /* Emit code to decide on runtime whether library call or inline should be
27391 used. */
27392 if (dynamic_check != -1)
27394 if (!issetmem && CONST_INT_P (count_exp))
27396 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27398 emit_block_copy_via_libcall (dst, src, count_exp);
27399 count_exp = const0_rtx;
27400 goto epilogue;
27403 else
27405 rtx_code_label *hot_label = gen_label_rtx ();
27406 if (jump_around_label == NULL_RTX)
27407 jump_around_label = gen_label_rtx ();
27408 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27409 LEU, 0, counter_mode (count_exp),
27410 1, hot_label);
27411 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27412 if (issetmem)
27413 set_storage_via_libcall (dst, count_exp, val_exp);
27414 else
27415 emit_block_copy_via_libcall (dst, src, count_exp);
27416 emit_jump (jump_around_label);
27417 emit_label (hot_label);
27421 /* Step 2: Alignment prologue. */
27422 /* Do the expensive promotion once we branched off the small blocks. */
27423 if (issetmem && !promoted_val)
27424 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27425 desired_align, align);
27427 if (desired_align > align && !misaligned_prologue_used)
27429 if (align_bytes == 0)
27431 /* Except for the first move in prologue, we no longer know
27432 constant offset in aliasing info. It don't seems to worth
27433 the pain to maintain it for the first move, so throw away
27434 the info early. */
27435 dst = change_address (dst, BLKmode, destreg);
27436 if (!issetmem)
27437 src = change_address (src, BLKmode, srcreg);
27438 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27439 promoted_val, vec_promoted_val,
27440 count_exp, align, desired_align,
27441 issetmem);
27442 /* At most desired_align - align bytes are copied. */
27443 if (min_size < (unsigned)(desired_align - align))
27444 min_size = 0;
27445 else
27446 min_size -= desired_align - align;
27448 else
27450 /* If we know how many bytes need to be stored before dst is
27451 sufficiently aligned, maintain aliasing info accurately. */
27452 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27453 srcreg,
27454 promoted_val,
27455 vec_promoted_val,
27456 desired_align,
27457 align_bytes,
27458 issetmem);
27460 count_exp = plus_constant (counter_mode (count_exp),
27461 count_exp, -align_bytes);
27462 count -= align_bytes;
27463 min_size -= align_bytes;
27464 max_size -= align_bytes;
27466 if (need_zero_guard
27467 && min_size < (unsigned HOST_WIDE_INT) size_needed
27468 && (count < (unsigned HOST_WIDE_INT) size_needed
27469 || (align_bytes == 0
27470 && count < ((unsigned HOST_WIDE_INT) size_needed
27471 + desired_align - align))))
27473 /* It is possible that we copied enough so the main loop will not
27474 execute. */
27475 gcc_assert (size_needed > 1);
27476 if (label == NULL_RTX)
27477 label = gen_label_rtx ();
27478 emit_cmp_and_jump_insns (count_exp,
27479 GEN_INT (size_needed),
27480 LTU, 0, counter_mode (count_exp), 1, label);
27481 if (expected_size == -1
27482 || expected_size < (desired_align - align) / 2 + size_needed)
27483 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27484 else
27485 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27488 if (label && size_needed == 1)
27490 emit_label (label);
27491 LABEL_NUSES (label) = 1;
27492 label = NULL;
27493 epilogue_size_needed = 1;
27494 if (issetmem)
27495 promoted_val = val_exp;
27497 else if (label == NULL_RTX && !misaligned_prologue_used)
27498 epilogue_size_needed = size_needed;
27500 /* Step 3: Main loop. */
27502 switch (alg)
27504 case libcall:
27505 case no_stringop:
27506 case last_alg:
27507 gcc_unreachable ();
27508 case loop_1_byte:
27509 case loop:
27510 case unrolled_loop:
27511 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27512 count_exp, move_mode, unroll_factor,
27513 expected_size, issetmem);
27514 break;
27515 case vector_loop:
27516 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27517 vec_promoted_val, count_exp, move_mode,
27518 unroll_factor, expected_size, issetmem);
27519 break;
27520 case rep_prefix_8_byte:
27521 case rep_prefix_4_byte:
27522 case rep_prefix_1_byte:
27523 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27524 val_exp, count_exp, move_mode, issetmem);
27525 break;
27527 /* Adjust properly the offset of src and dest memory for aliasing. */
27528 if (CONST_INT_P (count_exp))
27530 if (!issetmem)
27531 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27532 (count / size_needed) * size_needed);
27533 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27534 (count / size_needed) * size_needed);
27536 else
27538 if (!issetmem)
27539 src = change_address (src, BLKmode, srcreg);
27540 dst = change_address (dst, BLKmode, destreg);
27543 /* Step 4: Epilogue to copy the remaining bytes. */
27544 epilogue:
27545 if (label)
27547 /* When the main loop is done, COUNT_EXP might hold original count,
27548 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27549 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27550 bytes. Compensate if needed. */
27552 if (size_needed < epilogue_size_needed)
27554 tmp =
27555 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27556 GEN_INT (size_needed - 1), count_exp, 1,
27557 OPTAB_DIRECT);
27558 if (tmp != count_exp)
27559 emit_move_insn (count_exp, tmp);
27561 emit_label (label);
27562 LABEL_NUSES (label) = 1;
27565 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27567 if (force_loopy_epilogue)
27568 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27569 epilogue_size_needed);
27570 else
27572 if (issetmem)
27573 expand_setmem_epilogue (dst, destreg, promoted_val,
27574 vec_promoted_val, count_exp,
27575 epilogue_size_needed);
27576 else
27577 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27578 epilogue_size_needed);
27581 if (jump_around_label)
27582 emit_label (jump_around_label);
27583 return true;
27587 /* Expand the appropriate insns for doing strlen if not just doing
27588 repnz; scasb
27590 out = result, initialized with the start address
27591 align_rtx = alignment of the address.
27592 scratch = scratch register, initialized with the startaddress when
27593 not aligned, otherwise undefined
27595 This is just the body. It needs the initializations mentioned above and
27596 some address computing at the end. These things are done in i386.md. */
27598 static void
27599 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27601 int align;
27602 rtx tmp;
27603 rtx_code_label *align_2_label = NULL;
27604 rtx_code_label *align_3_label = NULL;
27605 rtx_code_label *align_4_label = gen_label_rtx ();
27606 rtx_code_label *end_0_label = gen_label_rtx ();
27607 rtx mem;
27608 rtx tmpreg = gen_reg_rtx (SImode);
27609 rtx scratch = gen_reg_rtx (SImode);
27610 rtx cmp;
27612 align = 0;
27613 if (CONST_INT_P (align_rtx))
27614 align = INTVAL (align_rtx);
27616 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27618 /* Is there a known alignment and is it less than 4? */
27619 if (align < 4)
27621 rtx scratch1 = gen_reg_rtx (Pmode);
27622 emit_move_insn (scratch1, out);
27623 /* Is there a known alignment and is it not 2? */
27624 if (align != 2)
27626 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27627 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27629 /* Leave just the 3 lower bits. */
27630 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27631 NULL_RTX, 0, OPTAB_WIDEN);
27633 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27634 Pmode, 1, align_4_label);
27635 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27636 Pmode, 1, align_2_label);
27637 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27638 Pmode, 1, align_3_label);
27640 else
27642 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27643 check if is aligned to 4 - byte. */
27645 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27646 NULL_RTX, 0, OPTAB_WIDEN);
27648 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27649 Pmode, 1, align_4_label);
27652 mem = change_address (src, QImode, out);
27654 /* Now compare the bytes. */
27656 /* Compare the first n unaligned byte on a byte per byte basis. */
27657 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27658 QImode, 1, end_0_label);
27660 /* Increment the address. */
27661 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27663 /* Not needed with an alignment of 2 */
27664 if (align != 2)
27666 emit_label (align_2_label);
27668 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27669 end_0_label);
27671 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27673 emit_label (align_3_label);
27676 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27677 end_0_label);
27679 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27682 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27683 align this loop. It gives only huge programs, but does not help to
27684 speed up. */
27685 emit_label (align_4_label);
27687 mem = change_address (src, SImode, out);
27688 emit_move_insn (scratch, mem);
27689 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27691 /* This formula yields a nonzero result iff one of the bytes is zero.
27692 This saves three branches inside loop and many cycles. */
27694 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27695 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27696 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27697 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27698 gen_int_mode (0x80808080, SImode)));
27699 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27700 align_4_label);
27702 if (TARGET_CMOVE)
27704 rtx reg = gen_reg_rtx (SImode);
27705 rtx reg2 = gen_reg_rtx (Pmode);
27706 emit_move_insn (reg, tmpreg);
27707 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27709 /* If zero is not in the first two bytes, move two bytes forward. */
27710 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27711 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27712 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27713 emit_insn (gen_rtx_SET (tmpreg,
27714 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27715 reg,
27716 tmpreg)));
27717 /* Emit lea manually to avoid clobbering of flags. */
27718 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27720 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27721 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27722 emit_insn (gen_rtx_SET (out,
27723 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27724 reg2,
27725 out)));
27727 else
27729 rtx_code_label *end_2_label = gen_label_rtx ();
27730 /* Is zero in the first two bytes? */
27732 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27733 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27734 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27735 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27736 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27737 pc_rtx);
27738 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27739 JUMP_LABEL (tmp) = end_2_label;
27741 /* Not in the first two. Move two bytes forward. */
27742 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27743 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27745 emit_label (end_2_label);
27749 /* Avoid branch in fixing the byte. */
27750 tmpreg = gen_lowpart (QImode, tmpreg);
27751 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27752 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27753 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27754 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27756 emit_label (end_0_label);
27759 /* Expand strlen. */
27761 bool
27762 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27764 rtx addr, scratch1, scratch2, scratch3, scratch4;
27766 /* The generic case of strlen expander is long. Avoid it's
27767 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27769 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27770 && !TARGET_INLINE_ALL_STRINGOPS
27771 && !optimize_insn_for_size_p ()
27772 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27773 return false;
27775 addr = force_reg (Pmode, XEXP (src, 0));
27776 scratch1 = gen_reg_rtx (Pmode);
27778 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27779 && !optimize_insn_for_size_p ())
27781 /* Well it seems that some optimizer does not combine a call like
27782 foo(strlen(bar), strlen(bar));
27783 when the move and the subtraction is done here. It does calculate
27784 the length just once when these instructions are done inside of
27785 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27786 often used and I use one fewer register for the lifetime of
27787 output_strlen_unroll() this is better. */
27789 emit_move_insn (out, addr);
27791 ix86_expand_strlensi_unroll_1 (out, src, align);
27793 /* strlensi_unroll_1 returns the address of the zero at the end of
27794 the string, like memchr(), so compute the length by subtracting
27795 the start address. */
27796 emit_insn (ix86_gen_sub3 (out, out, addr));
27798 else
27800 rtx unspec;
27802 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27803 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27804 return false;
27805 /* Can't use this for non-default address spaces. */
27806 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27807 return false;
27809 scratch2 = gen_reg_rtx (Pmode);
27810 scratch3 = gen_reg_rtx (Pmode);
27811 scratch4 = force_reg (Pmode, constm1_rtx);
27813 emit_move_insn (scratch3, addr);
27814 eoschar = force_reg (QImode, eoschar);
27816 src = replace_equiv_address_nv (src, scratch3);
27818 /* If .md starts supporting :P, this can be done in .md. */
27819 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27820 scratch4), UNSPEC_SCAS);
27821 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27822 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27823 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27825 return true;
27828 /* For given symbol (function) construct code to compute address of it's PLT
27829 entry in large x86-64 PIC model. */
27830 static rtx
27831 construct_plt_address (rtx symbol)
27833 rtx tmp, unspec;
27835 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27836 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27837 gcc_assert (Pmode == DImode);
27839 tmp = gen_reg_rtx (Pmode);
27840 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27842 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27843 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27844 return tmp;
27848 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
27849 rtx callarg2,
27850 rtx pop, bool sibcall)
27852 rtx vec[3];
27853 rtx use = NULL, call;
27854 unsigned int vec_len = 0;
27855 tree fndecl;
27857 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27859 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
27860 if (fndecl
27861 && (lookup_attribute ("interrupt",
27862 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
27863 error ("interrupt service routine can't be called directly");
27865 else
27866 fndecl = NULL_TREE;
27868 if (pop == const0_rtx)
27869 pop = NULL;
27870 gcc_assert (!TARGET_64BIT || !pop);
27872 if (TARGET_MACHO && !TARGET_64BIT)
27874 #if TARGET_MACHO
27875 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27876 fnaddr = machopic_indirect_call_target (fnaddr);
27877 #endif
27879 else
27881 /* Static functions and indirect calls don't need the pic register. Also,
27882 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
27883 it an indirect call. */
27884 rtx addr = XEXP (fnaddr, 0);
27885 if (flag_pic
27886 && GET_CODE (addr) == SYMBOL_REF
27887 && !SYMBOL_REF_LOCAL_P (addr))
27889 if (flag_plt
27890 && (SYMBOL_REF_DECL (addr) == NULL_TREE
27891 || !lookup_attribute ("noplt",
27892 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
27894 if (!TARGET_64BIT
27895 || (ix86_cmodel == CM_LARGE_PIC
27896 && DEFAULT_ABI != MS_ABI))
27898 use_reg (&use, gen_rtx_REG (Pmode,
27899 REAL_PIC_OFFSET_TABLE_REGNUM));
27900 if (ix86_use_pseudo_pic_reg ())
27901 emit_move_insn (gen_rtx_REG (Pmode,
27902 REAL_PIC_OFFSET_TABLE_REGNUM),
27903 pic_offset_table_rtx);
27906 else if (!TARGET_PECOFF && !TARGET_MACHO)
27908 if (TARGET_64BIT)
27910 fnaddr = gen_rtx_UNSPEC (Pmode,
27911 gen_rtvec (1, addr),
27912 UNSPEC_GOTPCREL);
27913 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27915 else
27917 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
27918 UNSPEC_GOT);
27919 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27920 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
27921 fnaddr);
27923 fnaddr = gen_const_mem (Pmode, fnaddr);
27924 /* Pmode may not be the same as word_mode for x32, which
27925 doesn't support indirect branch via 32-bit memory slot.
27926 Since x32 GOT slot is 64 bit with zero upper 32 bits,
27927 indirect branch via x32 GOT slot is OK. */
27928 if (GET_MODE (fnaddr) != word_mode)
27929 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
27930 fnaddr = gen_rtx_MEM (QImode, fnaddr);
27935 /* Skip setting up RAX register for -mskip-rax-setup when there are no
27936 parameters passed in vector registers. */
27937 if (TARGET_64BIT
27938 && (INTVAL (callarg2) > 0
27939 || (INTVAL (callarg2) == 0
27940 && (TARGET_SSE || !flag_skip_rax_setup))))
27942 rtx al = gen_rtx_REG (QImode, AX_REG);
27943 emit_move_insn (al, callarg2);
27944 use_reg (&use, al);
27947 if (ix86_cmodel == CM_LARGE_PIC
27948 && !TARGET_PECOFF
27949 && MEM_P (fnaddr)
27950 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
27951 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
27952 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
27953 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
27954 branch via x32 GOT slot is OK. */
27955 else if (!(TARGET_X32
27956 && MEM_P (fnaddr)
27957 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
27958 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
27959 && (sibcall
27960 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
27961 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
27963 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
27964 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
27967 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
27969 if (retval)
27971 /* We should add bounds as destination register in case
27972 pointer with bounds may be returned. */
27973 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
27975 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
27976 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
27977 if (GET_CODE (retval) == PARALLEL)
27979 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
27980 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
27981 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
27982 retval = chkp_join_splitted_slot (retval, par);
27984 else
27986 retval = gen_rtx_PARALLEL (VOIDmode,
27987 gen_rtvec (3, retval, b0, b1));
27988 chkp_put_regs_to_expr_list (retval);
27992 call = gen_rtx_SET (retval, call);
27994 vec[vec_len++] = call;
27996 if (pop)
27998 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
27999 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28000 vec[vec_len++] = pop;
28003 if (cfun->machine->no_caller_saved_registers
28004 && (!fndecl
28005 || (!TREE_THIS_VOLATILE (fndecl)
28006 && !lookup_attribute ("no_caller_saved_registers",
28007 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28009 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28010 bool is_64bit_ms_abi = (TARGET_64BIT
28011 && ix86_function_abi (fndecl) == MS_ABI);
28012 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28014 /* If there are no caller-saved registers, add all registers
28015 that are clobbered by the call which returns. */
28016 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28017 if (!fixed_regs[i]
28018 && (ix86_call_used_regs[i] == 1
28019 || (ix86_call_used_regs[i] & c_mask))
28020 && !STACK_REGNO_P (i)
28021 && !MMX_REGNO_P (i))
28022 clobber_reg (&use,
28023 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28025 else if (TARGET_64BIT_MS_ABI
28026 && (!callarg2 || INTVAL (callarg2) != -2))
28028 unsigned i;
28030 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28032 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28033 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28035 clobber_reg (&use, gen_rtx_REG (mode, regno));
28038 /* Set here, but it may get cleared later. */
28039 if (TARGET_CALL_MS2SYSV_XLOGUES)
28041 if (!TARGET_SSE)
28044 /* Don't break hot-patched functions. */
28045 else if (ix86_function_ms_hook_prologue (current_function_decl))
28048 /* TODO: Cases not yet examined. */
28049 else if (flag_split_stack)
28050 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28052 else
28054 gcc_assert (!reload_completed);
28055 cfun->machine->call_ms2sysv = true;
28060 if (vec_len > 1)
28061 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28062 call = emit_call_insn (call);
28063 if (use)
28064 CALL_INSN_FUNCTION_USAGE (call) = use;
28066 return call;
28069 /* Return true if the function being called was marked with attribute
28070 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28071 to handle the non-PIC case in the backend because there is no easy
28072 interface for the front-end to force non-PLT calls to use the GOT.
28073 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28074 to call the function marked "noplt" indirectly. */
28076 static bool
28077 ix86_nopic_noplt_attribute_p (rtx call_op)
28079 if (flag_pic || ix86_cmodel == CM_LARGE
28080 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28081 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28082 || SYMBOL_REF_LOCAL_P (call_op))
28083 return false;
28085 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28087 if (!flag_plt
28088 || (symbol_decl != NULL_TREE
28089 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28090 return true;
28092 return false;
28095 /* Output the assembly for a call instruction. */
28097 const char *
28098 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28100 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28101 bool seh_nop_p = false;
28102 const char *xasm;
28104 if (SIBLING_CALL_P (insn))
28106 if (direct_p)
28108 if (ix86_nopic_noplt_attribute_p (call_op))
28110 if (TARGET_64BIT)
28111 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28112 else
28113 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28115 else
28116 xasm = "%!jmp\t%P0";
28118 /* SEH epilogue detection requires the indirect branch case
28119 to include REX.W. */
28120 else if (TARGET_SEH)
28121 xasm = "%!rex.W jmp\t%A0";
28122 else
28123 xasm = "%!jmp\t%A0";
28125 output_asm_insn (xasm, &call_op);
28126 return "";
28129 /* SEH unwinding can require an extra nop to be emitted in several
28130 circumstances. Determine if we have one of those. */
28131 if (TARGET_SEH)
28133 rtx_insn *i;
28135 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28137 /* If we get to another real insn, we don't need the nop. */
28138 if (INSN_P (i))
28139 break;
28141 /* If we get to the epilogue note, prevent a catch region from
28142 being adjacent to the standard epilogue sequence. If non-
28143 call-exceptions, we'll have done this during epilogue emission. */
28144 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28145 && !flag_non_call_exceptions
28146 && !can_throw_internal (insn))
28148 seh_nop_p = true;
28149 break;
28153 /* If we didn't find a real insn following the call, prevent the
28154 unwinder from looking into the next function. */
28155 if (i == NULL)
28156 seh_nop_p = true;
28159 if (direct_p)
28161 if (ix86_nopic_noplt_attribute_p (call_op))
28163 if (TARGET_64BIT)
28164 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28165 else
28166 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28168 else
28169 xasm = "%!call\t%P0";
28171 else
28172 xasm = "%!call\t%A0";
28174 output_asm_insn (xasm, &call_op);
28176 if (seh_nop_p)
28177 return "nop";
28179 return "";
28182 /* Clear stack slot assignments remembered from previous functions.
28183 This is called from INIT_EXPANDERS once before RTL is emitted for each
28184 function. */
28186 static struct machine_function *
28187 ix86_init_machine_status (void)
28189 struct machine_function *f;
28191 f = ggc_cleared_alloc<machine_function> ();
28192 f->call_abi = ix86_abi;
28194 return f;
28197 /* Return a MEM corresponding to a stack slot with mode MODE.
28198 Allocate a new slot if necessary.
28200 The RTL for a function can have several slots available: N is
28201 which slot to use. */
28204 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28206 struct stack_local_entry *s;
28208 gcc_assert (n < MAX_386_STACK_LOCALS);
28210 for (s = ix86_stack_locals; s; s = s->next)
28211 if (s->mode == mode && s->n == n)
28212 return validize_mem (copy_rtx (s->rtl));
28214 s = ggc_alloc<stack_local_entry> ();
28215 s->n = n;
28216 s->mode = mode;
28217 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28219 s->next = ix86_stack_locals;
28220 ix86_stack_locals = s;
28221 return validize_mem (copy_rtx (s->rtl));
28224 static void
28225 ix86_instantiate_decls (void)
28227 struct stack_local_entry *s;
28229 for (s = ix86_stack_locals; s; s = s->next)
28230 if (s->rtl != NULL_RTX)
28231 instantiate_decl_rtl (s->rtl);
28234 /* Return the number used for encoding REG, in the range 0..7. */
28236 static int
28237 reg_encoded_number (rtx reg)
28239 unsigned regno = REGNO (reg);
28240 switch (regno)
28242 case AX_REG:
28243 return 0;
28244 case CX_REG:
28245 return 1;
28246 case DX_REG:
28247 return 2;
28248 case BX_REG:
28249 return 3;
28250 case SP_REG:
28251 return 4;
28252 case BP_REG:
28253 return 5;
28254 case SI_REG:
28255 return 6;
28256 case DI_REG:
28257 return 7;
28258 default:
28259 break;
28261 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28262 return regno - FIRST_STACK_REG;
28263 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28264 return regno - FIRST_SSE_REG;
28265 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28266 return regno - FIRST_MMX_REG;
28267 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28268 return regno - FIRST_REX_SSE_REG;
28269 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28270 return regno - FIRST_REX_INT_REG;
28271 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28272 return regno - FIRST_MASK_REG;
28273 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28274 return regno - FIRST_BND_REG;
28275 return -1;
28278 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28279 in its encoding if it could be relevant for ROP mitigation, otherwise
28280 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28281 used for calculating it into them. */
28283 static int
28284 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28285 int *popno0 = 0, int *popno1 = 0)
28287 if (asm_noperands (PATTERN (insn)) >= 0)
28288 return -1;
28289 int has_modrm = get_attr_modrm (insn);
28290 if (!has_modrm)
28291 return -1;
28292 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28293 rtx op0, op1;
28294 switch (cls)
28296 case MODRM_CLASS_OP02:
28297 gcc_assert (noperands >= 3);
28298 if (popno0)
28300 *popno0 = 0;
28301 *popno1 = 2;
28303 op0 = operands[0];
28304 op1 = operands[2];
28305 break;
28306 case MODRM_CLASS_OP01:
28307 gcc_assert (noperands >= 2);
28308 if (popno0)
28310 *popno0 = 0;
28311 *popno1 = 1;
28313 op0 = operands[0];
28314 op1 = operands[1];
28315 break;
28316 default:
28317 return -1;
28319 if (REG_P (op0) && REG_P (op1))
28321 int enc0 = reg_encoded_number (op0);
28322 int enc1 = reg_encoded_number (op1);
28323 return 0xc0 + (enc1 << 3) + enc0;
28325 return -1;
28328 /* Check whether x86 address PARTS is a pc-relative address. */
28330 bool
28331 ix86_rip_relative_addr_p (struct ix86_address *parts)
28333 rtx base, index, disp;
28335 base = parts->base;
28336 index = parts->index;
28337 disp = parts->disp;
28339 if (disp && !base && !index)
28341 if (TARGET_64BIT)
28343 rtx symbol = disp;
28345 if (GET_CODE (disp) == CONST)
28346 symbol = XEXP (disp, 0);
28347 if (GET_CODE (symbol) == PLUS
28348 && CONST_INT_P (XEXP (symbol, 1)))
28349 symbol = XEXP (symbol, 0);
28351 if (GET_CODE (symbol) == LABEL_REF
28352 || (GET_CODE (symbol) == SYMBOL_REF
28353 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28354 || (GET_CODE (symbol) == UNSPEC
28355 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28356 || XINT (symbol, 1) == UNSPEC_PCREL
28357 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28358 return true;
28361 return false;
28364 /* Calculate the length of the memory address in the instruction encoding.
28365 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28366 or other prefixes. We never generate addr32 prefix for LEA insn. */
28369 memory_address_length (rtx addr, bool lea)
28371 struct ix86_address parts;
28372 rtx base, index, disp;
28373 int len;
28374 int ok;
28376 if (GET_CODE (addr) == PRE_DEC
28377 || GET_CODE (addr) == POST_INC
28378 || GET_CODE (addr) == PRE_MODIFY
28379 || GET_CODE (addr) == POST_MODIFY)
28380 return 0;
28382 ok = ix86_decompose_address (addr, &parts);
28383 gcc_assert (ok);
28385 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28387 /* If this is not LEA instruction, add the length of addr32 prefix. */
28388 if (TARGET_64BIT && !lea
28389 && (SImode_address_operand (addr, VOIDmode)
28390 || (parts.base && GET_MODE (parts.base) == SImode)
28391 || (parts.index && GET_MODE (parts.index) == SImode)))
28392 len++;
28394 base = parts.base;
28395 index = parts.index;
28396 disp = parts.disp;
28398 if (base && SUBREG_P (base))
28399 base = SUBREG_REG (base);
28400 if (index && SUBREG_P (index))
28401 index = SUBREG_REG (index);
28403 gcc_assert (base == NULL_RTX || REG_P (base));
28404 gcc_assert (index == NULL_RTX || REG_P (index));
28406 /* Rule of thumb:
28407 - esp as the base always wants an index,
28408 - ebp as the base always wants a displacement,
28409 - r12 as the base always wants an index,
28410 - r13 as the base always wants a displacement. */
28412 /* Register Indirect. */
28413 if (base && !index && !disp)
28415 /* esp (for its index) and ebp (for its displacement) need
28416 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28417 code. */
28418 if (base == arg_pointer_rtx
28419 || base == frame_pointer_rtx
28420 || REGNO (base) == SP_REG
28421 || REGNO (base) == BP_REG
28422 || REGNO (base) == R12_REG
28423 || REGNO (base) == R13_REG)
28424 len++;
28427 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28428 is not disp32, but disp32(%rip), so for disp32
28429 SIB byte is needed, unless print_operand_address
28430 optimizes it into disp32(%rip) or (%rip) is implied
28431 by UNSPEC. */
28432 else if (disp && !base && !index)
28434 len += 4;
28435 if (!ix86_rip_relative_addr_p (&parts))
28436 len++;
28438 else
28440 /* Find the length of the displacement constant. */
28441 if (disp)
28443 if (base && satisfies_constraint_K (disp))
28444 len += 1;
28445 else
28446 len += 4;
28448 /* ebp always wants a displacement. Similarly r13. */
28449 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28450 len++;
28452 /* An index requires the two-byte modrm form.... */
28453 if (index
28454 /* ...like esp (or r12), which always wants an index. */
28455 || base == arg_pointer_rtx
28456 || base == frame_pointer_rtx
28457 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28458 len++;
28461 return len;
28464 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28465 is set, expect that insn have 8bit immediate alternative. */
28467 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28469 int len = 0;
28470 int i;
28471 extract_insn_cached (insn);
28472 for (i = recog_data.n_operands - 1; i >= 0; --i)
28473 if (CONSTANT_P (recog_data.operand[i]))
28475 enum attr_mode mode = get_attr_mode (insn);
28477 gcc_assert (!len);
28478 if (shortform && CONST_INT_P (recog_data.operand[i]))
28480 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28481 switch (mode)
28483 case MODE_QI:
28484 len = 1;
28485 continue;
28486 case MODE_HI:
28487 ival = trunc_int_for_mode (ival, HImode);
28488 break;
28489 case MODE_SI:
28490 ival = trunc_int_for_mode (ival, SImode);
28491 break;
28492 default:
28493 break;
28495 if (IN_RANGE (ival, -128, 127))
28497 len = 1;
28498 continue;
28501 switch (mode)
28503 case MODE_QI:
28504 len = 1;
28505 break;
28506 case MODE_HI:
28507 len = 2;
28508 break;
28509 case MODE_SI:
28510 len = 4;
28511 break;
28512 /* Immediates for DImode instructions are encoded
28513 as 32bit sign extended values. */
28514 case MODE_DI:
28515 len = 4;
28516 break;
28517 default:
28518 fatal_insn ("unknown insn mode", insn);
28521 return len;
28524 /* Compute default value for "length_address" attribute. */
28526 ix86_attr_length_address_default (rtx_insn *insn)
28528 int i;
28530 if (get_attr_type (insn) == TYPE_LEA)
28532 rtx set = PATTERN (insn), addr;
28534 if (GET_CODE (set) == PARALLEL)
28535 set = XVECEXP (set, 0, 0);
28537 gcc_assert (GET_CODE (set) == SET);
28539 addr = SET_SRC (set);
28541 return memory_address_length (addr, true);
28544 extract_insn_cached (insn);
28545 for (i = recog_data.n_operands - 1; i >= 0; --i)
28547 rtx op = recog_data.operand[i];
28548 if (MEM_P (op))
28550 constrain_operands_cached (insn, reload_completed);
28551 if (which_alternative != -1)
28553 const char *constraints = recog_data.constraints[i];
28554 int alt = which_alternative;
28556 while (*constraints == '=' || *constraints == '+')
28557 constraints++;
28558 while (alt-- > 0)
28559 while (*constraints++ != ',')
28561 /* Skip ignored operands. */
28562 if (*constraints == 'X')
28563 continue;
28566 int len = memory_address_length (XEXP (op, 0), false);
28568 /* Account for segment prefix for non-default addr spaces. */
28569 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28570 len++;
28572 return len;
28575 return 0;
28578 /* Compute default value for "length_vex" attribute. It includes
28579 2 or 3 byte VEX prefix and 1 opcode byte. */
28582 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28583 bool has_vex_w)
28585 int i;
28587 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28588 byte VEX prefix. */
28589 if (!has_0f_opcode || has_vex_w)
28590 return 3 + 1;
28592 /* We can always use 2 byte VEX prefix in 32bit. */
28593 if (!TARGET_64BIT)
28594 return 2 + 1;
28596 extract_insn_cached (insn);
28598 for (i = recog_data.n_operands - 1; i >= 0; --i)
28599 if (REG_P (recog_data.operand[i]))
28601 /* REX.W bit uses 3 byte VEX prefix. */
28602 if (GET_MODE (recog_data.operand[i]) == DImode
28603 && GENERAL_REG_P (recog_data.operand[i]))
28604 return 3 + 1;
28606 else
28608 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28609 if (MEM_P (recog_data.operand[i])
28610 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28611 return 3 + 1;
28614 return 2 + 1;
28618 static bool
28619 ix86_class_likely_spilled_p (reg_class_t);
28621 /* Returns true if lhs of insn is HW function argument register and set up
28622 is_spilled to true if it is likely spilled HW register. */
28623 static bool
28624 insn_is_function_arg (rtx insn, bool* is_spilled)
28626 rtx dst;
28628 if (!NONDEBUG_INSN_P (insn))
28629 return false;
28630 /* Call instructions are not movable, ignore it. */
28631 if (CALL_P (insn))
28632 return false;
28633 insn = PATTERN (insn);
28634 if (GET_CODE (insn) == PARALLEL)
28635 insn = XVECEXP (insn, 0, 0);
28636 if (GET_CODE (insn) != SET)
28637 return false;
28638 dst = SET_DEST (insn);
28639 if (REG_P (dst) && HARD_REGISTER_P (dst)
28640 && ix86_function_arg_regno_p (REGNO (dst)))
28642 /* Is it likely spilled HW register? */
28643 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
28644 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
28645 *is_spilled = true;
28646 return true;
28648 return false;
28651 /* Add output dependencies for chain of function adjacent arguments if only
28652 there is a move to likely spilled HW register. Return first argument
28653 if at least one dependence was added or NULL otherwise. */
28654 static rtx_insn *
28655 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
28657 rtx_insn *insn;
28658 rtx_insn *last = call;
28659 rtx_insn *first_arg = NULL;
28660 bool is_spilled = false;
28662 head = PREV_INSN (head);
28664 /* Find nearest to call argument passing instruction. */
28665 while (true)
28667 last = PREV_INSN (last);
28668 if (last == head)
28669 return NULL;
28670 if (!NONDEBUG_INSN_P (last))
28671 continue;
28672 if (insn_is_function_arg (last, &is_spilled))
28673 break;
28674 return NULL;
28677 first_arg = last;
28678 while (true)
28680 insn = PREV_INSN (last);
28681 if (!INSN_P (insn))
28682 break;
28683 if (insn == head)
28684 break;
28685 if (!NONDEBUG_INSN_P (insn))
28687 last = insn;
28688 continue;
28690 if (insn_is_function_arg (insn, &is_spilled))
28692 /* Add output depdendence between two function arguments if chain
28693 of output arguments contains likely spilled HW registers. */
28694 if (is_spilled)
28695 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28696 first_arg = last = insn;
28698 else
28699 break;
28701 if (!is_spilled)
28702 return NULL;
28703 return first_arg;
28706 /* Add output or anti dependency from insn to first_arg to restrict its code
28707 motion. */
28708 static void
28709 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
28711 rtx set;
28712 rtx tmp;
28714 /* Add anti dependencies for bounds stores. */
28715 if (INSN_P (insn)
28716 && GET_CODE (PATTERN (insn)) == PARALLEL
28717 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
28718 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
28720 add_dependence (first_arg, insn, REG_DEP_ANTI);
28721 return;
28724 set = single_set (insn);
28725 if (!set)
28726 return;
28727 tmp = SET_DEST (set);
28728 if (REG_P (tmp))
28730 /* Add output dependency to the first function argument. */
28731 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28732 return;
28734 /* Add anti dependency. */
28735 add_dependence (first_arg, insn, REG_DEP_ANTI);
28738 /* Avoid cross block motion of function argument through adding dependency
28739 from the first non-jump instruction in bb. */
28740 static void
28741 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
28743 rtx_insn *insn = BB_END (bb);
28745 while (insn)
28747 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
28749 rtx set = single_set (insn);
28750 if (set)
28752 avoid_func_arg_motion (arg, insn);
28753 return;
28756 if (insn == BB_HEAD (bb))
28757 return;
28758 insn = PREV_INSN (insn);
28762 /* Hook for pre-reload schedule - avoid motion of function arguments
28763 passed in likely spilled HW registers. */
28764 static void
28765 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
28767 rtx_insn *insn;
28768 rtx_insn *first_arg = NULL;
28769 if (reload_completed)
28770 return;
28771 while (head != tail && DEBUG_INSN_P (head))
28772 head = NEXT_INSN (head);
28773 for (insn = tail; insn != head; insn = PREV_INSN (insn))
28774 if (INSN_P (insn) && CALL_P (insn))
28776 first_arg = add_parameter_dependencies (insn, head);
28777 if (first_arg)
28779 /* Add dependee for first argument to predecessors if only
28780 region contains more than one block. */
28781 basic_block bb = BLOCK_FOR_INSN (insn);
28782 int rgn = CONTAINING_RGN (bb->index);
28783 int nr_blks = RGN_NR_BLOCKS (rgn);
28784 /* Skip trivial regions and region head blocks that can have
28785 predecessors outside of region. */
28786 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
28788 edge e;
28789 edge_iterator ei;
28791 /* Regions are SCCs with the exception of selective
28792 scheduling with pipelining of outer blocks enabled.
28793 So also check that immediate predecessors of a non-head
28794 block are in the same region. */
28795 FOR_EACH_EDGE (e, ei, bb->preds)
28797 /* Avoid creating of loop-carried dependencies through
28798 using topological ordering in the region. */
28799 if (rgn == CONTAINING_RGN (e->src->index)
28800 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
28801 add_dependee_for_func_arg (first_arg, e->src);
28804 insn = first_arg;
28805 if (insn == head)
28806 break;
28809 else if (first_arg)
28810 avoid_func_arg_motion (first_arg, insn);
28813 /* Hook for pre-reload schedule - set priority of moves from likely spilled
28814 HW registers to maximum, to schedule them at soon as possible. These are
28815 moves from function argument registers at the top of the function entry
28816 and moves from function return value registers after call. */
28817 static int
28818 ix86_adjust_priority (rtx_insn *insn, int priority)
28820 rtx set;
28822 if (reload_completed)
28823 return priority;
28825 if (!NONDEBUG_INSN_P (insn))
28826 return priority;
28828 set = single_set (insn);
28829 if (set)
28831 rtx tmp = SET_SRC (set);
28832 if (REG_P (tmp)
28833 && HARD_REGISTER_P (tmp)
28834 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
28835 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
28836 return current_sched_info->sched_max_insns_priority;
28839 return priority;
28842 /* Prepare for scheduling pass. */
28843 static void
28844 ix86_sched_init_global (FILE *, int, int)
28846 /* Install scheduling hooks for current CPU. Some of these hooks are used
28847 in time-critical parts of the scheduler, so we only set them up when
28848 they are actually used. */
28849 switch (ix86_tune)
28851 case PROCESSOR_CORE2:
28852 case PROCESSOR_NEHALEM:
28853 case PROCESSOR_SANDYBRIDGE:
28854 case PROCESSOR_HASWELL:
28855 case PROCESSOR_GENERIC:
28856 /* Do not perform multipass scheduling for pre-reload schedule
28857 to save compile time. */
28858 if (reload_completed)
28860 ix86_core2i7_init_hooks ();
28861 break;
28863 /* Fall through. */
28864 default:
28865 targetm.sched.dfa_post_advance_cycle = NULL;
28866 targetm.sched.first_cycle_multipass_init = NULL;
28867 targetm.sched.first_cycle_multipass_begin = NULL;
28868 targetm.sched.first_cycle_multipass_issue = NULL;
28869 targetm.sched.first_cycle_multipass_backtrack = NULL;
28870 targetm.sched.first_cycle_multipass_end = NULL;
28871 targetm.sched.first_cycle_multipass_fini = NULL;
28872 break;
28877 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
28879 static HOST_WIDE_INT
28880 ix86_static_rtx_alignment (machine_mode mode)
28882 if (mode == DFmode)
28883 return 64;
28884 if (ALIGN_MODE_128 (mode))
28885 return MAX (128, GET_MODE_ALIGNMENT (mode));
28886 return GET_MODE_ALIGNMENT (mode);
28889 /* Implement TARGET_CONSTANT_ALIGNMENT. */
28891 static HOST_WIDE_INT
28892 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
28894 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
28895 || TREE_CODE (exp) == INTEGER_CST)
28897 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
28898 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
28899 return MAX (mode_align, align);
28901 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
28902 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
28903 return BITS_PER_WORD;
28905 return align;
28908 /* Implement TARGET_EMPTY_RECORD_P. */
28910 static bool
28911 ix86_is_empty_record (const_tree type)
28913 if (!TARGET_64BIT)
28914 return false;
28915 return default_is_empty_record (type);
28918 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
28920 static void
28921 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
28923 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
28925 if (!cum->warn_empty)
28926 return;
28928 if (!TYPE_EMPTY_P (type))
28929 return;
28931 const_tree ctx = get_ultimate_context (cum->decl);
28932 if (ctx != NULL_TREE
28933 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
28934 return;
28936 /* If the actual size of the type is zero, then there is no change
28937 in how objects of this size are passed. */
28938 if (int_size_in_bytes (type) == 0)
28939 return;
28941 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
28942 "changes in -fabi-version=12 (GCC 8)", type);
28944 /* Only warn once. */
28945 cum->warn_empty = false;
28948 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
28949 the data type, and ALIGN is the alignment that the object would
28950 ordinarily have. */
28952 static int
28953 iamcu_alignment (tree type, int align)
28955 machine_mode mode;
28957 if (align < 32 || TYPE_USER_ALIGN (type))
28958 return align;
28960 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
28961 bytes. */
28962 mode = TYPE_MODE (strip_array_types (type));
28963 switch (GET_MODE_CLASS (mode))
28965 case MODE_INT:
28966 case MODE_COMPLEX_INT:
28967 case MODE_COMPLEX_FLOAT:
28968 case MODE_FLOAT:
28969 case MODE_DECIMAL_FLOAT:
28970 return 32;
28971 default:
28972 return align;
28976 /* Compute the alignment for a static variable.
28977 TYPE is the data type, and ALIGN is the alignment that
28978 the object would ordinarily have. The value of this function is used
28979 instead of that alignment to align the object. */
28982 ix86_data_alignment (tree type, int align, bool opt)
28984 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
28985 for symbols from other compilation units or symbols that don't need
28986 to bind locally. In order to preserve some ABI compatibility with
28987 those compilers, ensure we don't decrease alignment from what we
28988 used to assume. */
28990 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
28992 /* A data structure, equal or greater than the size of a cache line
28993 (64 bytes in the Pentium 4 and other recent Intel processors, including
28994 processors based on Intel Core microarchitecture) should be aligned
28995 so that its base address is a multiple of a cache line size. */
28997 int max_align
28998 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
29000 if (max_align < BITS_PER_WORD)
29001 max_align = BITS_PER_WORD;
29003 switch (ix86_align_data_type)
29005 case ix86_align_data_type_abi: opt = false; break;
29006 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29007 case ix86_align_data_type_cacheline: break;
29010 if (TARGET_IAMCU)
29011 align = iamcu_alignment (type, align);
29013 if (opt
29014 && AGGREGATE_TYPE_P (type)
29015 && TYPE_SIZE (type)
29016 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29018 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29019 && align < max_align_compat)
29020 align = max_align_compat;
29021 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29022 && align < max_align)
29023 align = max_align;
29026 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29027 to 16byte boundary. */
29028 if (TARGET_64BIT)
29030 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29031 && TYPE_SIZE (type)
29032 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29033 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29034 && align < 128)
29035 return 128;
29038 if (!opt)
29039 return align;
29041 if (TREE_CODE (type) == ARRAY_TYPE)
29043 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29044 return 64;
29045 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29046 return 128;
29048 else if (TREE_CODE (type) == COMPLEX_TYPE)
29051 if (TYPE_MODE (type) == DCmode && align < 64)
29052 return 64;
29053 if ((TYPE_MODE (type) == XCmode
29054 || TYPE_MODE (type) == TCmode) && align < 128)
29055 return 128;
29057 else if ((TREE_CODE (type) == RECORD_TYPE
29058 || TREE_CODE (type) == UNION_TYPE
29059 || TREE_CODE (type) == QUAL_UNION_TYPE)
29060 && TYPE_FIELDS (type))
29062 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29063 return 64;
29064 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29065 return 128;
29067 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29068 || TREE_CODE (type) == INTEGER_TYPE)
29070 if (TYPE_MODE (type) == DFmode && align < 64)
29071 return 64;
29072 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29073 return 128;
29076 return align;
29079 /* Compute the alignment for a local variable or a stack slot. EXP is
29080 the data type or decl itself, MODE is the widest mode available and
29081 ALIGN is the alignment that the object would ordinarily have. The
29082 value of this macro is used instead of that alignment to align the
29083 object. */
29085 unsigned int
29086 ix86_local_alignment (tree exp, machine_mode mode,
29087 unsigned int align)
29089 tree type, decl;
29091 if (exp && DECL_P (exp))
29093 type = TREE_TYPE (exp);
29094 decl = exp;
29096 else
29098 type = exp;
29099 decl = NULL;
29102 /* Don't do dynamic stack realignment for long long objects with
29103 -mpreferred-stack-boundary=2. */
29104 if (!TARGET_64BIT
29105 && align == 64
29106 && ix86_preferred_stack_boundary < 64
29107 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29108 && (!type || !TYPE_USER_ALIGN (type))
29109 && (!decl || !DECL_USER_ALIGN (decl)))
29110 align = 32;
29112 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29113 register in MODE. We will return the largest alignment of XF
29114 and DF. */
29115 if (!type)
29117 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29118 align = GET_MODE_ALIGNMENT (DFmode);
29119 return align;
29122 /* Don't increase alignment for Intel MCU psABI. */
29123 if (TARGET_IAMCU)
29124 return align;
29126 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29127 to 16byte boundary. Exact wording is:
29129 An array uses the same alignment as its elements, except that a local or
29130 global array variable of length at least 16 bytes or
29131 a C99 variable-length array variable always has alignment of at least 16 bytes.
29133 This was added to allow use of aligned SSE instructions at arrays. This
29134 rule is meant for static storage (where compiler can not do the analysis
29135 by itself). We follow it for automatic variables only when convenient.
29136 We fully control everything in the function compiled and functions from
29137 other unit can not rely on the alignment.
29139 Exclude va_list type. It is the common case of local array where
29140 we can not benefit from the alignment.
29142 TODO: Probably one should optimize for size only when var is not escaping. */
29143 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
29144 && TARGET_SSE)
29146 if (AGGREGATE_TYPE_P (type)
29147 && (va_list_type_node == NULL_TREE
29148 || (TYPE_MAIN_VARIANT (type)
29149 != TYPE_MAIN_VARIANT (va_list_type_node)))
29150 && TYPE_SIZE (type)
29151 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29152 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29153 && align < 128)
29154 return 128;
29156 if (TREE_CODE (type) == ARRAY_TYPE)
29158 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29159 return 64;
29160 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29161 return 128;
29163 else if (TREE_CODE (type) == COMPLEX_TYPE)
29165 if (TYPE_MODE (type) == DCmode && align < 64)
29166 return 64;
29167 if ((TYPE_MODE (type) == XCmode
29168 || TYPE_MODE (type) == TCmode) && align < 128)
29169 return 128;
29171 else if ((TREE_CODE (type) == RECORD_TYPE
29172 || TREE_CODE (type) == UNION_TYPE
29173 || TREE_CODE (type) == QUAL_UNION_TYPE)
29174 && TYPE_FIELDS (type))
29176 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29177 return 64;
29178 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29179 return 128;
29181 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29182 || TREE_CODE (type) == INTEGER_TYPE)
29185 if (TYPE_MODE (type) == DFmode && align < 64)
29186 return 64;
29187 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29188 return 128;
29190 return align;
29193 /* Compute the minimum required alignment for dynamic stack realignment
29194 purposes for a local variable, parameter or a stack slot. EXP is
29195 the data type or decl itself, MODE is its mode and ALIGN is the
29196 alignment that the object would ordinarily have. */
29198 unsigned int
29199 ix86_minimum_alignment (tree exp, machine_mode mode,
29200 unsigned int align)
29202 tree type, decl;
29204 if (exp && DECL_P (exp))
29206 type = TREE_TYPE (exp);
29207 decl = exp;
29209 else
29211 type = exp;
29212 decl = NULL;
29215 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
29216 return align;
29218 /* Don't do dynamic stack realignment for long long objects with
29219 -mpreferred-stack-boundary=2. */
29220 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
29221 && (!type || !TYPE_USER_ALIGN (type))
29222 && (!decl || !DECL_USER_ALIGN (decl)))
29224 gcc_checking_assert (!TARGET_STV);
29225 return 32;
29228 return align;
29231 /* Find a location for the static chain incoming to a nested function.
29232 This is a register, unless all free registers are used by arguments. */
29234 static rtx
29235 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
29237 unsigned regno;
29239 /* While this function won't be called by the middle-end when a static
29240 chain isn't needed, it's also used throughout the backend so it's
29241 easiest to keep this check centralized. */
29242 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
29243 return NULL;
29245 if (TARGET_64BIT)
29247 /* We always use R10 in 64-bit mode. */
29248 regno = R10_REG;
29250 else
29252 const_tree fntype, fndecl;
29253 unsigned int ccvt;
29255 /* By default in 32-bit mode we use ECX to pass the static chain. */
29256 regno = CX_REG;
29258 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
29260 fntype = TREE_TYPE (fndecl_or_type);
29261 fndecl = fndecl_or_type;
29263 else
29265 fntype = fndecl_or_type;
29266 fndecl = NULL;
29269 ccvt = ix86_get_callcvt (fntype);
29270 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29272 /* Fastcall functions use ecx/edx for arguments, which leaves
29273 us with EAX for the static chain.
29274 Thiscall functions use ecx for arguments, which also
29275 leaves us with EAX for the static chain. */
29276 regno = AX_REG;
29278 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29280 /* Thiscall functions use ecx for arguments, which leaves
29281 us with EAX and EDX for the static chain.
29282 We are using for abi-compatibility EAX. */
29283 regno = AX_REG;
29285 else if (ix86_function_regparm (fntype, fndecl) == 3)
29287 /* For regparm 3, we have no free call-clobbered registers in
29288 which to store the static chain. In order to implement this,
29289 we have the trampoline push the static chain to the stack.
29290 However, we can't push a value below the return address when
29291 we call the nested function directly, so we have to use an
29292 alternate entry point. For this we use ESI, and have the
29293 alternate entry point push ESI, so that things appear the
29294 same once we're executing the nested function. */
29295 if (incoming_p)
29297 if (fndecl == current_function_decl
29298 && !ix86_static_chain_on_stack)
29300 gcc_assert (!reload_completed);
29301 ix86_static_chain_on_stack = true;
29303 return gen_frame_mem (SImode,
29304 plus_constant (Pmode,
29305 arg_pointer_rtx, -8));
29307 regno = SI_REG;
29311 return gen_rtx_REG (Pmode, regno);
29314 /* Emit RTL insns to initialize the variable parts of a trampoline.
29315 FNDECL is the decl of the target address; M_TRAMP is a MEM for
29316 the trampoline, and CHAIN_VALUE is an RTX for the static chain
29317 to be passed to the target function. */
29319 static void
29320 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
29322 rtx mem, fnaddr;
29323 int opcode;
29324 int offset = 0;
29326 fnaddr = XEXP (DECL_RTL (fndecl), 0);
29328 if (TARGET_64BIT)
29330 int size;
29332 /* Load the function address to r11. Try to load address using
29333 the shorter movl instead of movabs. We may want to support
29334 movq for kernel mode, but kernel does not use trampolines at
29335 the moment. FNADDR is a 32bit address and may not be in
29336 DImode when ptr_mode == SImode. Always use movl in this
29337 case. */
29338 if (ptr_mode == SImode
29339 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
29341 fnaddr = copy_addr_to_reg (fnaddr);
29343 mem = adjust_address (m_tramp, HImode, offset);
29344 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
29346 mem = adjust_address (m_tramp, SImode, offset + 2);
29347 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
29348 offset += 6;
29350 else
29352 mem = adjust_address (m_tramp, HImode, offset);
29353 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
29355 mem = adjust_address (m_tramp, DImode, offset + 2);
29356 emit_move_insn (mem, fnaddr);
29357 offset += 10;
29360 /* Load static chain using movabs to r10. Use the shorter movl
29361 instead of movabs when ptr_mode == SImode. */
29362 if (ptr_mode == SImode)
29364 opcode = 0xba41;
29365 size = 6;
29367 else
29369 opcode = 0xba49;
29370 size = 10;
29373 mem = adjust_address (m_tramp, HImode, offset);
29374 emit_move_insn (mem, gen_int_mode (opcode, HImode));
29376 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
29377 emit_move_insn (mem, chain_value);
29378 offset += size;
29380 /* Jump to r11; the last (unused) byte is a nop, only there to
29381 pad the write out to a single 32-bit store. */
29382 mem = adjust_address (m_tramp, SImode, offset);
29383 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
29384 offset += 4;
29386 else
29388 rtx disp, chain;
29390 /* Depending on the static chain location, either load a register
29391 with a constant, or push the constant to the stack. All of the
29392 instructions are the same size. */
29393 chain = ix86_static_chain (fndecl, true);
29394 if (REG_P (chain))
29396 switch (REGNO (chain))
29398 case AX_REG:
29399 opcode = 0xb8; break;
29400 case CX_REG:
29401 opcode = 0xb9; break;
29402 default:
29403 gcc_unreachable ();
29406 else
29407 opcode = 0x68;
29409 mem = adjust_address (m_tramp, QImode, offset);
29410 emit_move_insn (mem, gen_int_mode (opcode, QImode));
29412 mem = adjust_address (m_tramp, SImode, offset + 1);
29413 emit_move_insn (mem, chain_value);
29414 offset += 5;
29416 mem = adjust_address (m_tramp, QImode, offset);
29417 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
29419 mem = adjust_address (m_tramp, SImode, offset + 1);
29421 /* Compute offset from the end of the jmp to the target function.
29422 In the case in which the trampoline stores the static chain on
29423 the stack, we need to skip the first insn which pushes the
29424 (call-saved) register static chain; this push is 1 byte. */
29425 offset += 5;
29426 disp = expand_binop (SImode, sub_optab, fnaddr,
29427 plus_constant (Pmode, XEXP (m_tramp, 0),
29428 offset - (MEM_P (chain) ? 1 : 0)),
29429 NULL_RTX, 1, OPTAB_DIRECT);
29430 emit_move_insn (mem, disp);
29433 gcc_assert (offset <= TRAMPOLINE_SIZE);
29435 #ifdef HAVE_ENABLE_EXECUTE_STACK
29436 #ifdef CHECK_EXECUTE_STACK_ENABLED
29437 if (CHECK_EXECUTE_STACK_ENABLED)
29438 #endif
29439 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
29440 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
29441 #endif
29444 static bool
29445 ix86_allocate_stack_slots_for_args (void)
29447 /* Naked functions should not allocate stack slots for arguments. */
29448 return !ix86_function_naked (current_function_decl);
29451 static bool
29452 ix86_warn_func_return (tree decl)
29454 /* Naked functions are implemented entirely in assembly, including the
29455 return sequence, so suppress warnings about this. */
29456 return !ix86_function_naked (decl);
29459 /* The following file contains several enumerations and data structures
29460 built from the definitions in i386-builtin-types.def. */
29462 #include "i386-builtin-types.inc"
29464 /* Table for the ix86 builtin non-function types. */
29465 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
29467 /* Retrieve an element from the above table, building some of
29468 the types lazily. */
29470 static tree
29471 ix86_get_builtin_type (enum ix86_builtin_type tcode)
29473 unsigned int index;
29474 tree type, itype;
29476 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
29478 type = ix86_builtin_type_tab[(int) tcode];
29479 if (type != NULL)
29480 return type;
29482 gcc_assert (tcode > IX86_BT_LAST_PRIM);
29483 if (tcode <= IX86_BT_LAST_VECT)
29485 machine_mode mode;
29487 index = tcode - IX86_BT_LAST_PRIM - 1;
29488 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
29489 mode = ix86_builtin_type_vect_mode[index];
29491 type = build_vector_type_for_mode (itype, mode);
29493 else
29495 int quals;
29497 index = tcode - IX86_BT_LAST_VECT - 1;
29498 if (tcode <= IX86_BT_LAST_PTR)
29499 quals = TYPE_UNQUALIFIED;
29500 else
29501 quals = TYPE_QUAL_CONST;
29503 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
29504 if (quals != TYPE_UNQUALIFIED)
29505 itype = build_qualified_type (itype, quals);
29507 type = build_pointer_type (itype);
29510 ix86_builtin_type_tab[(int) tcode] = type;
29511 return type;
29514 /* Table for the ix86 builtin function types. */
29515 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
29517 /* Retrieve an element from the above table, building some of
29518 the types lazily. */
29520 static tree
29521 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
29523 tree type;
29525 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
29527 type = ix86_builtin_func_type_tab[(int) tcode];
29528 if (type != NULL)
29529 return type;
29531 if (tcode <= IX86_BT_LAST_FUNC)
29533 unsigned start = ix86_builtin_func_start[(int) tcode];
29534 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
29535 tree rtype, atype, args = void_list_node;
29536 unsigned i;
29538 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
29539 for (i = after - 1; i > start; --i)
29541 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
29542 args = tree_cons (NULL, atype, args);
29545 type = build_function_type (rtype, args);
29547 else
29549 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
29550 enum ix86_builtin_func_type icode;
29552 icode = ix86_builtin_func_alias_base[index];
29553 type = ix86_get_builtin_func_type (icode);
29556 ix86_builtin_func_type_tab[(int) tcode] = type;
29557 return type;
29561 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
29562 bdesc_* arrays below should come first, then builtins for each bdesc_*
29563 array in ascending order, so that we can use direct array accesses. */
29564 enum ix86_builtins
29566 IX86_BUILTIN_MASKMOVQ,
29567 IX86_BUILTIN_LDMXCSR,
29568 IX86_BUILTIN_STMXCSR,
29569 IX86_BUILTIN_MASKMOVDQU,
29570 IX86_BUILTIN_PSLLDQ128,
29571 IX86_BUILTIN_CLFLUSH,
29572 IX86_BUILTIN_MONITOR,
29573 IX86_BUILTIN_MWAIT,
29574 IX86_BUILTIN_CLZERO,
29575 IX86_BUILTIN_VEC_INIT_V2SI,
29576 IX86_BUILTIN_VEC_INIT_V4HI,
29577 IX86_BUILTIN_VEC_INIT_V8QI,
29578 IX86_BUILTIN_VEC_EXT_V2DF,
29579 IX86_BUILTIN_VEC_EXT_V2DI,
29580 IX86_BUILTIN_VEC_EXT_V4SF,
29581 IX86_BUILTIN_VEC_EXT_V4SI,
29582 IX86_BUILTIN_VEC_EXT_V8HI,
29583 IX86_BUILTIN_VEC_EXT_V2SI,
29584 IX86_BUILTIN_VEC_EXT_V4HI,
29585 IX86_BUILTIN_VEC_EXT_V16QI,
29586 IX86_BUILTIN_VEC_SET_V2DI,
29587 IX86_BUILTIN_VEC_SET_V4SF,
29588 IX86_BUILTIN_VEC_SET_V4SI,
29589 IX86_BUILTIN_VEC_SET_V8HI,
29590 IX86_BUILTIN_VEC_SET_V4HI,
29591 IX86_BUILTIN_VEC_SET_V16QI,
29592 IX86_BUILTIN_GATHERSIV2DF,
29593 IX86_BUILTIN_GATHERSIV4DF,
29594 IX86_BUILTIN_GATHERDIV2DF,
29595 IX86_BUILTIN_GATHERDIV4DF,
29596 IX86_BUILTIN_GATHERSIV4SF,
29597 IX86_BUILTIN_GATHERSIV8SF,
29598 IX86_BUILTIN_GATHERDIV4SF,
29599 IX86_BUILTIN_GATHERDIV8SF,
29600 IX86_BUILTIN_GATHERSIV2DI,
29601 IX86_BUILTIN_GATHERSIV4DI,
29602 IX86_BUILTIN_GATHERDIV2DI,
29603 IX86_BUILTIN_GATHERDIV4DI,
29604 IX86_BUILTIN_GATHERSIV4SI,
29605 IX86_BUILTIN_GATHERSIV8SI,
29606 IX86_BUILTIN_GATHERDIV4SI,
29607 IX86_BUILTIN_GATHERDIV8SI,
29608 IX86_BUILTIN_VFMSUBSD3_MASK3,
29609 IX86_BUILTIN_VFMSUBSS3_MASK3,
29610 IX86_BUILTIN_GATHER3SIV8SF,
29611 IX86_BUILTIN_GATHER3SIV4SF,
29612 IX86_BUILTIN_GATHER3SIV4DF,
29613 IX86_BUILTIN_GATHER3SIV2DF,
29614 IX86_BUILTIN_GATHER3DIV8SF,
29615 IX86_BUILTIN_GATHER3DIV4SF,
29616 IX86_BUILTIN_GATHER3DIV4DF,
29617 IX86_BUILTIN_GATHER3DIV2DF,
29618 IX86_BUILTIN_GATHER3SIV8SI,
29619 IX86_BUILTIN_GATHER3SIV4SI,
29620 IX86_BUILTIN_GATHER3SIV4DI,
29621 IX86_BUILTIN_GATHER3SIV2DI,
29622 IX86_BUILTIN_GATHER3DIV8SI,
29623 IX86_BUILTIN_GATHER3DIV4SI,
29624 IX86_BUILTIN_GATHER3DIV4DI,
29625 IX86_BUILTIN_GATHER3DIV2DI,
29626 IX86_BUILTIN_SCATTERSIV8SF,
29627 IX86_BUILTIN_SCATTERSIV4SF,
29628 IX86_BUILTIN_SCATTERSIV4DF,
29629 IX86_BUILTIN_SCATTERSIV2DF,
29630 IX86_BUILTIN_SCATTERDIV8SF,
29631 IX86_BUILTIN_SCATTERDIV4SF,
29632 IX86_BUILTIN_SCATTERDIV4DF,
29633 IX86_BUILTIN_SCATTERDIV2DF,
29634 IX86_BUILTIN_SCATTERSIV8SI,
29635 IX86_BUILTIN_SCATTERSIV4SI,
29636 IX86_BUILTIN_SCATTERSIV4DI,
29637 IX86_BUILTIN_SCATTERSIV2DI,
29638 IX86_BUILTIN_SCATTERDIV8SI,
29639 IX86_BUILTIN_SCATTERDIV4SI,
29640 IX86_BUILTIN_SCATTERDIV4DI,
29641 IX86_BUILTIN_SCATTERDIV2DI,
29642 /* Alternate 4 and 8 element gather/scatter for the vectorizer
29643 where all operands are 32-byte or 64-byte wide respectively. */
29644 IX86_BUILTIN_GATHERALTSIV4DF,
29645 IX86_BUILTIN_GATHERALTDIV8SF,
29646 IX86_BUILTIN_GATHERALTSIV4DI,
29647 IX86_BUILTIN_GATHERALTDIV8SI,
29648 IX86_BUILTIN_GATHER3ALTDIV16SF,
29649 IX86_BUILTIN_GATHER3ALTDIV16SI,
29650 IX86_BUILTIN_GATHER3ALTSIV4DF,
29651 IX86_BUILTIN_GATHER3ALTDIV8SF,
29652 IX86_BUILTIN_GATHER3ALTSIV4DI,
29653 IX86_BUILTIN_GATHER3ALTDIV8SI,
29654 IX86_BUILTIN_GATHER3ALTSIV8DF,
29655 IX86_BUILTIN_GATHER3ALTSIV8DI,
29656 IX86_BUILTIN_GATHER3DIV16SF,
29657 IX86_BUILTIN_GATHER3DIV16SI,
29658 IX86_BUILTIN_GATHER3DIV8DF,
29659 IX86_BUILTIN_GATHER3DIV8DI,
29660 IX86_BUILTIN_GATHER3SIV16SF,
29661 IX86_BUILTIN_GATHER3SIV16SI,
29662 IX86_BUILTIN_GATHER3SIV8DF,
29663 IX86_BUILTIN_GATHER3SIV8DI,
29664 IX86_BUILTIN_SCATTERALTSIV8DF,
29665 IX86_BUILTIN_SCATTERALTDIV16SF,
29666 IX86_BUILTIN_SCATTERALTSIV8DI,
29667 IX86_BUILTIN_SCATTERALTDIV16SI,
29668 IX86_BUILTIN_SCATTERDIV16SF,
29669 IX86_BUILTIN_SCATTERDIV16SI,
29670 IX86_BUILTIN_SCATTERDIV8DF,
29671 IX86_BUILTIN_SCATTERDIV8DI,
29672 IX86_BUILTIN_SCATTERSIV16SF,
29673 IX86_BUILTIN_SCATTERSIV16SI,
29674 IX86_BUILTIN_SCATTERSIV8DF,
29675 IX86_BUILTIN_SCATTERSIV8DI,
29676 IX86_BUILTIN_GATHERPFQPD,
29677 IX86_BUILTIN_GATHERPFDPS,
29678 IX86_BUILTIN_GATHERPFDPD,
29679 IX86_BUILTIN_GATHERPFQPS,
29680 IX86_BUILTIN_SCATTERPFDPD,
29681 IX86_BUILTIN_SCATTERPFDPS,
29682 IX86_BUILTIN_SCATTERPFQPD,
29683 IX86_BUILTIN_SCATTERPFQPS,
29684 IX86_BUILTIN_CLWB,
29685 IX86_BUILTIN_CLFLUSHOPT,
29686 IX86_BUILTIN_INFQ,
29687 IX86_BUILTIN_HUGE_VALQ,
29688 IX86_BUILTIN_NANQ,
29689 IX86_BUILTIN_NANSQ,
29690 IX86_BUILTIN_XABORT,
29691 IX86_BUILTIN_ADDCARRYX32,
29692 IX86_BUILTIN_ADDCARRYX64,
29693 IX86_BUILTIN_SBB32,
29694 IX86_BUILTIN_SBB64,
29695 IX86_BUILTIN_RDRAND16_STEP,
29696 IX86_BUILTIN_RDRAND32_STEP,
29697 IX86_BUILTIN_RDRAND64_STEP,
29698 IX86_BUILTIN_RDSEED16_STEP,
29699 IX86_BUILTIN_RDSEED32_STEP,
29700 IX86_BUILTIN_RDSEED64_STEP,
29701 IX86_BUILTIN_MONITORX,
29702 IX86_BUILTIN_MWAITX,
29703 IX86_BUILTIN_CFSTRING,
29704 IX86_BUILTIN_CPU_INIT,
29705 IX86_BUILTIN_CPU_IS,
29706 IX86_BUILTIN_CPU_SUPPORTS,
29707 IX86_BUILTIN_READ_FLAGS,
29708 IX86_BUILTIN_WRITE_FLAGS,
29710 /* All the remaining builtins are tracked in bdesc_* arrays in
29711 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
29712 this point. */
29713 #define BDESC(mask, icode, name, code, comparison, flag) \
29714 code,
29715 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29716 code, \
29717 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
29718 #define BDESC_END(kind, next_kind)
29720 #include "i386-builtin.def"
29722 #undef BDESC
29723 #undef BDESC_FIRST
29724 #undef BDESC_END
29726 IX86_BUILTIN_MAX,
29728 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
29730 /* Now just the aliases for bdesc_* start/end. */
29731 #define BDESC(mask, icode, name, code, comparison, flag)
29732 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
29733 #define BDESC_END(kind, next_kind) \
29734 IX86_BUILTIN__BDESC_##kind##_LAST \
29735 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
29737 #include "i386-builtin.def"
29739 #undef BDESC
29740 #undef BDESC_FIRST
29741 #undef BDESC_END
29743 /* Just to make sure there is no comma after the last enumerator. */
29744 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
29747 /* Table for the ix86 builtin decls. */
29748 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
29750 /* Table of all of the builtin functions that are possible with different ISA's
29751 but are waiting to be built until a function is declared to use that
29752 ISA. */
29753 struct builtin_isa {
29754 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
29755 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
29756 const char *name; /* function name */
29757 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
29758 unsigned char const_p:1; /* true if the declaration is constant */
29759 unsigned char pure_p:1; /* true if the declaration has pure attribute */
29760 bool leaf_p; /* true if the declaration has leaf attribute */
29761 bool nothrow_p; /* true if the declaration has nothrow attribute */
29762 bool set_and_not_built_p;
29765 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
29767 /* Bits that can still enable any inclusion of a builtin. */
29768 static HOST_WIDE_INT deferred_isa_values = 0;
29769 static HOST_WIDE_INT deferred_isa_values2 = 0;
29771 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
29772 of which isa_flags to use in the ix86_builtins_isa array. Stores the
29773 function decl in the ix86_builtins array. Returns the function decl or
29774 NULL_TREE, if the builtin was not added.
29776 If the front end has a special hook for builtin functions, delay adding
29777 builtin functions that aren't in the current ISA until the ISA is changed
29778 with function specific optimization. Doing so, can save about 300K for the
29779 default compiler. When the builtin is expanded, check at that time whether
29780 it is valid.
29782 If the front end doesn't have a special hook, record all builtins, even if
29783 it isn't an instruction set in the current ISA in case the user uses
29784 function specific options for a different ISA, so that we don't get scope
29785 errors if a builtin is added in the middle of a function scope. */
29787 static inline tree
29788 def_builtin (HOST_WIDE_INT mask, const char *name,
29789 enum ix86_builtin_func_type tcode,
29790 enum ix86_builtins code)
29792 tree decl = NULL_TREE;
29794 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
29796 ix86_builtins_isa[(int) code].isa = mask;
29798 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
29799 where any bit set means that built-in is enable, this bit must be *and-ed*
29800 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
29801 means that *both* cpuid bits must be set for the built-in to be available.
29802 Handle this here. */
29803 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
29804 mask &= ~OPTION_MASK_ISA_AVX512VL;
29806 mask &= ~OPTION_MASK_ISA_64BIT;
29807 if (mask == 0
29808 || (mask & ix86_isa_flags) != 0
29809 || (lang_hooks.builtin_function
29810 == lang_hooks.builtin_function_ext_scope))
29813 tree type = ix86_get_builtin_func_type (tcode);
29814 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29815 NULL, NULL_TREE);
29816 ix86_builtins[(int) code] = decl;
29817 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29819 else
29821 /* Just a MASK where set_and_not_built_p == true can potentially
29822 include a builtin. */
29823 deferred_isa_values |= mask;
29824 ix86_builtins[(int) code] = NULL_TREE;
29825 ix86_builtins_isa[(int) code].tcode = tcode;
29826 ix86_builtins_isa[(int) code].name = name;
29827 ix86_builtins_isa[(int) code].leaf_p = false;
29828 ix86_builtins_isa[(int) code].nothrow_p = false;
29829 ix86_builtins_isa[(int) code].const_p = false;
29830 ix86_builtins_isa[(int) code].pure_p = false;
29831 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29835 return decl;
29838 /* Like def_builtin, but also marks the function decl "const". */
29840 static inline tree
29841 def_builtin_const (HOST_WIDE_INT mask, const char *name,
29842 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29844 tree decl = def_builtin (mask, name, tcode, code);
29845 if (decl)
29846 TREE_READONLY (decl) = 1;
29847 else
29848 ix86_builtins_isa[(int) code].const_p = true;
29850 return decl;
29853 /* Like def_builtin, but also marks the function decl "pure". */
29855 static inline tree
29856 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
29857 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29859 tree decl = def_builtin (mask, name, tcode, code);
29860 if (decl)
29861 DECL_PURE_P (decl) = 1;
29862 else
29863 ix86_builtins_isa[(int) code].pure_p = true;
29865 return decl;
29868 /* Like def_builtin, but for additional isa2 flags. */
29870 static inline tree
29871 def_builtin2 (HOST_WIDE_INT mask, const char *name,
29872 enum ix86_builtin_func_type tcode,
29873 enum ix86_builtins code)
29875 tree decl = NULL_TREE;
29877 ix86_builtins_isa[(int) code].isa2 = mask;
29879 if (mask == 0
29880 || (mask & ix86_isa_flags2) != 0
29881 || (lang_hooks.builtin_function
29882 == lang_hooks.builtin_function_ext_scope))
29885 tree type = ix86_get_builtin_func_type (tcode);
29886 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29887 NULL, NULL_TREE);
29888 ix86_builtins[(int) code] = decl;
29889 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29891 else
29893 /* Just a MASK where set_and_not_built_p == true can potentially
29894 include a builtin. */
29895 deferred_isa_values2 |= mask;
29896 ix86_builtins[(int) code] = NULL_TREE;
29897 ix86_builtins_isa[(int) code].tcode = tcode;
29898 ix86_builtins_isa[(int) code].name = name;
29899 ix86_builtins_isa[(int) code].leaf_p = false;
29900 ix86_builtins_isa[(int) code].nothrow_p = false;
29901 ix86_builtins_isa[(int) code].const_p = false;
29902 ix86_builtins_isa[(int) code].pure_p = false;
29903 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29906 return decl;
29909 /* Like def_builtin, but also marks the function decl "const". */
29911 static inline tree
29912 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
29913 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29915 tree decl = def_builtin2 (mask, name, tcode, code);
29916 if (decl)
29917 TREE_READONLY (decl) = 1;
29918 else
29919 ix86_builtins_isa[(int) code].const_p = true;
29921 return decl;
29924 /* Like def_builtin, but also marks the function decl "pure". */
29926 static inline tree
29927 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
29928 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29930 tree decl = def_builtin2 (mask, name, tcode, code);
29931 if (decl)
29932 DECL_PURE_P (decl) = 1;
29933 else
29934 ix86_builtins_isa[(int) code].pure_p = true;
29936 return decl;
29939 /* Add any new builtin functions for a given ISA that may not have been
29940 declared. This saves a bit of space compared to adding all of the
29941 declarations to the tree, even if we didn't use them. */
29943 static void
29944 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
29946 if ((isa & deferred_isa_values) == 0
29947 && (isa2 & deferred_isa_values2) == 0)
29948 return;
29950 /* Bits in ISA value can be removed from potential isa values. */
29951 deferred_isa_values &= ~isa;
29952 deferred_isa_values2 &= ~isa2;
29954 int i;
29955 tree saved_current_target_pragma = current_target_pragma;
29956 current_target_pragma = NULL_TREE;
29958 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
29960 if (((ix86_builtins_isa[i].isa & isa) != 0
29961 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
29962 && ix86_builtins_isa[i].set_and_not_built_p)
29964 tree decl, type;
29966 /* Don't define the builtin again. */
29967 ix86_builtins_isa[i].set_and_not_built_p = false;
29969 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
29970 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
29971 type, i, BUILT_IN_MD, NULL,
29972 NULL_TREE);
29974 ix86_builtins[i] = decl;
29975 if (ix86_builtins_isa[i].const_p)
29976 TREE_READONLY (decl) = 1;
29977 if (ix86_builtins_isa[i].pure_p)
29978 DECL_PURE_P (decl) = 1;
29979 if (ix86_builtins_isa[i].leaf_p)
29980 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
29981 NULL_TREE);
29982 if (ix86_builtins_isa[i].nothrow_p)
29983 TREE_NOTHROW (decl) = 1;
29987 current_target_pragma = saved_current_target_pragma;
29990 /* Bits for builtin_description.flag. */
29992 /* Set when we don't support the comparison natively, and should
29993 swap_comparison in order to support it. */
29994 #define BUILTIN_DESC_SWAP_OPERANDS 1
29996 struct builtin_description
29998 const HOST_WIDE_INT mask;
29999 const enum insn_code icode;
30000 const char *const name;
30001 const enum ix86_builtins code;
30002 const enum rtx_code comparison;
30003 const int flag;
30006 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30007 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30008 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30009 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30010 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30011 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30012 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30013 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30014 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30015 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30016 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30017 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30018 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30019 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30020 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30021 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30022 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30023 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30024 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30025 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30026 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30027 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30028 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30029 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30030 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30031 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30032 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30033 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30034 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30035 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30036 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30037 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30038 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30039 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30040 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30041 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30042 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30043 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30044 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30045 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30046 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30047 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30048 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30049 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30050 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30051 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30052 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30053 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30054 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30055 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30056 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30057 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30059 #define BDESC(mask, icode, name, code, comparison, flag) \
30060 { mask, icode, name, code, comparison, flag },
30061 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30062 static const struct builtin_description bdesc_##kind[] = \
30064 BDESC (mask, icode, name, code, comparison, flag)
30065 #define BDESC_END(kind, next_kind) \
30068 #include "i386-builtin.def"
30070 #undef BDESC
30071 #undef BDESC_FIRST
30072 #undef BDESC_END
30074 /* TM vector builtins. */
30076 /* Reuse the existing x86-specific `struct builtin_description' cause
30077 we're lazy. Add casts to make them fit. */
30078 static const struct builtin_description bdesc_tm[] =
30080 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30081 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30082 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30083 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30084 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30085 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30086 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30088 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30089 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30090 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30091 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30092 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30093 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30094 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30096 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30097 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30098 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30099 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30100 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30101 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30102 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30104 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30105 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
30106 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
30109 /* Initialize the transactional memory vector load/store builtins. */
30111 static void
30112 ix86_init_tm_builtins (void)
30114 enum ix86_builtin_func_type ftype;
30115 const struct builtin_description *d;
30116 size_t i;
30117 tree decl;
30118 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
30119 tree attrs_log, attrs_type_log;
30121 if (!flag_tm)
30122 return;
30124 /* If there are no builtins defined, we must be compiling in a
30125 language without trans-mem support. */
30126 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
30127 return;
30129 /* Use whatever attributes a normal TM load has. */
30130 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
30131 attrs_load = DECL_ATTRIBUTES (decl);
30132 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30133 /* Use whatever attributes a normal TM store has. */
30134 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
30135 attrs_store = DECL_ATTRIBUTES (decl);
30136 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30137 /* Use whatever attributes a normal TM log has. */
30138 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
30139 attrs_log = DECL_ATTRIBUTES (decl);
30140 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30142 for (i = 0, d = bdesc_tm;
30143 i < ARRAY_SIZE (bdesc_tm);
30144 i++, d++)
30146 if ((d->mask & ix86_isa_flags) != 0
30147 || (lang_hooks.builtin_function
30148 == lang_hooks.builtin_function_ext_scope))
30150 tree type, attrs, attrs_type;
30151 enum built_in_function code = (enum built_in_function) d->code;
30153 ftype = (enum ix86_builtin_func_type) d->flag;
30154 type = ix86_get_builtin_func_type (ftype);
30156 if (BUILTIN_TM_LOAD_P (code))
30158 attrs = attrs_load;
30159 attrs_type = attrs_type_load;
30161 else if (BUILTIN_TM_STORE_P (code))
30163 attrs = attrs_store;
30164 attrs_type = attrs_type_store;
30166 else
30168 attrs = attrs_log;
30169 attrs_type = attrs_type_log;
30171 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
30172 /* The builtin without the prefix for
30173 calling it directly. */
30174 d->name + strlen ("__builtin_"),
30175 attrs);
30176 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
30177 set the TYPE_ATTRIBUTES. */
30178 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
30180 set_builtin_decl (code, decl, false);
30185 /* Macros for verification of enum ix86_builtins order. */
30186 #define BDESC_VERIFY(x, y, z) \
30187 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
30188 #define BDESC_VERIFYS(x, y, z) \
30189 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
30191 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30192 IX86_BUILTIN__BDESC_COMI_LAST, 1);
30193 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30194 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
30195 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30196 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
30197 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
30198 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
30199 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30200 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
30201 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
30202 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
30203 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30204 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
30205 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
30206 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
30207 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30208 IX86_BUILTIN__BDESC_MPX_LAST, 1);
30209 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30210 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
30211 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
30212 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
30213 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30214 IX86_BUILTIN__BDESC_CET_LAST, 1);
30215 BDESC_VERIFYS (IX86_BUILTIN_MAX,
30216 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
30218 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
30219 in the current target ISA to allow the user to compile particular modules
30220 with different target specific options that differ from the command line
30221 options. */
30222 static void
30223 ix86_init_mmx_sse_builtins (void)
30225 const struct builtin_description * d;
30226 enum ix86_builtin_func_type ftype;
30227 size_t i;
30229 /* Add all special builtins with variable number of operands. */
30230 for (i = 0, d = bdesc_special_args;
30231 i < ARRAY_SIZE (bdesc_special_args);
30232 i++, d++)
30234 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
30235 if (d->name == 0)
30236 continue;
30238 ftype = (enum ix86_builtin_func_type) d->flag;
30239 def_builtin (d->mask, d->name, ftype, d->code);
30241 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
30242 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30243 ARRAY_SIZE (bdesc_special_args) - 1);
30245 /* Add all builtins with variable number of operands. */
30246 for (i = 0, d = bdesc_args;
30247 i < ARRAY_SIZE (bdesc_args);
30248 i++, d++)
30250 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
30251 if (d->name == 0)
30252 continue;
30254 ftype = (enum ix86_builtin_func_type) d->flag;
30255 def_builtin_const (d->mask, d->name, ftype, d->code);
30257 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
30258 IX86_BUILTIN__BDESC_ARGS_FIRST,
30259 ARRAY_SIZE (bdesc_args) - 1);
30261 /* Add all builtins with variable number of operands. */
30262 for (i = 0, d = bdesc_args2;
30263 i < ARRAY_SIZE (bdesc_args2);
30264 i++, d++)
30266 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
30267 if (d->name == 0)
30268 continue;
30270 ftype = (enum ix86_builtin_func_type) d->flag;
30271 def_builtin_const2 (d->mask, d->name, ftype, d->code);
30273 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
30274 IX86_BUILTIN__BDESC_ARGS2_FIRST,
30275 ARRAY_SIZE (bdesc_args2) - 1);
30277 for (i = 0, d = bdesc_special_args2;
30278 i < ARRAY_SIZE (bdesc_special_args2);
30279 i++, d++)
30281 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
30282 if (d->name == 0)
30283 continue;
30285 ftype = (enum ix86_builtin_func_type) d->flag;
30286 def_builtin2 (d->mask, d->name, ftype, d->code);
30288 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
30289 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30290 ARRAY_SIZE (bdesc_special_args2) - 1);
30292 /* Add all builtins with rounding. */
30293 for (i = 0, d = bdesc_round_args;
30294 i < ARRAY_SIZE (bdesc_round_args);
30295 i++, d++)
30297 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
30298 if (d->name == 0)
30299 continue;
30301 ftype = (enum ix86_builtin_func_type) d->flag;
30302 def_builtin_const (d->mask, d->name, ftype, d->code);
30304 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
30305 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30306 ARRAY_SIZE (bdesc_round_args) - 1);
30308 /* pcmpestr[im] insns. */
30309 for (i = 0, d = bdesc_pcmpestr;
30310 i < ARRAY_SIZE (bdesc_pcmpestr);
30311 i++, d++)
30313 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
30314 if (d->code == IX86_BUILTIN_PCMPESTRM128)
30315 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
30316 else
30317 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
30318 def_builtin_const (d->mask, d->name, ftype, d->code);
30320 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
30321 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30322 ARRAY_SIZE (bdesc_pcmpestr) - 1);
30324 /* pcmpistr[im] insns. */
30325 for (i = 0, d = bdesc_pcmpistr;
30326 i < ARRAY_SIZE (bdesc_pcmpistr);
30327 i++, d++)
30329 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
30330 if (d->code == IX86_BUILTIN_PCMPISTRM128)
30331 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
30332 else
30333 ftype = INT_FTYPE_V16QI_V16QI_INT;
30334 def_builtin_const (d->mask, d->name, ftype, d->code);
30336 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
30337 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30338 ARRAY_SIZE (bdesc_pcmpistr) - 1);
30340 /* comi/ucomi insns. */
30341 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30343 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
30344 if (d->mask == OPTION_MASK_ISA_SSE2)
30345 ftype = INT_FTYPE_V2DF_V2DF;
30346 else
30347 ftype = INT_FTYPE_V4SF_V4SF;
30348 def_builtin_const (d->mask, d->name, ftype, d->code);
30350 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
30351 IX86_BUILTIN__BDESC_COMI_FIRST,
30352 ARRAY_SIZE (bdesc_comi) - 1);
30354 /* SSE */
30355 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
30356 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
30357 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
30358 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
30360 /* SSE or 3DNow!A */
30361 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30362 /* As it uses V4HImode, we have to require -mmmx too. */
30363 | OPTION_MASK_ISA_MMX,
30364 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
30365 IX86_BUILTIN_MASKMOVQ);
30367 /* SSE2 */
30368 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
30369 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
30371 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
30372 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
30373 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
30374 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
30376 /* SSE3. */
30377 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
30378 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
30379 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
30380 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
30382 /* AES */
30383 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
30384 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
30385 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
30386 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
30387 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
30388 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
30389 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
30390 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
30391 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
30392 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
30393 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
30394 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
30396 /* PCLMUL */
30397 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
30398 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
30400 /* RDRND */
30401 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
30402 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
30403 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
30404 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
30405 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
30406 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
30407 IX86_BUILTIN_RDRAND64_STEP);
30409 /* AVX2 */
30410 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
30411 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
30412 IX86_BUILTIN_GATHERSIV2DF);
30414 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
30415 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
30416 IX86_BUILTIN_GATHERSIV4DF);
30418 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
30419 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
30420 IX86_BUILTIN_GATHERDIV2DF);
30422 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
30423 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
30424 IX86_BUILTIN_GATHERDIV4DF);
30426 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
30427 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
30428 IX86_BUILTIN_GATHERSIV4SF);
30430 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
30431 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
30432 IX86_BUILTIN_GATHERSIV8SF);
30434 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
30435 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
30436 IX86_BUILTIN_GATHERDIV4SF);
30438 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
30439 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
30440 IX86_BUILTIN_GATHERDIV8SF);
30442 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
30443 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
30444 IX86_BUILTIN_GATHERSIV2DI);
30446 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
30447 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
30448 IX86_BUILTIN_GATHERSIV4DI);
30450 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
30451 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
30452 IX86_BUILTIN_GATHERDIV2DI);
30454 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
30455 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
30456 IX86_BUILTIN_GATHERDIV4DI);
30458 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
30459 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
30460 IX86_BUILTIN_GATHERSIV4SI);
30462 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
30463 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
30464 IX86_BUILTIN_GATHERSIV8SI);
30466 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
30467 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
30468 IX86_BUILTIN_GATHERDIV4SI);
30470 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
30471 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
30472 IX86_BUILTIN_GATHERDIV8SI);
30474 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
30475 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
30476 IX86_BUILTIN_GATHERALTSIV4DF);
30478 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
30479 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
30480 IX86_BUILTIN_GATHERALTDIV8SF);
30482 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
30483 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
30484 IX86_BUILTIN_GATHERALTSIV4DI);
30486 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
30487 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
30488 IX86_BUILTIN_GATHERALTDIV8SI);
30490 /* AVX512F */
30491 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
30492 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
30493 IX86_BUILTIN_GATHER3SIV16SF);
30495 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
30496 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
30497 IX86_BUILTIN_GATHER3SIV8DF);
30499 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
30500 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
30501 IX86_BUILTIN_GATHER3DIV16SF);
30503 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
30504 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
30505 IX86_BUILTIN_GATHER3DIV8DF);
30507 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
30508 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
30509 IX86_BUILTIN_GATHER3SIV16SI);
30511 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
30512 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
30513 IX86_BUILTIN_GATHER3SIV8DI);
30515 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
30516 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
30517 IX86_BUILTIN_GATHER3DIV16SI);
30519 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
30520 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
30521 IX86_BUILTIN_GATHER3DIV8DI);
30523 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
30524 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
30525 IX86_BUILTIN_GATHER3ALTSIV8DF);
30527 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
30528 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
30529 IX86_BUILTIN_GATHER3ALTDIV16SF);
30531 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
30532 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
30533 IX86_BUILTIN_GATHER3ALTSIV8DI);
30535 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
30536 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
30537 IX86_BUILTIN_GATHER3ALTDIV16SI);
30539 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
30540 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
30541 IX86_BUILTIN_SCATTERSIV16SF);
30543 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
30544 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
30545 IX86_BUILTIN_SCATTERSIV8DF);
30547 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
30548 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
30549 IX86_BUILTIN_SCATTERDIV16SF);
30551 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
30552 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
30553 IX86_BUILTIN_SCATTERDIV8DF);
30555 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
30556 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
30557 IX86_BUILTIN_SCATTERSIV16SI);
30559 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
30560 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
30561 IX86_BUILTIN_SCATTERSIV8DI);
30563 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
30564 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
30565 IX86_BUILTIN_SCATTERDIV16SI);
30567 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
30568 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
30569 IX86_BUILTIN_SCATTERDIV8DI);
30571 /* AVX512VL */
30572 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
30573 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
30574 IX86_BUILTIN_GATHER3SIV2DF);
30576 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
30577 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
30578 IX86_BUILTIN_GATHER3SIV4DF);
30580 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
30581 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
30582 IX86_BUILTIN_GATHER3DIV2DF);
30584 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
30585 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
30586 IX86_BUILTIN_GATHER3DIV4DF);
30588 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
30589 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
30590 IX86_BUILTIN_GATHER3SIV4SF);
30592 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
30593 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
30594 IX86_BUILTIN_GATHER3SIV8SF);
30596 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
30597 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
30598 IX86_BUILTIN_GATHER3DIV4SF);
30600 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
30601 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
30602 IX86_BUILTIN_GATHER3DIV8SF);
30604 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
30605 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
30606 IX86_BUILTIN_GATHER3SIV2DI);
30608 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
30609 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
30610 IX86_BUILTIN_GATHER3SIV4DI);
30612 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
30613 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
30614 IX86_BUILTIN_GATHER3DIV2DI);
30616 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
30617 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
30618 IX86_BUILTIN_GATHER3DIV4DI);
30620 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
30621 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
30622 IX86_BUILTIN_GATHER3SIV4SI);
30624 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
30625 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
30626 IX86_BUILTIN_GATHER3SIV8SI);
30628 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
30629 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
30630 IX86_BUILTIN_GATHER3DIV4SI);
30632 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
30633 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
30634 IX86_BUILTIN_GATHER3DIV8SI);
30636 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
30637 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
30638 IX86_BUILTIN_GATHER3ALTSIV4DF);
30640 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
30641 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
30642 IX86_BUILTIN_GATHER3ALTDIV8SF);
30644 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
30645 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
30646 IX86_BUILTIN_GATHER3ALTSIV4DI);
30648 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
30649 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
30650 IX86_BUILTIN_GATHER3ALTDIV8SI);
30652 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
30653 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
30654 IX86_BUILTIN_SCATTERSIV8SF);
30656 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
30657 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
30658 IX86_BUILTIN_SCATTERSIV4SF);
30660 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
30661 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
30662 IX86_BUILTIN_SCATTERSIV4DF);
30664 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
30665 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
30666 IX86_BUILTIN_SCATTERSIV2DF);
30668 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
30669 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
30670 IX86_BUILTIN_SCATTERDIV8SF);
30672 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
30673 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
30674 IX86_BUILTIN_SCATTERDIV4SF);
30676 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
30677 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
30678 IX86_BUILTIN_SCATTERDIV4DF);
30680 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
30681 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
30682 IX86_BUILTIN_SCATTERDIV2DF);
30684 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
30685 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
30686 IX86_BUILTIN_SCATTERSIV8SI);
30688 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
30689 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
30690 IX86_BUILTIN_SCATTERSIV4SI);
30692 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
30693 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
30694 IX86_BUILTIN_SCATTERSIV4DI);
30696 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
30697 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
30698 IX86_BUILTIN_SCATTERSIV2DI);
30700 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
30701 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
30702 IX86_BUILTIN_SCATTERDIV8SI);
30704 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
30705 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
30706 IX86_BUILTIN_SCATTERDIV4SI);
30708 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
30709 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
30710 IX86_BUILTIN_SCATTERDIV4DI);
30712 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
30713 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
30714 IX86_BUILTIN_SCATTERDIV2DI);
30715 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
30716 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
30717 IX86_BUILTIN_SCATTERALTSIV8DF);
30719 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
30720 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
30721 IX86_BUILTIN_SCATTERALTDIV16SF);
30723 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
30724 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
30725 IX86_BUILTIN_SCATTERALTSIV8DI);
30727 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
30728 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
30729 IX86_BUILTIN_SCATTERALTDIV16SI);
30731 /* AVX512PF */
30732 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
30733 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30734 IX86_BUILTIN_GATHERPFDPD);
30735 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
30736 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30737 IX86_BUILTIN_GATHERPFDPS);
30738 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
30739 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30740 IX86_BUILTIN_GATHERPFQPD);
30741 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
30742 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30743 IX86_BUILTIN_GATHERPFQPS);
30744 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
30745 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30746 IX86_BUILTIN_SCATTERPFDPD);
30747 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
30748 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30749 IX86_BUILTIN_SCATTERPFDPS);
30750 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
30751 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30752 IX86_BUILTIN_SCATTERPFQPD);
30753 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
30754 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30755 IX86_BUILTIN_SCATTERPFQPS);
30757 /* SHA */
30758 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
30759 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
30760 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
30761 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
30762 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
30763 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
30764 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
30765 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
30766 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
30767 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
30768 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
30769 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
30770 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
30771 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
30773 /* RTM. */
30774 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
30775 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
30777 /* MMX access to the vec_init patterns. */
30778 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
30779 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
30781 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
30782 V4HI_FTYPE_HI_HI_HI_HI,
30783 IX86_BUILTIN_VEC_INIT_V4HI);
30785 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
30786 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
30787 IX86_BUILTIN_VEC_INIT_V8QI);
30789 /* Access to the vec_extract patterns. */
30790 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
30791 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
30792 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
30793 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
30794 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
30795 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
30796 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
30797 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
30798 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
30799 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
30801 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30802 /* As it uses V4HImode, we have to require -mmmx too. */
30803 | OPTION_MASK_ISA_MMX,
30804 "__builtin_ia32_vec_ext_v4hi",
30805 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
30807 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
30808 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
30810 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
30811 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
30813 /* Access to the vec_set patterns. */
30814 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
30815 "__builtin_ia32_vec_set_v2di",
30816 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
30818 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
30819 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
30821 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
30822 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
30824 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
30825 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
30827 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30828 /* As it uses V4HImode, we have to require -mmmx too. */
30829 | OPTION_MASK_ISA_MMX,
30830 "__builtin_ia32_vec_set_v4hi",
30831 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
30833 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
30834 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
30836 /* RDSEED */
30837 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
30838 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
30839 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
30840 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
30841 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
30842 "__builtin_ia32_rdseed_di_step",
30843 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
30845 /* ADCX */
30846 def_builtin (0, "__builtin_ia32_addcarryx_u32",
30847 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
30848 def_builtin (OPTION_MASK_ISA_64BIT,
30849 "__builtin_ia32_addcarryx_u64",
30850 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30851 IX86_BUILTIN_ADDCARRYX64);
30853 /* SBB */
30854 def_builtin (0, "__builtin_ia32_sbb_u32",
30855 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
30856 def_builtin (OPTION_MASK_ISA_64BIT,
30857 "__builtin_ia32_sbb_u64",
30858 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30859 IX86_BUILTIN_SBB64);
30861 /* Read/write FLAGS. */
30862 def_builtin (0, "__builtin_ia32_readeflags_u32",
30863 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30864 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
30865 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30866 def_builtin (0, "__builtin_ia32_writeeflags_u32",
30867 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
30868 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
30869 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
30871 /* CLFLUSHOPT. */
30872 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
30873 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
30875 /* CLWB. */
30876 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
30877 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
30879 /* MONITORX and MWAITX. */
30880 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
30881 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
30882 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
30883 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
30885 /* CLZERO. */
30886 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
30887 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
30889 /* Add FMA4 multi-arg argument instructions */
30890 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30892 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
30893 if (d->name == 0)
30894 continue;
30896 ftype = (enum ix86_builtin_func_type) d->flag;
30897 def_builtin_const (d->mask, d->name, ftype, d->code);
30899 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
30900 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30901 ARRAY_SIZE (bdesc_multi_arg) - 1);
30903 /* Add CET inrinsics. */
30904 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
30906 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
30907 if (d->name == 0)
30908 continue;
30910 ftype = (enum ix86_builtin_func_type) d->flag;
30911 def_builtin2 (d->mask, d->name, ftype, d->code);
30913 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
30914 IX86_BUILTIN__BDESC_CET_FIRST,
30915 ARRAY_SIZE (bdesc_cet) - 1);
30917 for (i = 0, d = bdesc_cet_rdssp;
30918 i < ARRAY_SIZE (bdesc_cet_rdssp);
30919 i++, d++)
30921 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
30922 if (d->name == 0)
30923 continue;
30925 ftype = (enum ix86_builtin_func_type) d->flag;
30926 def_builtin2 (d->mask, d->name, ftype, d->code);
30928 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
30929 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30930 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
30933 static void
30934 ix86_init_mpx_builtins ()
30936 const struct builtin_description * d;
30937 enum ix86_builtin_func_type ftype;
30938 tree decl;
30939 size_t i;
30941 for (i = 0, d = bdesc_mpx;
30942 i < ARRAY_SIZE (bdesc_mpx);
30943 i++, d++)
30945 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
30946 if (d->name == 0)
30947 continue;
30949 ftype = (enum ix86_builtin_func_type) d->flag;
30950 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
30952 /* With no leaf and nothrow flags for MPX builtins
30953 abnormal edges may follow its call when setjmp
30954 presents in the function. Since we may have a lot
30955 of MPX builtins calls it causes lots of useless
30956 edges and enormous PHI nodes. To avoid this we mark
30957 MPX builtins as leaf and nothrow. */
30958 if (decl)
30960 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30961 NULL_TREE);
30962 TREE_NOTHROW (decl) = 1;
30964 else
30966 ix86_builtins_isa[(int)d->code].leaf_p = true;
30967 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30970 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
30971 IX86_BUILTIN__BDESC_MPX_FIRST,
30972 ARRAY_SIZE (bdesc_mpx) - 1);
30974 for (i = 0, d = bdesc_mpx_const;
30975 i < ARRAY_SIZE (bdesc_mpx_const);
30976 i++, d++)
30978 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
30979 if (d->name == 0)
30980 continue;
30982 ftype = (enum ix86_builtin_func_type) d->flag;
30983 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
30985 if (decl)
30987 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30988 NULL_TREE);
30989 TREE_NOTHROW (decl) = 1;
30991 else
30993 ix86_builtins_isa[(int)d->code].leaf_p = true;
30994 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30997 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
30998 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30999 ARRAY_SIZE (bdesc_mpx_const) - 1);
31001 #undef BDESC_VERIFY
31002 #undef BDESC_VERIFYS
31004 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31005 to return a pointer to VERSION_DECL if the outcome of the expression
31006 formed by PREDICATE_CHAIN is true. This function will be called during
31007 version dispatch to decide which function version to execute. It returns
31008 the basic block at the end, to which more conditions can be added. */
31010 static basic_block
31011 add_condition_to_bb (tree function_decl, tree version_decl,
31012 tree predicate_chain, basic_block new_bb)
31014 gimple *return_stmt;
31015 tree convert_expr, result_var;
31016 gimple *convert_stmt;
31017 gimple *call_cond_stmt;
31018 gimple *if_else_stmt;
31020 basic_block bb1, bb2, bb3;
31021 edge e12, e23;
31023 tree cond_var, and_expr_var = NULL_TREE;
31024 gimple_seq gseq;
31026 tree predicate_decl, predicate_arg;
31028 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31030 gcc_assert (new_bb != NULL);
31031 gseq = bb_seq (new_bb);
31034 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31035 build_fold_addr_expr (version_decl));
31036 result_var = create_tmp_var (ptr_type_node);
31037 convert_stmt = gimple_build_assign (result_var, convert_expr);
31038 return_stmt = gimple_build_return (result_var);
31040 if (predicate_chain == NULL_TREE)
31042 gimple_seq_add_stmt (&gseq, convert_stmt);
31043 gimple_seq_add_stmt (&gseq, return_stmt);
31044 set_bb_seq (new_bb, gseq);
31045 gimple_set_bb (convert_stmt, new_bb);
31046 gimple_set_bb (return_stmt, new_bb);
31047 pop_cfun ();
31048 return new_bb;
31051 while (predicate_chain != NULL)
31053 cond_var = create_tmp_var (integer_type_node);
31054 predicate_decl = TREE_PURPOSE (predicate_chain);
31055 predicate_arg = TREE_VALUE (predicate_chain);
31056 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31057 gimple_call_set_lhs (call_cond_stmt, cond_var);
31059 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31060 gimple_set_bb (call_cond_stmt, new_bb);
31061 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31063 predicate_chain = TREE_CHAIN (predicate_chain);
31065 if (and_expr_var == NULL)
31066 and_expr_var = cond_var;
31067 else
31069 gimple *assign_stmt;
31070 /* Use MIN_EXPR to check if any integer is zero?.
31071 and_expr_var = min_expr <cond_var, and_expr_var> */
31072 assign_stmt = gimple_build_assign (and_expr_var,
31073 build2 (MIN_EXPR, integer_type_node,
31074 cond_var, and_expr_var));
31076 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31077 gimple_set_bb (assign_stmt, new_bb);
31078 gimple_seq_add_stmt (&gseq, assign_stmt);
31082 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31083 integer_zero_node,
31084 NULL_TREE, NULL_TREE);
31085 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31086 gimple_set_bb (if_else_stmt, new_bb);
31087 gimple_seq_add_stmt (&gseq, if_else_stmt);
31089 gimple_seq_add_stmt (&gseq, convert_stmt);
31090 gimple_seq_add_stmt (&gseq, return_stmt);
31091 set_bb_seq (new_bb, gseq);
31093 bb1 = new_bb;
31094 e12 = split_block (bb1, if_else_stmt);
31095 bb2 = e12->dest;
31096 e12->flags &= ~EDGE_FALLTHRU;
31097 e12->flags |= EDGE_TRUE_VALUE;
31099 e23 = split_block (bb2, return_stmt);
31101 gimple_set_bb (convert_stmt, bb2);
31102 gimple_set_bb (return_stmt, bb2);
31104 bb3 = e23->dest;
31105 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31107 remove_edge (e23);
31108 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31110 pop_cfun ();
31112 return bb3;
31115 /* This parses the attribute arguments to target in DECL and determines
31116 the right builtin to use to match the platform specification.
31117 It returns the priority value for this version decl. If PREDICATE_LIST
31118 is not NULL, it stores the list of cpu features that need to be checked
31119 before dispatching this function. */
31121 static unsigned int
31122 get_builtin_code_for_version (tree decl, tree *predicate_list)
31124 tree attrs;
31125 struct cl_target_option cur_target;
31126 tree target_node;
31127 struct cl_target_option *new_target;
31128 const char *arg_str = NULL;
31129 const char *attrs_str = NULL;
31130 char *tok_str = NULL;
31131 char *token;
31133 /* Priority of i386 features, greater value is higher priority. This is
31134 used to decide the order in which function dispatch must happen. For
31135 instance, a version specialized for SSE4.2 should be checked for dispatch
31136 before a version for SSE3, as SSE4.2 implies SSE3. */
31137 enum feature_priority
31139 P_ZERO = 0,
31140 P_MMX,
31141 P_SSE,
31142 P_SSE2,
31143 P_SSE3,
31144 P_SSSE3,
31145 P_PROC_SSSE3,
31146 P_SSE4_A,
31147 P_PROC_SSE4_A,
31148 P_SSE4_1,
31149 P_SSE4_2,
31150 P_PROC_SSE4_2,
31151 P_POPCNT,
31152 P_AES,
31153 P_PCLMUL,
31154 P_AVX,
31155 P_PROC_AVX,
31156 P_BMI,
31157 P_PROC_BMI,
31158 P_FMA4,
31159 P_XOP,
31160 P_PROC_XOP,
31161 P_FMA,
31162 P_PROC_FMA,
31163 P_BMI2,
31164 P_AVX2,
31165 P_PROC_AVX2,
31166 P_AVX512F,
31167 P_PROC_AVX512F
31170 enum feature_priority priority = P_ZERO;
31172 /* These are the target attribute strings for which a dispatcher is
31173 available, from fold_builtin_cpu. */
31175 static struct _feature_list
31177 const char *const name;
31178 const enum feature_priority priority;
31180 const feature_list[] =
31182 {"mmx", P_MMX},
31183 {"sse", P_SSE},
31184 {"sse2", P_SSE2},
31185 {"sse3", P_SSE3},
31186 {"sse4a", P_SSE4_A},
31187 {"ssse3", P_SSSE3},
31188 {"sse4.1", P_SSE4_1},
31189 {"sse4.2", P_SSE4_2},
31190 {"popcnt", P_POPCNT},
31191 {"aes", P_AES},
31192 {"pclmul", P_PCLMUL},
31193 {"avx", P_AVX},
31194 {"bmi", P_BMI},
31195 {"fma4", P_FMA4},
31196 {"xop", P_XOP},
31197 {"fma", P_FMA},
31198 {"bmi2", P_BMI2},
31199 {"avx2", P_AVX2},
31200 {"avx512f", P_AVX512F}
31204 static unsigned int NUM_FEATURES
31205 = sizeof (feature_list) / sizeof (struct _feature_list);
31207 unsigned int i;
31209 tree predicate_chain = NULL_TREE;
31210 tree predicate_decl, predicate_arg;
31212 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31213 gcc_assert (attrs != NULL);
31215 attrs = TREE_VALUE (TREE_VALUE (attrs));
31217 gcc_assert (TREE_CODE (attrs) == STRING_CST);
31218 attrs_str = TREE_STRING_POINTER (attrs);
31220 /* Return priority zero for default function. */
31221 if (strcmp (attrs_str, "default") == 0)
31222 return 0;
31224 /* Handle arch= if specified. For priority, set it to be 1 more than
31225 the best instruction set the processor can handle. For instance, if
31226 there is a version for atom and a version for ssse3 (the highest ISA
31227 priority for atom), the atom version must be checked for dispatch
31228 before the ssse3 version. */
31229 if (strstr (attrs_str, "arch=") != NULL)
31231 cl_target_option_save (&cur_target, &global_options);
31232 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
31233 &global_options_set);
31235 gcc_assert (target_node);
31236 new_target = TREE_TARGET_OPTION (target_node);
31237 gcc_assert (new_target);
31239 if (new_target->arch_specified && new_target->arch > 0)
31241 switch (new_target->arch)
31243 case PROCESSOR_CORE2:
31244 arg_str = "core2";
31245 priority = P_PROC_SSSE3;
31246 break;
31247 case PROCESSOR_NEHALEM:
31248 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
31250 arg_str = "westmere";
31251 priority = P_AES;
31253 else
31255 /* We translate "arch=corei7" and "arch=nehalem" to
31256 "corei7" so that it will be mapped to M_INTEL_COREI7
31257 as cpu type to cover all M_INTEL_COREI7_XXXs. */
31258 arg_str = "corei7";
31259 priority = P_PROC_SSE4_2;
31261 break;
31262 case PROCESSOR_SANDYBRIDGE:
31263 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
31264 arg_str = "ivybridge";
31265 else
31266 arg_str = "sandybridge";
31267 priority = P_PROC_AVX;
31268 break;
31269 case PROCESSOR_HASWELL:
31270 case PROCESSOR_SKYLAKE_AVX512:
31271 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VBMI)
31272 arg_str = "cannonlake";
31273 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31274 arg_str = "skylake-avx512";
31275 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
31276 arg_str = "skylake";
31277 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
31278 arg_str = "broadwell";
31279 else
31280 arg_str = "haswell";
31281 priority = P_PROC_AVX2;
31282 break;
31283 case PROCESSOR_BONNELL:
31284 arg_str = "bonnell";
31285 priority = P_PROC_SSSE3;
31286 break;
31287 case PROCESSOR_KNL:
31288 arg_str = "knl";
31289 priority = P_PROC_AVX512F;
31290 break;
31291 case PROCESSOR_KNM:
31292 arg_str = "knm";
31293 priority = P_PROC_AVX512F;
31294 break;
31295 case PROCESSOR_SILVERMONT:
31296 arg_str = "silvermont";
31297 priority = P_PROC_SSE4_2;
31298 break;
31299 case PROCESSOR_AMDFAM10:
31300 arg_str = "amdfam10h";
31301 priority = P_PROC_SSE4_A;
31302 break;
31303 case PROCESSOR_BTVER1:
31304 arg_str = "btver1";
31305 priority = P_PROC_SSE4_A;
31306 break;
31307 case PROCESSOR_BTVER2:
31308 arg_str = "btver2";
31309 priority = P_PROC_BMI;
31310 break;
31311 case PROCESSOR_BDVER1:
31312 arg_str = "bdver1";
31313 priority = P_PROC_XOP;
31314 break;
31315 case PROCESSOR_BDVER2:
31316 arg_str = "bdver2";
31317 priority = P_PROC_FMA;
31318 break;
31319 case PROCESSOR_BDVER3:
31320 arg_str = "bdver3";
31321 priority = P_PROC_FMA;
31322 break;
31323 case PROCESSOR_BDVER4:
31324 arg_str = "bdver4";
31325 priority = P_PROC_AVX2;
31326 break;
31327 case PROCESSOR_ZNVER1:
31328 arg_str = "znver1";
31329 priority = P_PROC_AVX2;
31330 break;
31334 cl_target_option_restore (&global_options, &cur_target);
31336 if (predicate_list && arg_str == NULL)
31338 error_at (DECL_SOURCE_LOCATION (decl),
31339 "No dispatcher found for the versioning attributes");
31340 return 0;
31343 if (predicate_list)
31345 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
31346 /* For a C string literal the length includes the trailing NULL. */
31347 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
31348 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31349 predicate_chain);
31353 /* Process feature name. */
31354 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
31355 strcpy (tok_str, attrs_str);
31356 token = strtok (tok_str, ",");
31357 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
31359 while (token != NULL)
31361 /* Do not process "arch=" */
31362 if (strncmp (token, "arch=", 5) == 0)
31364 token = strtok (NULL, ",");
31365 continue;
31367 for (i = 0; i < NUM_FEATURES; ++i)
31369 if (strcmp (token, feature_list[i].name) == 0)
31371 if (predicate_list)
31373 predicate_arg = build_string_literal (
31374 strlen (feature_list[i].name) + 1,
31375 feature_list[i].name);
31376 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31377 predicate_chain);
31379 /* Find the maximum priority feature. */
31380 if (feature_list[i].priority > priority)
31381 priority = feature_list[i].priority;
31383 break;
31386 if (predicate_list && i == NUM_FEATURES)
31388 error_at (DECL_SOURCE_LOCATION (decl),
31389 "No dispatcher found for %s", token);
31390 return 0;
31392 token = strtok (NULL, ",");
31394 free (tok_str);
31396 if (predicate_list && predicate_chain == NULL_TREE)
31398 error_at (DECL_SOURCE_LOCATION (decl),
31399 "No dispatcher found for the versioning attributes : %s",
31400 attrs_str);
31401 return 0;
31403 else if (predicate_list)
31405 predicate_chain = nreverse (predicate_chain);
31406 *predicate_list = predicate_chain;
31409 return priority;
31412 /* This compares the priority of target features in function DECL1
31413 and DECL2. It returns positive value if DECL1 is higher priority,
31414 negative value if DECL2 is higher priority and 0 if they are the
31415 same. */
31417 static int
31418 ix86_compare_version_priority (tree decl1, tree decl2)
31420 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
31421 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
31423 return (int)priority1 - (int)priority2;
31426 /* V1 and V2 point to function versions with different priorities
31427 based on the target ISA. This function compares their priorities. */
31429 static int
31430 feature_compare (const void *v1, const void *v2)
31432 typedef struct _function_version_info
31434 tree version_decl;
31435 tree predicate_chain;
31436 unsigned int dispatch_priority;
31437 } function_version_info;
31439 const function_version_info c1 = *(const function_version_info *)v1;
31440 const function_version_info c2 = *(const function_version_info *)v2;
31441 return (c2.dispatch_priority - c1.dispatch_priority);
31444 /* This function generates the dispatch function for
31445 multi-versioned functions. DISPATCH_DECL is the function which will
31446 contain the dispatch logic. FNDECLS are the function choices for
31447 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
31448 in DISPATCH_DECL in which the dispatch code is generated. */
31450 static int
31451 dispatch_function_versions (tree dispatch_decl,
31452 void *fndecls_p,
31453 basic_block *empty_bb)
31455 tree default_decl;
31456 gimple *ifunc_cpu_init_stmt;
31457 gimple_seq gseq;
31458 int ix;
31459 tree ele;
31460 vec<tree> *fndecls;
31461 unsigned int num_versions = 0;
31462 unsigned int actual_versions = 0;
31463 unsigned int i;
31465 struct _function_version_info
31467 tree version_decl;
31468 tree predicate_chain;
31469 unsigned int dispatch_priority;
31470 }*function_version_info;
31472 gcc_assert (dispatch_decl != NULL
31473 && fndecls_p != NULL
31474 && empty_bb != NULL);
31476 /*fndecls_p is actually a vector. */
31477 fndecls = static_cast<vec<tree> *> (fndecls_p);
31479 /* At least one more version other than the default. */
31480 num_versions = fndecls->length ();
31481 gcc_assert (num_versions >= 2);
31483 function_version_info = (struct _function_version_info *)
31484 XNEWVEC (struct _function_version_info, (num_versions - 1));
31486 /* The first version in the vector is the default decl. */
31487 default_decl = (*fndecls)[0];
31489 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
31491 gseq = bb_seq (*empty_bb);
31492 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
31493 constructors, so explicity call __builtin_cpu_init here. */
31494 ifunc_cpu_init_stmt = gimple_build_call_vec (
31495 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
31496 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
31497 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
31498 set_bb_seq (*empty_bb, gseq);
31500 pop_cfun ();
31503 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
31505 tree version_decl = ele;
31506 tree predicate_chain = NULL_TREE;
31507 unsigned int priority;
31508 /* Get attribute string, parse it and find the right predicate decl.
31509 The predicate function could be a lengthy combination of many
31510 features, like arch-type and various isa-variants. */
31511 priority = get_builtin_code_for_version (version_decl,
31512 &predicate_chain);
31514 if (predicate_chain == NULL_TREE)
31515 continue;
31517 function_version_info [actual_versions].version_decl = version_decl;
31518 function_version_info [actual_versions].predicate_chain
31519 = predicate_chain;
31520 function_version_info [actual_versions].dispatch_priority = priority;
31521 actual_versions++;
31524 /* Sort the versions according to descending order of dispatch priority. The
31525 priority is based on the ISA. This is not a perfect solution. There
31526 could still be ambiguity. If more than one function version is suitable
31527 to execute, which one should be dispatched? In future, allow the user
31528 to specify a dispatch priority next to the version. */
31529 qsort (function_version_info, actual_versions,
31530 sizeof (struct _function_version_info), feature_compare);
31532 for (i = 0; i < actual_versions; ++i)
31533 *empty_bb = add_condition_to_bb (dispatch_decl,
31534 function_version_info[i].version_decl,
31535 function_version_info[i].predicate_chain,
31536 *empty_bb);
31538 /* dispatch default version at the end. */
31539 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
31540 NULL, *empty_bb);
31542 free (function_version_info);
31543 return 0;
31546 /* This function changes the assembler name for functions that are
31547 versions. If DECL is a function version and has a "target"
31548 attribute, it appends the attribute string to its assembler name. */
31550 static tree
31551 ix86_mangle_function_version_assembler_name (tree decl, tree id)
31553 tree version_attr;
31554 const char *orig_name, *version_string;
31555 char *attr_str, *assembler_name;
31557 if (DECL_DECLARED_INLINE_P (decl)
31558 && lookup_attribute ("gnu_inline",
31559 DECL_ATTRIBUTES (decl)))
31560 error_at (DECL_SOURCE_LOCATION (decl),
31561 "Function versions cannot be marked as gnu_inline,"
31562 " bodies have to be generated");
31564 if (DECL_VIRTUAL_P (decl)
31565 || DECL_VINDEX (decl))
31566 sorry ("Virtual function multiversioning not supported");
31568 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31570 /* target attribute string cannot be NULL. */
31571 gcc_assert (version_attr != NULL_TREE);
31573 orig_name = IDENTIFIER_POINTER (id);
31574 version_string
31575 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
31577 if (strcmp (version_string, "default") == 0)
31578 return id;
31580 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
31581 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
31583 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
31585 /* Allow assembler name to be modified if already set. */
31586 if (DECL_ASSEMBLER_NAME_SET_P (decl))
31587 SET_DECL_RTL (decl, NULL);
31589 tree ret = get_identifier (assembler_name);
31590 XDELETEVEC (attr_str);
31591 XDELETEVEC (assembler_name);
31592 return ret;
31596 static tree
31597 ix86_mangle_decl_assembler_name (tree decl, tree id)
31599 /* For function version, add the target suffix to the assembler name. */
31600 if (TREE_CODE (decl) == FUNCTION_DECL
31601 && DECL_FUNCTION_VERSIONED (decl))
31602 id = ix86_mangle_function_version_assembler_name (decl, id);
31603 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
31604 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
31605 #endif
31607 return id;
31610 /* Make a dispatcher declaration for the multi-versioned function DECL.
31611 Calls to DECL function will be replaced with calls to the dispatcher
31612 by the front-end. Returns the decl of the dispatcher function. */
31614 static tree
31615 ix86_get_function_versions_dispatcher (void *decl)
31617 tree fn = (tree) decl;
31618 struct cgraph_node *node = NULL;
31619 struct cgraph_node *default_node = NULL;
31620 struct cgraph_function_version_info *node_v = NULL;
31621 struct cgraph_function_version_info *first_v = NULL;
31623 tree dispatch_decl = NULL;
31625 struct cgraph_function_version_info *default_version_info = NULL;
31627 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
31629 node = cgraph_node::get (fn);
31630 gcc_assert (node != NULL);
31632 node_v = node->function_version ();
31633 gcc_assert (node_v != NULL);
31635 if (node_v->dispatcher_resolver != NULL)
31636 return node_v->dispatcher_resolver;
31638 /* Find the default version and make it the first node. */
31639 first_v = node_v;
31640 /* Go to the beginning of the chain. */
31641 while (first_v->prev != NULL)
31642 first_v = first_v->prev;
31643 default_version_info = first_v;
31644 while (default_version_info != NULL)
31646 if (is_function_default_version
31647 (default_version_info->this_node->decl))
31648 break;
31649 default_version_info = default_version_info->next;
31652 /* If there is no default node, just return NULL. */
31653 if (default_version_info == NULL)
31654 return NULL;
31656 /* Make default info the first node. */
31657 if (first_v != default_version_info)
31659 default_version_info->prev->next = default_version_info->next;
31660 if (default_version_info->next)
31661 default_version_info->next->prev = default_version_info->prev;
31662 first_v->prev = default_version_info;
31663 default_version_info->next = first_v;
31664 default_version_info->prev = NULL;
31667 default_node = default_version_info->this_node;
31669 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
31670 if (targetm.has_ifunc_p ())
31672 struct cgraph_function_version_info *it_v = NULL;
31673 struct cgraph_node *dispatcher_node = NULL;
31674 struct cgraph_function_version_info *dispatcher_version_info = NULL;
31676 /* Right now, the dispatching is done via ifunc. */
31677 dispatch_decl = make_dispatcher_decl (default_node->decl);
31679 dispatcher_node = cgraph_node::get_create (dispatch_decl);
31680 gcc_assert (dispatcher_node != NULL);
31681 dispatcher_node->dispatcher_function = 1;
31682 dispatcher_version_info
31683 = dispatcher_node->insert_new_function_version ();
31684 dispatcher_version_info->next = default_version_info;
31685 dispatcher_node->definition = 1;
31687 /* Set the dispatcher for all the versions. */
31688 it_v = default_version_info;
31689 while (it_v != NULL)
31691 it_v->dispatcher_resolver = dispatch_decl;
31692 it_v = it_v->next;
31695 else
31696 #endif
31698 error_at (DECL_SOURCE_LOCATION (default_node->decl),
31699 "multiversioning needs ifunc which is not supported "
31700 "on this target");
31703 return dispatch_decl;
31706 /* Make the resolver function decl to dispatch the versions of
31707 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
31708 ifunc alias that will point to the created resolver. Create an
31709 empty basic block in the resolver and store the pointer in
31710 EMPTY_BB. Return the decl of the resolver function. */
31712 static tree
31713 make_resolver_func (const tree default_decl,
31714 const tree ifunc_alias_decl,
31715 basic_block *empty_bb)
31717 char *resolver_name;
31718 tree decl, type, decl_name, t;
31720 /* IFUNC's have to be globally visible. So, if the default_decl is
31721 not, then the name of the IFUNC should be made unique. */
31722 if (TREE_PUBLIC (default_decl) == 0)
31724 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
31725 symtab->change_decl_assembler_name (ifunc_alias_decl,
31726 get_identifier (ifunc_name));
31727 XDELETEVEC (ifunc_name);
31730 resolver_name = make_unique_name (default_decl, "resolver", false);
31732 /* The resolver function should return a (void *). */
31733 type = build_function_type_list (ptr_type_node, NULL_TREE);
31735 decl = build_fn_decl (resolver_name, type);
31736 decl_name = get_identifier (resolver_name);
31737 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
31739 DECL_NAME (decl) = decl_name;
31740 TREE_USED (decl) = 1;
31741 DECL_ARTIFICIAL (decl) = 1;
31742 DECL_IGNORED_P (decl) = 1;
31743 TREE_PUBLIC (decl) = 0;
31744 DECL_UNINLINABLE (decl) = 1;
31746 /* Resolver is not external, body is generated. */
31747 DECL_EXTERNAL (decl) = 0;
31748 DECL_EXTERNAL (ifunc_alias_decl) = 0;
31750 DECL_CONTEXT (decl) = NULL_TREE;
31751 DECL_INITIAL (decl) = make_node (BLOCK);
31752 DECL_STATIC_CONSTRUCTOR (decl) = 0;
31754 if (DECL_COMDAT_GROUP (default_decl)
31755 || TREE_PUBLIC (default_decl))
31757 /* In this case, each translation unit with a call to this
31758 versioned function will put out a resolver. Ensure it
31759 is comdat to keep just one copy. */
31760 DECL_COMDAT (decl) = 1;
31761 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
31763 /* Build result decl and add to function_decl. */
31764 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
31765 DECL_ARTIFICIAL (t) = 1;
31766 DECL_IGNORED_P (t) = 1;
31767 DECL_RESULT (decl) = t;
31769 gimplify_function_tree (decl);
31770 push_cfun (DECL_STRUCT_FUNCTION (decl));
31771 *empty_bb = init_lowered_empty_function (decl, false,
31772 profile_count::uninitialized ());
31774 cgraph_node::add_new_function (decl, true);
31775 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
31777 pop_cfun ();
31779 gcc_assert (ifunc_alias_decl != NULL);
31780 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
31781 DECL_ATTRIBUTES (ifunc_alias_decl)
31782 = make_attribute ("ifunc", resolver_name,
31783 DECL_ATTRIBUTES (ifunc_alias_decl));
31785 /* Create the alias for dispatch to resolver here. */
31786 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
31787 XDELETEVEC (resolver_name);
31788 return decl;
31791 /* Generate the dispatching code body to dispatch multi-versioned function
31792 DECL. The target hook is called to process the "target" attributes and
31793 provide the code to dispatch the right function at run-time. NODE points
31794 to the dispatcher decl whose body will be created. */
31796 static tree
31797 ix86_generate_version_dispatcher_body (void *node_p)
31799 tree resolver_decl;
31800 basic_block empty_bb;
31801 tree default_ver_decl;
31802 struct cgraph_node *versn;
31803 struct cgraph_node *node;
31805 struct cgraph_function_version_info *node_version_info = NULL;
31806 struct cgraph_function_version_info *versn_info = NULL;
31808 node = (cgraph_node *)node_p;
31810 node_version_info = node->function_version ();
31811 gcc_assert (node->dispatcher_function
31812 && node_version_info != NULL);
31814 if (node_version_info->dispatcher_resolver)
31815 return node_version_info->dispatcher_resolver;
31817 /* The first version in the chain corresponds to the default version. */
31818 default_ver_decl = node_version_info->next->this_node->decl;
31820 /* node is going to be an alias, so remove the finalized bit. */
31821 node->definition = false;
31823 resolver_decl = make_resolver_func (default_ver_decl,
31824 node->decl, &empty_bb);
31826 node_version_info->dispatcher_resolver = resolver_decl;
31828 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
31830 auto_vec<tree, 2> fn_ver_vec;
31832 for (versn_info = node_version_info->next; versn_info;
31833 versn_info = versn_info->next)
31835 versn = versn_info->this_node;
31836 /* Check for virtual functions here again, as by this time it should
31837 have been determined if this function needs a vtable index or
31838 not. This happens for methods in derived classes that override
31839 virtual methods in base classes but are not explicitly marked as
31840 virtual. */
31841 if (DECL_VINDEX (versn->decl))
31842 sorry ("Virtual function multiversioning not supported");
31844 fn_ver_vec.safe_push (versn->decl);
31847 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
31848 cgraph_edge::rebuild_edges ();
31849 pop_cfun ();
31850 return resolver_decl;
31852 /* This builds the processor_model struct type defined in
31853 libgcc/config/i386/cpuinfo.c */
31855 static tree
31856 build_processor_model_struct (void)
31858 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
31859 "__cpu_features"};
31860 tree field = NULL_TREE, field_chain = NULL_TREE;
31861 int i;
31862 tree type = make_node (RECORD_TYPE);
31864 /* The first 3 fields are unsigned int. */
31865 for (i = 0; i < 3; ++i)
31867 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31868 get_identifier (field_name[i]), unsigned_type_node);
31869 if (field_chain != NULL_TREE)
31870 DECL_CHAIN (field) = field_chain;
31871 field_chain = field;
31874 /* The last field is an array of unsigned integers of size one. */
31875 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31876 get_identifier (field_name[3]),
31877 build_array_type (unsigned_type_node,
31878 build_index_type (size_one_node)));
31879 if (field_chain != NULL_TREE)
31880 DECL_CHAIN (field) = field_chain;
31881 field_chain = field;
31883 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
31884 return type;
31887 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
31889 static tree
31890 make_var_decl (tree type, const char *name)
31892 tree new_decl;
31894 new_decl = build_decl (UNKNOWN_LOCATION,
31895 VAR_DECL,
31896 get_identifier(name),
31897 type);
31899 DECL_EXTERNAL (new_decl) = 1;
31900 TREE_STATIC (new_decl) = 1;
31901 TREE_PUBLIC (new_decl) = 1;
31902 DECL_INITIAL (new_decl) = 0;
31903 DECL_ARTIFICIAL (new_decl) = 0;
31904 DECL_PRESERVE_P (new_decl) = 1;
31906 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
31907 assemble_variable (new_decl, 0, 0, 0);
31909 return new_decl;
31912 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
31913 into an integer defined in libgcc/config/i386/cpuinfo.c */
31915 static tree
31916 fold_builtin_cpu (tree fndecl, tree *args)
31918 unsigned int i;
31919 enum ix86_builtins fn_code = (enum ix86_builtins)
31920 DECL_FUNCTION_CODE (fndecl);
31921 tree param_string_cst = NULL;
31923 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
31924 enum processor_features
31926 F_CMOV = 0,
31927 F_MMX,
31928 F_POPCNT,
31929 F_SSE,
31930 F_SSE2,
31931 F_SSE3,
31932 F_SSSE3,
31933 F_SSE4_1,
31934 F_SSE4_2,
31935 F_AVX,
31936 F_AVX2,
31937 F_SSE4_A,
31938 F_FMA4,
31939 F_XOP,
31940 F_FMA,
31941 F_AVX512F,
31942 F_BMI,
31943 F_BMI2,
31944 F_AES,
31945 F_PCLMUL,
31946 F_AVX512VL,
31947 F_AVX512BW,
31948 F_AVX512DQ,
31949 F_AVX512CD,
31950 F_AVX512ER,
31951 F_AVX512PF,
31952 F_AVX512VBMI,
31953 F_AVX512IFMA,
31954 F_AVX5124VNNIW,
31955 F_AVX5124FMAPS,
31956 F_AVX512VPOPCNTDQ,
31957 F_MAX
31960 /* These are the values for vendor types and cpu types and subtypes
31961 in cpuinfo.c. Cpu types and subtypes should be subtracted by
31962 the corresponding start value. */
31963 enum processor_model
31965 M_INTEL = 1,
31966 M_AMD,
31967 M_CPU_TYPE_START,
31968 M_INTEL_BONNELL,
31969 M_INTEL_CORE2,
31970 M_INTEL_COREI7,
31971 M_AMDFAM10H,
31972 M_AMDFAM15H,
31973 M_INTEL_SILVERMONT,
31974 M_INTEL_KNL,
31975 M_AMD_BTVER1,
31976 M_AMD_BTVER2,
31977 M_AMDFAM17H,
31978 M_INTEL_KNM,
31979 M_CPU_SUBTYPE_START,
31980 M_INTEL_COREI7_NEHALEM,
31981 M_INTEL_COREI7_WESTMERE,
31982 M_INTEL_COREI7_SANDYBRIDGE,
31983 M_AMDFAM10H_BARCELONA,
31984 M_AMDFAM10H_SHANGHAI,
31985 M_AMDFAM10H_ISTANBUL,
31986 M_AMDFAM15H_BDVER1,
31987 M_AMDFAM15H_BDVER2,
31988 M_AMDFAM15H_BDVER3,
31989 M_AMDFAM15H_BDVER4,
31990 M_AMDFAM17H_ZNVER1,
31991 M_INTEL_COREI7_IVYBRIDGE,
31992 M_INTEL_COREI7_HASWELL,
31993 M_INTEL_COREI7_BROADWELL,
31994 M_INTEL_COREI7_SKYLAKE,
31995 M_INTEL_COREI7_SKYLAKE_AVX512,
31996 M_INTEL_COREI7_CANNONLAKE
31999 static struct _arch_names_table
32001 const char *const name;
32002 const enum processor_model model;
32004 const arch_names_table[] =
32006 {"amd", M_AMD},
32007 {"intel", M_INTEL},
32008 {"atom", M_INTEL_BONNELL},
32009 {"slm", M_INTEL_SILVERMONT},
32010 {"core2", M_INTEL_CORE2},
32011 {"corei7", M_INTEL_COREI7},
32012 {"nehalem", M_INTEL_COREI7_NEHALEM},
32013 {"westmere", M_INTEL_COREI7_WESTMERE},
32014 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32015 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32016 {"haswell", M_INTEL_COREI7_HASWELL},
32017 {"broadwell", M_INTEL_COREI7_BROADWELL},
32018 {"skylake", M_INTEL_COREI7_SKYLAKE},
32019 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32020 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32021 {"bonnell", M_INTEL_BONNELL},
32022 {"silvermont", M_INTEL_SILVERMONT},
32023 {"knl", M_INTEL_KNL},
32024 {"knm", M_INTEL_KNM},
32025 {"amdfam10h", M_AMDFAM10H},
32026 {"barcelona", M_AMDFAM10H_BARCELONA},
32027 {"shanghai", M_AMDFAM10H_SHANGHAI},
32028 {"istanbul", M_AMDFAM10H_ISTANBUL},
32029 {"btver1", M_AMD_BTVER1},
32030 {"amdfam15h", M_AMDFAM15H},
32031 {"bdver1", M_AMDFAM15H_BDVER1},
32032 {"bdver2", M_AMDFAM15H_BDVER2},
32033 {"bdver3", M_AMDFAM15H_BDVER3},
32034 {"bdver4", M_AMDFAM15H_BDVER4},
32035 {"btver2", M_AMD_BTVER2},
32036 {"amdfam17h", M_AMDFAM17H},
32037 {"znver1", M_AMDFAM17H_ZNVER1},
32040 static struct _isa_names_table
32042 const char *const name;
32043 const enum processor_features feature;
32045 const isa_names_table[] =
32047 {"cmov", F_CMOV},
32048 {"mmx", F_MMX},
32049 {"popcnt", F_POPCNT},
32050 {"sse", F_SSE},
32051 {"sse2", F_SSE2},
32052 {"sse3", F_SSE3},
32053 {"ssse3", F_SSSE3},
32054 {"sse4a", F_SSE4_A},
32055 {"sse4.1", F_SSE4_1},
32056 {"sse4.2", F_SSE4_2},
32057 {"avx", F_AVX},
32058 {"fma4", F_FMA4},
32059 {"xop", F_XOP},
32060 {"fma", F_FMA},
32061 {"avx2", F_AVX2},
32062 {"avx512f", F_AVX512F},
32063 {"bmi", F_BMI},
32064 {"bmi2", F_BMI2},
32065 {"aes", F_AES},
32066 {"pclmul", F_PCLMUL},
32067 {"avx512vl",F_AVX512VL},
32068 {"avx512bw",F_AVX512BW},
32069 {"avx512dq",F_AVX512DQ},
32070 {"avx512cd",F_AVX512CD},
32071 {"avx512er",F_AVX512ER},
32072 {"avx512pf",F_AVX512PF},
32073 {"avx512vbmi",F_AVX512VBMI},
32074 {"avx512ifma",F_AVX512IFMA},
32075 {"avx5124vnniw",F_AVX5124VNNIW},
32076 {"avx5124fmaps",F_AVX5124FMAPS},
32077 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32080 tree __processor_model_type = build_processor_model_struct ();
32081 tree __cpu_model_var = make_var_decl (__processor_model_type,
32082 "__cpu_model");
32085 varpool_node::add (__cpu_model_var);
32087 gcc_assert ((args != NULL) && (*args != NULL));
32089 param_string_cst = *args;
32090 while (param_string_cst
32091 && TREE_CODE (param_string_cst) != STRING_CST)
32093 /* *args must be a expr that can contain other EXPRS leading to a
32094 STRING_CST. */
32095 if (!EXPR_P (param_string_cst))
32097 error ("Parameter to builtin must be a string constant or literal");
32098 return integer_zero_node;
32100 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32103 gcc_assert (param_string_cst);
32105 if (fn_code == IX86_BUILTIN_CPU_IS)
32107 tree ref;
32108 tree field;
32109 tree final;
32111 unsigned int field_val = 0;
32112 unsigned int NUM_ARCH_NAMES
32113 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
32115 for (i = 0; i < NUM_ARCH_NAMES; i++)
32116 if (strcmp (arch_names_table[i].name,
32117 TREE_STRING_POINTER (param_string_cst)) == 0)
32118 break;
32120 if (i == NUM_ARCH_NAMES)
32122 error ("Parameter to builtin not valid: %s",
32123 TREE_STRING_POINTER (param_string_cst));
32124 return integer_zero_node;
32127 field = TYPE_FIELDS (__processor_model_type);
32128 field_val = arch_names_table[i].model;
32130 /* CPU types are stored in the next field. */
32131 if (field_val > M_CPU_TYPE_START
32132 && field_val < M_CPU_SUBTYPE_START)
32134 field = DECL_CHAIN (field);
32135 field_val -= M_CPU_TYPE_START;
32138 /* CPU subtypes are stored in the next field. */
32139 if (field_val > M_CPU_SUBTYPE_START)
32141 field = DECL_CHAIN ( DECL_CHAIN (field));
32142 field_val -= M_CPU_SUBTYPE_START;
32145 /* Get the appropriate field in __cpu_model. */
32146 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32147 field, NULL_TREE);
32149 /* Check the value. */
32150 final = build2 (EQ_EXPR, unsigned_type_node, ref,
32151 build_int_cstu (unsigned_type_node, field_val));
32152 return build1 (CONVERT_EXPR, integer_type_node, final);
32154 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
32156 tree ref;
32157 tree array_elt;
32158 tree field;
32159 tree final;
32161 unsigned int field_val = 0;
32162 unsigned int NUM_ISA_NAMES
32163 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
32165 for (i = 0; i < NUM_ISA_NAMES; i++)
32166 if (strcmp (isa_names_table[i].name,
32167 TREE_STRING_POINTER (param_string_cst)) == 0)
32168 break;
32170 if (i == NUM_ISA_NAMES)
32172 error ("Parameter to builtin not valid: %s",
32173 TREE_STRING_POINTER (param_string_cst));
32174 return integer_zero_node;
32177 field = TYPE_FIELDS (__processor_model_type);
32178 /* Get the last field, which is __cpu_features. */
32179 while (DECL_CHAIN (field))
32180 field = DECL_CHAIN (field);
32182 /* Get the appropriate field: __cpu_model.__cpu_features */
32183 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32184 field, NULL_TREE);
32186 /* Access the 0th element of __cpu_features array. */
32187 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
32188 integer_zero_node, NULL_TREE, NULL_TREE);
32190 field_val = (1 << isa_names_table[i].feature);
32191 /* Return __cpu_model.__cpu_features[0] & field_val */
32192 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
32193 build_int_cstu (unsigned_type_node, field_val));
32194 return build1 (CONVERT_EXPR, integer_type_node, final);
32196 gcc_unreachable ();
32199 static tree
32200 ix86_fold_builtin (tree fndecl, int n_args,
32201 tree *args, bool ignore ATTRIBUTE_UNUSED)
32203 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32205 enum ix86_builtins fn_code = (enum ix86_builtins)
32206 DECL_FUNCTION_CODE (fndecl);
32207 switch (fn_code)
32209 case IX86_BUILTIN_CPU_IS:
32210 case IX86_BUILTIN_CPU_SUPPORTS:
32211 gcc_assert (n_args == 1);
32212 return fold_builtin_cpu (fndecl, args);
32214 case IX86_BUILTIN_NANQ:
32215 case IX86_BUILTIN_NANSQ:
32217 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32218 const char *str = c_getstr (*args);
32219 int quiet = fn_code == IX86_BUILTIN_NANQ;
32220 REAL_VALUE_TYPE real;
32222 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
32223 return build_real (type, real);
32224 return NULL_TREE;
32227 case IX86_BUILTIN_INFQ:
32228 case IX86_BUILTIN_HUGE_VALQ:
32230 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32231 REAL_VALUE_TYPE inf;
32232 real_inf (&inf);
32233 return build_real (type, inf);
32236 case IX86_BUILTIN_TZCNT16:
32237 case IX86_BUILTIN_CTZS:
32238 case IX86_BUILTIN_TZCNT32:
32239 case IX86_BUILTIN_TZCNT64:
32240 gcc_assert (n_args == 1);
32241 if (TREE_CODE (args[0]) == INTEGER_CST)
32243 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32244 tree arg = args[0];
32245 if (fn_code == IX86_BUILTIN_TZCNT16
32246 || fn_code == IX86_BUILTIN_CTZS)
32247 arg = fold_convert (short_unsigned_type_node, arg);
32248 if (integer_zerop (arg))
32249 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32250 else
32251 return fold_const_call (CFN_CTZ, type, arg);
32253 break;
32255 case IX86_BUILTIN_LZCNT16:
32256 case IX86_BUILTIN_CLZS:
32257 case IX86_BUILTIN_LZCNT32:
32258 case IX86_BUILTIN_LZCNT64:
32259 gcc_assert (n_args == 1);
32260 if (TREE_CODE (args[0]) == INTEGER_CST)
32262 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32263 tree arg = args[0];
32264 if (fn_code == IX86_BUILTIN_LZCNT16
32265 || fn_code == IX86_BUILTIN_CLZS)
32266 arg = fold_convert (short_unsigned_type_node, arg);
32267 if (integer_zerop (arg))
32268 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32269 else
32270 return fold_const_call (CFN_CLZ, type, arg);
32272 break;
32274 case IX86_BUILTIN_BEXTR32:
32275 case IX86_BUILTIN_BEXTR64:
32276 case IX86_BUILTIN_BEXTRI32:
32277 case IX86_BUILTIN_BEXTRI64:
32278 gcc_assert (n_args == 2);
32279 if (tree_fits_uhwi_p (args[1]))
32281 unsigned HOST_WIDE_INT res = 0;
32282 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
32283 unsigned int start = tree_to_uhwi (args[1]);
32284 unsigned int len = (start & 0xff00) >> 8;
32285 start &= 0xff;
32286 if (start >= prec || len == 0)
32287 res = 0;
32288 else if (!tree_fits_uhwi_p (args[0]))
32289 break;
32290 else
32291 res = tree_to_uhwi (args[0]) >> start;
32292 if (len > prec)
32293 len = prec;
32294 if (len < HOST_BITS_PER_WIDE_INT)
32295 res &= (HOST_WIDE_INT_1U << len) - 1;
32296 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32298 break;
32300 case IX86_BUILTIN_BZHI32:
32301 case IX86_BUILTIN_BZHI64:
32302 gcc_assert (n_args == 2);
32303 if (tree_fits_uhwi_p (args[1]))
32305 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
32306 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
32307 return args[0];
32308 if (!tree_fits_uhwi_p (args[0]))
32309 break;
32310 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
32311 res &= ~(HOST_WIDE_INT_M1U << idx);
32312 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32314 break;
32316 case IX86_BUILTIN_PDEP32:
32317 case IX86_BUILTIN_PDEP64:
32318 gcc_assert (n_args == 2);
32319 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32321 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32322 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32323 unsigned HOST_WIDE_INT res = 0;
32324 unsigned HOST_WIDE_INT m, k = 1;
32325 for (m = 1; m; m <<= 1)
32326 if ((mask & m) != 0)
32328 if ((src & k) != 0)
32329 res |= m;
32330 k <<= 1;
32332 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32334 break;
32336 case IX86_BUILTIN_PEXT32:
32337 case IX86_BUILTIN_PEXT64:
32338 gcc_assert (n_args == 2);
32339 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32341 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32342 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32343 unsigned HOST_WIDE_INT res = 0;
32344 unsigned HOST_WIDE_INT m, k = 1;
32345 for (m = 1; m; m <<= 1)
32346 if ((mask & m) != 0)
32348 if ((src & m) != 0)
32349 res |= k;
32350 k <<= 1;
32352 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32354 break;
32356 default:
32357 break;
32361 #ifdef SUBTARGET_FOLD_BUILTIN
32362 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
32363 #endif
32365 return NULL_TREE;
32368 /* Fold a MD builtin (use ix86_fold_builtin for folding into
32369 constant) in GIMPLE. */
32371 bool
32372 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
32374 gimple *stmt = gsi_stmt (*gsi);
32375 tree fndecl = gimple_call_fndecl (stmt);
32376 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
32377 int n_args = gimple_call_num_args (stmt);
32378 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
32379 tree decl = NULL_TREE;
32380 tree arg0, arg1;
32382 switch (fn_code)
32384 case IX86_BUILTIN_TZCNT32:
32385 decl = builtin_decl_implicit (BUILT_IN_CTZ);
32386 goto fold_tzcnt_lzcnt;
32388 case IX86_BUILTIN_TZCNT64:
32389 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
32390 goto fold_tzcnt_lzcnt;
32392 case IX86_BUILTIN_LZCNT32:
32393 decl = builtin_decl_implicit (BUILT_IN_CLZ);
32394 goto fold_tzcnt_lzcnt;
32396 case IX86_BUILTIN_LZCNT64:
32397 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
32398 goto fold_tzcnt_lzcnt;
32400 fold_tzcnt_lzcnt:
32401 gcc_assert (n_args == 1);
32402 arg0 = gimple_call_arg (stmt, 0);
32403 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
32405 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
32406 /* If arg0 is provably non-zero, optimize into generic
32407 __builtin_c[tl]z{,ll} function the middle-end handles
32408 better. */
32409 if (!expr_not_equal_to (arg0, wi::zero (prec)))
32410 return false;
32412 location_t loc = gimple_location (stmt);
32413 gimple *g = gimple_build_call (decl, 1, arg0);
32414 gimple_set_location (g, loc);
32415 tree lhs = make_ssa_name (integer_type_node);
32416 gimple_call_set_lhs (g, lhs);
32417 gsi_insert_before (gsi, g, GSI_SAME_STMT);
32418 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
32419 gimple_set_location (g, loc);
32420 gsi_replace (gsi, g, false);
32421 return true;
32423 break;
32425 case IX86_BUILTIN_BZHI32:
32426 case IX86_BUILTIN_BZHI64:
32427 gcc_assert (n_args == 2);
32428 arg1 = gimple_call_arg (stmt, 1);
32429 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
32431 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
32432 arg0 = gimple_call_arg (stmt, 0);
32433 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
32434 break;
32435 location_t loc = gimple_location (stmt);
32436 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32437 gimple_set_location (g, loc);
32438 gsi_replace (gsi, g, false);
32439 return true;
32441 break;
32443 case IX86_BUILTIN_PDEP32:
32444 case IX86_BUILTIN_PDEP64:
32445 case IX86_BUILTIN_PEXT32:
32446 case IX86_BUILTIN_PEXT64:
32447 gcc_assert (n_args == 2);
32448 arg1 = gimple_call_arg (stmt, 1);
32449 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
32451 location_t loc = gimple_location (stmt);
32452 arg0 = gimple_call_arg (stmt, 0);
32453 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32454 gimple_set_location (g, loc);
32455 gsi_replace (gsi, g, false);
32456 return true;
32458 break;
32460 default:
32461 break;
32464 return false;
32467 /* Make builtins to detect cpu type and features supported. NAME is
32468 the builtin name, CODE is the builtin code, and FTYPE is the function
32469 type of the builtin. */
32471 static void
32472 make_cpu_type_builtin (const char* name, int code,
32473 enum ix86_builtin_func_type ftype, bool is_const)
32475 tree decl;
32476 tree type;
32478 type = ix86_get_builtin_func_type (ftype);
32479 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32480 NULL, NULL_TREE);
32481 gcc_assert (decl != NULL_TREE);
32482 ix86_builtins[(int) code] = decl;
32483 TREE_READONLY (decl) = is_const;
32486 /* Make builtins to get CPU type and features supported. The created
32487 builtins are :
32489 __builtin_cpu_init (), to detect cpu type and features,
32490 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
32491 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
32494 static void
32495 ix86_init_platform_type_builtins (void)
32497 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
32498 INT_FTYPE_VOID, false);
32499 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
32500 INT_FTYPE_PCCHAR, true);
32501 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
32502 INT_FTYPE_PCCHAR, true);
32505 /* Internal method for ix86_init_builtins. */
32507 static void
32508 ix86_init_builtins_va_builtins_abi (void)
32510 tree ms_va_ref, sysv_va_ref;
32511 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
32512 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
32513 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
32514 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
32516 if (!TARGET_64BIT)
32517 return;
32518 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
32519 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
32520 ms_va_ref = build_reference_type (ms_va_list_type_node);
32521 sysv_va_ref =
32522 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
32524 fnvoid_va_end_ms =
32525 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32526 fnvoid_va_start_ms =
32527 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32528 fnvoid_va_end_sysv =
32529 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
32530 fnvoid_va_start_sysv =
32531 build_varargs_function_type_list (void_type_node, sysv_va_ref,
32532 NULL_TREE);
32533 fnvoid_va_copy_ms =
32534 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
32535 NULL_TREE);
32536 fnvoid_va_copy_sysv =
32537 build_function_type_list (void_type_node, sysv_va_ref,
32538 sysv_va_ref, NULL_TREE);
32540 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
32541 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
32542 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
32543 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
32544 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
32545 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
32546 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
32547 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32548 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
32549 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32550 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
32551 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32554 static void
32555 ix86_init_builtin_types (void)
32557 tree float80_type_node, const_string_type_node;
32559 /* The __float80 type. */
32560 float80_type_node = long_double_type_node;
32561 if (TYPE_MODE (float80_type_node) != XFmode)
32563 if (float64x_type_node != NULL_TREE
32564 && TYPE_MODE (float64x_type_node) == XFmode)
32565 float80_type_node = float64x_type_node;
32566 else
32568 /* The __float80 type. */
32569 float80_type_node = make_node (REAL_TYPE);
32571 TYPE_PRECISION (float80_type_node) = 80;
32572 layout_type (float80_type_node);
32575 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
32577 /* The __float128 type. The node has already been created as
32578 _Float128, so we only need to register the __float128 name for
32579 it. */
32580 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
32582 const_string_type_node
32583 = build_pointer_type (build_qualified_type
32584 (char_type_node, TYPE_QUAL_CONST));
32586 /* This macro is built by i386-builtin-types.awk. */
32587 DEFINE_BUILTIN_PRIMITIVE_TYPES;
32590 static void
32591 ix86_init_builtins (void)
32593 tree ftype, decl;
32595 ix86_init_builtin_types ();
32597 /* Builtins to get CPU type and features. */
32598 ix86_init_platform_type_builtins ();
32600 /* TFmode support builtins. */
32601 def_builtin_const (0, "__builtin_infq",
32602 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
32603 def_builtin_const (0, "__builtin_huge_valq",
32604 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
32606 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
32607 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
32608 BUILT_IN_MD, "nanq", NULL_TREE);
32609 TREE_READONLY (decl) = 1;
32610 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
32612 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
32613 BUILT_IN_MD, "nansq", NULL_TREE);
32614 TREE_READONLY (decl) = 1;
32615 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
32617 /* We will expand them to normal call if SSE isn't available since
32618 they are used by libgcc. */
32619 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
32620 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
32621 BUILT_IN_MD, "__fabstf2", NULL_TREE);
32622 TREE_READONLY (decl) = 1;
32623 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
32625 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
32626 decl = add_builtin_function ("__builtin_copysignq", ftype,
32627 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
32628 "__copysigntf3", NULL_TREE);
32629 TREE_READONLY (decl) = 1;
32630 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
32632 ix86_init_tm_builtins ();
32633 ix86_init_mmx_sse_builtins ();
32634 ix86_init_mpx_builtins ();
32636 if (TARGET_LP64)
32637 ix86_init_builtins_va_builtins_abi ();
32639 #ifdef SUBTARGET_INIT_BUILTINS
32640 SUBTARGET_INIT_BUILTINS;
32641 #endif
32644 /* Return the ix86 builtin for CODE. */
32646 static tree
32647 ix86_builtin_decl (unsigned code, bool)
32649 if (code >= IX86_BUILTIN_MAX)
32650 return error_mark_node;
32652 return ix86_builtins[code];
32655 /* Errors in the source file can cause expand_expr to return const0_rtx
32656 where we expect a vector. To avoid crashing, use one of the vector
32657 clear instructions. */
32658 static rtx
32659 safe_vector_operand (rtx x, machine_mode mode)
32661 if (x == const0_rtx)
32662 x = CONST0_RTX (mode);
32663 return x;
32666 /* Fixup modeless constants to fit required mode. */
32667 static rtx
32668 fixup_modeless_constant (rtx x, machine_mode mode)
32670 if (GET_MODE (x) == VOIDmode)
32671 x = convert_to_mode (mode, x, 1);
32672 return x;
32675 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
32677 static rtx
32678 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
32680 rtx pat;
32681 tree arg0 = CALL_EXPR_ARG (exp, 0);
32682 tree arg1 = CALL_EXPR_ARG (exp, 1);
32683 rtx op0 = expand_normal (arg0);
32684 rtx op1 = expand_normal (arg1);
32685 machine_mode tmode = insn_data[icode].operand[0].mode;
32686 machine_mode mode0 = insn_data[icode].operand[1].mode;
32687 machine_mode mode1 = insn_data[icode].operand[2].mode;
32689 if (VECTOR_MODE_P (mode0))
32690 op0 = safe_vector_operand (op0, mode0);
32691 if (VECTOR_MODE_P (mode1))
32692 op1 = safe_vector_operand (op1, mode1);
32694 if (optimize || !target
32695 || GET_MODE (target) != tmode
32696 || !insn_data[icode].operand[0].predicate (target, tmode))
32697 target = gen_reg_rtx (tmode);
32699 if (GET_MODE (op1) == SImode && mode1 == TImode)
32701 rtx x = gen_reg_rtx (V4SImode);
32702 emit_insn (gen_sse2_loadd (x, op1));
32703 op1 = gen_lowpart (TImode, x);
32706 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32707 op0 = copy_to_mode_reg (mode0, op0);
32708 if (!insn_data[icode].operand[2].predicate (op1, mode1))
32709 op1 = copy_to_mode_reg (mode1, op1);
32711 pat = GEN_FCN (icode) (target, op0, op1);
32712 if (! pat)
32713 return 0;
32715 emit_insn (pat);
32717 return target;
32720 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
32722 static rtx
32723 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
32724 enum ix86_builtin_func_type m_type,
32725 enum rtx_code sub_code)
32727 rtx pat;
32728 int i;
32729 int nargs;
32730 bool comparison_p = false;
32731 bool tf_p = false;
32732 bool last_arg_constant = false;
32733 int num_memory = 0;
32734 struct {
32735 rtx op;
32736 machine_mode mode;
32737 } args[4];
32739 machine_mode tmode = insn_data[icode].operand[0].mode;
32741 switch (m_type)
32743 case MULTI_ARG_4_DF2_DI_I:
32744 case MULTI_ARG_4_DF2_DI_I1:
32745 case MULTI_ARG_4_SF2_SI_I:
32746 case MULTI_ARG_4_SF2_SI_I1:
32747 nargs = 4;
32748 last_arg_constant = true;
32749 break;
32751 case MULTI_ARG_3_SF:
32752 case MULTI_ARG_3_DF:
32753 case MULTI_ARG_3_SF2:
32754 case MULTI_ARG_3_DF2:
32755 case MULTI_ARG_3_DI:
32756 case MULTI_ARG_3_SI:
32757 case MULTI_ARG_3_SI_DI:
32758 case MULTI_ARG_3_HI:
32759 case MULTI_ARG_3_HI_SI:
32760 case MULTI_ARG_3_QI:
32761 case MULTI_ARG_3_DI2:
32762 case MULTI_ARG_3_SI2:
32763 case MULTI_ARG_3_HI2:
32764 case MULTI_ARG_3_QI2:
32765 nargs = 3;
32766 break;
32768 case MULTI_ARG_2_SF:
32769 case MULTI_ARG_2_DF:
32770 case MULTI_ARG_2_DI:
32771 case MULTI_ARG_2_SI:
32772 case MULTI_ARG_2_HI:
32773 case MULTI_ARG_2_QI:
32774 nargs = 2;
32775 break;
32777 case MULTI_ARG_2_DI_IMM:
32778 case MULTI_ARG_2_SI_IMM:
32779 case MULTI_ARG_2_HI_IMM:
32780 case MULTI_ARG_2_QI_IMM:
32781 nargs = 2;
32782 last_arg_constant = true;
32783 break;
32785 case MULTI_ARG_1_SF:
32786 case MULTI_ARG_1_DF:
32787 case MULTI_ARG_1_SF2:
32788 case MULTI_ARG_1_DF2:
32789 case MULTI_ARG_1_DI:
32790 case MULTI_ARG_1_SI:
32791 case MULTI_ARG_1_HI:
32792 case MULTI_ARG_1_QI:
32793 case MULTI_ARG_1_SI_DI:
32794 case MULTI_ARG_1_HI_DI:
32795 case MULTI_ARG_1_HI_SI:
32796 case MULTI_ARG_1_QI_DI:
32797 case MULTI_ARG_1_QI_SI:
32798 case MULTI_ARG_1_QI_HI:
32799 nargs = 1;
32800 break;
32802 case MULTI_ARG_2_DI_CMP:
32803 case MULTI_ARG_2_SI_CMP:
32804 case MULTI_ARG_2_HI_CMP:
32805 case MULTI_ARG_2_QI_CMP:
32806 nargs = 2;
32807 comparison_p = true;
32808 break;
32810 case MULTI_ARG_2_SF_TF:
32811 case MULTI_ARG_2_DF_TF:
32812 case MULTI_ARG_2_DI_TF:
32813 case MULTI_ARG_2_SI_TF:
32814 case MULTI_ARG_2_HI_TF:
32815 case MULTI_ARG_2_QI_TF:
32816 nargs = 2;
32817 tf_p = true;
32818 break;
32820 default:
32821 gcc_unreachable ();
32824 if (optimize || !target
32825 || GET_MODE (target) != tmode
32826 || !insn_data[icode].operand[0].predicate (target, tmode))
32827 target = gen_reg_rtx (tmode);
32828 else if (memory_operand (target, tmode))
32829 num_memory++;
32831 gcc_assert (nargs <= 4);
32833 for (i = 0; i < nargs; i++)
32835 tree arg = CALL_EXPR_ARG (exp, i);
32836 rtx op = expand_normal (arg);
32837 int adjust = (comparison_p) ? 1 : 0;
32838 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
32840 if (last_arg_constant && i == nargs - 1)
32842 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
32844 enum insn_code new_icode = icode;
32845 switch (icode)
32847 case CODE_FOR_xop_vpermil2v2df3:
32848 case CODE_FOR_xop_vpermil2v4sf3:
32849 case CODE_FOR_xop_vpermil2v4df3:
32850 case CODE_FOR_xop_vpermil2v8sf3:
32851 error ("the last argument must be a 2-bit immediate");
32852 return gen_reg_rtx (tmode);
32853 case CODE_FOR_xop_rotlv2di3:
32854 new_icode = CODE_FOR_rotlv2di3;
32855 goto xop_rotl;
32856 case CODE_FOR_xop_rotlv4si3:
32857 new_icode = CODE_FOR_rotlv4si3;
32858 goto xop_rotl;
32859 case CODE_FOR_xop_rotlv8hi3:
32860 new_icode = CODE_FOR_rotlv8hi3;
32861 goto xop_rotl;
32862 case CODE_FOR_xop_rotlv16qi3:
32863 new_icode = CODE_FOR_rotlv16qi3;
32864 xop_rotl:
32865 if (CONST_INT_P (op))
32867 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
32868 op = GEN_INT (INTVAL (op) & mask);
32869 gcc_checking_assert
32870 (insn_data[icode].operand[i + 1].predicate (op, mode));
32872 else
32874 gcc_checking_assert
32875 (nargs == 2
32876 && insn_data[new_icode].operand[0].mode == tmode
32877 && insn_data[new_icode].operand[1].mode == tmode
32878 && insn_data[new_icode].operand[2].mode == mode
32879 && insn_data[new_icode].operand[0].predicate
32880 == insn_data[icode].operand[0].predicate
32881 && insn_data[new_icode].operand[1].predicate
32882 == insn_data[icode].operand[1].predicate);
32883 icode = new_icode;
32884 goto non_constant;
32886 break;
32887 default:
32888 gcc_unreachable ();
32892 else
32894 non_constant:
32895 if (VECTOR_MODE_P (mode))
32896 op = safe_vector_operand (op, mode);
32898 /* If we aren't optimizing, only allow one memory operand to be
32899 generated. */
32900 if (memory_operand (op, mode))
32901 num_memory++;
32903 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
32905 if (optimize
32906 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
32907 || num_memory > 1)
32908 op = force_reg (mode, op);
32911 args[i].op = op;
32912 args[i].mode = mode;
32915 switch (nargs)
32917 case 1:
32918 pat = GEN_FCN (icode) (target, args[0].op);
32919 break;
32921 case 2:
32922 if (tf_p)
32923 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
32924 GEN_INT ((int)sub_code));
32925 else if (! comparison_p)
32926 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32927 else
32929 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
32930 args[0].op,
32931 args[1].op);
32933 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
32935 break;
32937 case 3:
32938 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32939 break;
32941 case 4:
32942 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
32943 break;
32945 default:
32946 gcc_unreachable ();
32949 if (! pat)
32950 return 0;
32952 emit_insn (pat);
32953 return target;
32956 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
32957 insns with vec_merge. */
32959 static rtx
32960 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
32961 rtx target)
32963 rtx pat;
32964 tree arg0 = CALL_EXPR_ARG (exp, 0);
32965 rtx op1, op0 = expand_normal (arg0);
32966 machine_mode tmode = insn_data[icode].operand[0].mode;
32967 machine_mode mode0 = insn_data[icode].operand[1].mode;
32969 if (optimize || !target
32970 || GET_MODE (target) != tmode
32971 || !insn_data[icode].operand[0].predicate (target, tmode))
32972 target = gen_reg_rtx (tmode);
32974 if (VECTOR_MODE_P (mode0))
32975 op0 = safe_vector_operand (op0, mode0);
32977 if ((optimize && !register_operand (op0, mode0))
32978 || !insn_data[icode].operand[1].predicate (op0, mode0))
32979 op0 = copy_to_mode_reg (mode0, op0);
32981 op1 = op0;
32982 if (!insn_data[icode].operand[2].predicate (op1, mode0))
32983 op1 = copy_to_mode_reg (mode0, op1);
32985 pat = GEN_FCN (icode) (target, op0, op1);
32986 if (! pat)
32987 return 0;
32988 emit_insn (pat);
32989 return target;
32992 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
32994 static rtx
32995 ix86_expand_sse_compare (const struct builtin_description *d,
32996 tree exp, rtx target, bool swap)
32998 rtx pat;
32999 tree arg0 = CALL_EXPR_ARG (exp, 0);
33000 tree arg1 = CALL_EXPR_ARG (exp, 1);
33001 rtx op0 = expand_normal (arg0);
33002 rtx op1 = expand_normal (arg1);
33003 rtx op2;
33004 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33005 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33006 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33007 enum rtx_code comparison = d->comparison;
33009 if (VECTOR_MODE_P (mode0))
33010 op0 = safe_vector_operand (op0, mode0);
33011 if (VECTOR_MODE_P (mode1))
33012 op1 = safe_vector_operand (op1, mode1);
33014 /* Swap operands if we have a comparison that isn't available in
33015 hardware. */
33016 if (swap)
33017 std::swap (op0, op1);
33019 if (optimize || !target
33020 || GET_MODE (target) != tmode
33021 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33022 target = gen_reg_rtx (tmode);
33024 if ((optimize && !register_operand (op0, mode0))
33025 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33026 op0 = copy_to_mode_reg (mode0, op0);
33027 if ((optimize && !register_operand (op1, mode1))
33028 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33029 op1 = copy_to_mode_reg (mode1, op1);
33031 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33032 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33033 if (! pat)
33034 return 0;
33035 emit_insn (pat);
33036 return target;
33039 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33041 static rtx
33042 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33043 rtx target)
33045 rtx pat;
33046 tree arg0 = CALL_EXPR_ARG (exp, 0);
33047 tree arg1 = CALL_EXPR_ARG (exp, 1);
33048 rtx op0 = expand_normal (arg0);
33049 rtx op1 = expand_normal (arg1);
33050 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33051 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33052 enum rtx_code comparison = d->comparison;
33054 if (VECTOR_MODE_P (mode0))
33055 op0 = safe_vector_operand (op0, mode0);
33056 if (VECTOR_MODE_P (mode1))
33057 op1 = safe_vector_operand (op1, mode1);
33059 /* Swap operands if we have a comparison that isn't available in
33060 hardware. */
33061 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33062 std::swap (op0, op1);
33064 target = gen_reg_rtx (SImode);
33065 emit_move_insn (target, const0_rtx);
33066 target = gen_rtx_SUBREG (QImode, target, 0);
33068 if ((optimize && !register_operand (op0, mode0))
33069 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33070 op0 = copy_to_mode_reg (mode0, op0);
33071 if ((optimize && !register_operand (op1, mode1))
33072 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33073 op1 = copy_to_mode_reg (mode1, op1);
33075 pat = GEN_FCN (d->icode) (op0, op1);
33076 if (! pat)
33077 return 0;
33078 emit_insn (pat);
33079 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33080 gen_rtx_fmt_ee (comparison, QImode,
33081 SET_DEST (pat),
33082 const0_rtx)));
33084 return SUBREG_REG (target);
33087 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33089 static rtx
33090 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33091 rtx target)
33093 rtx pat;
33094 tree arg0 = CALL_EXPR_ARG (exp, 0);
33095 rtx op1, op0 = expand_normal (arg0);
33096 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33097 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33099 if (optimize || target == 0
33100 || GET_MODE (target) != tmode
33101 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33102 target = gen_reg_rtx (tmode);
33104 if (VECTOR_MODE_P (mode0))
33105 op0 = safe_vector_operand (op0, mode0);
33107 if ((optimize && !register_operand (op0, mode0))
33108 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33109 op0 = copy_to_mode_reg (mode0, op0);
33111 op1 = GEN_INT (d->comparison);
33113 pat = GEN_FCN (d->icode) (target, op0, op1);
33114 if (! pat)
33115 return 0;
33116 emit_insn (pat);
33117 return target;
33120 static rtx
33121 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33122 tree exp, rtx target)
33124 rtx pat;
33125 tree arg0 = CALL_EXPR_ARG (exp, 0);
33126 tree arg1 = CALL_EXPR_ARG (exp, 1);
33127 rtx op0 = expand_normal (arg0);
33128 rtx op1 = expand_normal (arg1);
33129 rtx op2;
33130 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33131 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33132 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33134 if (optimize || target == 0
33135 || GET_MODE (target) != tmode
33136 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33137 target = gen_reg_rtx (tmode);
33139 op0 = safe_vector_operand (op0, mode0);
33140 op1 = safe_vector_operand (op1, mode1);
33142 if ((optimize && !register_operand (op0, mode0))
33143 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33144 op0 = copy_to_mode_reg (mode0, op0);
33145 if ((optimize && !register_operand (op1, mode1))
33146 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33147 op1 = copy_to_mode_reg (mode1, op1);
33149 op2 = GEN_INT (d->comparison);
33151 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33152 if (! pat)
33153 return 0;
33154 emit_insn (pat);
33155 return target;
33158 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
33160 static rtx
33161 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
33162 rtx target)
33164 rtx pat;
33165 tree arg0 = CALL_EXPR_ARG (exp, 0);
33166 tree arg1 = CALL_EXPR_ARG (exp, 1);
33167 rtx op0 = expand_normal (arg0);
33168 rtx op1 = expand_normal (arg1);
33169 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33170 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33171 enum rtx_code comparison = d->comparison;
33173 if (VECTOR_MODE_P (mode0))
33174 op0 = safe_vector_operand (op0, mode0);
33175 if (VECTOR_MODE_P (mode1))
33176 op1 = safe_vector_operand (op1, mode1);
33178 target = gen_reg_rtx (SImode);
33179 emit_move_insn (target, const0_rtx);
33180 target = gen_rtx_SUBREG (QImode, target, 0);
33182 if ((optimize && !register_operand (op0, mode0))
33183 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33184 op0 = copy_to_mode_reg (mode0, op0);
33185 if ((optimize && !register_operand (op1, mode1))
33186 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33187 op1 = copy_to_mode_reg (mode1, op1);
33189 pat = GEN_FCN (d->icode) (op0, op1);
33190 if (! pat)
33191 return 0;
33192 emit_insn (pat);
33193 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33194 gen_rtx_fmt_ee (comparison, QImode,
33195 SET_DEST (pat),
33196 const0_rtx)));
33198 return SUBREG_REG (target);
33201 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
33203 static rtx
33204 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
33205 tree exp, rtx target)
33207 rtx pat;
33208 tree arg0 = CALL_EXPR_ARG (exp, 0);
33209 tree arg1 = CALL_EXPR_ARG (exp, 1);
33210 tree arg2 = CALL_EXPR_ARG (exp, 2);
33211 tree arg3 = CALL_EXPR_ARG (exp, 3);
33212 tree arg4 = CALL_EXPR_ARG (exp, 4);
33213 rtx scratch0, scratch1;
33214 rtx op0 = expand_normal (arg0);
33215 rtx op1 = expand_normal (arg1);
33216 rtx op2 = expand_normal (arg2);
33217 rtx op3 = expand_normal (arg3);
33218 rtx op4 = expand_normal (arg4);
33219 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
33221 tmode0 = insn_data[d->icode].operand[0].mode;
33222 tmode1 = insn_data[d->icode].operand[1].mode;
33223 modev2 = insn_data[d->icode].operand[2].mode;
33224 modei3 = insn_data[d->icode].operand[3].mode;
33225 modev4 = insn_data[d->icode].operand[4].mode;
33226 modei5 = insn_data[d->icode].operand[5].mode;
33227 modeimm = insn_data[d->icode].operand[6].mode;
33229 if (VECTOR_MODE_P (modev2))
33230 op0 = safe_vector_operand (op0, modev2);
33231 if (VECTOR_MODE_P (modev4))
33232 op2 = safe_vector_operand (op2, modev4);
33234 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33235 op0 = copy_to_mode_reg (modev2, op0);
33236 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
33237 op1 = copy_to_mode_reg (modei3, op1);
33238 if ((optimize && !register_operand (op2, modev4))
33239 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
33240 op2 = copy_to_mode_reg (modev4, op2);
33241 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
33242 op3 = copy_to_mode_reg (modei5, op3);
33244 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
33246 error ("the fifth argument must be an 8-bit immediate");
33247 return const0_rtx;
33250 if (d->code == IX86_BUILTIN_PCMPESTRI128)
33252 if (optimize || !target
33253 || GET_MODE (target) != tmode0
33254 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33255 target = gen_reg_rtx (tmode0);
33257 scratch1 = gen_reg_rtx (tmode1);
33259 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
33261 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
33263 if (optimize || !target
33264 || GET_MODE (target) != tmode1
33265 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33266 target = gen_reg_rtx (tmode1);
33268 scratch0 = gen_reg_rtx (tmode0);
33270 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
33272 else
33274 gcc_assert (d->flag);
33276 scratch0 = gen_reg_rtx (tmode0);
33277 scratch1 = gen_reg_rtx (tmode1);
33279 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
33282 if (! pat)
33283 return 0;
33285 emit_insn (pat);
33287 if (d->flag)
33289 target = gen_reg_rtx (SImode);
33290 emit_move_insn (target, const0_rtx);
33291 target = gen_rtx_SUBREG (QImode, target, 0);
33293 emit_insn
33294 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33295 gen_rtx_fmt_ee (EQ, QImode,
33296 gen_rtx_REG ((machine_mode) d->flag,
33297 FLAGS_REG),
33298 const0_rtx)));
33299 return SUBREG_REG (target);
33301 else
33302 return target;
33306 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
33308 static rtx
33309 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
33310 tree exp, rtx target)
33312 rtx pat;
33313 tree arg0 = CALL_EXPR_ARG (exp, 0);
33314 tree arg1 = CALL_EXPR_ARG (exp, 1);
33315 tree arg2 = CALL_EXPR_ARG (exp, 2);
33316 rtx scratch0, scratch1;
33317 rtx op0 = expand_normal (arg0);
33318 rtx op1 = expand_normal (arg1);
33319 rtx op2 = expand_normal (arg2);
33320 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
33322 tmode0 = insn_data[d->icode].operand[0].mode;
33323 tmode1 = insn_data[d->icode].operand[1].mode;
33324 modev2 = insn_data[d->icode].operand[2].mode;
33325 modev3 = insn_data[d->icode].operand[3].mode;
33326 modeimm = insn_data[d->icode].operand[4].mode;
33328 if (VECTOR_MODE_P (modev2))
33329 op0 = safe_vector_operand (op0, modev2);
33330 if (VECTOR_MODE_P (modev3))
33331 op1 = safe_vector_operand (op1, modev3);
33333 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33334 op0 = copy_to_mode_reg (modev2, op0);
33335 if ((optimize && !register_operand (op1, modev3))
33336 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
33337 op1 = copy_to_mode_reg (modev3, op1);
33339 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
33341 error ("the third argument must be an 8-bit immediate");
33342 return const0_rtx;
33345 if (d->code == IX86_BUILTIN_PCMPISTRI128)
33347 if (optimize || !target
33348 || GET_MODE (target) != tmode0
33349 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33350 target = gen_reg_rtx (tmode0);
33352 scratch1 = gen_reg_rtx (tmode1);
33354 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
33356 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
33358 if (optimize || !target
33359 || GET_MODE (target) != tmode1
33360 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33361 target = gen_reg_rtx (tmode1);
33363 scratch0 = gen_reg_rtx (tmode0);
33365 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
33367 else
33369 gcc_assert (d->flag);
33371 scratch0 = gen_reg_rtx (tmode0);
33372 scratch1 = gen_reg_rtx (tmode1);
33374 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
33377 if (! pat)
33378 return 0;
33380 emit_insn (pat);
33382 if (d->flag)
33384 target = gen_reg_rtx (SImode);
33385 emit_move_insn (target, const0_rtx);
33386 target = gen_rtx_SUBREG (QImode, target, 0);
33388 emit_insn
33389 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33390 gen_rtx_fmt_ee (EQ, QImode,
33391 gen_rtx_REG ((machine_mode) d->flag,
33392 FLAGS_REG),
33393 const0_rtx)));
33394 return SUBREG_REG (target);
33396 else
33397 return target;
33400 /* Subroutine of ix86_expand_builtin to take care of insns with
33401 variable number of operands. */
33403 static rtx
33404 ix86_expand_args_builtin (const struct builtin_description *d,
33405 tree exp, rtx target)
33407 rtx pat, real_target;
33408 unsigned int i, nargs;
33409 unsigned int nargs_constant = 0;
33410 unsigned int mask_pos = 0;
33411 int num_memory = 0;
33412 struct
33414 rtx op;
33415 machine_mode mode;
33416 } args[6];
33417 bool second_arg_count = false;
33418 enum insn_code icode = d->icode;
33419 const struct insn_data_d *insn_p = &insn_data[icode];
33420 machine_mode tmode = insn_p->operand[0].mode;
33421 machine_mode rmode = VOIDmode;
33422 bool swap = false;
33423 enum rtx_code comparison = d->comparison;
33425 switch ((enum ix86_builtin_func_type) d->flag)
33427 case V2DF_FTYPE_V2DF_ROUND:
33428 case V4DF_FTYPE_V4DF_ROUND:
33429 case V8DF_FTYPE_V8DF_ROUND:
33430 case V4SF_FTYPE_V4SF_ROUND:
33431 case V8SF_FTYPE_V8SF_ROUND:
33432 case V16SF_FTYPE_V16SF_ROUND:
33433 case V4SI_FTYPE_V4SF_ROUND:
33434 case V8SI_FTYPE_V8SF_ROUND:
33435 case V16SI_FTYPE_V16SF_ROUND:
33436 return ix86_expand_sse_round (d, exp, target);
33437 case V4SI_FTYPE_V2DF_V2DF_ROUND:
33438 case V8SI_FTYPE_V4DF_V4DF_ROUND:
33439 case V16SI_FTYPE_V8DF_V8DF_ROUND:
33440 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
33441 case INT_FTYPE_V8SF_V8SF_PTEST:
33442 case INT_FTYPE_V4DI_V4DI_PTEST:
33443 case INT_FTYPE_V4DF_V4DF_PTEST:
33444 case INT_FTYPE_V4SF_V4SF_PTEST:
33445 case INT_FTYPE_V2DI_V2DI_PTEST:
33446 case INT_FTYPE_V2DF_V2DF_PTEST:
33447 return ix86_expand_sse_ptest (d, exp, target);
33448 case FLOAT128_FTYPE_FLOAT128:
33449 case FLOAT_FTYPE_FLOAT:
33450 case INT_FTYPE_INT:
33451 case UINT_FTYPE_UINT:
33452 case UINT16_FTYPE_UINT16:
33453 case UINT64_FTYPE_INT:
33454 case UINT64_FTYPE_UINT64:
33455 case INT64_FTYPE_INT64:
33456 case INT64_FTYPE_V4SF:
33457 case INT64_FTYPE_V2DF:
33458 case INT_FTYPE_V16QI:
33459 case INT_FTYPE_V8QI:
33460 case INT_FTYPE_V8SF:
33461 case INT_FTYPE_V4DF:
33462 case INT_FTYPE_V4SF:
33463 case INT_FTYPE_V2DF:
33464 case INT_FTYPE_V32QI:
33465 case V16QI_FTYPE_V16QI:
33466 case V8SI_FTYPE_V8SF:
33467 case V8SI_FTYPE_V4SI:
33468 case V8HI_FTYPE_V8HI:
33469 case V8HI_FTYPE_V16QI:
33470 case V8QI_FTYPE_V8QI:
33471 case V8SF_FTYPE_V8SF:
33472 case V8SF_FTYPE_V8SI:
33473 case V8SF_FTYPE_V4SF:
33474 case V8SF_FTYPE_V8HI:
33475 case V4SI_FTYPE_V4SI:
33476 case V4SI_FTYPE_V16QI:
33477 case V4SI_FTYPE_V4SF:
33478 case V4SI_FTYPE_V8SI:
33479 case V4SI_FTYPE_V8HI:
33480 case V4SI_FTYPE_V4DF:
33481 case V4SI_FTYPE_V2DF:
33482 case V4HI_FTYPE_V4HI:
33483 case V4DF_FTYPE_V4DF:
33484 case V4DF_FTYPE_V4SI:
33485 case V4DF_FTYPE_V4SF:
33486 case V4DF_FTYPE_V2DF:
33487 case V4SF_FTYPE_V4SF:
33488 case V4SF_FTYPE_V4SI:
33489 case V4SF_FTYPE_V8SF:
33490 case V4SF_FTYPE_V4DF:
33491 case V4SF_FTYPE_V8HI:
33492 case V4SF_FTYPE_V2DF:
33493 case V2DI_FTYPE_V2DI:
33494 case V2DI_FTYPE_V16QI:
33495 case V2DI_FTYPE_V8HI:
33496 case V2DI_FTYPE_V4SI:
33497 case V2DF_FTYPE_V2DF:
33498 case V2DF_FTYPE_V4SI:
33499 case V2DF_FTYPE_V4DF:
33500 case V2DF_FTYPE_V4SF:
33501 case V2DF_FTYPE_V2SI:
33502 case V2SI_FTYPE_V2SI:
33503 case V2SI_FTYPE_V4SF:
33504 case V2SI_FTYPE_V2SF:
33505 case V2SI_FTYPE_V2DF:
33506 case V2SF_FTYPE_V2SF:
33507 case V2SF_FTYPE_V2SI:
33508 case V32QI_FTYPE_V32QI:
33509 case V32QI_FTYPE_V16QI:
33510 case V16HI_FTYPE_V16HI:
33511 case V16HI_FTYPE_V8HI:
33512 case V8SI_FTYPE_V8SI:
33513 case V16HI_FTYPE_V16QI:
33514 case V8SI_FTYPE_V16QI:
33515 case V4DI_FTYPE_V16QI:
33516 case V8SI_FTYPE_V8HI:
33517 case V4DI_FTYPE_V8HI:
33518 case V4DI_FTYPE_V4SI:
33519 case V4DI_FTYPE_V2DI:
33520 case UQI_FTYPE_UQI:
33521 case UHI_FTYPE_UHI:
33522 case USI_FTYPE_USI:
33523 case USI_FTYPE_UQI:
33524 case USI_FTYPE_UHI:
33525 case UDI_FTYPE_UDI:
33526 case UHI_FTYPE_V16QI:
33527 case USI_FTYPE_V32QI:
33528 case UDI_FTYPE_V64QI:
33529 case V16QI_FTYPE_UHI:
33530 case V32QI_FTYPE_USI:
33531 case V64QI_FTYPE_UDI:
33532 case V8HI_FTYPE_UQI:
33533 case V16HI_FTYPE_UHI:
33534 case V32HI_FTYPE_USI:
33535 case V4SI_FTYPE_UQI:
33536 case V8SI_FTYPE_UQI:
33537 case V4SI_FTYPE_UHI:
33538 case V8SI_FTYPE_UHI:
33539 case UQI_FTYPE_V8HI:
33540 case UHI_FTYPE_V16HI:
33541 case USI_FTYPE_V32HI:
33542 case UQI_FTYPE_V4SI:
33543 case UQI_FTYPE_V8SI:
33544 case UHI_FTYPE_V16SI:
33545 case UQI_FTYPE_V2DI:
33546 case UQI_FTYPE_V4DI:
33547 case UQI_FTYPE_V8DI:
33548 case V16SI_FTYPE_UHI:
33549 case V2DI_FTYPE_UQI:
33550 case V4DI_FTYPE_UQI:
33551 case V16SI_FTYPE_INT:
33552 case V16SF_FTYPE_V8SF:
33553 case V16SI_FTYPE_V8SI:
33554 case V16SF_FTYPE_V4SF:
33555 case V16SI_FTYPE_V4SI:
33556 case V16SI_FTYPE_V16SF:
33557 case V16SI_FTYPE_V16SI:
33558 case V16SF_FTYPE_V16SF:
33559 case V8DI_FTYPE_UQI:
33560 case V8DI_FTYPE_V8DI:
33561 case V8DF_FTYPE_V4DF:
33562 case V8DF_FTYPE_V2DF:
33563 case V8DF_FTYPE_V8DF:
33564 nargs = 1;
33565 break;
33566 case V4SF_FTYPE_V4SF_VEC_MERGE:
33567 case V2DF_FTYPE_V2DF_VEC_MERGE:
33568 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
33569 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
33570 case V16QI_FTYPE_V16QI_V16QI:
33571 case V16QI_FTYPE_V8HI_V8HI:
33572 case V16SF_FTYPE_V16SF_V16SF:
33573 case V8QI_FTYPE_V8QI_V8QI:
33574 case V8QI_FTYPE_V4HI_V4HI:
33575 case V8HI_FTYPE_V8HI_V8HI:
33576 case V8HI_FTYPE_V16QI_V16QI:
33577 case V8HI_FTYPE_V4SI_V4SI:
33578 case V8SF_FTYPE_V8SF_V8SF:
33579 case V8SF_FTYPE_V8SF_V8SI:
33580 case V8DF_FTYPE_V8DF_V8DF:
33581 case V4SI_FTYPE_V4SI_V4SI:
33582 case V4SI_FTYPE_V8HI_V8HI:
33583 case V4SI_FTYPE_V2DF_V2DF:
33584 case V4HI_FTYPE_V4HI_V4HI:
33585 case V4HI_FTYPE_V8QI_V8QI:
33586 case V4HI_FTYPE_V2SI_V2SI:
33587 case V4DF_FTYPE_V4DF_V4DF:
33588 case V4DF_FTYPE_V4DF_V4DI:
33589 case V4SF_FTYPE_V4SF_V4SF:
33590 case V4SF_FTYPE_V4SF_V4SI:
33591 case V4SF_FTYPE_V4SF_V2SI:
33592 case V4SF_FTYPE_V4SF_V2DF:
33593 case V4SF_FTYPE_V4SF_UINT:
33594 case V4SF_FTYPE_V4SF_DI:
33595 case V4SF_FTYPE_V4SF_SI:
33596 case V2DI_FTYPE_V2DI_V2DI:
33597 case V2DI_FTYPE_V16QI_V16QI:
33598 case V2DI_FTYPE_V4SI_V4SI:
33599 case V2DI_FTYPE_V2DI_V16QI:
33600 case V2SI_FTYPE_V2SI_V2SI:
33601 case V2SI_FTYPE_V4HI_V4HI:
33602 case V2SI_FTYPE_V2SF_V2SF:
33603 case V2DF_FTYPE_V2DF_V2DF:
33604 case V2DF_FTYPE_V2DF_V4SF:
33605 case V2DF_FTYPE_V2DF_V2DI:
33606 case V2DF_FTYPE_V2DF_DI:
33607 case V2DF_FTYPE_V2DF_SI:
33608 case V2DF_FTYPE_V2DF_UINT:
33609 case V2SF_FTYPE_V2SF_V2SF:
33610 case V1DI_FTYPE_V1DI_V1DI:
33611 case V1DI_FTYPE_V8QI_V8QI:
33612 case V1DI_FTYPE_V2SI_V2SI:
33613 case V32QI_FTYPE_V16HI_V16HI:
33614 case V16HI_FTYPE_V8SI_V8SI:
33615 case V64QI_FTYPE_V64QI_V64QI:
33616 case V32QI_FTYPE_V32QI_V32QI:
33617 case V16HI_FTYPE_V32QI_V32QI:
33618 case V16HI_FTYPE_V16HI_V16HI:
33619 case V8SI_FTYPE_V4DF_V4DF:
33620 case V8SI_FTYPE_V8SI_V8SI:
33621 case V8SI_FTYPE_V16HI_V16HI:
33622 case V4DI_FTYPE_V4DI_V4DI:
33623 case V4DI_FTYPE_V8SI_V8SI:
33624 case V8DI_FTYPE_V64QI_V64QI:
33625 if (comparison == UNKNOWN)
33626 return ix86_expand_binop_builtin (icode, exp, target);
33627 nargs = 2;
33628 break;
33629 case V4SF_FTYPE_V4SF_V4SF_SWAP:
33630 case V2DF_FTYPE_V2DF_V2DF_SWAP:
33631 gcc_assert (comparison != UNKNOWN);
33632 nargs = 2;
33633 swap = true;
33634 break;
33635 case V16HI_FTYPE_V16HI_V8HI_COUNT:
33636 case V16HI_FTYPE_V16HI_SI_COUNT:
33637 case V8SI_FTYPE_V8SI_V4SI_COUNT:
33638 case V8SI_FTYPE_V8SI_SI_COUNT:
33639 case V4DI_FTYPE_V4DI_V2DI_COUNT:
33640 case V4DI_FTYPE_V4DI_INT_COUNT:
33641 case V8HI_FTYPE_V8HI_V8HI_COUNT:
33642 case V8HI_FTYPE_V8HI_SI_COUNT:
33643 case V4SI_FTYPE_V4SI_V4SI_COUNT:
33644 case V4SI_FTYPE_V4SI_SI_COUNT:
33645 case V4HI_FTYPE_V4HI_V4HI_COUNT:
33646 case V4HI_FTYPE_V4HI_SI_COUNT:
33647 case V2DI_FTYPE_V2DI_V2DI_COUNT:
33648 case V2DI_FTYPE_V2DI_SI_COUNT:
33649 case V2SI_FTYPE_V2SI_V2SI_COUNT:
33650 case V2SI_FTYPE_V2SI_SI_COUNT:
33651 case V1DI_FTYPE_V1DI_V1DI_COUNT:
33652 case V1DI_FTYPE_V1DI_SI_COUNT:
33653 nargs = 2;
33654 second_arg_count = true;
33655 break;
33656 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
33657 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
33658 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
33659 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
33660 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
33661 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
33662 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
33663 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
33664 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
33665 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
33666 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
33667 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
33668 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
33669 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
33670 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
33671 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
33672 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
33673 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
33674 nargs = 4;
33675 second_arg_count = true;
33676 break;
33677 case UINT64_FTYPE_UINT64_UINT64:
33678 case UINT_FTYPE_UINT_UINT:
33679 case UINT_FTYPE_UINT_USHORT:
33680 case UINT_FTYPE_UINT_UCHAR:
33681 case UINT16_FTYPE_UINT16_INT:
33682 case UINT8_FTYPE_UINT8_INT:
33683 case UQI_FTYPE_UQI_UQI:
33684 case UHI_FTYPE_UHI_UHI:
33685 case USI_FTYPE_USI_USI:
33686 case UDI_FTYPE_UDI_UDI:
33687 case V16SI_FTYPE_V8DF_V8DF:
33688 nargs = 2;
33689 break;
33690 case V2DI_FTYPE_V2DI_INT_CONVERT:
33691 nargs = 2;
33692 rmode = V1TImode;
33693 nargs_constant = 1;
33694 break;
33695 case V4DI_FTYPE_V4DI_INT_CONVERT:
33696 nargs = 2;
33697 rmode = V2TImode;
33698 nargs_constant = 1;
33699 break;
33700 case V8DI_FTYPE_V8DI_INT_CONVERT:
33701 nargs = 2;
33702 rmode = V4TImode;
33703 nargs_constant = 1;
33704 break;
33705 case V8HI_FTYPE_V8HI_INT:
33706 case V8HI_FTYPE_V8SF_INT:
33707 case V16HI_FTYPE_V16SF_INT:
33708 case V8HI_FTYPE_V4SF_INT:
33709 case V8SF_FTYPE_V8SF_INT:
33710 case V4SF_FTYPE_V16SF_INT:
33711 case V16SF_FTYPE_V16SF_INT:
33712 case V4SI_FTYPE_V4SI_INT:
33713 case V4SI_FTYPE_V8SI_INT:
33714 case V4HI_FTYPE_V4HI_INT:
33715 case V4DF_FTYPE_V4DF_INT:
33716 case V4DF_FTYPE_V8DF_INT:
33717 case V4SF_FTYPE_V4SF_INT:
33718 case V4SF_FTYPE_V8SF_INT:
33719 case V2DI_FTYPE_V2DI_INT:
33720 case V2DF_FTYPE_V2DF_INT:
33721 case V2DF_FTYPE_V4DF_INT:
33722 case V16HI_FTYPE_V16HI_INT:
33723 case V8SI_FTYPE_V8SI_INT:
33724 case V16SI_FTYPE_V16SI_INT:
33725 case V4SI_FTYPE_V16SI_INT:
33726 case V4DI_FTYPE_V4DI_INT:
33727 case V2DI_FTYPE_V4DI_INT:
33728 case V4DI_FTYPE_V8DI_INT:
33729 case QI_FTYPE_V4SF_INT:
33730 case QI_FTYPE_V2DF_INT:
33731 case UQI_FTYPE_UQI_UQI_CONST:
33732 case UHI_FTYPE_UHI_UQI:
33733 case USI_FTYPE_USI_UQI:
33734 case UDI_FTYPE_UDI_UQI:
33735 nargs = 2;
33736 nargs_constant = 1;
33737 break;
33738 case V16QI_FTYPE_V16QI_V16QI_V16QI:
33739 case V8SF_FTYPE_V8SF_V8SF_V8SF:
33740 case V4DF_FTYPE_V4DF_V4DF_V4DF:
33741 case V4SF_FTYPE_V4SF_V4SF_V4SF:
33742 case V2DF_FTYPE_V2DF_V2DF_V2DF:
33743 case V32QI_FTYPE_V32QI_V32QI_V32QI:
33744 case UHI_FTYPE_V16SI_V16SI_UHI:
33745 case UQI_FTYPE_V8DI_V8DI_UQI:
33746 case V16HI_FTYPE_V16SI_V16HI_UHI:
33747 case V16QI_FTYPE_V16SI_V16QI_UHI:
33748 case V16QI_FTYPE_V8DI_V16QI_UQI:
33749 case V16SF_FTYPE_V16SF_V16SF_UHI:
33750 case V16SF_FTYPE_V4SF_V16SF_UHI:
33751 case V16SI_FTYPE_SI_V16SI_UHI:
33752 case V16SI_FTYPE_V16HI_V16SI_UHI:
33753 case V16SI_FTYPE_V16QI_V16SI_UHI:
33754 case V8SF_FTYPE_V4SF_V8SF_UQI:
33755 case V4DF_FTYPE_V2DF_V4DF_UQI:
33756 case V8SI_FTYPE_V4SI_V8SI_UQI:
33757 case V8SI_FTYPE_SI_V8SI_UQI:
33758 case V4SI_FTYPE_V4SI_V4SI_UQI:
33759 case V4SI_FTYPE_SI_V4SI_UQI:
33760 case V4DI_FTYPE_V2DI_V4DI_UQI:
33761 case V4DI_FTYPE_DI_V4DI_UQI:
33762 case V2DI_FTYPE_V2DI_V2DI_UQI:
33763 case V2DI_FTYPE_DI_V2DI_UQI:
33764 case V64QI_FTYPE_V64QI_V64QI_UDI:
33765 case V64QI_FTYPE_V16QI_V64QI_UDI:
33766 case V64QI_FTYPE_QI_V64QI_UDI:
33767 case V32QI_FTYPE_V32QI_V32QI_USI:
33768 case V32QI_FTYPE_V16QI_V32QI_USI:
33769 case V32QI_FTYPE_QI_V32QI_USI:
33770 case V16QI_FTYPE_V16QI_V16QI_UHI:
33771 case V16QI_FTYPE_QI_V16QI_UHI:
33772 case V32HI_FTYPE_V8HI_V32HI_USI:
33773 case V32HI_FTYPE_HI_V32HI_USI:
33774 case V16HI_FTYPE_V8HI_V16HI_UHI:
33775 case V16HI_FTYPE_HI_V16HI_UHI:
33776 case V8HI_FTYPE_V8HI_V8HI_UQI:
33777 case V8HI_FTYPE_HI_V8HI_UQI:
33778 case V8SF_FTYPE_V8HI_V8SF_UQI:
33779 case V4SF_FTYPE_V8HI_V4SF_UQI:
33780 case V8SI_FTYPE_V8SF_V8SI_UQI:
33781 case V4SI_FTYPE_V4SF_V4SI_UQI:
33782 case V4DI_FTYPE_V4SF_V4DI_UQI:
33783 case V2DI_FTYPE_V4SF_V2DI_UQI:
33784 case V4SF_FTYPE_V4DI_V4SF_UQI:
33785 case V4SF_FTYPE_V2DI_V4SF_UQI:
33786 case V4DF_FTYPE_V4DI_V4DF_UQI:
33787 case V2DF_FTYPE_V2DI_V2DF_UQI:
33788 case V16QI_FTYPE_V8HI_V16QI_UQI:
33789 case V16QI_FTYPE_V16HI_V16QI_UHI:
33790 case V16QI_FTYPE_V4SI_V16QI_UQI:
33791 case V16QI_FTYPE_V8SI_V16QI_UQI:
33792 case V8HI_FTYPE_V4SI_V8HI_UQI:
33793 case V8HI_FTYPE_V8SI_V8HI_UQI:
33794 case V16QI_FTYPE_V2DI_V16QI_UQI:
33795 case V16QI_FTYPE_V4DI_V16QI_UQI:
33796 case V8HI_FTYPE_V2DI_V8HI_UQI:
33797 case V8HI_FTYPE_V4DI_V8HI_UQI:
33798 case V4SI_FTYPE_V2DI_V4SI_UQI:
33799 case V4SI_FTYPE_V4DI_V4SI_UQI:
33800 case V32QI_FTYPE_V32HI_V32QI_USI:
33801 case UHI_FTYPE_V16QI_V16QI_UHI:
33802 case USI_FTYPE_V32QI_V32QI_USI:
33803 case UDI_FTYPE_V64QI_V64QI_UDI:
33804 case UQI_FTYPE_V8HI_V8HI_UQI:
33805 case UHI_FTYPE_V16HI_V16HI_UHI:
33806 case USI_FTYPE_V32HI_V32HI_USI:
33807 case UQI_FTYPE_V4SI_V4SI_UQI:
33808 case UQI_FTYPE_V8SI_V8SI_UQI:
33809 case UQI_FTYPE_V2DI_V2DI_UQI:
33810 case UQI_FTYPE_V4DI_V4DI_UQI:
33811 case V4SF_FTYPE_V2DF_V4SF_UQI:
33812 case V4SF_FTYPE_V4DF_V4SF_UQI:
33813 case V16SI_FTYPE_V16SI_V16SI_UHI:
33814 case V16SI_FTYPE_V4SI_V16SI_UHI:
33815 case V2DI_FTYPE_V4SI_V2DI_UQI:
33816 case V2DI_FTYPE_V8HI_V2DI_UQI:
33817 case V2DI_FTYPE_V16QI_V2DI_UQI:
33818 case V4DI_FTYPE_V4DI_V4DI_UQI:
33819 case V4DI_FTYPE_V4SI_V4DI_UQI:
33820 case V4DI_FTYPE_V8HI_V4DI_UQI:
33821 case V4DI_FTYPE_V16QI_V4DI_UQI:
33822 case V4DI_FTYPE_V4DF_V4DI_UQI:
33823 case V2DI_FTYPE_V2DF_V2DI_UQI:
33824 case V4SI_FTYPE_V4DF_V4SI_UQI:
33825 case V4SI_FTYPE_V2DF_V4SI_UQI:
33826 case V4SI_FTYPE_V8HI_V4SI_UQI:
33827 case V4SI_FTYPE_V16QI_V4SI_UQI:
33828 case V4DI_FTYPE_V4DI_V4DI_V4DI:
33829 case V8DF_FTYPE_V2DF_V8DF_UQI:
33830 case V8DF_FTYPE_V4DF_V8DF_UQI:
33831 case V8DF_FTYPE_V8DF_V8DF_UQI:
33832 case V8SF_FTYPE_V8SF_V8SF_UQI:
33833 case V8SF_FTYPE_V8SI_V8SF_UQI:
33834 case V4DF_FTYPE_V4DF_V4DF_UQI:
33835 case V4SF_FTYPE_V4SF_V4SF_UQI:
33836 case V2DF_FTYPE_V2DF_V2DF_UQI:
33837 case V2DF_FTYPE_V4SF_V2DF_UQI:
33838 case V2DF_FTYPE_V4SI_V2DF_UQI:
33839 case V4SF_FTYPE_V4SI_V4SF_UQI:
33840 case V4DF_FTYPE_V4SF_V4DF_UQI:
33841 case V4DF_FTYPE_V4SI_V4DF_UQI:
33842 case V8SI_FTYPE_V8SI_V8SI_UQI:
33843 case V8SI_FTYPE_V8HI_V8SI_UQI:
33844 case V8SI_FTYPE_V16QI_V8SI_UQI:
33845 case V8DF_FTYPE_V8SI_V8DF_UQI:
33846 case V8DI_FTYPE_DI_V8DI_UQI:
33847 case V16SF_FTYPE_V8SF_V16SF_UHI:
33848 case V16SI_FTYPE_V8SI_V16SI_UHI:
33849 case V16HI_FTYPE_V16HI_V16HI_UHI:
33850 case V8HI_FTYPE_V16QI_V8HI_UQI:
33851 case V16HI_FTYPE_V16QI_V16HI_UHI:
33852 case V32HI_FTYPE_V32HI_V32HI_USI:
33853 case V32HI_FTYPE_V32QI_V32HI_USI:
33854 case V8DI_FTYPE_V16QI_V8DI_UQI:
33855 case V8DI_FTYPE_V2DI_V8DI_UQI:
33856 case V8DI_FTYPE_V4DI_V8DI_UQI:
33857 case V8DI_FTYPE_V8DI_V8DI_UQI:
33858 case V8DI_FTYPE_V8HI_V8DI_UQI:
33859 case V8DI_FTYPE_V8SI_V8DI_UQI:
33860 case V8HI_FTYPE_V8DI_V8HI_UQI:
33861 case V8SI_FTYPE_V8DI_V8SI_UQI:
33862 case V4SI_FTYPE_V4SI_V4SI_V4SI:
33863 case V16SI_FTYPE_V16SI_V16SI_V16SI:
33864 case V8DI_FTYPE_V8DI_V8DI_V8DI:
33865 case V32HI_FTYPE_V32HI_V32HI_V32HI:
33866 case V2DI_FTYPE_V2DI_V2DI_V2DI:
33867 case V16HI_FTYPE_V16HI_V16HI_V16HI:
33868 case V8SI_FTYPE_V8SI_V8SI_V8SI:
33869 case V8HI_FTYPE_V8HI_V8HI_V8HI:
33870 nargs = 3;
33871 break;
33872 case V32QI_FTYPE_V32QI_V32QI_INT:
33873 case V16HI_FTYPE_V16HI_V16HI_INT:
33874 case V16QI_FTYPE_V16QI_V16QI_INT:
33875 case V4DI_FTYPE_V4DI_V4DI_INT:
33876 case V8HI_FTYPE_V8HI_V8HI_INT:
33877 case V8SI_FTYPE_V8SI_V8SI_INT:
33878 case V8SI_FTYPE_V8SI_V4SI_INT:
33879 case V8SF_FTYPE_V8SF_V8SF_INT:
33880 case V8SF_FTYPE_V8SF_V4SF_INT:
33881 case V4SI_FTYPE_V4SI_V4SI_INT:
33882 case V4DF_FTYPE_V4DF_V4DF_INT:
33883 case V16SF_FTYPE_V16SF_V16SF_INT:
33884 case V16SF_FTYPE_V16SF_V4SF_INT:
33885 case V16SI_FTYPE_V16SI_V4SI_INT:
33886 case V4DF_FTYPE_V4DF_V2DF_INT:
33887 case V4SF_FTYPE_V4SF_V4SF_INT:
33888 case V2DI_FTYPE_V2DI_V2DI_INT:
33889 case V4DI_FTYPE_V4DI_V2DI_INT:
33890 case V2DF_FTYPE_V2DF_V2DF_INT:
33891 case UQI_FTYPE_V8DI_V8UDI_INT:
33892 case UQI_FTYPE_V8DF_V8DF_INT:
33893 case UQI_FTYPE_V2DF_V2DF_INT:
33894 case UQI_FTYPE_V4SF_V4SF_INT:
33895 case UHI_FTYPE_V16SI_V16SI_INT:
33896 case UHI_FTYPE_V16SF_V16SF_INT:
33897 case V64QI_FTYPE_V64QI_V64QI_INT:
33898 case V32HI_FTYPE_V32HI_V32HI_INT:
33899 case V16SI_FTYPE_V16SI_V16SI_INT:
33900 case V8DI_FTYPE_V8DI_V8DI_INT:
33901 nargs = 3;
33902 nargs_constant = 1;
33903 break;
33904 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
33905 nargs = 3;
33906 rmode = V4DImode;
33907 nargs_constant = 1;
33908 break;
33909 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
33910 nargs = 3;
33911 rmode = V2DImode;
33912 nargs_constant = 1;
33913 break;
33914 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
33915 nargs = 3;
33916 rmode = DImode;
33917 nargs_constant = 1;
33918 break;
33919 case V2DI_FTYPE_V2DI_UINT_UINT:
33920 nargs = 3;
33921 nargs_constant = 2;
33922 break;
33923 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
33924 nargs = 3;
33925 rmode = V8DImode;
33926 nargs_constant = 1;
33927 break;
33928 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
33929 nargs = 5;
33930 rmode = V8DImode;
33931 mask_pos = 2;
33932 nargs_constant = 1;
33933 break;
33934 case QI_FTYPE_V8DF_INT_UQI:
33935 case QI_FTYPE_V4DF_INT_UQI:
33936 case QI_FTYPE_V2DF_INT_UQI:
33937 case HI_FTYPE_V16SF_INT_UHI:
33938 case QI_FTYPE_V8SF_INT_UQI:
33939 case QI_FTYPE_V4SF_INT_UQI:
33940 nargs = 3;
33941 mask_pos = 1;
33942 nargs_constant = 1;
33943 break;
33944 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
33945 nargs = 5;
33946 rmode = V4DImode;
33947 mask_pos = 2;
33948 nargs_constant = 1;
33949 break;
33950 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
33951 nargs = 5;
33952 rmode = V2DImode;
33953 mask_pos = 2;
33954 nargs_constant = 1;
33955 break;
33956 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
33957 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
33958 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
33959 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
33960 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
33961 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
33962 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
33963 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
33964 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
33965 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
33966 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
33967 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
33968 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
33969 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
33970 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
33971 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
33972 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
33973 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
33974 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
33975 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
33976 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
33977 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
33978 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
33979 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
33980 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
33981 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
33982 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
33983 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
33984 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
33985 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
33986 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
33987 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
33988 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
33989 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
33990 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
33991 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
33992 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
33993 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
33994 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
33995 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
33996 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
33997 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
33998 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
33999 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34000 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34001 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34002 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34003 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34004 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34005 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34006 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34007 nargs = 4;
34008 break;
34009 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34010 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34011 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34012 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34013 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34014 nargs = 4;
34015 nargs_constant = 1;
34016 break;
34017 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34018 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34019 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34020 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34021 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34022 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34023 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34024 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34025 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34026 case USI_FTYPE_V32QI_V32QI_INT_USI:
34027 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34028 case USI_FTYPE_V32HI_V32HI_INT_USI:
34029 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34030 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34031 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34032 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34033 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34034 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34035 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34036 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34037 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34038 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34039 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34040 nargs = 4;
34041 mask_pos = 1;
34042 nargs_constant = 1;
34043 break;
34044 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34045 nargs = 4;
34046 nargs_constant = 2;
34047 break;
34048 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34049 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34050 nargs = 4;
34051 break;
34052 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34053 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34054 mask_pos = 1;
34055 nargs = 4;
34056 nargs_constant = 1;
34057 break;
34058 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34059 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34060 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34061 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34062 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34063 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34064 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34065 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34066 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34067 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34068 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34069 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34070 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34071 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34072 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34073 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34074 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34075 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34076 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34077 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34078 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34079 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34080 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34081 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34082 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34083 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34084 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34085 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34086 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34087 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34088 nargs = 4;
34089 mask_pos = 2;
34090 nargs_constant = 1;
34091 break;
34092 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34093 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34094 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34095 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34096 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34097 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34098 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34099 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34100 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34101 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34102 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34103 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34104 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34105 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34106 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34107 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34108 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34109 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34110 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34111 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34112 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34113 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34114 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34115 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34116 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34117 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34118 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34119 nargs = 5;
34120 mask_pos = 2;
34121 nargs_constant = 1;
34122 break;
34123 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34124 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34125 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34126 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34127 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34128 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34129 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34130 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34131 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34132 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34133 nargs = 5;
34134 mask_pos = 1;
34135 nargs_constant = 1;
34136 break;
34137 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
34138 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
34139 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
34140 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
34141 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
34142 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
34143 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
34144 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
34145 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
34146 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
34147 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
34148 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
34149 nargs = 5;
34150 mask_pos = 1;
34151 nargs_constant = 2;
34152 break;
34154 default:
34155 gcc_unreachable ();
34158 gcc_assert (nargs <= ARRAY_SIZE (args));
34160 if (comparison != UNKNOWN)
34162 gcc_assert (nargs == 2);
34163 return ix86_expand_sse_compare (d, exp, target, swap);
34166 if (rmode == VOIDmode || rmode == tmode)
34168 if (optimize
34169 || target == 0
34170 || GET_MODE (target) != tmode
34171 || !insn_p->operand[0].predicate (target, tmode))
34172 target = gen_reg_rtx (tmode);
34173 else if (memory_operand (target, tmode))
34174 num_memory++;
34175 real_target = target;
34177 else
34179 real_target = gen_reg_rtx (tmode);
34180 target = lowpart_subreg (rmode, real_target, tmode);
34183 for (i = 0; i < nargs; i++)
34185 tree arg = CALL_EXPR_ARG (exp, i);
34186 rtx op = expand_normal (arg);
34187 machine_mode mode = insn_p->operand[i + 1].mode;
34188 bool match = insn_p->operand[i + 1].predicate (op, mode);
34190 if (second_arg_count && i == 1)
34192 /* SIMD shift insns take either an 8-bit immediate or
34193 register as count. But builtin functions take int as
34194 count. If count doesn't match, we put it in register.
34195 The instructions are using 64-bit count, if op is just
34196 32-bit, zero-extend it, as negative shift counts
34197 are undefined behavior and zero-extension is more
34198 efficient. */
34199 if (!match)
34201 if (SCALAR_INT_MODE_P (GET_MODE (op)))
34202 op = convert_modes (mode, GET_MODE (op), op, 1);
34203 else
34204 op = lowpart_subreg (mode, op, GET_MODE (op));
34205 if (!insn_p->operand[i + 1].predicate (op, mode))
34206 op = copy_to_reg (op);
34209 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34210 (!mask_pos && (nargs - i) <= nargs_constant))
34212 if (!match)
34213 switch (icode)
34215 case CODE_FOR_avx_vinsertf128v4di:
34216 case CODE_FOR_avx_vextractf128v4di:
34217 error ("the last argument must be an 1-bit immediate");
34218 return const0_rtx;
34220 case CODE_FOR_avx512f_cmpv8di3_mask:
34221 case CODE_FOR_avx512f_cmpv16si3_mask:
34222 case CODE_FOR_avx512f_ucmpv8di3_mask:
34223 case CODE_FOR_avx512f_ucmpv16si3_mask:
34224 case CODE_FOR_avx512vl_cmpv4di3_mask:
34225 case CODE_FOR_avx512vl_cmpv8si3_mask:
34226 case CODE_FOR_avx512vl_ucmpv4di3_mask:
34227 case CODE_FOR_avx512vl_ucmpv8si3_mask:
34228 case CODE_FOR_avx512vl_cmpv2di3_mask:
34229 case CODE_FOR_avx512vl_cmpv4si3_mask:
34230 case CODE_FOR_avx512vl_ucmpv2di3_mask:
34231 case CODE_FOR_avx512vl_ucmpv4si3_mask:
34232 error ("the last argument must be a 3-bit immediate");
34233 return const0_rtx;
34235 case CODE_FOR_sse4_1_roundsd:
34236 case CODE_FOR_sse4_1_roundss:
34238 case CODE_FOR_sse4_1_roundpd:
34239 case CODE_FOR_sse4_1_roundps:
34240 case CODE_FOR_avx_roundpd256:
34241 case CODE_FOR_avx_roundps256:
34243 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
34244 case CODE_FOR_sse4_1_roundps_sfix:
34245 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
34246 case CODE_FOR_avx_roundps_sfix256:
34248 case CODE_FOR_sse4_1_blendps:
34249 case CODE_FOR_avx_blendpd256:
34250 case CODE_FOR_avx_vpermilv4df:
34251 case CODE_FOR_avx_vpermilv4df_mask:
34252 case CODE_FOR_avx512f_getmantv8df_mask:
34253 case CODE_FOR_avx512f_getmantv16sf_mask:
34254 case CODE_FOR_avx512vl_getmantv8sf_mask:
34255 case CODE_FOR_avx512vl_getmantv4df_mask:
34256 case CODE_FOR_avx512vl_getmantv4sf_mask:
34257 case CODE_FOR_avx512vl_getmantv2df_mask:
34258 case CODE_FOR_avx512dq_rangepv8df_mask_round:
34259 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
34260 case CODE_FOR_avx512dq_rangepv4df_mask:
34261 case CODE_FOR_avx512dq_rangepv8sf_mask:
34262 case CODE_FOR_avx512dq_rangepv2df_mask:
34263 case CODE_FOR_avx512dq_rangepv4sf_mask:
34264 case CODE_FOR_avx_shufpd256_mask:
34265 error ("the last argument must be a 4-bit immediate");
34266 return const0_rtx;
34268 case CODE_FOR_sha1rnds4:
34269 case CODE_FOR_sse4_1_blendpd:
34270 case CODE_FOR_avx_vpermilv2df:
34271 case CODE_FOR_avx_vpermilv2df_mask:
34272 case CODE_FOR_xop_vpermil2v2df3:
34273 case CODE_FOR_xop_vpermil2v4sf3:
34274 case CODE_FOR_xop_vpermil2v4df3:
34275 case CODE_FOR_xop_vpermil2v8sf3:
34276 case CODE_FOR_avx512f_vinsertf32x4_mask:
34277 case CODE_FOR_avx512f_vinserti32x4_mask:
34278 case CODE_FOR_avx512f_vextractf32x4_mask:
34279 case CODE_FOR_avx512f_vextracti32x4_mask:
34280 case CODE_FOR_sse2_shufpd:
34281 case CODE_FOR_sse2_shufpd_mask:
34282 case CODE_FOR_avx512dq_shuf_f64x2_mask:
34283 case CODE_FOR_avx512dq_shuf_i64x2_mask:
34284 case CODE_FOR_avx512vl_shuf_i32x4_mask:
34285 case CODE_FOR_avx512vl_shuf_f32x4_mask:
34286 error ("the last argument must be a 2-bit immediate");
34287 return const0_rtx;
34289 case CODE_FOR_avx_vextractf128v4df:
34290 case CODE_FOR_avx_vextractf128v8sf:
34291 case CODE_FOR_avx_vextractf128v8si:
34292 case CODE_FOR_avx_vinsertf128v4df:
34293 case CODE_FOR_avx_vinsertf128v8sf:
34294 case CODE_FOR_avx_vinsertf128v8si:
34295 case CODE_FOR_avx512f_vinsertf64x4_mask:
34296 case CODE_FOR_avx512f_vinserti64x4_mask:
34297 case CODE_FOR_avx512f_vextractf64x4_mask:
34298 case CODE_FOR_avx512f_vextracti64x4_mask:
34299 case CODE_FOR_avx512dq_vinsertf32x8_mask:
34300 case CODE_FOR_avx512dq_vinserti32x8_mask:
34301 case CODE_FOR_avx512vl_vinsertv4df:
34302 case CODE_FOR_avx512vl_vinsertv4di:
34303 case CODE_FOR_avx512vl_vinsertv8sf:
34304 case CODE_FOR_avx512vl_vinsertv8si:
34305 error ("the last argument must be a 1-bit immediate");
34306 return const0_rtx;
34308 case CODE_FOR_avx_vmcmpv2df3:
34309 case CODE_FOR_avx_vmcmpv4sf3:
34310 case CODE_FOR_avx_cmpv2df3:
34311 case CODE_FOR_avx_cmpv4sf3:
34312 case CODE_FOR_avx_cmpv4df3:
34313 case CODE_FOR_avx_cmpv8sf3:
34314 case CODE_FOR_avx512f_cmpv8df3_mask:
34315 case CODE_FOR_avx512f_cmpv16sf3_mask:
34316 case CODE_FOR_avx512f_vmcmpv2df3_mask:
34317 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
34318 error ("the last argument must be a 5-bit immediate");
34319 return const0_rtx;
34321 default:
34322 switch (nargs_constant)
34324 case 2:
34325 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34326 (!mask_pos && (nargs - i) == nargs_constant))
34328 error ("the next to last argument must be an 8-bit immediate");
34329 break;
34331 /* FALLTHRU */
34332 case 1:
34333 error ("the last argument must be an 8-bit immediate");
34334 break;
34335 default:
34336 gcc_unreachable ();
34338 return const0_rtx;
34341 else
34343 if (VECTOR_MODE_P (mode))
34344 op = safe_vector_operand (op, mode);
34346 /* If we aren't optimizing, only allow one memory operand to
34347 be generated. */
34348 if (memory_operand (op, mode))
34349 num_memory++;
34351 op = fixup_modeless_constant (op, mode);
34353 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34355 if (optimize || !match || num_memory > 1)
34356 op = copy_to_mode_reg (mode, op);
34358 else
34360 op = copy_to_reg (op);
34361 op = lowpart_subreg (mode, op, GET_MODE (op));
34365 args[i].op = op;
34366 args[i].mode = mode;
34369 switch (nargs)
34371 case 1:
34372 pat = GEN_FCN (icode) (real_target, args[0].op);
34373 break;
34374 case 2:
34375 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
34376 break;
34377 case 3:
34378 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34379 args[2].op);
34380 break;
34381 case 4:
34382 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34383 args[2].op, args[3].op);
34384 break;
34385 case 5:
34386 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34387 args[2].op, args[3].op, args[4].op);
34388 break;
34389 case 6:
34390 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34391 args[2].op, args[3].op, args[4].op,
34392 args[5].op);
34393 break;
34394 default:
34395 gcc_unreachable ();
34398 if (! pat)
34399 return 0;
34401 emit_insn (pat);
34402 return target;
34405 /* Transform pattern of following layout:
34406 (set A
34407 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
34409 into:
34410 (set (A B)) */
34412 static rtx
34413 ix86_erase_embedded_rounding (rtx pat)
34415 if (GET_CODE (pat) == INSN)
34416 pat = PATTERN (pat);
34418 gcc_assert (GET_CODE (pat) == SET);
34419 rtx src = SET_SRC (pat);
34420 gcc_assert (XVECLEN (src, 0) == 2);
34421 rtx p0 = XVECEXP (src, 0, 0);
34422 gcc_assert (GET_CODE (src) == UNSPEC
34423 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
34424 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
34425 return res;
34428 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
34429 with rounding. */
34430 static rtx
34431 ix86_expand_sse_comi_round (const struct builtin_description *d,
34432 tree exp, rtx target)
34434 rtx pat, set_dst;
34435 tree arg0 = CALL_EXPR_ARG (exp, 0);
34436 tree arg1 = CALL_EXPR_ARG (exp, 1);
34437 tree arg2 = CALL_EXPR_ARG (exp, 2);
34438 tree arg3 = CALL_EXPR_ARG (exp, 3);
34439 rtx op0 = expand_normal (arg0);
34440 rtx op1 = expand_normal (arg1);
34441 rtx op2 = expand_normal (arg2);
34442 rtx op3 = expand_normal (arg3);
34443 enum insn_code icode = d->icode;
34444 const struct insn_data_d *insn_p = &insn_data[icode];
34445 machine_mode mode0 = insn_p->operand[0].mode;
34446 machine_mode mode1 = insn_p->operand[1].mode;
34447 enum rtx_code comparison = UNEQ;
34448 bool need_ucomi = false;
34450 /* See avxintrin.h for values. */
34451 enum rtx_code comi_comparisons[32] =
34453 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
34454 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
34455 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
34457 bool need_ucomi_values[32] =
34459 true, false, false, true, true, false, false, true,
34460 true, false, false, true, true, false, false, true,
34461 false, true, true, false, false, true, true, false,
34462 false, true, true, false, false, true, true, false
34465 if (!CONST_INT_P (op2))
34467 error ("the third argument must be comparison constant");
34468 return const0_rtx;
34470 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
34472 error ("incorrect comparison mode");
34473 return const0_rtx;
34476 if (!insn_p->operand[2].predicate (op3, SImode))
34478 error ("incorrect rounding operand");
34479 return const0_rtx;
34482 comparison = comi_comparisons[INTVAL (op2)];
34483 need_ucomi = need_ucomi_values[INTVAL (op2)];
34485 if (VECTOR_MODE_P (mode0))
34486 op0 = safe_vector_operand (op0, mode0);
34487 if (VECTOR_MODE_P (mode1))
34488 op1 = safe_vector_operand (op1, mode1);
34490 target = gen_reg_rtx (SImode);
34491 emit_move_insn (target, const0_rtx);
34492 target = gen_rtx_SUBREG (QImode, target, 0);
34494 if ((optimize && !register_operand (op0, mode0))
34495 || !insn_p->operand[0].predicate (op0, mode0))
34496 op0 = copy_to_mode_reg (mode0, op0);
34497 if ((optimize && !register_operand (op1, mode1))
34498 || !insn_p->operand[1].predicate (op1, mode1))
34499 op1 = copy_to_mode_reg (mode1, op1);
34501 if (need_ucomi)
34502 icode = icode == CODE_FOR_sse_comi_round
34503 ? CODE_FOR_sse_ucomi_round
34504 : CODE_FOR_sse2_ucomi_round;
34506 pat = GEN_FCN (icode) (op0, op1, op3);
34507 if (! pat)
34508 return 0;
34510 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
34511 if (INTVAL (op3) == NO_ROUND)
34513 pat = ix86_erase_embedded_rounding (pat);
34514 if (! pat)
34515 return 0;
34517 set_dst = SET_DEST (pat);
34519 else
34521 gcc_assert (GET_CODE (pat) == SET);
34522 set_dst = SET_DEST (pat);
34525 emit_insn (pat);
34526 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34527 gen_rtx_fmt_ee (comparison, QImode,
34528 set_dst,
34529 const0_rtx)));
34531 return SUBREG_REG (target);
34534 static rtx
34535 ix86_expand_round_builtin (const struct builtin_description *d,
34536 tree exp, rtx target)
34538 rtx pat;
34539 unsigned int i, nargs;
34540 struct
34542 rtx op;
34543 machine_mode mode;
34544 } args[6];
34545 enum insn_code icode = d->icode;
34546 const struct insn_data_d *insn_p = &insn_data[icode];
34547 machine_mode tmode = insn_p->operand[0].mode;
34548 unsigned int nargs_constant = 0;
34549 unsigned int redundant_embed_rnd = 0;
34551 switch ((enum ix86_builtin_func_type) d->flag)
34553 case UINT64_FTYPE_V2DF_INT:
34554 case UINT64_FTYPE_V4SF_INT:
34555 case UINT_FTYPE_V2DF_INT:
34556 case UINT_FTYPE_V4SF_INT:
34557 case INT64_FTYPE_V2DF_INT:
34558 case INT64_FTYPE_V4SF_INT:
34559 case INT_FTYPE_V2DF_INT:
34560 case INT_FTYPE_V4SF_INT:
34561 nargs = 2;
34562 break;
34563 case V4SF_FTYPE_V4SF_UINT_INT:
34564 case V4SF_FTYPE_V4SF_UINT64_INT:
34565 case V2DF_FTYPE_V2DF_UINT64_INT:
34566 case V4SF_FTYPE_V4SF_INT_INT:
34567 case V4SF_FTYPE_V4SF_INT64_INT:
34568 case V2DF_FTYPE_V2DF_INT64_INT:
34569 case V4SF_FTYPE_V4SF_V4SF_INT:
34570 case V2DF_FTYPE_V2DF_V2DF_INT:
34571 case V4SF_FTYPE_V4SF_V2DF_INT:
34572 case V2DF_FTYPE_V2DF_V4SF_INT:
34573 nargs = 3;
34574 break;
34575 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
34576 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
34577 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
34578 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
34579 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
34580 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
34581 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
34582 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
34583 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
34584 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
34585 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
34586 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
34587 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
34588 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
34589 nargs = 4;
34590 break;
34591 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
34592 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
34593 nargs_constant = 2;
34594 nargs = 4;
34595 break;
34596 case INT_FTYPE_V4SF_V4SF_INT_INT:
34597 case INT_FTYPE_V2DF_V2DF_INT_INT:
34598 return ix86_expand_sse_comi_round (d, exp, target);
34599 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
34600 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
34601 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
34602 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
34603 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
34604 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
34605 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
34606 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
34607 nargs = 5;
34608 break;
34609 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
34610 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
34611 nargs_constant = 4;
34612 nargs = 5;
34613 break;
34614 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
34615 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
34616 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
34617 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
34618 nargs_constant = 3;
34619 nargs = 5;
34620 break;
34621 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
34622 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
34623 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
34624 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
34625 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
34626 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
34627 nargs = 6;
34628 nargs_constant = 4;
34629 break;
34630 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
34631 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
34632 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
34633 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
34634 nargs = 6;
34635 nargs_constant = 3;
34636 break;
34637 default:
34638 gcc_unreachable ();
34640 gcc_assert (nargs <= ARRAY_SIZE (args));
34642 if (optimize
34643 || target == 0
34644 || GET_MODE (target) != tmode
34645 || !insn_p->operand[0].predicate (target, tmode))
34646 target = gen_reg_rtx (tmode);
34648 for (i = 0; i < nargs; i++)
34650 tree arg = CALL_EXPR_ARG (exp, i);
34651 rtx op = expand_normal (arg);
34652 machine_mode mode = insn_p->operand[i + 1].mode;
34653 bool match = insn_p->operand[i + 1].predicate (op, mode);
34655 if (i == nargs - nargs_constant)
34657 if (!match)
34659 switch (icode)
34661 case CODE_FOR_avx512f_getmantv8df_mask_round:
34662 case CODE_FOR_avx512f_getmantv16sf_mask_round:
34663 case CODE_FOR_avx512f_vgetmantv2df_round:
34664 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
34665 case CODE_FOR_avx512f_vgetmantv4sf_round:
34666 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
34667 error ("the immediate argument must be a 4-bit immediate");
34668 return const0_rtx;
34669 case CODE_FOR_avx512f_cmpv8df3_mask_round:
34670 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
34671 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
34672 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
34673 error ("the immediate argument must be a 5-bit immediate");
34674 return const0_rtx;
34675 default:
34676 error ("the immediate argument must be an 8-bit immediate");
34677 return const0_rtx;
34681 else if (i == nargs-1)
34683 if (!insn_p->operand[nargs].predicate (op, SImode))
34685 error ("incorrect rounding operand");
34686 return const0_rtx;
34689 /* If there is no rounding use normal version of the pattern. */
34690 if (INTVAL (op) == NO_ROUND)
34691 redundant_embed_rnd = 1;
34693 else
34695 if (VECTOR_MODE_P (mode))
34696 op = safe_vector_operand (op, mode);
34698 op = fixup_modeless_constant (op, mode);
34700 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34702 if (optimize || !match)
34703 op = copy_to_mode_reg (mode, op);
34705 else
34707 op = copy_to_reg (op);
34708 op = lowpart_subreg (mode, op, GET_MODE (op));
34712 args[i].op = op;
34713 args[i].mode = mode;
34716 switch (nargs)
34718 case 1:
34719 pat = GEN_FCN (icode) (target, args[0].op);
34720 break;
34721 case 2:
34722 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34723 break;
34724 case 3:
34725 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34726 args[2].op);
34727 break;
34728 case 4:
34729 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34730 args[2].op, args[3].op);
34731 break;
34732 case 5:
34733 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34734 args[2].op, args[3].op, args[4].op);
34735 break;
34736 case 6:
34737 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34738 args[2].op, args[3].op, args[4].op,
34739 args[5].op);
34740 break;
34741 default:
34742 gcc_unreachable ();
34745 if (!pat)
34746 return 0;
34748 if (redundant_embed_rnd)
34749 pat = ix86_erase_embedded_rounding (pat);
34751 emit_insn (pat);
34752 return target;
34755 /* Subroutine of ix86_expand_builtin to take care of special insns
34756 with variable number of operands. */
34758 static rtx
34759 ix86_expand_special_args_builtin (const struct builtin_description *d,
34760 tree exp, rtx target)
34762 tree arg;
34763 rtx pat, op;
34764 unsigned int i, nargs, arg_adjust, memory;
34765 bool aligned_mem = false;
34766 struct
34768 rtx op;
34769 machine_mode mode;
34770 } args[3];
34771 enum insn_code icode = d->icode;
34772 bool last_arg_constant = false;
34773 const struct insn_data_d *insn_p = &insn_data[icode];
34774 machine_mode tmode = insn_p->operand[0].mode;
34775 enum { load, store } klass;
34777 switch ((enum ix86_builtin_func_type) d->flag)
34779 case VOID_FTYPE_VOID:
34780 emit_insn (GEN_FCN (icode) (target));
34781 return 0;
34782 case VOID_FTYPE_UINT64:
34783 case VOID_FTYPE_UNSIGNED:
34784 nargs = 0;
34785 klass = store;
34786 memory = 0;
34787 break;
34789 case INT_FTYPE_VOID:
34790 case USHORT_FTYPE_VOID:
34791 case UINT64_FTYPE_VOID:
34792 case UNSIGNED_FTYPE_VOID:
34793 nargs = 0;
34794 klass = load;
34795 memory = 0;
34796 break;
34797 case UINT64_FTYPE_PUNSIGNED:
34798 case V2DI_FTYPE_PV2DI:
34799 case V4DI_FTYPE_PV4DI:
34800 case V32QI_FTYPE_PCCHAR:
34801 case V16QI_FTYPE_PCCHAR:
34802 case V8SF_FTYPE_PCV4SF:
34803 case V8SF_FTYPE_PCFLOAT:
34804 case V4SF_FTYPE_PCFLOAT:
34805 case V4DF_FTYPE_PCV2DF:
34806 case V4DF_FTYPE_PCDOUBLE:
34807 case V2DF_FTYPE_PCDOUBLE:
34808 case VOID_FTYPE_PVOID:
34809 case V8DI_FTYPE_PV8DI:
34810 nargs = 1;
34811 klass = load;
34812 memory = 0;
34813 switch (icode)
34815 case CODE_FOR_sse4_1_movntdqa:
34816 case CODE_FOR_avx2_movntdqa:
34817 case CODE_FOR_avx512f_movntdqa:
34818 aligned_mem = true;
34819 break;
34820 default:
34821 break;
34823 break;
34824 case VOID_FTYPE_PV2SF_V4SF:
34825 case VOID_FTYPE_PV8DI_V8DI:
34826 case VOID_FTYPE_PV4DI_V4DI:
34827 case VOID_FTYPE_PV2DI_V2DI:
34828 case VOID_FTYPE_PCHAR_V32QI:
34829 case VOID_FTYPE_PCHAR_V16QI:
34830 case VOID_FTYPE_PFLOAT_V16SF:
34831 case VOID_FTYPE_PFLOAT_V8SF:
34832 case VOID_FTYPE_PFLOAT_V4SF:
34833 case VOID_FTYPE_PDOUBLE_V8DF:
34834 case VOID_FTYPE_PDOUBLE_V4DF:
34835 case VOID_FTYPE_PDOUBLE_V2DF:
34836 case VOID_FTYPE_PLONGLONG_LONGLONG:
34837 case VOID_FTYPE_PULONGLONG_ULONGLONG:
34838 case VOID_FTYPE_PINT_INT:
34839 nargs = 1;
34840 klass = store;
34841 /* Reserve memory operand for target. */
34842 memory = ARRAY_SIZE (args);
34843 switch (icode)
34845 /* These builtins and instructions require the memory
34846 to be properly aligned. */
34847 case CODE_FOR_avx_movntv4di:
34848 case CODE_FOR_sse2_movntv2di:
34849 case CODE_FOR_avx_movntv8sf:
34850 case CODE_FOR_sse_movntv4sf:
34851 case CODE_FOR_sse4a_vmmovntv4sf:
34852 case CODE_FOR_avx_movntv4df:
34853 case CODE_FOR_sse2_movntv2df:
34854 case CODE_FOR_sse4a_vmmovntv2df:
34855 case CODE_FOR_sse2_movntidi:
34856 case CODE_FOR_sse_movntq:
34857 case CODE_FOR_sse2_movntisi:
34858 case CODE_FOR_avx512f_movntv16sf:
34859 case CODE_FOR_avx512f_movntv8df:
34860 case CODE_FOR_avx512f_movntv8di:
34861 aligned_mem = true;
34862 break;
34863 default:
34864 break;
34866 break;
34867 case V4SF_FTYPE_V4SF_PCV2SF:
34868 case V2DF_FTYPE_V2DF_PCDOUBLE:
34869 nargs = 2;
34870 klass = load;
34871 memory = 1;
34872 break;
34873 case V8SF_FTYPE_PCV8SF_V8SI:
34874 case V4DF_FTYPE_PCV4DF_V4DI:
34875 case V4SF_FTYPE_PCV4SF_V4SI:
34876 case V2DF_FTYPE_PCV2DF_V2DI:
34877 case V8SI_FTYPE_PCV8SI_V8SI:
34878 case V4DI_FTYPE_PCV4DI_V4DI:
34879 case V4SI_FTYPE_PCV4SI_V4SI:
34880 case V2DI_FTYPE_PCV2DI_V2DI:
34881 case VOID_FTYPE_INT_INT64:
34882 nargs = 2;
34883 klass = load;
34884 memory = 0;
34885 break;
34886 case VOID_FTYPE_PV8DF_V8DF_UQI:
34887 case VOID_FTYPE_PV4DF_V4DF_UQI:
34888 case VOID_FTYPE_PV2DF_V2DF_UQI:
34889 case VOID_FTYPE_PV16SF_V16SF_UHI:
34890 case VOID_FTYPE_PV8SF_V8SF_UQI:
34891 case VOID_FTYPE_PV4SF_V4SF_UQI:
34892 case VOID_FTYPE_PV8DI_V8DI_UQI:
34893 case VOID_FTYPE_PV4DI_V4DI_UQI:
34894 case VOID_FTYPE_PV2DI_V2DI_UQI:
34895 case VOID_FTYPE_PV16SI_V16SI_UHI:
34896 case VOID_FTYPE_PV8SI_V8SI_UQI:
34897 case VOID_FTYPE_PV4SI_V4SI_UQI:
34898 case VOID_FTYPE_PV64QI_V64QI_UDI:
34899 case VOID_FTYPE_PV32HI_V32HI_USI:
34900 case VOID_FTYPE_PV32QI_V32QI_USI:
34901 case VOID_FTYPE_PV16QI_V16QI_UHI:
34902 case VOID_FTYPE_PV16HI_V16HI_UHI:
34903 case VOID_FTYPE_PV8HI_V8HI_UQI:
34904 switch (icode)
34906 /* These builtins and instructions require the memory
34907 to be properly aligned. */
34908 case CODE_FOR_avx512f_storev16sf_mask:
34909 case CODE_FOR_avx512f_storev16si_mask:
34910 case CODE_FOR_avx512f_storev8df_mask:
34911 case CODE_FOR_avx512f_storev8di_mask:
34912 case CODE_FOR_avx512vl_storev8sf_mask:
34913 case CODE_FOR_avx512vl_storev8si_mask:
34914 case CODE_FOR_avx512vl_storev4df_mask:
34915 case CODE_FOR_avx512vl_storev4di_mask:
34916 case CODE_FOR_avx512vl_storev4sf_mask:
34917 case CODE_FOR_avx512vl_storev4si_mask:
34918 case CODE_FOR_avx512vl_storev2df_mask:
34919 case CODE_FOR_avx512vl_storev2di_mask:
34920 aligned_mem = true;
34921 break;
34922 default:
34923 break;
34925 /* FALLTHRU */
34926 case VOID_FTYPE_PV8SF_V8SI_V8SF:
34927 case VOID_FTYPE_PV4DF_V4DI_V4DF:
34928 case VOID_FTYPE_PV4SF_V4SI_V4SF:
34929 case VOID_FTYPE_PV2DF_V2DI_V2DF:
34930 case VOID_FTYPE_PV8SI_V8SI_V8SI:
34931 case VOID_FTYPE_PV4DI_V4DI_V4DI:
34932 case VOID_FTYPE_PV4SI_V4SI_V4SI:
34933 case VOID_FTYPE_PV2DI_V2DI_V2DI:
34934 case VOID_FTYPE_PV8SI_V8DI_UQI:
34935 case VOID_FTYPE_PV8HI_V8DI_UQI:
34936 case VOID_FTYPE_PV16HI_V16SI_UHI:
34937 case VOID_FTYPE_PV16QI_V8DI_UQI:
34938 case VOID_FTYPE_PV16QI_V16SI_UHI:
34939 case VOID_FTYPE_PV4SI_V4DI_UQI:
34940 case VOID_FTYPE_PV4SI_V2DI_UQI:
34941 case VOID_FTYPE_PV8HI_V4DI_UQI:
34942 case VOID_FTYPE_PV8HI_V2DI_UQI:
34943 case VOID_FTYPE_PV8HI_V8SI_UQI:
34944 case VOID_FTYPE_PV8HI_V4SI_UQI:
34945 case VOID_FTYPE_PV16QI_V4DI_UQI:
34946 case VOID_FTYPE_PV16QI_V2DI_UQI:
34947 case VOID_FTYPE_PV16QI_V8SI_UQI:
34948 case VOID_FTYPE_PV16QI_V4SI_UQI:
34949 case VOID_FTYPE_PCHAR_V64QI_UDI:
34950 case VOID_FTYPE_PCHAR_V32QI_USI:
34951 case VOID_FTYPE_PCHAR_V16QI_UHI:
34952 case VOID_FTYPE_PSHORT_V32HI_USI:
34953 case VOID_FTYPE_PSHORT_V16HI_UHI:
34954 case VOID_FTYPE_PSHORT_V8HI_UQI:
34955 case VOID_FTYPE_PINT_V16SI_UHI:
34956 case VOID_FTYPE_PINT_V8SI_UQI:
34957 case VOID_FTYPE_PINT_V4SI_UQI:
34958 case VOID_FTYPE_PINT64_V8DI_UQI:
34959 case VOID_FTYPE_PINT64_V4DI_UQI:
34960 case VOID_FTYPE_PINT64_V2DI_UQI:
34961 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
34962 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
34963 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
34964 case VOID_FTYPE_PFLOAT_V16SF_UHI:
34965 case VOID_FTYPE_PFLOAT_V8SF_UQI:
34966 case VOID_FTYPE_PFLOAT_V4SF_UQI:
34967 case VOID_FTYPE_PV32QI_V32HI_USI:
34968 case VOID_FTYPE_PV16QI_V16HI_UHI:
34969 case VOID_FTYPE_PV8QI_V8HI_UQI:
34970 nargs = 2;
34971 klass = store;
34972 /* Reserve memory operand for target. */
34973 memory = ARRAY_SIZE (args);
34974 break;
34975 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
34976 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
34977 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
34978 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
34979 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
34980 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
34981 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
34982 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
34983 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
34984 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
34985 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
34986 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
34987 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
34988 case V32HI_FTYPE_PCV32HI_V32HI_USI:
34989 case V32QI_FTYPE_PCV32QI_V32QI_USI:
34990 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
34991 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
34992 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
34993 switch (icode)
34995 /* These builtins and instructions require the memory
34996 to be properly aligned. */
34997 case CODE_FOR_avx512f_loadv16sf_mask:
34998 case CODE_FOR_avx512f_loadv16si_mask:
34999 case CODE_FOR_avx512f_loadv8df_mask:
35000 case CODE_FOR_avx512f_loadv8di_mask:
35001 case CODE_FOR_avx512vl_loadv8sf_mask:
35002 case CODE_FOR_avx512vl_loadv8si_mask:
35003 case CODE_FOR_avx512vl_loadv4df_mask:
35004 case CODE_FOR_avx512vl_loadv4di_mask:
35005 case CODE_FOR_avx512vl_loadv4sf_mask:
35006 case CODE_FOR_avx512vl_loadv4si_mask:
35007 case CODE_FOR_avx512vl_loadv2df_mask:
35008 case CODE_FOR_avx512vl_loadv2di_mask:
35009 case CODE_FOR_avx512bw_loadv64qi_mask:
35010 case CODE_FOR_avx512vl_loadv32qi_mask:
35011 case CODE_FOR_avx512vl_loadv16qi_mask:
35012 case CODE_FOR_avx512bw_loadv32hi_mask:
35013 case CODE_FOR_avx512vl_loadv16hi_mask:
35014 case CODE_FOR_avx512vl_loadv8hi_mask:
35015 aligned_mem = true;
35016 break;
35017 default:
35018 break;
35020 /* FALLTHRU */
35021 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35022 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35023 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35024 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35025 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35026 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35027 case V16SI_FTYPE_PCINT_V16SI_UHI:
35028 case V8SI_FTYPE_PCINT_V8SI_UQI:
35029 case V4SI_FTYPE_PCINT_V4SI_UQI:
35030 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35031 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35032 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35033 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35034 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35035 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35036 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35037 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35038 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35039 nargs = 3;
35040 klass = load;
35041 memory = 0;
35042 break;
35043 case VOID_FTYPE_UINT_UINT_UINT:
35044 case VOID_FTYPE_UINT64_UINT_UINT:
35045 case UCHAR_FTYPE_UINT_UINT_UINT:
35046 case UCHAR_FTYPE_UINT64_UINT_UINT:
35047 nargs = 3;
35048 klass = load;
35049 memory = ARRAY_SIZE (args);
35050 last_arg_constant = true;
35051 break;
35052 default:
35053 gcc_unreachable ();
35056 gcc_assert (nargs <= ARRAY_SIZE (args));
35058 if (klass == store)
35060 arg = CALL_EXPR_ARG (exp, 0);
35061 op = expand_normal (arg);
35062 gcc_assert (target == 0);
35063 if (memory)
35065 op = ix86_zero_extend_to_Pmode (op);
35066 target = gen_rtx_MEM (tmode, op);
35067 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35068 on it. Try to improve it using get_pointer_alignment,
35069 and if the special builtin is one that requires strict
35070 mode alignment, also from it's GET_MODE_ALIGNMENT.
35071 Failure to do so could lead to ix86_legitimate_combined_insn
35072 rejecting all changes to such insns. */
35073 unsigned int align = get_pointer_alignment (arg);
35074 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35075 align = GET_MODE_ALIGNMENT (tmode);
35076 if (MEM_ALIGN (target) < align)
35077 set_mem_align (target, align);
35079 else
35080 target = force_reg (tmode, op);
35081 arg_adjust = 1;
35083 else
35085 arg_adjust = 0;
35086 if (optimize
35087 || target == 0
35088 || !register_operand (target, tmode)
35089 || GET_MODE (target) != tmode)
35090 target = gen_reg_rtx (tmode);
35093 for (i = 0; i < nargs; i++)
35095 machine_mode mode = insn_p->operand[i + 1].mode;
35096 bool match;
35098 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35099 op = expand_normal (arg);
35100 match = insn_p->operand[i + 1].predicate (op, mode);
35102 if (last_arg_constant && (i + 1) == nargs)
35104 if (!match)
35106 if (icode == CODE_FOR_lwp_lwpvalsi3
35107 || icode == CODE_FOR_lwp_lwpinssi3
35108 || icode == CODE_FOR_lwp_lwpvaldi3
35109 || icode == CODE_FOR_lwp_lwpinsdi3)
35110 error ("the last argument must be a 32-bit immediate");
35111 else
35112 error ("the last argument must be an 8-bit immediate");
35113 return const0_rtx;
35116 else
35118 if (i == memory)
35120 /* This must be the memory operand. */
35121 op = ix86_zero_extend_to_Pmode (op);
35122 op = gen_rtx_MEM (mode, op);
35123 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35124 on it. Try to improve it using get_pointer_alignment,
35125 and if the special builtin is one that requires strict
35126 mode alignment, also from it's GET_MODE_ALIGNMENT.
35127 Failure to do so could lead to ix86_legitimate_combined_insn
35128 rejecting all changes to such insns. */
35129 unsigned int align = get_pointer_alignment (arg);
35130 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35131 align = GET_MODE_ALIGNMENT (mode);
35132 if (MEM_ALIGN (op) < align)
35133 set_mem_align (op, align);
35135 else
35137 /* This must be register. */
35138 if (VECTOR_MODE_P (mode))
35139 op = safe_vector_operand (op, mode);
35141 op = fixup_modeless_constant (op, mode);
35143 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35144 op = copy_to_mode_reg (mode, op);
35145 else
35147 op = copy_to_reg (op);
35148 op = lowpart_subreg (mode, op, GET_MODE (op));
35153 args[i].op = op;
35154 args[i].mode = mode;
35157 switch (nargs)
35159 case 0:
35160 pat = GEN_FCN (icode) (target);
35161 break;
35162 case 1:
35163 pat = GEN_FCN (icode) (target, args[0].op);
35164 break;
35165 case 2:
35166 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35167 break;
35168 case 3:
35169 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35170 break;
35171 default:
35172 gcc_unreachable ();
35175 if (! pat)
35176 return 0;
35177 emit_insn (pat);
35178 return klass == store ? 0 : target;
35181 /* Return the integer constant in ARG. Constrain it to be in the range
35182 of the subparts of VEC_TYPE; issue an error if not. */
35184 static int
35185 get_element_number (tree vec_type, tree arg)
35187 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35189 if (!tree_fits_uhwi_p (arg)
35190 || (elt = tree_to_uhwi (arg), elt > max))
35192 error ("selector must be an integer constant in the range 0..%wi", max);
35193 return 0;
35196 return elt;
35199 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35200 ix86_expand_vector_init. We DO have language-level syntax for this, in
35201 the form of (type){ init-list }. Except that since we can't place emms
35202 instructions from inside the compiler, we can't allow the use of MMX
35203 registers unless the user explicitly asks for it. So we do *not* define
35204 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
35205 we have builtins invoked by mmintrin.h that gives us license to emit
35206 these sorts of instructions. */
35208 static rtx
35209 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
35211 machine_mode tmode = TYPE_MODE (type);
35212 machine_mode inner_mode = GET_MODE_INNER (tmode);
35213 int i, n_elt = GET_MODE_NUNITS (tmode);
35214 rtvec v = rtvec_alloc (n_elt);
35216 gcc_assert (VECTOR_MODE_P (tmode));
35217 gcc_assert (call_expr_nargs (exp) == n_elt);
35219 for (i = 0; i < n_elt; ++i)
35221 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
35222 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
35225 if (!target || !register_operand (target, tmode))
35226 target = gen_reg_rtx (tmode);
35228 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
35229 return target;
35232 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35233 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
35234 had a language-level syntax for referencing vector elements. */
35236 static rtx
35237 ix86_expand_vec_ext_builtin (tree exp, rtx target)
35239 machine_mode tmode, mode0;
35240 tree arg0, arg1;
35241 int elt;
35242 rtx op0;
35244 arg0 = CALL_EXPR_ARG (exp, 0);
35245 arg1 = CALL_EXPR_ARG (exp, 1);
35247 op0 = expand_normal (arg0);
35248 elt = get_element_number (TREE_TYPE (arg0), arg1);
35250 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35251 mode0 = TYPE_MODE (TREE_TYPE (arg0));
35252 gcc_assert (VECTOR_MODE_P (mode0));
35254 op0 = force_reg (mode0, op0);
35256 if (optimize || !target || !register_operand (target, tmode))
35257 target = gen_reg_rtx (tmode);
35259 ix86_expand_vector_extract (true, target, op0, elt);
35261 return target;
35264 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35265 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
35266 a language-level syntax for referencing vector elements. */
35268 static rtx
35269 ix86_expand_vec_set_builtin (tree exp)
35271 machine_mode tmode, mode1;
35272 tree arg0, arg1, arg2;
35273 int elt;
35274 rtx op0, op1, target;
35276 arg0 = CALL_EXPR_ARG (exp, 0);
35277 arg1 = CALL_EXPR_ARG (exp, 1);
35278 arg2 = CALL_EXPR_ARG (exp, 2);
35280 tmode = TYPE_MODE (TREE_TYPE (arg0));
35281 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35282 gcc_assert (VECTOR_MODE_P (tmode));
35284 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
35285 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
35286 elt = get_element_number (TREE_TYPE (arg0), arg2);
35288 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
35289 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
35291 op0 = force_reg (tmode, op0);
35292 op1 = force_reg (mode1, op1);
35294 /* OP0 is the source of these builtin functions and shouldn't be
35295 modified. Create a copy, use it and return it as target. */
35296 target = gen_reg_rtx (tmode);
35297 emit_move_insn (target, op0);
35298 ix86_expand_vector_set (true, target, op1, elt);
35300 return target;
35303 /* Emit conditional move of SRC to DST with condition
35304 OP1 CODE OP2. */
35305 static void
35306 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
35308 rtx t;
35310 if (TARGET_CMOVE)
35312 t = ix86_expand_compare (code, op1, op2);
35313 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
35314 src, dst)));
35316 else
35318 rtx_code_label *nomove = gen_label_rtx ();
35319 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
35320 const0_rtx, GET_MODE (op1), 1, nomove);
35321 emit_move_insn (dst, src);
35322 emit_label (nomove);
35326 /* Choose max of DST and SRC and put it to DST. */
35327 static void
35328 ix86_emit_move_max (rtx dst, rtx src)
35330 ix86_emit_cmove (dst, src, LTU, dst, src);
35333 /* Expand an expression EXP that calls a built-in function,
35334 with result going to TARGET if that's convenient
35335 (and in mode MODE if that's convenient).
35336 SUBTARGET may be used as the target for computing one of EXP's operands.
35337 IGNORE is nonzero if the value is to be ignored. */
35339 static rtx
35340 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
35341 machine_mode mode, int ignore)
35343 size_t i;
35344 enum insn_code icode, icode2;
35345 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
35346 tree arg0, arg1, arg2, arg3, arg4;
35347 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
35348 machine_mode mode0, mode1, mode2, mode3, mode4;
35349 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
35351 /* For CPU builtins that can be folded, fold first and expand the fold. */
35352 switch (fcode)
35354 case IX86_BUILTIN_CPU_INIT:
35356 /* Make it call __cpu_indicator_init in libgcc. */
35357 tree call_expr, fndecl, type;
35358 type = build_function_type_list (integer_type_node, NULL_TREE);
35359 fndecl = build_fn_decl ("__cpu_indicator_init", type);
35360 call_expr = build_call_expr (fndecl, 0);
35361 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
35363 case IX86_BUILTIN_CPU_IS:
35364 case IX86_BUILTIN_CPU_SUPPORTS:
35366 tree arg0 = CALL_EXPR_ARG (exp, 0);
35367 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
35368 gcc_assert (fold_expr != NULL_TREE);
35369 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
35373 /* Determine whether the builtin function is available under the current ISA.
35374 Originally the builtin was not created if it wasn't applicable to the
35375 current ISA based on the command line switches. With function specific
35376 options, we need to check in the context of the function making the call
35377 whether it is supported. Treat AVX512VL and MMX specially. For other flags,
35378 if isa includes more than one ISA bit, treat those are requiring any
35379 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
35380 ISAs. Likewise for MMX, require both MMX and the non-MMX ISAs.
35381 Similarly for 64BIT, but we shouldn't be building such builtins
35382 at all, -m64 is a whole TU option. */
35383 if (((ix86_builtins_isa[fcode].isa
35384 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35385 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI))
35386 && !(ix86_builtins_isa[fcode].isa
35387 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35388 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI)
35389 & ix86_isa_flags))
35390 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
35391 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
35392 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_GFNI)
35393 && !(ix86_isa_flags & OPTION_MASK_ISA_GFNI))
35394 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_MMX)
35395 && !(ix86_isa_flags & OPTION_MASK_ISA_MMX))
35396 || (ix86_builtins_isa[fcode].isa2
35397 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
35399 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
35400 ix86_builtins_isa[fcode].isa2, 0, 0,
35401 NULL, NULL, (enum fpmath_unit) 0,
35402 false);
35403 if (!opts)
35404 error ("%qE needs unknown isa option", fndecl);
35405 else
35407 gcc_assert (opts != NULL);
35408 error ("%qE needs isa option %s", fndecl, opts);
35409 free (opts);
35411 return expand_call (exp, target, ignore);
35414 switch (fcode)
35416 case IX86_BUILTIN_BNDMK:
35417 if (!target
35418 || GET_MODE (target) != BNDmode
35419 || !register_operand (target, BNDmode))
35420 target = gen_reg_rtx (BNDmode);
35422 arg0 = CALL_EXPR_ARG (exp, 0);
35423 arg1 = CALL_EXPR_ARG (exp, 1);
35425 op0 = expand_normal (arg0);
35426 op1 = expand_normal (arg1);
35428 if (!register_operand (op0, Pmode))
35429 op0 = ix86_zero_extend_to_Pmode (op0);
35430 if (!register_operand (op1, Pmode))
35431 op1 = ix86_zero_extend_to_Pmode (op1);
35433 /* Builtin arg1 is size of block but instruction op1 should
35434 be (size - 1). */
35435 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
35436 NULL_RTX, 1, OPTAB_DIRECT);
35438 emit_insn (BNDmode == BND64mode
35439 ? gen_bnd64_mk (target, op0, op1)
35440 : gen_bnd32_mk (target, op0, op1));
35441 return target;
35443 case IX86_BUILTIN_BNDSTX:
35444 arg0 = CALL_EXPR_ARG (exp, 0);
35445 arg1 = CALL_EXPR_ARG (exp, 1);
35446 arg2 = CALL_EXPR_ARG (exp, 2);
35448 op0 = expand_normal (arg0);
35449 op1 = expand_normal (arg1);
35450 op2 = expand_normal (arg2);
35452 if (!register_operand (op0, Pmode))
35453 op0 = ix86_zero_extend_to_Pmode (op0);
35454 if (!register_operand (op1, BNDmode))
35455 op1 = copy_to_mode_reg (BNDmode, op1);
35456 if (!register_operand (op2, Pmode))
35457 op2 = ix86_zero_extend_to_Pmode (op2);
35459 emit_insn (BNDmode == BND64mode
35460 ? gen_bnd64_stx (op2, op0, op1)
35461 : gen_bnd32_stx (op2, op0, op1));
35462 return 0;
35464 case IX86_BUILTIN_BNDLDX:
35465 if (!target
35466 || GET_MODE (target) != BNDmode
35467 || !register_operand (target, BNDmode))
35468 target = gen_reg_rtx (BNDmode);
35470 arg0 = CALL_EXPR_ARG (exp, 0);
35471 arg1 = CALL_EXPR_ARG (exp, 1);
35473 op0 = expand_normal (arg0);
35474 op1 = expand_normal (arg1);
35476 if (!register_operand (op0, Pmode))
35477 op0 = ix86_zero_extend_to_Pmode (op0);
35478 if (!register_operand (op1, Pmode))
35479 op1 = ix86_zero_extend_to_Pmode (op1);
35481 emit_insn (BNDmode == BND64mode
35482 ? gen_bnd64_ldx (target, op0, op1)
35483 : gen_bnd32_ldx (target, op0, op1));
35484 return target;
35486 case IX86_BUILTIN_BNDCL:
35487 arg0 = CALL_EXPR_ARG (exp, 0);
35488 arg1 = CALL_EXPR_ARG (exp, 1);
35490 op0 = expand_normal (arg0);
35491 op1 = expand_normal (arg1);
35493 if (!register_operand (op0, Pmode))
35494 op0 = ix86_zero_extend_to_Pmode (op0);
35495 if (!register_operand (op1, BNDmode))
35496 op1 = copy_to_mode_reg (BNDmode, op1);
35498 emit_insn (BNDmode == BND64mode
35499 ? gen_bnd64_cl (op1, op0)
35500 : gen_bnd32_cl (op1, op0));
35501 return 0;
35503 case IX86_BUILTIN_BNDCU:
35504 arg0 = CALL_EXPR_ARG (exp, 0);
35505 arg1 = CALL_EXPR_ARG (exp, 1);
35507 op0 = expand_normal (arg0);
35508 op1 = expand_normal (arg1);
35510 if (!register_operand (op0, Pmode))
35511 op0 = ix86_zero_extend_to_Pmode (op0);
35512 if (!register_operand (op1, BNDmode))
35513 op1 = copy_to_mode_reg (BNDmode, op1);
35515 emit_insn (BNDmode == BND64mode
35516 ? gen_bnd64_cu (op1, op0)
35517 : gen_bnd32_cu (op1, op0));
35518 return 0;
35520 case IX86_BUILTIN_BNDRET:
35521 arg0 = CALL_EXPR_ARG (exp, 0);
35522 target = chkp_get_rtl_bounds (arg0);
35524 /* If no bounds were specified for returned value,
35525 then use INIT bounds. It usually happens when
35526 some built-in function is expanded. */
35527 if (!target)
35529 rtx t1 = gen_reg_rtx (Pmode);
35530 rtx t2 = gen_reg_rtx (Pmode);
35531 target = gen_reg_rtx (BNDmode);
35532 emit_move_insn (t1, const0_rtx);
35533 emit_move_insn (t2, constm1_rtx);
35534 emit_insn (BNDmode == BND64mode
35535 ? gen_bnd64_mk (target, t1, t2)
35536 : gen_bnd32_mk (target, t1, t2));
35539 gcc_assert (target && REG_P (target));
35540 return target;
35542 case IX86_BUILTIN_BNDNARROW:
35544 rtx m1, m1h1, m1h2, lb, ub, t1;
35546 /* Return value and lb. */
35547 arg0 = CALL_EXPR_ARG (exp, 0);
35548 /* Bounds. */
35549 arg1 = CALL_EXPR_ARG (exp, 1);
35550 /* Size. */
35551 arg2 = CALL_EXPR_ARG (exp, 2);
35553 lb = expand_normal (arg0);
35554 op1 = expand_normal (arg1);
35555 op2 = expand_normal (arg2);
35557 /* Size was passed but we need to use (size - 1) as for bndmk. */
35558 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
35559 NULL_RTX, 1, OPTAB_DIRECT);
35561 /* Add LB to size and inverse to get UB. */
35562 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
35563 op2, 1, OPTAB_DIRECT);
35564 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
35566 if (!register_operand (lb, Pmode))
35567 lb = ix86_zero_extend_to_Pmode (lb);
35568 if (!register_operand (ub, Pmode))
35569 ub = ix86_zero_extend_to_Pmode (ub);
35571 /* We need to move bounds to memory before any computations. */
35572 if (MEM_P (op1))
35573 m1 = op1;
35574 else
35576 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
35577 emit_move_insn (m1, op1);
35580 /* Generate mem expression to be used for access to LB and UB. */
35581 m1h1 = adjust_address (m1, Pmode, 0);
35582 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
35584 t1 = gen_reg_rtx (Pmode);
35586 /* Compute LB. */
35587 emit_move_insn (t1, m1h1);
35588 ix86_emit_move_max (t1, lb);
35589 emit_move_insn (m1h1, t1);
35591 /* Compute UB. UB is stored in 1's complement form. Therefore
35592 we also use max here. */
35593 emit_move_insn (t1, m1h2);
35594 ix86_emit_move_max (t1, ub);
35595 emit_move_insn (m1h2, t1);
35597 op2 = gen_reg_rtx (BNDmode);
35598 emit_move_insn (op2, m1);
35600 return chkp_join_splitted_slot (lb, op2);
35603 case IX86_BUILTIN_BNDINT:
35605 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
35607 if (!target
35608 || GET_MODE (target) != BNDmode
35609 || !register_operand (target, BNDmode))
35610 target = gen_reg_rtx (BNDmode);
35612 arg0 = CALL_EXPR_ARG (exp, 0);
35613 arg1 = CALL_EXPR_ARG (exp, 1);
35615 op0 = expand_normal (arg0);
35616 op1 = expand_normal (arg1);
35618 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
35619 rh1 = adjust_address (res, Pmode, 0);
35620 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
35622 /* Put first bounds to temporaries. */
35623 lb1 = gen_reg_rtx (Pmode);
35624 ub1 = gen_reg_rtx (Pmode);
35625 if (MEM_P (op0))
35627 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
35628 emit_move_insn (ub1, adjust_address (op0, Pmode,
35629 GET_MODE_SIZE (Pmode)));
35631 else
35633 emit_move_insn (res, op0);
35634 emit_move_insn (lb1, rh1);
35635 emit_move_insn (ub1, rh2);
35638 /* Put second bounds to temporaries. */
35639 lb2 = gen_reg_rtx (Pmode);
35640 ub2 = gen_reg_rtx (Pmode);
35641 if (MEM_P (op1))
35643 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
35644 emit_move_insn (ub2, adjust_address (op1, Pmode,
35645 GET_MODE_SIZE (Pmode)));
35647 else
35649 emit_move_insn (res, op1);
35650 emit_move_insn (lb2, rh1);
35651 emit_move_insn (ub2, rh2);
35654 /* Compute LB. */
35655 ix86_emit_move_max (lb1, lb2);
35656 emit_move_insn (rh1, lb1);
35658 /* Compute UB. UB is stored in 1's complement form. Therefore
35659 we also use max here. */
35660 ix86_emit_move_max (ub1, ub2);
35661 emit_move_insn (rh2, ub1);
35663 emit_move_insn (target, res);
35665 return target;
35668 case IX86_BUILTIN_SIZEOF:
35670 tree name;
35671 rtx symbol;
35673 if (!target
35674 || GET_MODE (target) != Pmode
35675 || !register_operand (target, Pmode))
35676 target = gen_reg_rtx (Pmode);
35678 arg0 = CALL_EXPR_ARG (exp, 0);
35679 gcc_assert (VAR_P (arg0));
35681 name = DECL_ASSEMBLER_NAME (arg0);
35682 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
35684 emit_insn (Pmode == SImode
35685 ? gen_move_size_reloc_si (target, symbol)
35686 : gen_move_size_reloc_di (target, symbol));
35688 return target;
35691 case IX86_BUILTIN_BNDLOWER:
35693 rtx mem, hmem;
35695 if (!target
35696 || GET_MODE (target) != Pmode
35697 || !register_operand (target, Pmode))
35698 target = gen_reg_rtx (Pmode);
35700 arg0 = CALL_EXPR_ARG (exp, 0);
35701 op0 = expand_normal (arg0);
35703 /* We need to move bounds to memory first. */
35704 if (MEM_P (op0))
35705 mem = op0;
35706 else
35708 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35709 emit_move_insn (mem, op0);
35712 /* Generate mem expression to access LB and load it. */
35713 hmem = adjust_address (mem, Pmode, 0);
35714 emit_move_insn (target, hmem);
35716 return target;
35719 case IX86_BUILTIN_BNDUPPER:
35721 rtx mem, hmem, res;
35723 if (!target
35724 || GET_MODE (target) != Pmode
35725 || !register_operand (target, Pmode))
35726 target = gen_reg_rtx (Pmode);
35728 arg0 = CALL_EXPR_ARG (exp, 0);
35729 op0 = expand_normal (arg0);
35731 /* We need to move bounds to memory first. */
35732 if (MEM_P (op0))
35733 mem = op0;
35734 else
35736 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35737 emit_move_insn (mem, op0);
35740 /* Generate mem expression to access UB. */
35741 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
35743 /* We need to inverse all bits of UB. */
35744 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
35746 if (res != target)
35747 emit_move_insn (target, res);
35749 return target;
35752 case IX86_BUILTIN_MASKMOVQ:
35753 case IX86_BUILTIN_MASKMOVDQU:
35754 icode = (fcode == IX86_BUILTIN_MASKMOVQ
35755 ? CODE_FOR_mmx_maskmovq
35756 : CODE_FOR_sse2_maskmovdqu);
35757 /* Note the arg order is different from the operand order. */
35758 arg1 = CALL_EXPR_ARG (exp, 0);
35759 arg2 = CALL_EXPR_ARG (exp, 1);
35760 arg0 = CALL_EXPR_ARG (exp, 2);
35761 op0 = expand_normal (arg0);
35762 op1 = expand_normal (arg1);
35763 op2 = expand_normal (arg2);
35764 mode0 = insn_data[icode].operand[0].mode;
35765 mode1 = insn_data[icode].operand[1].mode;
35766 mode2 = insn_data[icode].operand[2].mode;
35768 op0 = ix86_zero_extend_to_Pmode (op0);
35769 op0 = gen_rtx_MEM (mode1, op0);
35771 if (!insn_data[icode].operand[0].predicate (op0, mode0))
35772 op0 = copy_to_mode_reg (mode0, op0);
35773 if (!insn_data[icode].operand[1].predicate (op1, mode1))
35774 op1 = copy_to_mode_reg (mode1, op1);
35775 if (!insn_data[icode].operand[2].predicate (op2, mode2))
35776 op2 = copy_to_mode_reg (mode2, op2);
35777 pat = GEN_FCN (icode) (op0, op1, op2);
35778 if (! pat)
35779 return 0;
35780 emit_insn (pat);
35781 return 0;
35783 case IX86_BUILTIN_LDMXCSR:
35784 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
35785 target = assign_386_stack_local (SImode, SLOT_TEMP);
35786 emit_move_insn (target, op0);
35787 emit_insn (gen_sse_ldmxcsr (target));
35788 return 0;
35790 case IX86_BUILTIN_STMXCSR:
35791 target = assign_386_stack_local (SImode, SLOT_TEMP);
35792 emit_insn (gen_sse_stmxcsr (target));
35793 return copy_to_mode_reg (SImode, target);
35795 case IX86_BUILTIN_CLFLUSH:
35796 arg0 = CALL_EXPR_ARG (exp, 0);
35797 op0 = expand_normal (arg0);
35798 icode = CODE_FOR_sse2_clflush;
35799 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35800 op0 = ix86_zero_extend_to_Pmode (op0);
35802 emit_insn (gen_sse2_clflush (op0));
35803 return 0;
35805 case IX86_BUILTIN_CLWB:
35806 arg0 = CALL_EXPR_ARG (exp, 0);
35807 op0 = expand_normal (arg0);
35808 icode = CODE_FOR_clwb;
35809 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35810 op0 = ix86_zero_extend_to_Pmode (op0);
35812 emit_insn (gen_clwb (op0));
35813 return 0;
35815 case IX86_BUILTIN_CLFLUSHOPT:
35816 arg0 = CALL_EXPR_ARG (exp, 0);
35817 op0 = expand_normal (arg0);
35818 icode = CODE_FOR_clflushopt;
35819 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35820 op0 = ix86_zero_extend_to_Pmode (op0);
35822 emit_insn (gen_clflushopt (op0));
35823 return 0;
35825 case IX86_BUILTIN_MONITOR:
35826 case IX86_BUILTIN_MONITORX:
35827 arg0 = CALL_EXPR_ARG (exp, 0);
35828 arg1 = CALL_EXPR_ARG (exp, 1);
35829 arg2 = CALL_EXPR_ARG (exp, 2);
35830 op0 = expand_normal (arg0);
35831 op1 = expand_normal (arg1);
35832 op2 = expand_normal (arg2);
35833 if (!REG_P (op0))
35834 op0 = ix86_zero_extend_to_Pmode (op0);
35835 if (!REG_P (op1))
35836 op1 = copy_to_mode_reg (SImode, op1);
35837 if (!REG_P (op2))
35838 op2 = copy_to_mode_reg (SImode, op2);
35840 emit_insn (fcode == IX86_BUILTIN_MONITOR
35841 ? ix86_gen_monitor (op0, op1, op2)
35842 : ix86_gen_monitorx (op0, op1, op2));
35843 return 0;
35845 case IX86_BUILTIN_MWAIT:
35846 arg0 = CALL_EXPR_ARG (exp, 0);
35847 arg1 = CALL_EXPR_ARG (exp, 1);
35848 op0 = expand_normal (arg0);
35849 op1 = expand_normal (arg1);
35850 if (!REG_P (op0))
35851 op0 = copy_to_mode_reg (SImode, op0);
35852 if (!REG_P (op1))
35853 op1 = copy_to_mode_reg (SImode, op1);
35854 emit_insn (gen_sse3_mwait (op0, op1));
35855 return 0;
35857 case IX86_BUILTIN_MWAITX:
35858 arg0 = CALL_EXPR_ARG (exp, 0);
35859 arg1 = CALL_EXPR_ARG (exp, 1);
35860 arg2 = CALL_EXPR_ARG (exp, 2);
35861 op0 = expand_normal (arg0);
35862 op1 = expand_normal (arg1);
35863 op2 = expand_normal (arg2);
35864 if (!REG_P (op0))
35865 op0 = copy_to_mode_reg (SImode, op0);
35866 if (!REG_P (op1))
35867 op1 = copy_to_mode_reg (SImode, op1);
35868 if (!REG_P (op2))
35869 op2 = copy_to_mode_reg (SImode, op2);
35870 emit_insn (gen_mwaitx (op0, op1, op2));
35871 return 0;
35873 case IX86_BUILTIN_CLZERO:
35874 arg0 = CALL_EXPR_ARG (exp, 0);
35875 op0 = expand_normal (arg0);
35876 if (!REG_P (op0))
35877 op0 = ix86_zero_extend_to_Pmode (op0);
35878 emit_insn (ix86_gen_clzero (op0));
35879 return 0;
35881 case IX86_BUILTIN_VEC_INIT_V2SI:
35882 case IX86_BUILTIN_VEC_INIT_V4HI:
35883 case IX86_BUILTIN_VEC_INIT_V8QI:
35884 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
35886 case IX86_BUILTIN_VEC_EXT_V2DF:
35887 case IX86_BUILTIN_VEC_EXT_V2DI:
35888 case IX86_BUILTIN_VEC_EXT_V4SF:
35889 case IX86_BUILTIN_VEC_EXT_V4SI:
35890 case IX86_BUILTIN_VEC_EXT_V8HI:
35891 case IX86_BUILTIN_VEC_EXT_V2SI:
35892 case IX86_BUILTIN_VEC_EXT_V4HI:
35893 case IX86_BUILTIN_VEC_EXT_V16QI:
35894 return ix86_expand_vec_ext_builtin (exp, target);
35896 case IX86_BUILTIN_VEC_SET_V2DI:
35897 case IX86_BUILTIN_VEC_SET_V4SF:
35898 case IX86_BUILTIN_VEC_SET_V4SI:
35899 case IX86_BUILTIN_VEC_SET_V8HI:
35900 case IX86_BUILTIN_VEC_SET_V4HI:
35901 case IX86_BUILTIN_VEC_SET_V16QI:
35902 return ix86_expand_vec_set_builtin (exp);
35904 case IX86_BUILTIN_NANQ:
35905 case IX86_BUILTIN_NANSQ:
35906 return expand_call (exp, target, ignore);
35908 case IX86_BUILTIN_RDPMC:
35909 case IX86_BUILTIN_RDTSC:
35910 case IX86_BUILTIN_RDTSCP:
35911 case IX86_BUILTIN_XGETBV:
35913 op0 = gen_reg_rtx (DImode);
35914 op1 = gen_reg_rtx (DImode);
35916 if (fcode == IX86_BUILTIN_RDPMC)
35918 arg0 = CALL_EXPR_ARG (exp, 0);
35919 op2 = expand_normal (arg0);
35920 if (!register_operand (op2, SImode))
35921 op2 = copy_to_mode_reg (SImode, op2);
35923 insn = (TARGET_64BIT
35924 ? gen_rdpmc_rex64 (op0, op1, op2)
35925 : gen_rdpmc (op0, op2));
35926 emit_insn (insn);
35928 else if (fcode == IX86_BUILTIN_XGETBV)
35930 arg0 = CALL_EXPR_ARG (exp, 0);
35931 op2 = expand_normal (arg0);
35932 if (!register_operand (op2, SImode))
35933 op2 = copy_to_mode_reg (SImode, op2);
35935 insn = (TARGET_64BIT
35936 ? gen_xgetbv_rex64 (op0, op1, op2)
35937 : gen_xgetbv (op0, op2));
35938 emit_insn (insn);
35940 else if (fcode == IX86_BUILTIN_RDTSC)
35942 insn = (TARGET_64BIT
35943 ? gen_rdtsc_rex64 (op0, op1)
35944 : gen_rdtsc (op0));
35945 emit_insn (insn);
35947 else
35949 op2 = gen_reg_rtx (SImode);
35951 insn = (TARGET_64BIT
35952 ? gen_rdtscp_rex64 (op0, op1, op2)
35953 : gen_rdtscp (op0, op2));
35954 emit_insn (insn);
35956 arg0 = CALL_EXPR_ARG (exp, 0);
35957 op4 = expand_normal (arg0);
35958 if (!address_operand (op4, VOIDmode))
35960 op4 = convert_memory_address (Pmode, op4);
35961 op4 = copy_addr_to_reg (op4);
35963 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
35966 if (target == 0)
35968 /* mode is VOIDmode if __builtin_rd* has been called
35969 without lhs. */
35970 if (mode == VOIDmode)
35971 return target;
35972 target = gen_reg_rtx (mode);
35975 if (TARGET_64BIT)
35977 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
35978 op1, 1, OPTAB_DIRECT);
35979 op0 = expand_simple_binop (DImode, IOR, op0, op1,
35980 op0, 1, OPTAB_DIRECT);
35983 emit_move_insn (target, op0);
35984 return target;
35986 case IX86_BUILTIN_FXSAVE:
35987 case IX86_BUILTIN_FXRSTOR:
35988 case IX86_BUILTIN_FXSAVE64:
35989 case IX86_BUILTIN_FXRSTOR64:
35990 case IX86_BUILTIN_FNSTENV:
35991 case IX86_BUILTIN_FLDENV:
35992 mode0 = BLKmode;
35993 switch (fcode)
35995 case IX86_BUILTIN_FXSAVE:
35996 icode = CODE_FOR_fxsave;
35997 break;
35998 case IX86_BUILTIN_FXRSTOR:
35999 icode = CODE_FOR_fxrstor;
36000 break;
36001 case IX86_BUILTIN_FXSAVE64:
36002 icode = CODE_FOR_fxsave64;
36003 break;
36004 case IX86_BUILTIN_FXRSTOR64:
36005 icode = CODE_FOR_fxrstor64;
36006 break;
36007 case IX86_BUILTIN_FNSTENV:
36008 icode = CODE_FOR_fnstenv;
36009 break;
36010 case IX86_BUILTIN_FLDENV:
36011 icode = CODE_FOR_fldenv;
36012 break;
36013 default:
36014 gcc_unreachable ();
36017 arg0 = CALL_EXPR_ARG (exp, 0);
36018 op0 = expand_normal (arg0);
36020 if (!address_operand (op0, VOIDmode))
36022 op0 = convert_memory_address (Pmode, op0);
36023 op0 = copy_addr_to_reg (op0);
36025 op0 = gen_rtx_MEM (mode0, op0);
36027 pat = GEN_FCN (icode) (op0);
36028 if (pat)
36029 emit_insn (pat);
36030 return 0;
36032 case IX86_BUILTIN_XSETBV:
36033 arg0 = CALL_EXPR_ARG (exp, 0);
36034 arg1 = CALL_EXPR_ARG (exp, 1);
36035 op0 = expand_normal (arg0);
36036 op1 = expand_normal (arg1);
36038 if (!REG_P (op0))
36039 op0 = copy_to_mode_reg (SImode, op0);
36041 if (TARGET_64BIT)
36043 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36044 NULL, 1, OPTAB_DIRECT);
36046 op2 = gen_lowpart (SImode, op2);
36047 op1 = gen_lowpart (SImode, op1);
36048 if (!REG_P (op1))
36049 op1 = copy_to_mode_reg (SImode, op1);
36050 if (!REG_P (op2))
36051 op2 = copy_to_mode_reg (SImode, op2);
36052 icode = CODE_FOR_xsetbv_rex64;
36053 pat = GEN_FCN (icode) (op0, op1, op2);
36055 else
36057 if (!REG_P (op1))
36058 op1 = copy_to_mode_reg (DImode, op1);
36059 icode = CODE_FOR_xsetbv;
36060 pat = GEN_FCN (icode) (op0, op1);
36062 if (pat)
36063 emit_insn (pat);
36064 return 0;
36066 case IX86_BUILTIN_XSAVE:
36067 case IX86_BUILTIN_XRSTOR:
36068 case IX86_BUILTIN_XSAVE64:
36069 case IX86_BUILTIN_XRSTOR64:
36070 case IX86_BUILTIN_XSAVEOPT:
36071 case IX86_BUILTIN_XSAVEOPT64:
36072 case IX86_BUILTIN_XSAVES:
36073 case IX86_BUILTIN_XRSTORS:
36074 case IX86_BUILTIN_XSAVES64:
36075 case IX86_BUILTIN_XRSTORS64:
36076 case IX86_BUILTIN_XSAVEC:
36077 case IX86_BUILTIN_XSAVEC64:
36078 arg0 = CALL_EXPR_ARG (exp, 0);
36079 arg1 = CALL_EXPR_ARG (exp, 1);
36080 op0 = expand_normal (arg0);
36081 op1 = expand_normal (arg1);
36083 if (!address_operand (op0, VOIDmode))
36085 op0 = convert_memory_address (Pmode, op0);
36086 op0 = copy_addr_to_reg (op0);
36088 op0 = gen_rtx_MEM (BLKmode, op0);
36090 op1 = force_reg (DImode, op1);
36092 if (TARGET_64BIT)
36094 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36095 NULL, 1, OPTAB_DIRECT);
36096 switch (fcode)
36098 case IX86_BUILTIN_XSAVE:
36099 icode = CODE_FOR_xsave_rex64;
36100 break;
36101 case IX86_BUILTIN_XRSTOR:
36102 icode = CODE_FOR_xrstor_rex64;
36103 break;
36104 case IX86_BUILTIN_XSAVE64:
36105 icode = CODE_FOR_xsave64;
36106 break;
36107 case IX86_BUILTIN_XRSTOR64:
36108 icode = CODE_FOR_xrstor64;
36109 break;
36110 case IX86_BUILTIN_XSAVEOPT:
36111 icode = CODE_FOR_xsaveopt_rex64;
36112 break;
36113 case IX86_BUILTIN_XSAVEOPT64:
36114 icode = CODE_FOR_xsaveopt64;
36115 break;
36116 case IX86_BUILTIN_XSAVES:
36117 icode = CODE_FOR_xsaves_rex64;
36118 break;
36119 case IX86_BUILTIN_XRSTORS:
36120 icode = CODE_FOR_xrstors_rex64;
36121 break;
36122 case IX86_BUILTIN_XSAVES64:
36123 icode = CODE_FOR_xsaves64;
36124 break;
36125 case IX86_BUILTIN_XRSTORS64:
36126 icode = CODE_FOR_xrstors64;
36127 break;
36128 case IX86_BUILTIN_XSAVEC:
36129 icode = CODE_FOR_xsavec_rex64;
36130 break;
36131 case IX86_BUILTIN_XSAVEC64:
36132 icode = CODE_FOR_xsavec64;
36133 break;
36134 default:
36135 gcc_unreachable ();
36138 op2 = gen_lowpart (SImode, op2);
36139 op1 = gen_lowpart (SImode, op1);
36140 pat = GEN_FCN (icode) (op0, op1, op2);
36142 else
36144 switch (fcode)
36146 case IX86_BUILTIN_XSAVE:
36147 icode = CODE_FOR_xsave;
36148 break;
36149 case IX86_BUILTIN_XRSTOR:
36150 icode = CODE_FOR_xrstor;
36151 break;
36152 case IX86_BUILTIN_XSAVEOPT:
36153 icode = CODE_FOR_xsaveopt;
36154 break;
36155 case IX86_BUILTIN_XSAVES:
36156 icode = CODE_FOR_xsaves;
36157 break;
36158 case IX86_BUILTIN_XRSTORS:
36159 icode = CODE_FOR_xrstors;
36160 break;
36161 case IX86_BUILTIN_XSAVEC:
36162 icode = CODE_FOR_xsavec;
36163 break;
36164 default:
36165 gcc_unreachable ();
36167 pat = GEN_FCN (icode) (op0, op1);
36170 if (pat)
36171 emit_insn (pat);
36172 return 0;
36174 case IX86_BUILTIN_LLWPCB:
36175 arg0 = CALL_EXPR_ARG (exp, 0);
36176 op0 = expand_normal (arg0);
36177 icode = CODE_FOR_lwp_llwpcb;
36178 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36179 op0 = ix86_zero_extend_to_Pmode (op0);
36180 emit_insn (gen_lwp_llwpcb (op0));
36181 return 0;
36183 case IX86_BUILTIN_SLWPCB:
36184 icode = CODE_FOR_lwp_slwpcb;
36185 if (!target
36186 || !insn_data[icode].operand[0].predicate (target, Pmode))
36187 target = gen_reg_rtx (Pmode);
36188 emit_insn (gen_lwp_slwpcb (target));
36189 return target;
36191 case IX86_BUILTIN_BEXTRI32:
36192 case IX86_BUILTIN_BEXTRI64:
36193 arg0 = CALL_EXPR_ARG (exp, 0);
36194 arg1 = CALL_EXPR_ARG (exp, 1);
36195 op0 = expand_normal (arg0);
36196 op1 = expand_normal (arg1);
36197 icode = (fcode == IX86_BUILTIN_BEXTRI32
36198 ? CODE_FOR_tbm_bextri_si
36199 : CODE_FOR_tbm_bextri_di);
36200 if (!CONST_INT_P (op1))
36202 error ("last argument must be an immediate");
36203 return const0_rtx;
36205 else
36207 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36208 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36209 op1 = GEN_INT (length);
36210 op2 = GEN_INT (lsb_index);
36211 pat = GEN_FCN (icode) (target, op0, op1, op2);
36212 if (pat)
36213 emit_insn (pat);
36214 return target;
36217 case IX86_BUILTIN_RDRAND16_STEP:
36218 icode = CODE_FOR_rdrandhi_1;
36219 mode0 = HImode;
36220 goto rdrand_step;
36222 case IX86_BUILTIN_RDRAND32_STEP:
36223 icode = CODE_FOR_rdrandsi_1;
36224 mode0 = SImode;
36225 goto rdrand_step;
36227 case IX86_BUILTIN_RDRAND64_STEP:
36228 icode = CODE_FOR_rdranddi_1;
36229 mode0 = DImode;
36231 rdrand_step:
36232 arg0 = CALL_EXPR_ARG (exp, 0);
36233 op1 = expand_normal (arg0);
36234 if (!address_operand (op1, VOIDmode))
36236 op1 = convert_memory_address (Pmode, op1);
36237 op1 = copy_addr_to_reg (op1);
36240 op0 = gen_reg_rtx (mode0);
36241 emit_insn (GEN_FCN (icode) (op0));
36243 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36245 op1 = gen_reg_rtx (SImode);
36246 emit_move_insn (op1, CONST1_RTX (SImode));
36248 /* Emit SImode conditional move. */
36249 if (mode0 == HImode)
36251 if (TARGET_ZERO_EXTEND_WITH_AND
36252 && optimize_function_for_speed_p (cfun))
36254 op2 = force_reg (SImode, const0_rtx);
36256 emit_insn (gen_movstricthi
36257 (gen_lowpart (HImode, op2), op0));
36259 else
36261 op2 = gen_reg_rtx (SImode);
36263 emit_insn (gen_zero_extendhisi2 (op2, op0));
36266 else if (mode0 == SImode)
36267 op2 = op0;
36268 else
36269 op2 = gen_rtx_SUBREG (SImode, op0, 0);
36271 if (target == 0
36272 || !register_operand (target, SImode))
36273 target = gen_reg_rtx (SImode);
36275 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
36276 const0_rtx);
36277 emit_insn (gen_rtx_SET (target,
36278 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
36279 return target;
36281 case IX86_BUILTIN_RDSEED16_STEP:
36282 icode = CODE_FOR_rdseedhi_1;
36283 mode0 = HImode;
36284 goto rdseed_step;
36286 case IX86_BUILTIN_RDSEED32_STEP:
36287 icode = CODE_FOR_rdseedsi_1;
36288 mode0 = SImode;
36289 goto rdseed_step;
36291 case IX86_BUILTIN_RDSEED64_STEP:
36292 icode = CODE_FOR_rdseeddi_1;
36293 mode0 = DImode;
36295 rdseed_step:
36296 arg0 = CALL_EXPR_ARG (exp, 0);
36297 op1 = expand_normal (arg0);
36298 if (!address_operand (op1, VOIDmode))
36300 op1 = convert_memory_address (Pmode, op1);
36301 op1 = copy_addr_to_reg (op1);
36304 op0 = gen_reg_rtx (mode0);
36305 emit_insn (GEN_FCN (icode) (op0));
36307 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36309 op2 = gen_reg_rtx (QImode);
36311 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
36312 const0_rtx);
36313 emit_insn (gen_rtx_SET (op2, pat));
36315 if (target == 0
36316 || !register_operand (target, SImode))
36317 target = gen_reg_rtx (SImode);
36319 emit_insn (gen_zero_extendqisi2 (target, op2));
36320 return target;
36322 case IX86_BUILTIN_SBB32:
36323 icode = CODE_FOR_subborrowsi;
36324 icode2 = CODE_FOR_subborrowsi_0;
36325 mode0 = SImode;
36326 mode1 = DImode;
36327 mode2 = CCmode;
36328 goto handlecarry;
36330 case IX86_BUILTIN_SBB64:
36331 icode = CODE_FOR_subborrowdi;
36332 icode2 = CODE_FOR_subborrowdi_0;
36333 mode0 = DImode;
36334 mode1 = TImode;
36335 mode2 = CCmode;
36336 goto handlecarry;
36338 case IX86_BUILTIN_ADDCARRYX32:
36339 icode = CODE_FOR_addcarrysi;
36340 icode2 = CODE_FOR_addcarrysi_0;
36341 mode0 = SImode;
36342 mode1 = DImode;
36343 mode2 = CCCmode;
36344 goto handlecarry;
36346 case IX86_BUILTIN_ADDCARRYX64:
36347 icode = CODE_FOR_addcarrydi;
36348 icode2 = CODE_FOR_addcarrydi_0;
36349 mode0 = DImode;
36350 mode1 = TImode;
36351 mode2 = CCCmode;
36353 handlecarry:
36354 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
36355 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
36356 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
36357 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
36359 op1 = expand_normal (arg0);
36360 if (!integer_zerop (arg0))
36361 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
36363 op2 = expand_normal (arg1);
36364 if (!register_operand (op2, mode0))
36365 op2 = copy_to_mode_reg (mode0, op2);
36367 op3 = expand_normal (arg2);
36368 if (!register_operand (op3, mode0))
36369 op3 = copy_to_mode_reg (mode0, op3);
36371 op4 = expand_normal (arg3);
36372 if (!address_operand (op4, VOIDmode))
36374 op4 = convert_memory_address (Pmode, op4);
36375 op4 = copy_addr_to_reg (op4);
36378 op0 = gen_reg_rtx (mode0);
36379 if (integer_zerop (arg0))
36381 /* If arg0 is 0, optimize right away into add or sub
36382 instruction that sets CCCmode flags. */
36383 op1 = gen_rtx_REG (mode2, FLAGS_REG);
36384 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
36386 else
36388 /* Generate CF from input operand. */
36389 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
36391 /* Generate instruction that consumes CF. */
36392 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
36393 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
36394 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
36395 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
36398 /* Return current CF value. */
36399 if (target == 0)
36400 target = gen_reg_rtx (QImode);
36402 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
36403 emit_insn (gen_rtx_SET (target, pat));
36405 /* Store the result. */
36406 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
36408 return target;
36410 case IX86_BUILTIN_READ_FLAGS:
36411 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
36413 if (optimize
36414 || target == NULL_RTX
36415 || !nonimmediate_operand (target, word_mode)
36416 || GET_MODE (target) != word_mode)
36417 target = gen_reg_rtx (word_mode);
36419 emit_insn (gen_pop (target));
36420 return target;
36422 case IX86_BUILTIN_WRITE_FLAGS:
36424 arg0 = CALL_EXPR_ARG (exp, 0);
36425 op0 = expand_normal (arg0);
36426 if (!general_no_elim_operand (op0, word_mode))
36427 op0 = copy_to_mode_reg (word_mode, op0);
36429 emit_insn (gen_push (op0));
36430 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
36431 return 0;
36433 case IX86_BUILTIN_KTESTC8:
36434 icode = CODE_FOR_ktestqi;
36435 mode3 = CCCmode;
36436 goto kortest;
36438 case IX86_BUILTIN_KTESTZ8:
36439 icode = CODE_FOR_ktestqi;
36440 mode3 = CCZmode;
36441 goto kortest;
36443 case IX86_BUILTIN_KTESTC16:
36444 icode = CODE_FOR_ktesthi;
36445 mode3 = CCCmode;
36446 goto kortest;
36448 case IX86_BUILTIN_KTESTZ16:
36449 icode = CODE_FOR_ktesthi;
36450 mode3 = CCZmode;
36451 goto kortest;
36453 case IX86_BUILTIN_KTESTC32:
36454 icode = CODE_FOR_ktestsi;
36455 mode3 = CCCmode;
36456 goto kortest;
36458 case IX86_BUILTIN_KTESTZ32:
36459 icode = CODE_FOR_ktestsi;
36460 mode3 = CCZmode;
36461 goto kortest;
36463 case IX86_BUILTIN_KTESTC64:
36464 icode = CODE_FOR_ktestdi;
36465 mode3 = CCCmode;
36466 goto kortest;
36468 case IX86_BUILTIN_KTESTZ64:
36469 icode = CODE_FOR_ktestdi;
36470 mode3 = CCZmode;
36471 goto kortest;
36473 case IX86_BUILTIN_KORTESTC8:
36474 icode = CODE_FOR_kortestqi;
36475 mode3 = CCCmode;
36476 goto kortest;
36478 case IX86_BUILTIN_KORTESTZ8:
36479 icode = CODE_FOR_kortestqi;
36480 mode3 = CCZmode;
36481 goto kortest;
36483 case IX86_BUILTIN_KORTESTC16:
36484 icode = CODE_FOR_kortesthi;
36485 mode3 = CCCmode;
36486 goto kortest;
36488 case IX86_BUILTIN_KORTESTZ16:
36489 icode = CODE_FOR_kortesthi;
36490 mode3 = CCZmode;
36491 goto kortest;
36493 case IX86_BUILTIN_KORTESTC32:
36494 icode = CODE_FOR_kortestsi;
36495 mode3 = CCCmode;
36496 goto kortest;
36498 case IX86_BUILTIN_KORTESTZ32:
36499 icode = CODE_FOR_kortestsi;
36500 mode3 = CCZmode;
36501 goto kortest;
36503 case IX86_BUILTIN_KORTESTC64:
36504 icode = CODE_FOR_kortestdi;
36505 mode3 = CCCmode;
36506 goto kortest;
36508 case IX86_BUILTIN_KORTESTZ64:
36509 icode = CODE_FOR_kortestdi;
36510 mode3 = CCZmode;
36512 kortest:
36513 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
36514 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
36515 op0 = expand_normal (arg0);
36516 op1 = expand_normal (arg1);
36518 mode0 = insn_data[icode].operand[0].mode;
36519 mode1 = insn_data[icode].operand[1].mode;
36521 if (GET_MODE (op0) != VOIDmode)
36522 op0 = force_reg (GET_MODE (op0), op0);
36524 op0 = gen_lowpart (mode0, op0);
36526 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36527 op0 = copy_to_mode_reg (mode0, op0);
36529 if (GET_MODE (op1) != VOIDmode)
36530 op1 = force_reg (GET_MODE (op1), op1);
36532 op1 = gen_lowpart (mode1, op1);
36534 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36535 op1 = copy_to_mode_reg (mode1, op1);
36537 target = gen_reg_rtx (QImode);
36539 /* Emit kortest. */
36540 emit_insn (GEN_FCN (icode) (op0, op1));
36541 /* And use setcc to return result from flags. */
36542 ix86_expand_setcc (target, EQ,
36543 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
36544 return target;
36546 case IX86_BUILTIN_GATHERSIV2DF:
36547 icode = CODE_FOR_avx2_gathersiv2df;
36548 goto gather_gen;
36549 case IX86_BUILTIN_GATHERSIV4DF:
36550 icode = CODE_FOR_avx2_gathersiv4df;
36551 goto gather_gen;
36552 case IX86_BUILTIN_GATHERDIV2DF:
36553 icode = CODE_FOR_avx2_gatherdiv2df;
36554 goto gather_gen;
36555 case IX86_BUILTIN_GATHERDIV4DF:
36556 icode = CODE_FOR_avx2_gatherdiv4df;
36557 goto gather_gen;
36558 case IX86_BUILTIN_GATHERSIV4SF:
36559 icode = CODE_FOR_avx2_gathersiv4sf;
36560 goto gather_gen;
36561 case IX86_BUILTIN_GATHERSIV8SF:
36562 icode = CODE_FOR_avx2_gathersiv8sf;
36563 goto gather_gen;
36564 case IX86_BUILTIN_GATHERDIV4SF:
36565 icode = CODE_FOR_avx2_gatherdiv4sf;
36566 goto gather_gen;
36567 case IX86_BUILTIN_GATHERDIV8SF:
36568 icode = CODE_FOR_avx2_gatherdiv8sf;
36569 goto gather_gen;
36570 case IX86_BUILTIN_GATHERSIV2DI:
36571 icode = CODE_FOR_avx2_gathersiv2di;
36572 goto gather_gen;
36573 case IX86_BUILTIN_GATHERSIV4DI:
36574 icode = CODE_FOR_avx2_gathersiv4di;
36575 goto gather_gen;
36576 case IX86_BUILTIN_GATHERDIV2DI:
36577 icode = CODE_FOR_avx2_gatherdiv2di;
36578 goto gather_gen;
36579 case IX86_BUILTIN_GATHERDIV4DI:
36580 icode = CODE_FOR_avx2_gatherdiv4di;
36581 goto gather_gen;
36582 case IX86_BUILTIN_GATHERSIV4SI:
36583 icode = CODE_FOR_avx2_gathersiv4si;
36584 goto gather_gen;
36585 case IX86_BUILTIN_GATHERSIV8SI:
36586 icode = CODE_FOR_avx2_gathersiv8si;
36587 goto gather_gen;
36588 case IX86_BUILTIN_GATHERDIV4SI:
36589 icode = CODE_FOR_avx2_gatherdiv4si;
36590 goto gather_gen;
36591 case IX86_BUILTIN_GATHERDIV8SI:
36592 icode = CODE_FOR_avx2_gatherdiv8si;
36593 goto gather_gen;
36594 case IX86_BUILTIN_GATHERALTSIV4DF:
36595 icode = CODE_FOR_avx2_gathersiv4df;
36596 goto gather_gen;
36597 case IX86_BUILTIN_GATHERALTDIV8SF:
36598 icode = CODE_FOR_avx2_gatherdiv8sf;
36599 goto gather_gen;
36600 case IX86_BUILTIN_GATHERALTSIV4DI:
36601 icode = CODE_FOR_avx2_gathersiv4di;
36602 goto gather_gen;
36603 case IX86_BUILTIN_GATHERALTDIV8SI:
36604 icode = CODE_FOR_avx2_gatherdiv8si;
36605 goto gather_gen;
36606 case IX86_BUILTIN_GATHER3SIV16SF:
36607 icode = CODE_FOR_avx512f_gathersiv16sf;
36608 goto gather_gen;
36609 case IX86_BUILTIN_GATHER3SIV8DF:
36610 icode = CODE_FOR_avx512f_gathersiv8df;
36611 goto gather_gen;
36612 case IX86_BUILTIN_GATHER3DIV16SF:
36613 icode = CODE_FOR_avx512f_gatherdiv16sf;
36614 goto gather_gen;
36615 case IX86_BUILTIN_GATHER3DIV8DF:
36616 icode = CODE_FOR_avx512f_gatherdiv8df;
36617 goto gather_gen;
36618 case IX86_BUILTIN_GATHER3SIV16SI:
36619 icode = CODE_FOR_avx512f_gathersiv16si;
36620 goto gather_gen;
36621 case IX86_BUILTIN_GATHER3SIV8DI:
36622 icode = CODE_FOR_avx512f_gathersiv8di;
36623 goto gather_gen;
36624 case IX86_BUILTIN_GATHER3DIV16SI:
36625 icode = CODE_FOR_avx512f_gatherdiv16si;
36626 goto gather_gen;
36627 case IX86_BUILTIN_GATHER3DIV8DI:
36628 icode = CODE_FOR_avx512f_gatherdiv8di;
36629 goto gather_gen;
36630 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36631 icode = CODE_FOR_avx512f_gathersiv8df;
36632 goto gather_gen;
36633 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36634 icode = CODE_FOR_avx512f_gatherdiv16sf;
36635 goto gather_gen;
36636 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36637 icode = CODE_FOR_avx512f_gathersiv8di;
36638 goto gather_gen;
36639 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36640 icode = CODE_FOR_avx512f_gatherdiv16si;
36641 goto gather_gen;
36642 case IX86_BUILTIN_GATHER3SIV2DF:
36643 icode = CODE_FOR_avx512vl_gathersiv2df;
36644 goto gather_gen;
36645 case IX86_BUILTIN_GATHER3SIV4DF:
36646 icode = CODE_FOR_avx512vl_gathersiv4df;
36647 goto gather_gen;
36648 case IX86_BUILTIN_GATHER3DIV2DF:
36649 icode = CODE_FOR_avx512vl_gatherdiv2df;
36650 goto gather_gen;
36651 case IX86_BUILTIN_GATHER3DIV4DF:
36652 icode = CODE_FOR_avx512vl_gatherdiv4df;
36653 goto gather_gen;
36654 case IX86_BUILTIN_GATHER3SIV4SF:
36655 icode = CODE_FOR_avx512vl_gathersiv4sf;
36656 goto gather_gen;
36657 case IX86_BUILTIN_GATHER3SIV8SF:
36658 icode = CODE_FOR_avx512vl_gathersiv8sf;
36659 goto gather_gen;
36660 case IX86_BUILTIN_GATHER3DIV4SF:
36661 icode = CODE_FOR_avx512vl_gatherdiv4sf;
36662 goto gather_gen;
36663 case IX86_BUILTIN_GATHER3DIV8SF:
36664 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36665 goto gather_gen;
36666 case IX86_BUILTIN_GATHER3SIV2DI:
36667 icode = CODE_FOR_avx512vl_gathersiv2di;
36668 goto gather_gen;
36669 case IX86_BUILTIN_GATHER3SIV4DI:
36670 icode = CODE_FOR_avx512vl_gathersiv4di;
36671 goto gather_gen;
36672 case IX86_BUILTIN_GATHER3DIV2DI:
36673 icode = CODE_FOR_avx512vl_gatherdiv2di;
36674 goto gather_gen;
36675 case IX86_BUILTIN_GATHER3DIV4DI:
36676 icode = CODE_FOR_avx512vl_gatherdiv4di;
36677 goto gather_gen;
36678 case IX86_BUILTIN_GATHER3SIV4SI:
36679 icode = CODE_FOR_avx512vl_gathersiv4si;
36680 goto gather_gen;
36681 case IX86_BUILTIN_GATHER3SIV8SI:
36682 icode = CODE_FOR_avx512vl_gathersiv8si;
36683 goto gather_gen;
36684 case IX86_BUILTIN_GATHER3DIV4SI:
36685 icode = CODE_FOR_avx512vl_gatherdiv4si;
36686 goto gather_gen;
36687 case IX86_BUILTIN_GATHER3DIV8SI:
36688 icode = CODE_FOR_avx512vl_gatherdiv8si;
36689 goto gather_gen;
36690 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36691 icode = CODE_FOR_avx512vl_gathersiv4df;
36692 goto gather_gen;
36693 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36694 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36695 goto gather_gen;
36696 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36697 icode = CODE_FOR_avx512vl_gathersiv4di;
36698 goto gather_gen;
36699 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36700 icode = CODE_FOR_avx512vl_gatherdiv8si;
36701 goto gather_gen;
36702 case IX86_BUILTIN_SCATTERSIV16SF:
36703 icode = CODE_FOR_avx512f_scattersiv16sf;
36704 goto scatter_gen;
36705 case IX86_BUILTIN_SCATTERSIV8DF:
36706 icode = CODE_FOR_avx512f_scattersiv8df;
36707 goto scatter_gen;
36708 case IX86_BUILTIN_SCATTERDIV16SF:
36709 icode = CODE_FOR_avx512f_scatterdiv16sf;
36710 goto scatter_gen;
36711 case IX86_BUILTIN_SCATTERDIV8DF:
36712 icode = CODE_FOR_avx512f_scatterdiv8df;
36713 goto scatter_gen;
36714 case IX86_BUILTIN_SCATTERSIV16SI:
36715 icode = CODE_FOR_avx512f_scattersiv16si;
36716 goto scatter_gen;
36717 case IX86_BUILTIN_SCATTERSIV8DI:
36718 icode = CODE_FOR_avx512f_scattersiv8di;
36719 goto scatter_gen;
36720 case IX86_BUILTIN_SCATTERDIV16SI:
36721 icode = CODE_FOR_avx512f_scatterdiv16si;
36722 goto scatter_gen;
36723 case IX86_BUILTIN_SCATTERDIV8DI:
36724 icode = CODE_FOR_avx512f_scatterdiv8di;
36725 goto scatter_gen;
36726 case IX86_BUILTIN_SCATTERSIV8SF:
36727 icode = CODE_FOR_avx512vl_scattersiv8sf;
36728 goto scatter_gen;
36729 case IX86_BUILTIN_SCATTERSIV4SF:
36730 icode = CODE_FOR_avx512vl_scattersiv4sf;
36731 goto scatter_gen;
36732 case IX86_BUILTIN_SCATTERSIV4DF:
36733 icode = CODE_FOR_avx512vl_scattersiv4df;
36734 goto scatter_gen;
36735 case IX86_BUILTIN_SCATTERSIV2DF:
36736 icode = CODE_FOR_avx512vl_scattersiv2df;
36737 goto scatter_gen;
36738 case IX86_BUILTIN_SCATTERDIV8SF:
36739 icode = CODE_FOR_avx512vl_scatterdiv8sf;
36740 goto scatter_gen;
36741 case IX86_BUILTIN_SCATTERDIV4SF:
36742 icode = CODE_FOR_avx512vl_scatterdiv4sf;
36743 goto scatter_gen;
36744 case IX86_BUILTIN_SCATTERDIV4DF:
36745 icode = CODE_FOR_avx512vl_scatterdiv4df;
36746 goto scatter_gen;
36747 case IX86_BUILTIN_SCATTERDIV2DF:
36748 icode = CODE_FOR_avx512vl_scatterdiv2df;
36749 goto scatter_gen;
36750 case IX86_BUILTIN_SCATTERSIV8SI:
36751 icode = CODE_FOR_avx512vl_scattersiv8si;
36752 goto scatter_gen;
36753 case IX86_BUILTIN_SCATTERSIV4SI:
36754 icode = CODE_FOR_avx512vl_scattersiv4si;
36755 goto scatter_gen;
36756 case IX86_BUILTIN_SCATTERSIV4DI:
36757 icode = CODE_FOR_avx512vl_scattersiv4di;
36758 goto scatter_gen;
36759 case IX86_BUILTIN_SCATTERSIV2DI:
36760 icode = CODE_FOR_avx512vl_scattersiv2di;
36761 goto scatter_gen;
36762 case IX86_BUILTIN_SCATTERDIV8SI:
36763 icode = CODE_FOR_avx512vl_scatterdiv8si;
36764 goto scatter_gen;
36765 case IX86_BUILTIN_SCATTERDIV4SI:
36766 icode = CODE_FOR_avx512vl_scatterdiv4si;
36767 goto scatter_gen;
36768 case IX86_BUILTIN_SCATTERDIV4DI:
36769 icode = CODE_FOR_avx512vl_scatterdiv4di;
36770 goto scatter_gen;
36771 case IX86_BUILTIN_SCATTERDIV2DI:
36772 icode = CODE_FOR_avx512vl_scatterdiv2di;
36773 goto scatter_gen;
36774 case IX86_BUILTIN_GATHERPFDPD:
36775 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
36776 goto vec_prefetch_gen;
36777 case IX86_BUILTIN_SCATTERALTSIV8DF:
36778 icode = CODE_FOR_avx512f_scattersiv8df;
36779 goto scatter_gen;
36780 case IX86_BUILTIN_SCATTERALTDIV16SF:
36781 icode = CODE_FOR_avx512f_scatterdiv16sf;
36782 goto scatter_gen;
36783 case IX86_BUILTIN_SCATTERALTSIV8DI:
36784 icode = CODE_FOR_avx512f_scattersiv8di;
36785 goto scatter_gen;
36786 case IX86_BUILTIN_SCATTERALTDIV16SI:
36787 icode = CODE_FOR_avx512f_scatterdiv16si;
36788 goto scatter_gen;
36789 case IX86_BUILTIN_GATHERPFDPS:
36790 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
36791 goto vec_prefetch_gen;
36792 case IX86_BUILTIN_GATHERPFQPD:
36793 icode = CODE_FOR_avx512pf_gatherpfv8didf;
36794 goto vec_prefetch_gen;
36795 case IX86_BUILTIN_GATHERPFQPS:
36796 icode = CODE_FOR_avx512pf_gatherpfv8disf;
36797 goto vec_prefetch_gen;
36798 case IX86_BUILTIN_SCATTERPFDPD:
36799 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
36800 goto vec_prefetch_gen;
36801 case IX86_BUILTIN_SCATTERPFDPS:
36802 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
36803 goto vec_prefetch_gen;
36804 case IX86_BUILTIN_SCATTERPFQPD:
36805 icode = CODE_FOR_avx512pf_scatterpfv8didf;
36806 goto vec_prefetch_gen;
36807 case IX86_BUILTIN_SCATTERPFQPS:
36808 icode = CODE_FOR_avx512pf_scatterpfv8disf;
36809 goto vec_prefetch_gen;
36811 gather_gen:
36812 rtx half;
36813 rtx (*gen) (rtx, rtx);
36815 arg0 = CALL_EXPR_ARG (exp, 0);
36816 arg1 = CALL_EXPR_ARG (exp, 1);
36817 arg2 = CALL_EXPR_ARG (exp, 2);
36818 arg3 = CALL_EXPR_ARG (exp, 3);
36819 arg4 = CALL_EXPR_ARG (exp, 4);
36820 op0 = expand_normal (arg0);
36821 op1 = expand_normal (arg1);
36822 op2 = expand_normal (arg2);
36823 op3 = expand_normal (arg3);
36824 op4 = expand_normal (arg4);
36825 /* Note the arg order is different from the operand order. */
36826 mode0 = insn_data[icode].operand[1].mode;
36827 mode2 = insn_data[icode].operand[3].mode;
36828 mode3 = insn_data[icode].operand[4].mode;
36829 mode4 = insn_data[icode].operand[5].mode;
36831 if (target == NULL_RTX
36832 || GET_MODE (target) != insn_data[icode].operand[0].mode
36833 || !insn_data[icode].operand[0].predicate (target,
36834 GET_MODE (target)))
36835 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
36836 else
36837 subtarget = target;
36839 switch (fcode)
36841 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36842 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36843 half = gen_reg_rtx (V8SImode);
36844 if (!nonimmediate_operand (op2, V16SImode))
36845 op2 = copy_to_mode_reg (V16SImode, op2);
36846 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36847 op2 = half;
36848 break;
36849 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36850 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36851 case IX86_BUILTIN_GATHERALTSIV4DF:
36852 case IX86_BUILTIN_GATHERALTSIV4DI:
36853 half = gen_reg_rtx (V4SImode);
36854 if (!nonimmediate_operand (op2, V8SImode))
36855 op2 = copy_to_mode_reg (V8SImode, op2);
36856 emit_insn (gen_vec_extract_lo_v8si (half, op2));
36857 op2 = half;
36858 break;
36859 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36860 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36861 half = gen_reg_rtx (mode0);
36862 if (mode0 == V8SFmode)
36863 gen = gen_vec_extract_lo_v16sf;
36864 else
36865 gen = gen_vec_extract_lo_v16si;
36866 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36867 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36868 emit_insn (gen (half, op0));
36869 op0 = half;
36870 if (GET_MODE (op3) != VOIDmode)
36872 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36873 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36874 emit_insn (gen (half, op3));
36875 op3 = half;
36877 break;
36878 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36879 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36880 case IX86_BUILTIN_GATHERALTDIV8SF:
36881 case IX86_BUILTIN_GATHERALTDIV8SI:
36882 half = gen_reg_rtx (mode0);
36883 if (mode0 == V4SFmode)
36884 gen = gen_vec_extract_lo_v8sf;
36885 else
36886 gen = gen_vec_extract_lo_v8si;
36887 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36888 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36889 emit_insn (gen (half, op0));
36890 op0 = half;
36891 if (GET_MODE (op3) != VOIDmode)
36893 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36894 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36895 emit_insn (gen (half, op3));
36896 op3 = half;
36898 break;
36899 default:
36900 break;
36903 /* Force memory operand only with base register here. But we
36904 don't want to do it on memory operand for other builtin
36905 functions. */
36906 op1 = ix86_zero_extend_to_Pmode (op1);
36908 if (!insn_data[icode].operand[1].predicate (op0, mode0))
36909 op0 = copy_to_mode_reg (mode0, op0);
36910 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
36911 op1 = copy_to_mode_reg (Pmode, op1);
36912 if (!insn_data[icode].operand[3].predicate (op2, mode2))
36913 op2 = copy_to_mode_reg (mode2, op2);
36915 op3 = fixup_modeless_constant (op3, mode3);
36917 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
36919 if (!insn_data[icode].operand[4].predicate (op3, mode3))
36920 op3 = copy_to_mode_reg (mode3, op3);
36922 else
36924 op3 = copy_to_reg (op3);
36925 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
36927 if (!insn_data[icode].operand[5].predicate (op4, mode4))
36929 error ("the last argument must be scale 1, 2, 4, 8");
36930 return const0_rtx;
36933 /* Optimize. If mask is known to have all high bits set,
36934 replace op0 with pc_rtx to signal that the instruction
36935 overwrites the whole destination and doesn't use its
36936 previous contents. */
36937 if (optimize)
36939 if (TREE_CODE (arg3) == INTEGER_CST)
36941 if (integer_all_onesp (arg3))
36942 op0 = pc_rtx;
36944 else if (TREE_CODE (arg3) == VECTOR_CST)
36946 unsigned int negative = 0;
36947 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
36949 tree cst = VECTOR_CST_ELT (arg3, i);
36950 if (TREE_CODE (cst) == INTEGER_CST
36951 && tree_int_cst_sign_bit (cst))
36952 negative++;
36953 else if (TREE_CODE (cst) == REAL_CST
36954 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
36955 negative++;
36957 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
36958 op0 = pc_rtx;
36960 else if (TREE_CODE (arg3) == SSA_NAME
36961 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
36963 /* Recognize also when mask is like:
36964 __v2df src = _mm_setzero_pd ();
36965 __v2df mask = _mm_cmpeq_pd (src, src);
36967 __v8sf src = _mm256_setzero_ps ();
36968 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
36969 as that is a cheaper way to load all ones into
36970 a register than having to load a constant from
36971 memory. */
36972 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
36973 if (is_gimple_call (def_stmt))
36975 tree fndecl = gimple_call_fndecl (def_stmt);
36976 if (fndecl
36977 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
36978 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
36980 case IX86_BUILTIN_CMPPD:
36981 case IX86_BUILTIN_CMPPS:
36982 case IX86_BUILTIN_CMPPD256:
36983 case IX86_BUILTIN_CMPPS256:
36984 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
36985 break;
36986 /* FALLTHRU */
36987 case IX86_BUILTIN_CMPEQPD:
36988 case IX86_BUILTIN_CMPEQPS:
36989 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
36990 && initializer_zerop (gimple_call_arg (def_stmt,
36991 1)))
36992 op0 = pc_rtx;
36993 break;
36994 default:
36995 break;
37001 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37002 if (! pat)
37003 return const0_rtx;
37004 emit_insn (pat);
37006 switch (fcode)
37008 case IX86_BUILTIN_GATHER3DIV16SF:
37009 if (target == NULL_RTX)
37010 target = gen_reg_rtx (V8SFmode);
37011 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37012 break;
37013 case IX86_BUILTIN_GATHER3DIV16SI:
37014 if (target == NULL_RTX)
37015 target = gen_reg_rtx (V8SImode);
37016 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37017 break;
37018 case IX86_BUILTIN_GATHER3DIV8SF:
37019 case IX86_BUILTIN_GATHERDIV8SF:
37020 if (target == NULL_RTX)
37021 target = gen_reg_rtx (V4SFmode);
37022 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37023 break;
37024 case IX86_BUILTIN_GATHER3DIV8SI:
37025 case IX86_BUILTIN_GATHERDIV8SI:
37026 if (target == NULL_RTX)
37027 target = gen_reg_rtx (V4SImode);
37028 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37029 break;
37030 default:
37031 target = subtarget;
37032 break;
37034 return target;
37036 scatter_gen:
37037 arg0 = CALL_EXPR_ARG (exp, 0);
37038 arg1 = CALL_EXPR_ARG (exp, 1);
37039 arg2 = CALL_EXPR_ARG (exp, 2);
37040 arg3 = CALL_EXPR_ARG (exp, 3);
37041 arg4 = CALL_EXPR_ARG (exp, 4);
37042 op0 = expand_normal (arg0);
37043 op1 = expand_normal (arg1);
37044 op2 = expand_normal (arg2);
37045 op3 = expand_normal (arg3);
37046 op4 = expand_normal (arg4);
37047 mode1 = insn_data[icode].operand[1].mode;
37048 mode2 = insn_data[icode].operand[2].mode;
37049 mode3 = insn_data[icode].operand[3].mode;
37050 mode4 = insn_data[icode].operand[4].mode;
37052 /* Scatter instruction stores operand op3 to memory with
37053 indices from op2 and scale from op4 under writemask op1.
37054 If index operand op2 has more elements then source operand
37055 op3 one need to use only its low half. And vice versa. */
37056 switch (fcode)
37058 case IX86_BUILTIN_SCATTERALTSIV8DF:
37059 case IX86_BUILTIN_SCATTERALTSIV8DI:
37060 half = gen_reg_rtx (V8SImode);
37061 if (!nonimmediate_operand (op2, V16SImode))
37062 op2 = copy_to_mode_reg (V16SImode, op2);
37063 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37064 op2 = half;
37065 break;
37066 case IX86_BUILTIN_SCATTERALTDIV16SF:
37067 case IX86_BUILTIN_SCATTERALTDIV16SI:
37068 half = gen_reg_rtx (mode3);
37069 if (mode3 == V8SFmode)
37070 gen = gen_vec_extract_lo_v16sf;
37071 else
37072 gen = gen_vec_extract_lo_v16si;
37073 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37074 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37075 emit_insn (gen (half, op3));
37076 op3 = half;
37077 break;
37078 default:
37079 break;
37082 /* Force memory operand only with base register here. But we
37083 don't want to do it on memory operand for other builtin
37084 functions. */
37085 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37087 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37088 op0 = copy_to_mode_reg (Pmode, op0);
37090 op1 = fixup_modeless_constant (op1, mode1);
37092 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37094 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37095 op1 = copy_to_mode_reg (mode1, op1);
37097 else
37099 op1 = copy_to_reg (op1);
37100 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37103 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37104 op2 = copy_to_mode_reg (mode2, op2);
37106 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37107 op3 = copy_to_mode_reg (mode3, op3);
37109 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37111 error ("the last argument must be scale 1, 2, 4, 8");
37112 return const0_rtx;
37115 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37116 if (! pat)
37117 return const0_rtx;
37119 emit_insn (pat);
37120 return 0;
37122 vec_prefetch_gen:
37123 arg0 = CALL_EXPR_ARG (exp, 0);
37124 arg1 = CALL_EXPR_ARG (exp, 1);
37125 arg2 = CALL_EXPR_ARG (exp, 2);
37126 arg3 = CALL_EXPR_ARG (exp, 3);
37127 arg4 = CALL_EXPR_ARG (exp, 4);
37128 op0 = expand_normal (arg0);
37129 op1 = expand_normal (arg1);
37130 op2 = expand_normal (arg2);
37131 op3 = expand_normal (arg3);
37132 op4 = expand_normal (arg4);
37133 mode0 = insn_data[icode].operand[0].mode;
37134 mode1 = insn_data[icode].operand[1].mode;
37135 mode3 = insn_data[icode].operand[3].mode;
37136 mode4 = insn_data[icode].operand[4].mode;
37138 op0 = fixup_modeless_constant (op0, mode0);
37140 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37142 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37143 op0 = copy_to_mode_reg (mode0, op0);
37145 else
37147 op0 = copy_to_reg (op0);
37148 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37151 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37152 op1 = copy_to_mode_reg (mode1, op1);
37154 /* Force memory operand only with base register here. But we
37155 don't want to do it on memory operand for other builtin
37156 functions. */
37157 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37159 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37160 op2 = copy_to_mode_reg (Pmode, op2);
37162 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37164 error ("the forth argument must be scale 1, 2, 4, 8");
37165 return const0_rtx;
37168 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37170 error ("incorrect hint operand");
37171 return const0_rtx;
37174 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37175 if (! pat)
37176 return const0_rtx;
37178 emit_insn (pat);
37180 return 0;
37182 case IX86_BUILTIN_XABORT:
37183 icode = CODE_FOR_xabort;
37184 arg0 = CALL_EXPR_ARG (exp, 0);
37185 op0 = expand_normal (arg0);
37186 mode0 = insn_data[icode].operand[0].mode;
37187 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37189 error ("the xabort's argument must be an 8-bit immediate");
37190 return const0_rtx;
37192 emit_insn (gen_xabort (op0));
37193 return 0;
37195 case IX86_BUILTIN_RSTORSSP:
37196 case IX86_BUILTIN_CLRSSBSY:
37197 arg0 = CALL_EXPR_ARG (exp, 0);
37198 op0 = expand_normal (arg0);
37199 icode = (fcode == IX86_BUILTIN_RSTORSSP
37200 ? CODE_FOR_rstorssp
37201 : CODE_FOR_clrssbsy);
37202 if (!address_operand (op0, VOIDmode))
37204 op1 = convert_memory_address (Pmode, op0);
37205 op0 = copy_addr_to_reg (op1);
37207 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
37208 return 0;
37210 case IX86_BUILTIN_WRSSD:
37211 case IX86_BUILTIN_WRSSQ:
37212 case IX86_BUILTIN_WRUSSD:
37213 case IX86_BUILTIN_WRUSSQ:
37214 arg0 = CALL_EXPR_ARG (exp, 0);
37215 op0 = expand_normal (arg0);
37216 arg1 = CALL_EXPR_ARG (exp, 1);
37217 op1 = expand_normal (arg1);
37218 switch (fcode)
37220 case IX86_BUILTIN_WRSSD:
37221 icode = CODE_FOR_wrsssi;
37222 mode = SImode;
37223 break;
37224 case IX86_BUILTIN_WRSSQ:
37225 icode = CODE_FOR_wrssdi;
37226 mode = DImode;
37227 break;
37228 case IX86_BUILTIN_WRUSSD:
37229 icode = CODE_FOR_wrusssi;
37230 mode = SImode;
37231 break;
37232 case IX86_BUILTIN_WRUSSQ:
37233 icode = CODE_FOR_wrussdi;
37234 mode = DImode;
37235 break;
37237 op0 = force_reg (mode, op0);
37238 if (!address_operand (op1, VOIDmode))
37240 op2 = convert_memory_address (Pmode, op1);
37241 op1 = copy_addr_to_reg (op2);
37243 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
37244 return 0;
37246 default:
37247 break;
37250 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37251 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37253 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37254 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37255 target);
37258 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37259 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37261 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37262 switch (fcode)
37264 case IX86_BUILTIN_FABSQ:
37265 case IX86_BUILTIN_COPYSIGNQ:
37266 if (!TARGET_SSE)
37267 /* Emit a normal call if SSE isn't available. */
37268 return expand_call (exp, target, ignore);
37269 /* FALLTHRU */
37270 default:
37271 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37275 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
37276 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
37278 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
37279 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
37280 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
37281 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
37282 int masked = 1;
37283 machine_mode mode, wide_mode, nar_mode;
37285 nar_mode = V4SFmode;
37286 mode = V16SFmode;
37287 wide_mode = V64SFmode;
37288 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
37289 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
37291 switch (fcode)
37293 case IX86_BUILTIN_4FMAPS:
37294 fcn = gen_avx5124fmaddps_4fmaddps;
37295 masked = 0;
37296 goto v4fma_expand;
37298 case IX86_BUILTIN_4DPWSSD:
37299 nar_mode = V4SImode;
37300 mode = V16SImode;
37301 wide_mode = V64SImode;
37302 fcn = gen_avx5124vnniw_vp4dpwssd;
37303 masked = 0;
37304 goto v4fma_expand;
37306 case IX86_BUILTIN_4DPWSSDS:
37307 nar_mode = V4SImode;
37308 mode = V16SImode;
37309 wide_mode = V64SImode;
37310 fcn = gen_avx5124vnniw_vp4dpwssds;
37311 masked = 0;
37312 goto v4fma_expand;
37314 case IX86_BUILTIN_4FNMAPS:
37315 fcn = gen_avx5124fmaddps_4fnmaddps;
37316 masked = 0;
37317 goto v4fma_expand;
37319 case IX86_BUILTIN_4FNMAPS_MASK:
37320 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
37321 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
37322 goto v4fma_expand;
37324 case IX86_BUILTIN_4DPWSSD_MASK:
37325 nar_mode = V4SImode;
37326 mode = V16SImode;
37327 wide_mode = V64SImode;
37328 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
37329 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
37330 goto v4fma_expand;
37332 case IX86_BUILTIN_4DPWSSDS_MASK:
37333 nar_mode = V4SImode;
37334 mode = V16SImode;
37335 wide_mode = V64SImode;
37336 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
37337 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
37338 goto v4fma_expand;
37340 case IX86_BUILTIN_4FMAPS_MASK:
37342 tree args[4];
37343 rtx ops[4];
37344 rtx wide_reg;
37345 rtx accum;
37346 rtx addr;
37347 rtx mem;
37349 v4fma_expand:
37350 wide_reg = gen_reg_rtx (wide_mode);
37351 for (i = 0; i < 4; i++)
37353 args[i] = CALL_EXPR_ARG (exp, i);
37354 ops[i] = expand_normal (args[i]);
37356 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
37357 ops[i]);
37360 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37361 accum = force_reg (mode, accum);
37363 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37364 addr = force_reg (Pmode, addr);
37366 mem = gen_rtx_MEM (nar_mode, addr);
37368 target = gen_reg_rtx (mode);
37370 emit_move_insn (target, accum);
37372 if (! masked)
37373 emit_insn (fcn (target, accum, wide_reg, mem));
37374 else
37376 rtx merge, mask;
37377 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37379 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37381 if (CONST_INT_P (mask))
37382 mask = fixup_modeless_constant (mask, HImode);
37384 mask = force_reg (HImode, mask);
37386 if (GET_MODE (mask) != HImode)
37387 mask = gen_rtx_SUBREG (HImode, mask, 0);
37389 /* If merge is 0 then we're about to emit z-masked variant. */
37390 if (const0_operand (merge, mode))
37391 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37392 /* If merge is the same as accum then emit merge-masked variant. */
37393 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37395 merge = force_reg (mode, merge);
37396 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37398 /* Merge with something unknown might happen if we z-mask w/ -O0. */
37399 else
37401 target = gen_reg_rtx (mode);
37402 emit_move_insn (target, merge);
37403 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37406 return target;
37409 case IX86_BUILTIN_4FNMASS:
37410 fcn = gen_avx5124fmaddps_4fnmaddss;
37411 masked = 0;
37412 goto s4fma_expand;
37414 case IX86_BUILTIN_4FMASS:
37415 fcn = gen_avx5124fmaddps_4fmaddss;
37416 masked = 0;
37417 goto s4fma_expand;
37419 case IX86_BUILTIN_4FNMASS_MASK:
37420 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
37421 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
37422 goto s4fma_expand;
37424 case IX86_BUILTIN_4FMASS_MASK:
37426 tree args[4];
37427 rtx ops[4];
37428 rtx wide_reg;
37429 rtx accum;
37430 rtx addr;
37431 rtx mem;
37433 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
37434 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
37436 s4fma_expand:
37437 mode = V4SFmode;
37438 wide_reg = gen_reg_rtx (V64SFmode);
37439 for (i = 0; i < 4; i++)
37441 rtx tmp;
37442 args[i] = CALL_EXPR_ARG (exp, i);
37443 ops[i] = expand_normal (args[i]);
37445 tmp = gen_reg_rtx (SFmode);
37446 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
37448 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
37449 gen_rtx_SUBREG (V16SFmode, tmp, 0));
37452 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37453 accum = force_reg (V4SFmode, accum);
37455 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37456 addr = force_reg (Pmode, addr);
37458 mem = gen_rtx_MEM (V4SFmode, addr);
37460 target = gen_reg_rtx (V4SFmode);
37462 emit_move_insn (target, accum);
37464 if (! masked)
37465 emit_insn (fcn (target, accum, wide_reg, mem));
37466 else
37468 rtx merge, mask;
37469 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37471 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37473 if (CONST_INT_P (mask))
37474 mask = fixup_modeless_constant (mask, QImode);
37476 mask = force_reg (QImode, mask);
37478 if (GET_MODE (mask) != QImode)
37479 mask = gen_rtx_SUBREG (QImode, mask, 0);
37481 /* If merge is 0 then we're about to emit z-masked variant. */
37482 if (const0_operand (merge, mode))
37483 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37484 /* If merge is the same as accum then emit merge-masked
37485 variant. */
37486 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37488 merge = force_reg (mode, merge);
37489 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37491 /* Merge with something unknown might happen if we z-mask
37492 w/ -O0. */
37493 else
37495 target = gen_reg_rtx (mode);
37496 emit_move_insn (target, merge);
37497 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37500 return target;
37502 case IX86_BUILTIN_RDPID:
37503 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
37504 target);
37505 default:
37506 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
37510 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
37511 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
37513 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
37514 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
37515 target);
37518 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37519 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37521 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37522 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37525 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37526 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37528 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37529 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37532 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37533 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37535 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37536 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37539 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37540 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37542 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37543 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37546 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37547 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37549 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37550 const struct builtin_description *d = bdesc_multi_arg + i;
37551 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37552 (enum ix86_builtin_func_type)
37553 d->flag, d->comparison);
37556 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
37557 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
37559 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
37560 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
37561 target);
37564 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
37565 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
37567 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
37568 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
37569 target);
37572 gcc_unreachable ();
37575 /* This returns the target-specific builtin with code CODE if
37576 current_function_decl has visibility on this builtin, which is checked
37577 using isa flags. Returns NULL_TREE otherwise. */
37579 static tree ix86_get_builtin (enum ix86_builtins code)
37581 struct cl_target_option *opts;
37582 tree target_tree = NULL_TREE;
37584 /* Determine the isa flags of current_function_decl. */
37586 if (current_function_decl)
37587 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37589 if (target_tree == NULL)
37590 target_tree = target_option_default_node;
37592 opts = TREE_TARGET_OPTION (target_tree);
37594 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37595 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
37596 return ix86_builtin_decl (code, true);
37597 else
37598 return NULL_TREE;
37601 /* Return function decl for target specific builtin
37602 for given MPX builtin passed i FCODE. */
37603 static tree
37604 ix86_builtin_mpx_function (unsigned fcode)
37606 switch (fcode)
37608 case BUILT_IN_CHKP_BNDMK:
37609 return ix86_builtins[IX86_BUILTIN_BNDMK];
37611 case BUILT_IN_CHKP_BNDSTX:
37612 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37614 case BUILT_IN_CHKP_BNDLDX:
37615 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37617 case BUILT_IN_CHKP_BNDCL:
37618 return ix86_builtins[IX86_BUILTIN_BNDCL];
37620 case BUILT_IN_CHKP_BNDCU:
37621 return ix86_builtins[IX86_BUILTIN_BNDCU];
37623 case BUILT_IN_CHKP_BNDRET:
37624 return ix86_builtins[IX86_BUILTIN_BNDRET];
37626 case BUILT_IN_CHKP_INTERSECT:
37627 return ix86_builtins[IX86_BUILTIN_BNDINT];
37629 case BUILT_IN_CHKP_NARROW:
37630 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37632 case BUILT_IN_CHKP_SIZEOF:
37633 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37635 case BUILT_IN_CHKP_EXTRACT_LOWER:
37636 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37638 case BUILT_IN_CHKP_EXTRACT_UPPER:
37639 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37641 default:
37642 return NULL_TREE;
37645 gcc_unreachable ();
37648 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37650 Return an address to be used to load/store bounds for pointer
37651 passed in SLOT.
37653 SLOT_NO is an integer constant holding number of a target
37654 dependent special slot to be used in case SLOT is not a memory.
37656 SPECIAL_BASE is a pointer to be used as a base of fake address
37657 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37658 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37660 static rtx
37661 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37663 rtx addr = NULL;
37665 /* NULL slot means we pass bounds for pointer not passed to the
37666 function at all. Register slot means we pass pointer in a
37667 register. In both these cases bounds are passed via Bounds
37668 Table. Since we do not have actual pointer stored in memory,
37669 we have to use fake addresses to access Bounds Table. We
37670 start with (special_base - sizeof (void*)) and decrease this
37671 address by pointer size to get addresses for other slots. */
37672 if (!slot || REG_P (slot))
37674 gcc_assert (CONST_INT_P (slot_no));
37675 addr = plus_constant (Pmode, special_base,
37676 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37678 /* If pointer is passed in a memory then its address is used to
37679 access Bounds Table. */
37680 else if (MEM_P (slot))
37682 addr = XEXP (slot, 0);
37683 if (!register_operand (addr, Pmode))
37684 addr = copy_addr_to_reg (addr);
37686 else
37687 gcc_unreachable ();
37689 return addr;
37692 /* Expand pass uses this hook to load bounds for function parameter
37693 PTR passed in SLOT in case its bounds are not passed in a register.
37695 If SLOT is a memory, then bounds are loaded as for regular pointer
37696 loaded from memory. PTR may be NULL in case SLOT is a memory.
37697 In such case value of PTR (if required) may be loaded from SLOT.
37699 If SLOT is NULL or a register then SLOT_NO is an integer constant
37700 holding number of the target dependent special slot which should be
37701 used to obtain bounds.
37703 Return loaded bounds. */
37705 static rtx
37706 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
37708 rtx reg = gen_reg_rtx (BNDmode);
37709 rtx addr;
37711 /* Get address to be used to access Bounds Table. Special slots start
37712 at the location of return address of the current function. */
37713 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
37715 /* Load pointer value from a memory if we don't have it. */
37716 if (!ptr)
37718 gcc_assert (MEM_P (slot));
37719 ptr = copy_addr_to_reg (slot);
37722 if (!register_operand (ptr, Pmode))
37723 ptr = ix86_zero_extend_to_Pmode (ptr);
37725 emit_insn (BNDmode == BND64mode
37726 ? gen_bnd64_ldx (reg, addr, ptr)
37727 : gen_bnd32_ldx (reg, addr, ptr));
37729 return reg;
37732 /* Expand pass uses this hook to store BOUNDS for call argument PTR
37733 passed in SLOT in case BOUNDS are not passed in a register.
37735 If SLOT is a memory, then BOUNDS are stored as for regular pointer
37736 stored in memory. PTR may be NULL in case SLOT is a memory.
37737 In such case value of PTR (if required) may be loaded from SLOT.
37739 If SLOT is NULL or a register then SLOT_NO is an integer constant
37740 holding number of the target dependent special slot which should be
37741 used to store BOUNDS. */
37743 static void
37744 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
37746 rtx addr;
37748 /* Get address to be used to access Bounds Table. Special slots start
37749 at the location of return address of a called function. */
37750 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
37752 /* Load pointer value from a memory if we don't have it. */
37753 if (!ptr)
37755 gcc_assert (MEM_P (slot));
37756 ptr = copy_addr_to_reg (slot);
37759 if (!register_operand (ptr, Pmode))
37760 ptr = ix86_zero_extend_to_Pmode (ptr);
37762 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
37763 if (!register_operand (bounds, BNDmode))
37764 bounds = copy_to_mode_reg (BNDmode, bounds);
37766 emit_insn (BNDmode == BND64mode
37767 ? gen_bnd64_stx (addr, ptr, bounds)
37768 : gen_bnd32_stx (addr, ptr, bounds));
37771 /* Load and return bounds returned by function in SLOT. */
37773 static rtx
37774 ix86_load_returned_bounds (rtx slot)
37776 rtx res;
37778 gcc_assert (REG_P (slot));
37779 res = gen_reg_rtx (BNDmode);
37780 emit_move_insn (res, slot);
37782 return res;
37785 /* Store BOUNDS returned by function into SLOT. */
37787 static void
37788 ix86_store_returned_bounds (rtx slot, rtx bounds)
37790 gcc_assert (REG_P (slot));
37791 emit_move_insn (slot, bounds);
37794 /* Returns a function decl for a vectorized version of the combined function
37795 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
37796 if it is not available. */
37798 static tree
37799 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
37800 tree type_in)
37802 machine_mode in_mode, out_mode;
37803 int in_n, out_n;
37805 if (TREE_CODE (type_out) != VECTOR_TYPE
37806 || TREE_CODE (type_in) != VECTOR_TYPE)
37807 return NULL_TREE;
37809 out_mode = TYPE_MODE (TREE_TYPE (type_out));
37810 out_n = TYPE_VECTOR_SUBPARTS (type_out);
37811 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37812 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37814 switch (fn)
37816 CASE_CFN_EXP2:
37817 if (out_mode == SFmode && in_mode == SFmode)
37819 if (out_n == 16 && in_n == 16)
37820 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
37822 break;
37824 CASE_CFN_IFLOOR:
37825 CASE_CFN_LFLOOR:
37826 CASE_CFN_LLFLOOR:
37827 /* The round insn does not trap on denormals. */
37828 if (flag_trapping_math || !TARGET_SSE4_1)
37829 break;
37831 if (out_mode == SImode && in_mode == DFmode)
37833 if (out_n == 4 && in_n == 2)
37834 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
37835 else if (out_n == 8 && in_n == 4)
37836 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
37837 else if (out_n == 16 && in_n == 8)
37838 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
37840 if (out_mode == SImode && in_mode == SFmode)
37842 if (out_n == 4 && in_n == 4)
37843 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
37844 else if (out_n == 8 && in_n == 8)
37845 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
37846 else if (out_n == 16 && in_n == 16)
37847 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
37849 break;
37851 CASE_CFN_ICEIL:
37852 CASE_CFN_LCEIL:
37853 CASE_CFN_LLCEIL:
37854 /* The round insn does not trap on denormals. */
37855 if (flag_trapping_math || !TARGET_SSE4_1)
37856 break;
37858 if (out_mode == SImode && in_mode == DFmode)
37860 if (out_n == 4 && in_n == 2)
37861 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
37862 else if (out_n == 8 && in_n == 4)
37863 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
37864 else if (out_n == 16 && in_n == 8)
37865 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
37867 if (out_mode == SImode && in_mode == SFmode)
37869 if (out_n == 4 && in_n == 4)
37870 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
37871 else if (out_n == 8 && in_n == 8)
37872 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
37873 else if (out_n == 16 && in_n == 16)
37874 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
37876 break;
37878 CASE_CFN_IRINT:
37879 CASE_CFN_LRINT:
37880 CASE_CFN_LLRINT:
37881 if (out_mode == SImode && in_mode == DFmode)
37883 if (out_n == 4 && in_n == 2)
37884 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
37885 else if (out_n == 8 && in_n == 4)
37886 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
37887 else if (out_n == 16 && in_n == 8)
37888 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
37890 if (out_mode == SImode && in_mode == SFmode)
37892 if (out_n == 4 && in_n == 4)
37893 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
37894 else if (out_n == 8 && in_n == 8)
37895 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
37896 else if (out_n == 16 && in_n == 16)
37897 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
37899 break;
37901 CASE_CFN_IROUND:
37902 CASE_CFN_LROUND:
37903 CASE_CFN_LLROUND:
37904 /* The round insn does not trap on denormals. */
37905 if (flag_trapping_math || !TARGET_SSE4_1)
37906 break;
37908 if (out_mode == SImode && in_mode == DFmode)
37910 if (out_n == 4 && in_n == 2)
37911 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
37912 else if (out_n == 8 && in_n == 4)
37913 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
37914 else if (out_n == 16 && in_n == 8)
37915 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
37917 if (out_mode == SImode && in_mode == SFmode)
37919 if (out_n == 4 && in_n == 4)
37920 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
37921 else if (out_n == 8 && in_n == 8)
37922 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
37923 else if (out_n == 16 && in_n == 16)
37924 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
37926 break;
37928 CASE_CFN_FLOOR:
37929 /* The round insn does not trap on denormals. */
37930 if (flag_trapping_math || !TARGET_SSE4_1)
37931 break;
37933 if (out_mode == DFmode && in_mode == DFmode)
37935 if (out_n == 2 && in_n == 2)
37936 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
37937 else if (out_n == 4 && in_n == 4)
37938 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
37939 else if (out_n == 8 && in_n == 8)
37940 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
37942 if (out_mode == SFmode && in_mode == SFmode)
37944 if (out_n == 4 && in_n == 4)
37945 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
37946 else if (out_n == 8 && in_n == 8)
37947 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
37948 else if (out_n == 16 && in_n == 16)
37949 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
37951 break;
37953 CASE_CFN_CEIL:
37954 /* The round insn does not trap on denormals. */
37955 if (flag_trapping_math || !TARGET_SSE4_1)
37956 break;
37958 if (out_mode == DFmode && in_mode == DFmode)
37960 if (out_n == 2 && in_n == 2)
37961 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
37962 else if (out_n == 4 && in_n == 4)
37963 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
37964 else if (out_n == 8 && in_n == 8)
37965 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
37967 if (out_mode == SFmode && in_mode == SFmode)
37969 if (out_n == 4 && in_n == 4)
37970 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
37971 else if (out_n == 8 && in_n == 8)
37972 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
37973 else if (out_n == 16 && in_n == 16)
37974 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
37976 break;
37978 CASE_CFN_TRUNC:
37979 /* The round insn does not trap on denormals. */
37980 if (flag_trapping_math || !TARGET_SSE4_1)
37981 break;
37983 if (out_mode == DFmode && in_mode == DFmode)
37985 if (out_n == 2 && in_n == 2)
37986 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
37987 else if (out_n == 4 && in_n == 4)
37988 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
37989 else if (out_n == 8 && in_n == 8)
37990 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
37992 if (out_mode == SFmode && in_mode == SFmode)
37994 if (out_n == 4 && in_n == 4)
37995 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
37996 else if (out_n == 8 && in_n == 8)
37997 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
37998 else if (out_n == 16 && in_n == 16)
37999 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38001 break;
38003 CASE_CFN_RINT:
38004 /* The round insn does not trap on denormals. */
38005 if (flag_trapping_math || !TARGET_SSE4_1)
38006 break;
38008 if (out_mode == DFmode && in_mode == DFmode)
38010 if (out_n == 2 && in_n == 2)
38011 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38012 else if (out_n == 4 && in_n == 4)
38013 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38015 if (out_mode == SFmode && in_mode == SFmode)
38017 if (out_n == 4 && in_n == 4)
38018 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38019 else if (out_n == 8 && in_n == 8)
38020 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38022 break;
38024 CASE_CFN_FMA:
38025 if (out_mode == DFmode && in_mode == DFmode)
38027 if (out_n == 2 && in_n == 2)
38028 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38029 if (out_n == 4 && in_n == 4)
38030 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38032 if (out_mode == SFmode && in_mode == SFmode)
38034 if (out_n == 4 && in_n == 4)
38035 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38036 if (out_n == 8 && in_n == 8)
38037 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38039 break;
38041 default:
38042 break;
38045 /* Dispatch to a handler for a vectorization library. */
38046 if (ix86_veclib_handler)
38047 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38049 return NULL_TREE;
38052 /* Handler for an SVML-style interface to
38053 a library with vectorized intrinsics. */
38055 static tree
38056 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38058 char name[20];
38059 tree fntype, new_fndecl, args;
38060 unsigned arity;
38061 const char *bname;
38062 machine_mode el_mode, in_mode;
38063 int n, in_n;
38065 /* The SVML is suitable for unsafe math only. */
38066 if (!flag_unsafe_math_optimizations)
38067 return NULL_TREE;
38069 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38070 n = TYPE_VECTOR_SUBPARTS (type_out);
38071 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38072 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38073 if (el_mode != in_mode
38074 || n != in_n)
38075 return NULL_TREE;
38077 switch (fn)
38079 CASE_CFN_EXP:
38080 CASE_CFN_LOG:
38081 CASE_CFN_LOG10:
38082 CASE_CFN_POW:
38083 CASE_CFN_TANH:
38084 CASE_CFN_TAN:
38085 CASE_CFN_ATAN:
38086 CASE_CFN_ATAN2:
38087 CASE_CFN_ATANH:
38088 CASE_CFN_CBRT:
38089 CASE_CFN_SINH:
38090 CASE_CFN_SIN:
38091 CASE_CFN_ASINH:
38092 CASE_CFN_ASIN:
38093 CASE_CFN_COSH:
38094 CASE_CFN_COS:
38095 CASE_CFN_ACOSH:
38096 CASE_CFN_ACOS:
38097 if ((el_mode != DFmode || n != 2)
38098 && (el_mode != SFmode || n != 4))
38099 return NULL_TREE;
38100 break;
38102 default:
38103 return NULL_TREE;
38106 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38107 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38109 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38110 strcpy (name, "vmlsLn4");
38111 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38112 strcpy (name, "vmldLn2");
38113 else if (n == 4)
38115 sprintf (name, "vmls%s", bname+10);
38116 name[strlen (name)-1] = '4';
38118 else
38119 sprintf (name, "vmld%s2", bname+10);
38121 /* Convert to uppercase. */
38122 name[4] &= ~0x20;
38124 arity = 0;
38125 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38126 arity++;
38128 if (arity == 1)
38129 fntype = build_function_type_list (type_out, type_in, NULL);
38130 else
38131 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38133 /* Build a function declaration for the vectorized function. */
38134 new_fndecl = build_decl (BUILTINS_LOCATION,
38135 FUNCTION_DECL, get_identifier (name), fntype);
38136 TREE_PUBLIC (new_fndecl) = 1;
38137 DECL_EXTERNAL (new_fndecl) = 1;
38138 DECL_IS_NOVOPS (new_fndecl) = 1;
38139 TREE_READONLY (new_fndecl) = 1;
38141 return new_fndecl;
38144 /* Handler for an ACML-style interface to
38145 a library with vectorized intrinsics. */
38147 static tree
38148 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38150 char name[20] = "__vr.._";
38151 tree fntype, new_fndecl, args;
38152 unsigned arity;
38153 const char *bname;
38154 machine_mode el_mode, in_mode;
38155 int n, in_n;
38157 /* The ACML is 64bits only and suitable for unsafe math only as
38158 it does not correctly support parts of IEEE with the required
38159 precision such as denormals. */
38160 if (!TARGET_64BIT
38161 || !flag_unsafe_math_optimizations)
38162 return NULL_TREE;
38164 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38165 n = TYPE_VECTOR_SUBPARTS (type_out);
38166 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38167 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38168 if (el_mode != in_mode
38169 || n != in_n)
38170 return NULL_TREE;
38172 switch (fn)
38174 CASE_CFN_SIN:
38175 CASE_CFN_COS:
38176 CASE_CFN_EXP:
38177 CASE_CFN_LOG:
38178 CASE_CFN_LOG2:
38179 CASE_CFN_LOG10:
38180 if (el_mode == DFmode && n == 2)
38182 name[4] = 'd';
38183 name[5] = '2';
38185 else if (el_mode == SFmode && n == 4)
38187 name[4] = 's';
38188 name[5] = '4';
38190 else
38191 return NULL_TREE;
38192 break;
38194 default:
38195 return NULL_TREE;
38198 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38199 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38200 sprintf (name + 7, "%s", bname+10);
38202 arity = 0;
38203 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38204 arity++;
38206 if (arity == 1)
38207 fntype = build_function_type_list (type_out, type_in, NULL);
38208 else
38209 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38211 /* Build a function declaration for the vectorized function. */
38212 new_fndecl = build_decl (BUILTINS_LOCATION,
38213 FUNCTION_DECL, get_identifier (name), fntype);
38214 TREE_PUBLIC (new_fndecl) = 1;
38215 DECL_EXTERNAL (new_fndecl) = 1;
38216 DECL_IS_NOVOPS (new_fndecl) = 1;
38217 TREE_READONLY (new_fndecl) = 1;
38219 return new_fndecl;
38222 /* Returns a decl of a function that implements gather load with
38223 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38224 Return NULL_TREE if it is not available. */
38226 static tree
38227 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38228 const_tree index_type, int scale)
38230 bool si;
38231 enum ix86_builtins code;
38233 if (! TARGET_AVX2)
38234 return NULL_TREE;
38236 if ((TREE_CODE (index_type) != INTEGER_TYPE
38237 && !POINTER_TYPE_P (index_type))
38238 || (TYPE_MODE (index_type) != SImode
38239 && TYPE_MODE (index_type) != DImode))
38240 return NULL_TREE;
38242 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38243 return NULL_TREE;
38245 /* v*gather* insn sign extends index to pointer mode. */
38246 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38247 && TYPE_UNSIGNED (index_type))
38248 return NULL_TREE;
38250 if (scale <= 0
38251 || scale > 8
38252 || (scale & (scale - 1)) != 0)
38253 return NULL_TREE;
38255 si = TYPE_MODE (index_type) == SImode;
38256 switch (TYPE_MODE (mem_vectype))
38258 case E_V2DFmode:
38259 if (TARGET_AVX512VL)
38260 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38261 else
38262 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38263 break;
38264 case E_V4DFmode:
38265 if (TARGET_AVX512VL)
38266 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38267 else
38268 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38269 break;
38270 case E_V2DImode:
38271 if (TARGET_AVX512VL)
38272 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38273 else
38274 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38275 break;
38276 case E_V4DImode:
38277 if (TARGET_AVX512VL)
38278 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38279 else
38280 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38281 break;
38282 case E_V4SFmode:
38283 if (TARGET_AVX512VL)
38284 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38285 else
38286 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38287 break;
38288 case E_V8SFmode:
38289 if (TARGET_AVX512VL)
38290 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38291 else
38292 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38293 break;
38294 case E_V4SImode:
38295 if (TARGET_AVX512VL)
38296 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38297 else
38298 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38299 break;
38300 case E_V8SImode:
38301 if (TARGET_AVX512VL)
38302 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38303 else
38304 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38305 break;
38306 case E_V8DFmode:
38307 if (TARGET_AVX512F)
38308 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38309 else
38310 return NULL_TREE;
38311 break;
38312 case E_V8DImode:
38313 if (TARGET_AVX512F)
38314 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38315 else
38316 return NULL_TREE;
38317 break;
38318 case E_V16SFmode:
38319 if (TARGET_AVX512F)
38320 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38321 else
38322 return NULL_TREE;
38323 break;
38324 case E_V16SImode:
38325 if (TARGET_AVX512F)
38326 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38327 else
38328 return NULL_TREE;
38329 break;
38330 default:
38331 return NULL_TREE;
38334 return ix86_get_builtin (code);
38337 /* Returns a decl of a function that implements scatter store with
38338 register type VECTYPE and index type INDEX_TYPE and SCALE.
38339 Return NULL_TREE if it is not available. */
38341 static tree
38342 ix86_vectorize_builtin_scatter (const_tree vectype,
38343 const_tree index_type, int scale)
38345 bool si;
38346 enum ix86_builtins code;
38348 if (!TARGET_AVX512F)
38349 return NULL_TREE;
38351 if ((TREE_CODE (index_type) != INTEGER_TYPE
38352 && !POINTER_TYPE_P (index_type))
38353 || (TYPE_MODE (index_type) != SImode
38354 && TYPE_MODE (index_type) != DImode))
38355 return NULL_TREE;
38357 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38358 return NULL_TREE;
38360 /* v*scatter* insn sign extends index to pointer mode. */
38361 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38362 && TYPE_UNSIGNED (index_type))
38363 return NULL_TREE;
38365 /* Scale can be 1, 2, 4 or 8. */
38366 if (scale <= 0
38367 || scale > 8
38368 || (scale & (scale - 1)) != 0)
38369 return NULL_TREE;
38371 si = TYPE_MODE (index_type) == SImode;
38372 switch (TYPE_MODE (vectype))
38374 case E_V8DFmode:
38375 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38376 break;
38377 case E_V8DImode:
38378 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38379 break;
38380 case E_V16SFmode:
38381 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38382 break;
38383 case E_V16SImode:
38384 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38385 break;
38386 default:
38387 return NULL_TREE;
38390 return ix86_builtins[code];
38393 /* Return true if it is safe to use the rsqrt optabs to optimize
38394 1.0/sqrt. */
38396 static bool
38397 use_rsqrt_p ()
38399 return (TARGET_SSE_MATH
38400 && flag_finite_math_only
38401 && !flag_trapping_math
38402 && flag_unsafe_math_optimizations);
38405 /* Returns a code for a target-specific builtin that implements
38406 reciprocal of the function, or NULL_TREE if not available. */
38408 static tree
38409 ix86_builtin_reciprocal (tree fndecl)
38411 switch (DECL_FUNCTION_CODE (fndecl))
38413 /* Vectorized version of sqrt to rsqrt conversion. */
38414 case IX86_BUILTIN_SQRTPS_NR:
38415 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38417 case IX86_BUILTIN_SQRTPS_NR256:
38418 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38420 default:
38421 return NULL_TREE;
38425 /* Helper for avx_vpermilps256_operand et al. This is also used by
38426 the expansion functions to turn the parallel back into a mask.
38427 The return value is 0 for no match and the imm8+1 for a match. */
38430 avx_vpermilp_parallel (rtx par, machine_mode mode)
38432 unsigned i, nelt = GET_MODE_NUNITS (mode);
38433 unsigned mask = 0;
38434 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38436 if (XVECLEN (par, 0) != (int) nelt)
38437 return 0;
38439 /* Validate that all of the elements are constants, and not totally
38440 out of range. Copy the data into an integral array to make the
38441 subsequent checks easier. */
38442 for (i = 0; i < nelt; ++i)
38444 rtx er = XVECEXP (par, 0, i);
38445 unsigned HOST_WIDE_INT ei;
38447 if (!CONST_INT_P (er))
38448 return 0;
38449 ei = INTVAL (er);
38450 if (ei >= nelt)
38451 return 0;
38452 ipar[i] = ei;
38455 switch (mode)
38457 case E_V8DFmode:
38458 /* In the 512-bit DFmode case, we can only move elements within
38459 a 128-bit lane. First fill the second part of the mask,
38460 then fallthru. */
38461 for (i = 4; i < 6; ++i)
38463 if (ipar[i] < 4 || ipar[i] >= 6)
38464 return 0;
38465 mask |= (ipar[i] - 4) << i;
38467 for (i = 6; i < 8; ++i)
38469 if (ipar[i] < 6)
38470 return 0;
38471 mask |= (ipar[i] - 6) << i;
38473 /* FALLTHRU */
38475 case E_V4DFmode:
38476 /* In the 256-bit DFmode case, we can only move elements within
38477 a 128-bit lane. */
38478 for (i = 0; i < 2; ++i)
38480 if (ipar[i] >= 2)
38481 return 0;
38482 mask |= ipar[i] << i;
38484 for (i = 2; i < 4; ++i)
38486 if (ipar[i] < 2)
38487 return 0;
38488 mask |= (ipar[i] - 2) << i;
38490 break;
38492 case E_V16SFmode:
38493 /* In 512 bit SFmode case, permutation in the upper 256 bits
38494 must mirror the permutation in the lower 256-bits. */
38495 for (i = 0; i < 8; ++i)
38496 if (ipar[i] + 8 != ipar[i + 8])
38497 return 0;
38498 /* FALLTHRU */
38500 case E_V8SFmode:
38501 /* In 256 bit SFmode case, we have full freedom of
38502 movement within the low 128-bit lane, but the high 128-bit
38503 lane must mirror the exact same pattern. */
38504 for (i = 0; i < 4; ++i)
38505 if (ipar[i] + 4 != ipar[i + 4])
38506 return 0;
38507 nelt = 4;
38508 /* FALLTHRU */
38510 case E_V2DFmode:
38511 case E_V4SFmode:
38512 /* In the 128-bit case, we've full freedom in the placement of
38513 the elements from the source operand. */
38514 for (i = 0; i < nelt; ++i)
38515 mask |= ipar[i] << (i * (nelt / 2));
38516 break;
38518 default:
38519 gcc_unreachable ();
38522 /* Make sure success has a non-zero value by adding one. */
38523 return mask + 1;
38526 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38527 the expansion functions to turn the parallel back into a mask.
38528 The return value is 0 for no match and the imm8+1 for a match. */
38531 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38533 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38534 unsigned mask = 0;
38535 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38537 if (XVECLEN (par, 0) != (int) nelt)
38538 return 0;
38540 /* Validate that all of the elements are constants, and not totally
38541 out of range. Copy the data into an integral array to make the
38542 subsequent checks easier. */
38543 for (i = 0; i < nelt; ++i)
38545 rtx er = XVECEXP (par, 0, i);
38546 unsigned HOST_WIDE_INT ei;
38548 if (!CONST_INT_P (er))
38549 return 0;
38550 ei = INTVAL (er);
38551 if (ei >= 2 * nelt)
38552 return 0;
38553 ipar[i] = ei;
38556 /* Validate that the halves of the permute are halves. */
38557 for (i = 0; i < nelt2 - 1; ++i)
38558 if (ipar[i] + 1 != ipar[i + 1])
38559 return 0;
38560 for (i = nelt2; i < nelt - 1; ++i)
38561 if (ipar[i] + 1 != ipar[i + 1])
38562 return 0;
38564 /* Reconstruct the mask. */
38565 for (i = 0; i < 2; ++i)
38567 unsigned e = ipar[i * nelt2];
38568 if (e % nelt2)
38569 return 0;
38570 e /= nelt2;
38571 mask |= e << (i * 4);
38574 /* Make sure success has a non-zero value by adding one. */
38575 return mask + 1;
38578 /* Return a register priority for hard reg REGNO. */
38579 static int
38580 ix86_register_priority (int hard_regno)
38582 /* ebp and r13 as the base always wants a displacement, r12 as the
38583 base always wants an index. So discourage their usage in an
38584 address. */
38585 if (hard_regno == R12_REG || hard_regno == R13_REG)
38586 return 0;
38587 if (hard_regno == BP_REG)
38588 return 1;
38589 /* New x86-64 int registers result in bigger code size. Discourage
38590 them. */
38591 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38592 return 2;
38593 /* New x86-64 SSE registers result in bigger code size. Discourage
38594 them. */
38595 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38596 return 2;
38597 /* Usage of AX register results in smaller code. Prefer it. */
38598 if (hard_regno == AX_REG)
38599 return 4;
38600 return 3;
38603 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38605 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38606 QImode must go into class Q_REGS.
38607 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38608 movdf to do mem-to-mem moves through integer regs. */
38610 static reg_class_t
38611 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38613 machine_mode mode = GET_MODE (x);
38615 /* We're only allowed to return a subclass of CLASS. Many of the
38616 following checks fail for NO_REGS, so eliminate that early. */
38617 if (regclass == NO_REGS)
38618 return NO_REGS;
38620 /* All classes can load zeros. */
38621 if (x == CONST0_RTX (mode))
38622 return regclass;
38624 /* Force constants into memory if we are loading a (nonzero) constant into
38625 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38626 instructions to load from a constant. */
38627 if (CONSTANT_P (x)
38628 && (MAYBE_MMX_CLASS_P (regclass)
38629 || MAYBE_SSE_CLASS_P (regclass)
38630 || MAYBE_MASK_CLASS_P (regclass)))
38631 return NO_REGS;
38633 /* Floating-point constants need more complex checks. */
38634 if (CONST_DOUBLE_P (x))
38636 /* General regs can load everything. */
38637 if (INTEGER_CLASS_P (regclass))
38638 return regclass;
38640 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38641 zero above. We only want to wind up preferring 80387 registers if
38642 we plan on doing computation with them. */
38643 if (IS_STACK_MODE (mode)
38644 && standard_80387_constant_p (x) > 0)
38646 /* Limit class to FP regs. */
38647 if (FLOAT_CLASS_P (regclass))
38648 return FLOAT_REGS;
38649 else if (regclass == FP_TOP_SSE_REGS)
38650 return FP_TOP_REG;
38651 else if (regclass == FP_SECOND_SSE_REGS)
38652 return FP_SECOND_REG;
38655 return NO_REGS;
38658 /* Prefer SSE regs only, if we can use them for math. */
38659 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38660 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38662 /* Generally when we see PLUS here, it's the function invariant
38663 (plus soft-fp const_int). Which can only be computed into general
38664 regs. */
38665 if (GET_CODE (x) == PLUS)
38666 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38668 /* QImode constants are easy to load, but non-constant QImode data
38669 must go into Q_REGS. */
38670 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38672 if (Q_CLASS_P (regclass))
38673 return regclass;
38674 else if (reg_class_subset_p (Q_REGS, regclass))
38675 return Q_REGS;
38676 else
38677 return NO_REGS;
38680 return regclass;
38683 /* Discourage putting floating-point values in SSE registers unless
38684 SSE math is being used, and likewise for the 387 registers. */
38685 static reg_class_t
38686 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
38688 machine_mode mode = GET_MODE (x);
38690 /* Restrict the output reload class to the register bank that we are doing
38691 math on. If we would like not to return a subset of CLASS, reject this
38692 alternative: if reload cannot do this, it will still use its choice. */
38693 mode = GET_MODE (x);
38694 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38695 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
38697 if (IS_STACK_MODE (mode))
38699 if (regclass == FP_TOP_SSE_REGS)
38700 return FP_TOP_REG;
38701 else if (regclass == FP_SECOND_SSE_REGS)
38702 return FP_SECOND_REG;
38703 else
38704 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
38707 return regclass;
38710 static reg_class_t
38711 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
38712 machine_mode mode, secondary_reload_info *sri)
38714 /* Double-word spills from general registers to non-offsettable memory
38715 references (zero-extended addresses) require special handling. */
38716 if (TARGET_64BIT
38717 && MEM_P (x)
38718 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
38719 && INTEGER_CLASS_P (rclass)
38720 && !offsettable_memref_p (x))
38722 sri->icode = (in_p
38723 ? CODE_FOR_reload_noff_load
38724 : CODE_FOR_reload_noff_store);
38725 /* Add the cost of moving address to a temporary. */
38726 sri->extra_cost = 1;
38728 return NO_REGS;
38731 /* QImode spills from non-QI registers require
38732 intermediate register on 32bit targets. */
38733 if (mode == QImode
38734 && ((!TARGET_64BIT && !in_p
38735 && INTEGER_CLASS_P (rclass)
38736 && MAYBE_NON_Q_CLASS_P (rclass))
38737 || (!TARGET_AVX512DQ
38738 && MAYBE_MASK_CLASS_P (rclass))))
38740 int regno = true_regnum (x);
38742 /* Return Q_REGS if the operand is in memory. */
38743 if (regno == -1)
38744 return Q_REGS;
38746 return NO_REGS;
38749 /* This condition handles corner case where an expression involving
38750 pointers gets vectorized. We're trying to use the address of a
38751 stack slot as a vector initializer.
38753 (set (reg:V2DI 74 [ vect_cst_.2 ])
38754 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
38756 Eventually frame gets turned into sp+offset like this:
38758 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38759 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38760 (const_int 392 [0x188]))))
38762 That later gets turned into:
38764 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38765 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38766 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
38768 We'll have the following reload recorded:
38770 Reload 0: reload_in (DI) =
38771 (plus:DI (reg/f:DI 7 sp)
38772 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
38773 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38774 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
38775 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
38776 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38777 reload_reg_rtx: (reg:V2DI 22 xmm1)
38779 Which isn't going to work since SSE instructions can't handle scalar
38780 additions. Returning GENERAL_REGS forces the addition into integer
38781 register and reload can handle subsequent reloads without problems. */
38783 if (in_p && GET_CODE (x) == PLUS
38784 && SSE_CLASS_P (rclass)
38785 && SCALAR_INT_MODE_P (mode))
38786 return GENERAL_REGS;
38788 return NO_REGS;
38791 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
38793 static bool
38794 ix86_class_likely_spilled_p (reg_class_t rclass)
38796 switch (rclass)
38798 case AREG:
38799 case DREG:
38800 case CREG:
38801 case BREG:
38802 case AD_REGS:
38803 case SIREG:
38804 case DIREG:
38805 case SSE_FIRST_REG:
38806 case FP_TOP_REG:
38807 case FP_SECOND_REG:
38808 case BND_REGS:
38809 return true;
38811 default:
38812 break;
38815 return false;
38818 /* If we are copying between registers from different register sets
38819 (e.g. FP and integer), we may need a memory location.
38821 The function can't work reliably when one of the CLASSES is a class
38822 containing registers from multiple sets. We avoid this by never combining
38823 different sets in a single alternative in the machine description.
38824 Ensure that this constraint holds to avoid unexpected surprises.
38826 When STRICT is false, we are being called from REGISTER_MOVE_COST,
38827 so do not enforce these sanity checks.
38829 To optimize register_move_cost performance, define inline variant. */
38831 static inline bool
38832 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38833 reg_class_t class2, int strict)
38835 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
38836 return false;
38838 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
38839 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
38840 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
38841 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
38842 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
38843 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
38844 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
38845 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
38847 gcc_assert (!strict || lra_in_progress);
38848 return true;
38851 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
38852 return true;
38854 /* Between mask and general, we have moves no larger than word size. */
38855 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
38856 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
38857 return true;
38859 /* ??? This is a lie. We do have moves between mmx/general, and for
38860 mmx/sse2. But by saying we need secondary memory we discourage the
38861 register allocator from using the mmx registers unless needed. */
38862 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
38863 return true;
38865 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38867 /* SSE1 doesn't have any direct moves from other classes. */
38868 if (!TARGET_SSE2)
38869 return true;
38871 /* If the target says that inter-unit moves are more expensive
38872 than moving through memory, then don't generate them. */
38873 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
38874 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
38875 return true;
38877 /* Between SSE and general, we have moves no larger than word size. */
38878 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
38879 return true;
38882 return false;
38885 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
38887 static bool
38888 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38889 reg_class_t class2)
38891 return inline_secondary_memory_needed (mode, class1, class2, true);
38894 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
38896 get_secondary_mem widens integral modes to BITS_PER_WORD.
38897 There is no need to emit full 64 bit move on 64 bit targets
38898 for integral modes that can be moved using 32 bit move. */
38900 static machine_mode
38901 ix86_secondary_memory_needed_mode (machine_mode mode)
38903 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
38904 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
38905 return mode;
38908 /* Implement the TARGET_CLASS_MAX_NREGS hook.
38910 On the 80386, this is the size of MODE in words,
38911 except in the FP regs, where a single reg is always enough. */
38913 static unsigned char
38914 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
38916 if (MAYBE_INTEGER_CLASS_P (rclass))
38918 if (mode == XFmode)
38919 return (TARGET_64BIT ? 2 : 3);
38920 else if (mode == XCmode)
38921 return (TARGET_64BIT ? 4 : 6);
38922 else
38923 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38925 else
38927 if (COMPLEX_MODE_P (mode))
38928 return 2;
38929 else
38930 return 1;
38934 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
38936 static bool
38937 ix86_can_change_mode_class (machine_mode from, machine_mode to,
38938 reg_class_t regclass)
38940 if (from == to)
38941 return true;
38943 /* x87 registers can't do subreg at all, as all values are reformatted
38944 to extended precision. */
38945 if (MAYBE_FLOAT_CLASS_P (regclass))
38946 return false;
38948 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
38950 /* Vector registers do not support QI or HImode loads. If we don't
38951 disallow a change to these modes, reload will assume it's ok to
38952 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
38953 the vec_dupv4hi pattern. */
38954 if (GET_MODE_SIZE (from) < 4)
38955 return false;
38958 return true;
38961 /* Return index of MODE in the sse load/store tables. */
38963 static inline int
38964 sse_store_index (machine_mode mode)
38966 switch (GET_MODE_SIZE (mode))
38968 case 4:
38969 return 0;
38970 case 8:
38971 return 1;
38972 case 16:
38973 return 2;
38974 case 32:
38975 return 3;
38976 case 64:
38977 return 4;
38978 default:
38979 return -1;
38983 /* Return the cost of moving data of mode M between a
38984 register and memory. A value of 2 is the default; this cost is
38985 relative to those in `REGISTER_MOVE_COST'.
38987 This function is used extensively by register_move_cost that is used to
38988 build tables at startup. Make it inline in this case.
38989 When IN is 2, return maximum of in and out move cost.
38991 If moving between registers and memory is more expensive than
38992 between two registers, you should define this macro to express the
38993 relative cost.
38995 Model also increased moving costs of QImode registers in non
38996 Q_REGS classes.
38998 static inline int
38999 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39000 int in)
39002 int cost;
39003 if (FLOAT_CLASS_P (regclass))
39005 int index;
39006 switch (mode)
39008 case E_SFmode:
39009 index = 0;
39010 break;
39011 case E_DFmode:
39012 index = 1;
39013 break;
39014 case E_XFmode:
39015 index = 2;
39016 break;
39017 default:
39018 return 100;
39020 if (in == 2)
39021 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39022 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39024 if (SSE_CLASS_P (regclass))
39026 int index = sse_store_index (mode);
39027 if (index == -1)
39028 return 100;
39029 if (in == 2)
39030 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39031 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39033 if (MMX_CLASS_P (regclass))
39035 int index;
39036 switch (GET_MODE_SIZE (mode))
39038 case 4:
39039 index = 0;
39040 break;
39041 case 8:
39042 index = 1;
39043 break;
39044 default:
39045 return 100;
39047 if (in)
39048 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39049 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39051 switch (GET_MODE_SIZE (mode))
39053 case 1:
39054 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39056 if (!in)
39057 return ix86_cost->int_store[0];
39058 if (TARGET_PARTIAL_REG_DEPENDENCY
39059 && optimize_function_for_speed_p (cfun))
39060 cost = ix86_cost->movzbl_load;
39061 else
39062 cost = ix86_cost->int_load[0];
39063 if (in == 2)
39064 return MAX (cost, ix86_cost->int_store[0]);
39065 return cost;
39067 else
39069 if (in == 2)
39070 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39071 if (in)
39072 return ix86_cost->movzbl_load;
39073 else
39074 return ix86_cost->int_store[0] + 4;
39076 break;
39077 case 2:
39078 if (in == 2)
39079 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39080 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39081 default:
39082 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39083 if (mode == TFmode)
39084 mode = XFmode;
39085 if (in == 2)
39086 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39087 else if (in)
39088 cost = ix86_cost->int_load[2];
39089 else
39090 cost = ix86_cost->int_store[2];
39091 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39095 static int
39096 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39097 bool in)
39099 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39103 /* Return the cost of moving data from a register in class CLASS1 to
39104 one in class CLASS2.
39106 It is not required that the cost always equal 2 when FROM is the same as TO;
39107 on some machines it is expensive to move between registers if they are not
39108 general registers. */
39110 static int
39111 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39112 reg_class_t class2_i)
39114 enum reg_class class1 = (enum reg_class) class1_i;
39115 enum reg_class class2 = (enum reg_class) class2_i;
39117 /* In case we require secondary memory, compute cost of the store followed
39118 by load. In order to avoid bad register allocation choices, we need
39119 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39121 if (inline_secondary_memory_needed (mode, class1, class2, false))
39123 int cost = 1;
39125 cost += inline_memory_move_cost (mode, class1, 2);
39126 cost += inline_memory_move_cost (mode, class2, 2);
39128 /* In case of copying from general_purpose_register we may emit multiple
39129 stores followed by single load causing memory size mismatch stall.
39130 Count this as arbitrarily high cost of 20. */
39131 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
39132 && TARGET_MEMORY_MISMATCH_STALL
39133 && targetm.class_max_nregs (class1, mode)
39134 > targetm.class_max_nregs (class2, mode))
39135 cost += 20;
39137 /* In the case of FP/MMX moves, the registers actually overlap, and we
39138 have to switch modes in order to treat them differently. */
39139 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39140 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39141 cost += 20;
39143 return cost;
39146 /* Moves between SSE/MMX and integer unit are expensive. */
39147 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39148 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39150 /* ??? By keeping returned value relatively high, we limit the number
39151 of moves between integer and MMX/SSE registers for all targets.
39152 Additionally, high value prevents problem with x86_modes_tieable_p(),
39153 where integer modes in MMX/SSE registers are not tieable
39154 because of missing QImode and HImode moves to, from or between
39155 MMX/SSE registers. */
39156 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
39157 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
39159 if (MAYBE_FLOAT_CLASS_P (class1))
39160 return ix86_cost->fp_move;
39161 if (MAYBE_SSE_CLASS_P (class1))
39163 if (GET_MODE_BITSIZE (mode) <= 128)
39164 return ix86_cost->xmm_move;
39165 if (GET_MODE_BITSIZE (mode) <= 256)
39166 return ix86_cost->ymm_move;
39167 return ix86_cost->zmm_move;
39169 if (MAYBE_MMX_CLASS_P (class1))
39170 return ix86_cost->mmx_move;
39171 return 2;
39174 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
39175 words of a value of mode MODE but can be less for certain modes in
39176 special long registers.
39178 Actually there are no two word move instructions for consecutive
39179 registers. And only registers 0-3 may have mov byte instructions
39180 applied to them. */
39182 static unsigned int
39183 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
39185 if (GENERAL_REGNO_P (regno))
39187 if (mode == XFmode)
39188 return TARGET_64BIT ? 2 : 3;
39189 if (mode == XCmode)
39190 return TARGET_64BIT ? 4 : 6;
39191 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39193 if (COMPLEX_MODE_P (mode))
39194 return 2;
39195 if (mode == V64SFmode || mode == V64SImode)
39196 return 4;
39197 return 1;
39200 /* Implement TARGET_HARD_REGNO_MODE_OK. */
39202 static bool
39203 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
39205 /* Flags and only flags can only hold CCmode values. */
39206 if (CC_REGNO_P (regno))
39207 return GET_MODE_CLASS (mode) == MODE_CC;
39208 if (GET_MODE_CLASS (mode) == MODE_CC
39209 || GET_MODE_CLASS (mode) == MODE_RANDOM
39210 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39211 return false;
39212 if (STACK_REGNO_P (regno))
39213 return VALID_FP_MODE_P (mode);
39214 if (MASK_REGNO_P (regno))
39215 return (VALID_MASK_REG_MODE (mode)
39216 || (TARGET_AVX512BW
39217 && VALID_MASK_AVX512BW_MODE (mode)));
39218 if (BND_REGNO_P (regno))
39219 return VALID_BND_REG_MODE (mode);
39220 if (SSE_REGNO_P (regno))
39222 /* We implement the move patterns for all vector modes into and
39223 out of SSE registers, even when no operation instructions
39224 are available. */
39226 /* For AVX-512 we allow, regardless of regno:
39227 - XI mode
39228 - any of 512-bit wide vector mode
39229 - any scalar mode. */
39230 if (TARGET_AVX512F
39231 && (mode == XImode
39232 || VALID_AVX512F_REG_MODE (mode)
39233 || VALID_AVX512F_SCALAR_MODE (mode)))
39234 return true;
39236 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
39237 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39238 && MOD4_SSE_REGNO_P (regno)
39239 && mode == V64SFmode)
39240 return true;
39242 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
39243 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39244 && MOD4_SSE_REGNO_P (regno)
39245 && mode == V64SImode)
39246 return true;
39248 /* TODO check for QI/HI scalars. */
39249 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39250 if (TARGET_AVX512VL
39251 && (mode == OImode
39252 || mode == TImode
39253 || VALID_AVX256_REG_MODE (mode)
39254 || VALID_AVX512VL_128_REG_MODE (mode)))
39255 return true;
39257 /* xmm16-xmm31 are only available for AVX-512. */
39258 if (EXT_REX_SSE_REGNO_P (regno))
39259 return false;
39261 /* OImode and AVX modes are available only when AVX is enabled. */
39262 return ((TARGET_AVX
39263 && VALID_AVX256_REG_OR_OI_MODE (mode))
39264 || VALID_SSE_REG_MODE (mode)
39265 || VALID_SSE2_REG_MODE (mode)
39266 || VALID_MMX_REG_MODE (mode)
39267 || VALID_MMX_REG_MODE_3DNOW (mode));
39269 if (MMX_REGNO_P (regno))
39271 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39272 so if the register is available at all, then we can move data of
39273 the given mode into or out of it. */
39274 return (VALID_MMX_REG_MODE (mode)
39275 || VALID_MMX_REG_MODE_3DNOW (mode));
39278 if (mode == QImode)
39280 /* Take care for QImode values - they can be in non-QI regs,
39281 but then they do cause partial register stalls. */
39282 if (ANY_QI_REGNO_P (regno))
39283 return true;
39284 if (!TARGET_PARTIAL_REG_STALL)
39285 return true;
39286 /* LRA checks if the hard register is OK for the given mode.
39287 QImode values can live in non-QI regs, so we allow all
39288 registers here. */
39289 if (lra_in_progress)
39290 return true;
39291 return !can_create_pseudo_p ();
39293 /* We handle both integer and floats in the general purpose registers. */
39294 else if (VALID_INT_MODE_P (mode))
39295 return true;
39296 else if (VALID_FP_MODE_P (mode))
39297 return true;
39298 else if (VALID_DFP_MODE_P (mode))
39299 return true;
39300 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39301 on to use that value in smaller contexts, this can easily force a
39302 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39303 supporting DImode, allow it. */
39304 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39305 return true;
39307 return false;
39310 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
39311 saves SSE registers across calls is Win64 (thus no need to check the
39312 current ABI here), and with AVX enabled Win64 only guarantees that
39313 the low 16 bytes are saved. */
39315 static bool
39316 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
39318 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
39321 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39322 tieable integer mode. */
39324 static bool
39325 ix86_tieable_integer_mode_p (machine_mode mode)
39327 switch (mode)
39329 case E_HImode:
39330 case E_SImode:
39331 return true;
39333 case E_QImode:
39334 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39336 case E_DImode:
39337 return TARGET_64BIT;
39339 default:
39340 return false;
39344 /* Implement TARGET_MODES_TIEABLE_P.
39346 Return true if MODE1 is accessible in a register that can hold MODE2
39347 without copying. That is, all register classes that can hold MODE2
39348 can also hold MODE1. */
39350 static bool
39351 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39353 if (mode1 == mode2)
39354 return true;
39356 if (ix86_tieable_integer_mode_p (mode1)
39357 && ix86_tieable_integer_mode_p (mode2))
39358 return true;
39360 /* MODE2 being XFmode implies fp stack or general regs, which means we
39361 can tie any smaller floating point modes to it. Note that we do not
39362 tie this with TFmode. */
39363 if (mode2 == XFmode)
39364 return mode1 == SFmode || mode1 == DFmode;
39366 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39367 that we can tie it with SFmode. */
39368 if (mode2 == DFmode)
39369 return mode1 == SFmode;
39371 /* If MODE2 is only appropriate for an SSE register, then tie with
39372 any other mode acceptable to SSE registers. */
39373 if (GET_MODE_SIZE (mode2) == 32
39374 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39375 return (GET_MODE_SIZE (mode1) == 32
39376 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39377 if (GET_MODE_SIZE (mode2) == 16
39378 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39379 return (GET_MODE_SIZE (mode1) == 16
39380 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39382 /* If MODE2 is appropriate for an MMX register, then tie
39383 with any other mode acceptable to MMX registers. */
39384 if (GET_MODE_SIZE (mode2) == 8
39385 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39386 return (GET_MODE_SIZE (mode1) == 8
39387 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39389 return false;
39392 /* Return the cost of moving between two registers of mode MODE. */
39394 static int
39395 ix86_set_reg_reg_cost (machine_mode mode)
39397 unsigned int units = UNITS_PER_WORD;
39399 switch (GET_MODE_CLASS (mode))
39401 default:
39402 break;
39404 case MODE_CC:
39405 units = GET_MODE_SIZE (CCmode);
39406 break;
39408 case MODE_FLOAT:
39409 if ((TARGET_SSE && mode == TFmode)
39410 || (TARGET_80387 && mode == XFmode)
39411 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39412 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39413 units = GET_MODE_SIZE (mode);
39414 break;
39416 case MODE_COMPLEX_FLOAT:
39417 if ((TARGET_SSE && mode == TCmode)
39418 || (TARGET_80387 && mode == XCmode)
39419 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39420 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39421 units = GET_MODE_SIZE (mode);
39422 break;
39424 case MODE_VECTOR_INT:
39425 case MODE_VECTOR_FLOAT:
39426 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39427 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39428 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39429 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39430 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39431 units = GET_MODE_SIZE (mode);
39434 /* Return the cost of moving between two registers of mode MODE,
39435 assuming that the move will be in pieces of at most UNITS bytes. */
39436 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39439 /* Return cost of vector operation in MODE given that scalar version has
39440 COST. If PARALLEL is true assume that CPU has more than one unit
39441 performing the operation. */
39443 static int
39444 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
39446 if (!VECTOR_MODE_P (mode))
39447 return cost;
39449 if (!parallel)
39450 return cost * GET_MODE_NUNITS (mode);
39451 if (GET_MODE_BITSIZE (mode) == 128
39452 && TARGET_SSE_SPLIT_REGS)
39453 return cost * 2;
39454 if (GET_MODE_BITSIZE (mode) > 128
39455 && TARGET_AVX128_OPTIMAL)
39456 return cost * GET_MODE_BITSIZE (mode) / 128;
39457 return cost;
39460 /* Return cost of multiplication in MODE. */
39462 static int
39463 ix86_multiplication_cost (const struct processor_costs *cost,
39464 enum machine_mode mode)
39466 machine_mode inner_mode = mode;
39467 if (VECTOR_MODE_P (mode))
39468 inner_mode = GET_MODE_INNER (mode);
39470 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39471 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
39472 else if (X87_FLOAT_MODE_P (mode))
39473 return cost->fmul;
39474 else if (FLOAT_MODE_P (mode))
39475 return ix86_vec_cost (mode,
39476 inner_mode == DFmode
39477 ? cost->mulsd : cost->mulss, true);
39478 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39480 /* V*QImode is emulated with 7-13 insns. */
39481 if (mode == V16QImode || mode == V32QImode)
39483 int extra = 11;
39484 if (TARGET_XOP && mode == V16QImode)
39485 extra = 5;
39486 else if (TARGET_SSSE3)
39487 extra = 6;
39488 return ix86_vec_cost (mode,
39489 cost->mulss * 2 + cost->sse_op * extra,
39490 true);
39492 /* V*DImode is emulated with 5-8 insns. */
39493 else if (mode == V2DImode || mode == V4DImode)
39495 if (TARGET_XOP && mode == V2DImode)
39496 return ix86_vec_cost (mode,
39497 cost->mulss * 2 + cost->sse_op * 3,
39498 true);
39499 else
39500 return ix86_vec_cost (mode,
39501 cost->mulss * 3 + cost->sse_op * 5,
39502 true);
39504 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39505 insns, including two PMULUDQ. */
39506 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39507 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
39508 true);
39509 else
39510 return ix86_vec_cost (mode, cost->mulss, true);
39512 else
39513 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
39516 /* Return cost of multiplication in MODE. */
39518 static int
39519 ix86_division_cost (const struct processor_costs *cost,
39520 enum machine_mode mode)
39522 machine_mode inner_mode = mode;
39523 if (VECTOR_MODE_P (mode))
39524 inner_mode = GET_MODE_INNER (mode);
39526 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39527 return inner_mode == DFmode ? cost->divsd : cost->divss;
39528 else if (X87_FLOAT_MODE_P (mode))
39529 return cost->fdiv;
39530 else if (FLOAT_MODE_P (mode))
39531 return ix86_vec_cost (mode,
39532 inner_mode == DFmode ? cost->divsd : cost->divss,
39533 true);
39534 else
39535 return cost->divide[MODE_INDEX (mode)];
39538 /* Return cost of shift in MODE.
39539 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
39540 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
39541 if op1 is a result of subreg.
39543 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
39545 static int
39546 ix86_shift_rotate_cost (const struct processor_costs *cost,
39547 enum machine_mode mode, bool constant_op1,
39548 HOST_WIDE_INT op1_val,
39549 bool speed,
39550 bool and_in_op1,
39551 bool shift_and_truncate,
39552 bool *skip_op0, bool *skip_op1)
39554 if (skip_op0)
39555 *skip_op0 = *skip_op1 = false;
39556 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39558 /* V*QImode is emulated with 1-11 insns. */
39559 if (mode == V16QImode || mode == V32QImode)
39561 int count = 11;
39562 if (TARGET_XOP && mode == V16QImode)
39564 /* For XOP we use vpshab, which requires a broadcast of the
39565 value to the variable shift insn. For constants this
39566 means a V16Q const in mem; even when we can perform the
39567 shift with one insn set the cost to prefer paddb. */
39568 if (constant_op1)
39570 if (skip_op1)
39571 *skip_op1 = true;
39572 return ix86_vec_cost (mode,
39573 cost->sse_op
39574 + (speed
39576 : COSTS_N_BYTES
39577 (GET_MODE_UNIT_SIZE (mode))), true);
39579 count = 3;
39581 else if (TARGET_SSSE3)
39582 count = 7;
39583 return ix86_vec_cost (mode, cost->sse_op * count, true);
39585 else
39586 return ix86_vec_cost (mode, cost->sse_op, true);
39588 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39590 if (constant_op1)
39592 if (op1_val > 32)
39593 return cost->shift_const + COSTS_N_INSNS (2);
39594 else
39595 return cost->shift_const * 2;
39597 else
39599 if (and_in_op1)
39600 return cost->shift_var * 2;
39601 else
39602 return cost->shift_var * 6 + COSTS_N_INSNS (2);
39605 else
39607 if (constant_op1)
39608 return cost->shift_const;
39609 else if (shift_and_truncate)
39611 if (skip_op0)
39612 *skip_op0 = *skip_op1 = true;
39613 /* Return the cost after shift-and truncation. */
39614 return cost->shift_var;
39616 else
39617 return cost->shift_var;
39619 return cost->shift_const;
39622 /* Compute a (partial) cost for rtx X. Return true if the complete
39623 cost has been computed, and false if subexpressions should be
39624 scanned. In either case, *TOTAL contains the cost result. */
39626 static bool
39627 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39628 int *total, bool speed)
39630 rtx mask;
39631 enum rtx_code code = GET_CODE (x);
39632 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39633 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39634 int src_cost;
39636 switch (code)
39638 case SET:
39639 if (register_operand (SET_DEST (x), VOIDmode)
39640 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39642 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39643 return true;
39646 if (register_operand (SET_SRC (x), VOIDmode))
39647 /* Avoid potentially incorrect high cost from rtx_costs
39648 for non-tieable SUBREGs. */
39649 src_cost = 0;
39650 else
39652 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
39654 if (CONSTANT_P (SET_SRC (x)))
39655 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
39656 a small value, possibly zero for cheap constants. */
39657 src_cost += COSTS_N_INSNS (1);
39660 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
39661 return true;
39663 case CONST_INT:
39664 case CONST:
39665 case LABEL_REF:
39666 case SYMBOL_REF:
39667 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39668 *total = 3;
39669 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39670 *total = 2;
39671 else if (flag_pic && SYMBOLIC_CONST (x)
39672 && !(TARGET_64BIT
39673 && (GET_CODE (x) == LABEL_REF
39674 || (GET_CODE (x) == SYMBOL_REF
39675 && SYMBOL_REF_LOCAL_P (x))))
39676 /* Use 0 cost for CONST to improve its propagation. */
39677 && (TARGET_64BIT || GET_CODE (x) != CONST))
39678 *total = 1;
39679 else
39680 *total = 0;
39681 return true;
39683 case CONST_DOUBLE:
39684 if (IS_STACK_MODE (mode))
39685 switch (standard_80387_constant_p (x))
39687 case -1:
39688 case 0:
39689 break;
39690 case 1: /* 0.0 */
39691 *total = 1;
39692 return true;
39693 default: /* Other constants */
39694 *total = 2;
39695 return true;
39697 /* FALLTHRU */
39699 case CONST_VECTOR:
39700 switch (standard_sse_constant_p (x, mode))
39702 case 0:
39703 break;
39704 case 1: /* 0: xor eliminates false dependency */
39705 *total = 0;
39706 return true;
39707 default: /* -1: cmp contains false dependency */
39708 *total = 1;
39709 return true;
39711 /* FALLTHRU */
39713 case CONST_WIDE_INT:
39714 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39715 it'll probably end up. Add a penalty for size. */
39716 *total = (COSTS_N_INSNS (1)
39717 + (!TARGET_64BIT && flag_pic)
39718 + (GET_MODE_SIZE (mode) <= 4
39719 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39720 return true;
39722 case ZERO_EXTEND:
39723 /* The zero extensions is often completely free on x86_64, so make
39724 it as cheap as possible. */
39725 if (TARGET_64BIT && mode == DImode
39726 && GET_MODE (XEXP (x, 0)) == SImode)
39727 *total = 1;
39728 else if (TARGET_ZERO_EXTEND_WITH_AND)
39729 *total = cost->add;
39730 else
39731 *total = cost->movzx;
39732 return false;
39734 case SIGN_EXTEND:
39735 *total = cost->movsx;
39736 return false;
39738 case ASHIFT:
39739 if (SCALAR_INT_MODE_P (mode)
39740 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39741 && CONST_INT_P (XEXP (x, 1)))
39743 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39744 if (value == 1)
39746 *total = cost->add;
39747 return false;
39749 if ((value == 2 || value == 3)
39750 && cost->lea <= cost->shift_const)
39752 *total = cost->lea;
39753 return false;
39756 /* FALLTHRU */
39758 case ROTATE:
39759 case ASHIFTRT:
39760 case LSHIFTRT:
39761 case ROTATERT:
39762 bool skip_op0, skip_op1;
39763 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
39764 CONST_INT_P (XEXP (x, 1))
39765 ? INTVAL (XEXP (x, 1)) : -1,
39766 speed,
39767 GET_CODE (XEXP (x, 1)) == AND,
39768 SUBREG_P (XEXP (x, 1))
39769 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
39770 &skip_op0, &skip_op1);
39771 if (skip_op0 || skip_op1)
39773 if (!skip_op0)
39774 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
39775 if (!skip_op1)
39776 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
39777 return true;
39779 return false;
39781 case FMA:
39783 rtx sub;
39785 gcc_assert (FLOAT_MODE_P (mode));
39786 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39788 *total = ix86_vec_cost (mode,
39789 mode == SFmode ? cost->fmass : cost->fmasd,
39790 true);
39791 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39793 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39794 sub = XEXP (x, 0);
39795 if (GET_CODE (sub) == NEG)
39796 sub = XEXP (sub, 0);
39797 *total += rtx_cost (sub, mode, FMA, 0, speed);
39799 sub = XEXP (x, 2);
39800 if (GET_CODE (sub) == NEG)
39801 sub = XEXP (sub, 0);
39802 *total += rtx_cost (sub, mode, FMA, 2, speed);
39803 return true;
39806 case MULT:
39807 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
39809 rtx op0 = XEXP (x, 0);
39810 rtx op1 = XEXP (x, 1);
39811 int nbits;
39812 if (CONST_INT_P (XEXP (x, 1)))
39814 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39815 for (nbits = 0; value != 0; value &= value - 1)
39816 nbits++;
39818 else
39819 /* This is arbitrary. */
39820 nbits = 7;
39822 /* Compute costs correctly for widening multiplication. */
39823 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39824 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39825 == GET_MODE_SIZE (mode))
39827 int is_mulwiden = 0;
39828 machine_mode inner_mode = GET_MODE (op0);
39830 if (GET_CODE (op0) == GET_CODE (op1))
39831 is_mulwiden = 1, op1 = XEXP (op1, 0);
39832 else if (CONST_INT_P (op1))
39834 if (GET_CODE (op0) == SIGN_EXTEND)
39835 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39836 == INTVAL (op1);
39837 else
39838 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39841 if (is_mulwiden)
39842 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39845 *total = (cost->mult_init[MODE_INDEX (mode)]
39846 + nbits * cost->mult_bit
39847 + rtx_cost (op0, mode, outer_code, opno, speed)
39848 + rtx_cost (op1, mode, outer_code, opno, speed));
39850 return true;
39852 *total = ix86_multiplication_cost (cost, mode);
39853 return false;
39855 case DIV:
39856 case UDIV:
39857 case MOD:
39858 case UMOD:
39859 *total = ix86_division_cost (cost, mode);
39860 return false;
39862 case PLUS:
39863 if (GET_MODE_CLASS (mode) == MODE_INT
39864 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
39866 if (GET_CODE (XEXP (x, 0)) == PLUS
39867 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
39868 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
39869 && CONSTANT_P (XEXP (x, 1)))
39871 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
39872 if (val == 2 || val == 4 || val == 8)
39874 *total = cost->lea;
39875 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39876 outer_code, opno, speed);
39877 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
39878 outer_code, opno, speed);
39879 *total += rtx_cost (XEXP (x, 1), mode,
39880 outer_code, opno, speed);
39881 return true;
39884 else if (GET_CODE (XEXP (x, 0)) == MULT
39885 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
39887 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
39888 if (val == 2 || val == 4 || val == 8)
39890 *total = cost->lea;
39891 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39892 outer_code, opno, speed);
39893 *total += rtx_cost (XEXP (x, 1), mode,
39894 outer_code, opno, speed);
39895 return true;
39898 else if (GET_CODE (XEXP (x, 0)) == PLUS)
39900 /* Add with carry, ignore the cost of adding a carry flag. */
39901 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
39902 *total = cost->add;
39903 else
39905 *total = cost->lea;
39906 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39907 outer_code, opno, speed);
39910 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39911 outer_code, opno, speed);
39912 *total += rtx_cost (XEXP (x, 1), mode,
39913 outer_code, opno, speed);
39914 return true;
39917 /* FALLTHRU */
39919 case MINUS:
39920 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
39921 if (GET_MODE_CLASS (mode) == MODE_INT
39922 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
39923 && GET_CODE (XEXP (x, 0)) == MINUS
39924 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
39926 *total = cost->add;
39927 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39928 outer_code, opno, speed);
39929 *total += rtx_cost (XEXP (x, 1), mode,
39930 outer_code, opno, speed);
39931 return true;
39934 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39936 *total = cost->addss;
39937 return false;
39939 else if (X87_FLOAT_MODE_P (mode))
39941 *total = cost->fadd;
39942 return false;
39944 else if (FLOAT_MODE_P (mode))
39946 *total = ix86_vec_cost (mode, cost->addss, true);
39947 return false;
39949 /* FALLTHRU */
39951 case AND:
39952 case IOR:
39953 case XOR:
39954 if (GET_MODE_CLASS (mode) == MODE_INT
39955 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39957 *total = (cost->add * 2
39958 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
39959 << (GET_MODE (XEXP (x, 0)) != DImode))
39960 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
39961 << (GET_MODE (XEXP (x, 1)) != DImode)));
39962 return true;
39964 /* FALLTHRU */
39966 case NEG:
39967 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39969 *total = cost->sse_op;
39970 return false;
39972 else if (X87_FLOAT_MODE_P (mode))
39974 *total = cost->fchs;
39975 return false;
39977 else if (FLOAT_MODE_P (mode))
39979 *total = ix86_vec_cost (mode, cost->sse_op, true);
39980 return false;
39982 /* FALLTHRU */
39984 case NOT:
39985 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39986 *total = ix86_vec_cost (mode, cost->sse_op, true);
39987 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39988 *total = cost->add * 2;
39989 else
39990 *total = cost->add;
39991 return false;
39993 case COMPARE:
39994 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
39995 && XEXP (XEXP (x, 0), 1) == const1_rtx
39996 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
39997 && XEXP (x, 1) == const0_rtx)
39999 /* This kind of construct is implemented using test[bwl].
40000 Treat it as if we had an AND. */
40001 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40002 *total = (cost->add
40003 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40004 opno, speed)
40005 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40006 return true;
40009 /* The embedded comparison operand is completely free. */
40010 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40011 && XEXP (x, 1) == const0_rtx)
40012 *total = 0;
40014 return false;
40016 case FLOAT_EXTEND:
40017 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40018 *total = 0;
40019 else
40020 *total = ix86_vec_cost (mode, cost->addss, true);
40021 return false;
40023 case FLOAT_TRUNCATE:
40024 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40025 *total = cost->fadd;
40026 else
40027 *total = ix86_vec_cost (mode, cost->addss, true);
40028 return false;
40030 case ABS:
40031 /* SSE requires memory load for the constant operand. It may make
40032 sense to account for this. Of course the constant operand may or
40033 may not be reused. */
40034 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40035 *total = cost->sse_op;
40036 else if (X87_FLOAT_MODE_P (mode))
40037 *total = cost->fabs;
40038 else if (FLOAT_MODE_P (mode))
40039 *total = ix86_vec_cost (mode, cost->sse_op, true);
40040 return false;
40042 case SQRT:
40043 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40044 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40045 else if (X87_FLOAT_MODE_P (mode))
40046 *total = cost->fsqrt;
40047 else if (FLOAT_MODE_P (mode))
40048 *total = ix86_vec_cost (mode,
40049 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40050 true);
40051 return false;
40053 case UNSPEC:
40054 if (XINT (x, 1) == UNSPEC_TP)
40055 *total = 0;
40056 return false;
40058 case VEC_SELECT:
40059 case VEC_CONCAT:
40060 case VEC_DUPLICATE:
40061 /* ??? Assume all of these vector manipulation patterns are
40062 recognizable. In which case they all pretty much have the
40063 same cost. */
40064 *total = cost->sse_op;
40065 return true;
40066 case VEC_MERGE:
40067 mask = XEXP (x, 2);
40068 /* This is masked instruction, assume the same cost,
40069 as nonmasked variant. */
40070 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40071 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40072 else
40073 *total = cost->sse_op;
40074 return true;
40076 default:
40077 return false;
40081 #if TARGET_MACHO
40083 static int current_machopic_label_num;
40085 /* Given a symbol name and its associated stub, write out the
40086 definition of the stub. */
40088 void
40089 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40091 unsigned int length;
40092 char *binder_name, *symbol_name, lazy_ptr_name[32];
40093 int label = ++current_machopic_label_num;
40095 /* For 64-bit we shouldn't get here. */
40096 gcc_assert (!TARGET_64BIT);
40098 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40099 symb = targetm.strip_name_encoding (symb);
40101 length = strlen (stub);
40102 binder_name = XALLOCAVEC (char, length + 32);
40103 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40105 length = strlen (symb);
40106 symbol_name = XALLOCAVEC (char, length + 32);
40107 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40109 sprintf (lazy_ptr_name, "L%d$lz", label);
40111 if (MACHOPIC_ATT_STUB)
40112 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40113 else if (MACHOPIC_PURE)
40114 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40115 else
40116 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40118 fprintf (file, "%s:\n", stub);
40119 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40121 if (MACHOPIC_ATT_STUB)
40123 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40125 else if (MACHOPIC_PURE)
40127 /* PIC stub. */
40128 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40129 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40130 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40131 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40132 label, lazy_ptr_name, label);
40133 fprintf (file, "\tjmp\t*%%ecx\n");
40135 else
40136 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40138 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40139 it needs no stub-binding-helper. */
40140 if (MACHOPIC_ATT_STUB)
40141 return;
40143 fprintf (file, "%s:\n", binder_name);
40145 if (MACHOPIC_PURE)
40147 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40148 fprintf (file, "\tpushl\t%%ecx\n");
40150 else
40151 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40153 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40155 /* N.B. Keep the correspondence of these
40156 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40157 old-pic/new-pic/non-pic stubs; altering this will break
40158 compatibility with existing dylibs. */
40159 if (MACHOPIC_PURE)
40161 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40162 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40164 else
40165 /* 16-byte -mdynamic-no-pic stub. */
40166 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40168 fprintf (file, "%s:\n", lazy_ptr_name);
40169 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40170 fprintf (file, ASM_LONG "%s\n", binder_name);
40172 #endif /* TARGET_MACHO */
40174 /* Order the registers for register allocator. */
40176 void
40177 x86_order_regs_for_local_alloc (void)
40179 int pos = 0;
40180 int i;
40182 /* First allocate the local general purpose registers. */
40183 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40184 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40185 reg_alloc_order [pos++] = i;
40187 /* Global general purpose registers. */
40188 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40189 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40190 reg_alloc_order [pos++] = i;
40192 /* x87 registers come first in case we are doing FP math
40193 using them. */
40194 if (!TARGET_SSE_MATH)
40195 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40196 reg_alloc_order [pos++] = i;
40198 /* SSE registers. */
40199 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40200 reg_alloc_order [pos++] = i;
40201 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40202 reg_alloc_order [pos++] = i;
40204 /* Extended REX SSE registers. */
40205 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40206 reg_alloc_order [pos++] = i;
40208 /* Mask register. */
40209 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40210 reg_alloc_order [pos++] = i;
40212 /* MPX bound registers. */
40213 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40214 reg_alloc_order [pos++] = i;
40216 /* x87 registers. */
40217 if (TARGET_SSE_MATH)
40218 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40219 reg_alloc_order [pos++] = i;
40221 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40222 reg_alloc_order [pos++] = i;
40224 /* Initialize the rest of array as we do not allocate some registers
40225 at all. */
40226 while (pos < FIRST_PSEUDO_REGISTER)
40227 reg_alloc_order [pos++] = 0;
40230 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40231 in struct attribute_spec handler. */
40232 static tree
40233 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
40234 bool *no_add_attrs)
40236 if (TREE_CODE (*node) != FUNCTION_TYPE
40237 && TREE_CODE (*node) != METHOD_TYPE
40238 && TREE_CODE (*node) != FIELD_DECL
40239 && TREE_CODE (*node) != TYPE_DECL)
40241 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40242 name);
40243 *no_add_attrs = true;
40244 return NULL_TREE;
40246 if (TARGET_64BIT)
40248 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40249 name);
40250 *no_add_attrs = true;
40251 return NULL_TREE;
40253 if (is_attribute_p ("callee_pop_aggregate_return", name))
40255 tree cst;
40257 cst = TREE_VALUE (args);
40258 if (TREE_CODE (cst) != INTEGER_CST)
40260 warning (OPT_Wattributes,
40261 "%qE attribute requires an integer constant argument",
40262 name);
40263 *no_add_attrs = true;
40265 else if (compare_tree_int (cst, 0) != 0
40266 && compare_tree_int (cst, 1) != 0)
40268 warning (OPT_Wattributes,
40269 "argument to %qE attribute is neither zero, nor one",
40270 name);
40271 *no_add_attrs = true;
40274 return NULL_TREE;
40277 return NULL_TREE;
40280 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40281 struct attribute_spec.handler. */
40282 static tree
40283 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40284 bool *no_add_attrs)
40286 if (TREE_CODE (*node) != FUNCTION_TYPE
40287 && TREE_CODE (*node) != METHOD_TYPE
40288 && TREE_CODE (*node) != FIELD_DECL
40289 && TREE_CODE (*node) != TYPE_DECL)
40291 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40292 name);
40293 *no_add_attrs = true;
40294 return NULL_TREE;
40297 /* Can combine regparm with all attributes but fastcall. */
40298 if (is_attribute_p ("ms_abi", name))
40300 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40302 error ("ms_abi and sysv_abi attributes are not compatible");
40305 return NULL_TREE;
40307 else if (is_attribute_p ("sysv_abi", name))
40309 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40311 error ("ms_abi and sysv_abi attributes are not compatible");
40314 return NULL_TREE;
40317 return NULL_TREE;
40320 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40321 struct attribute_spec.handler. */
40322 static tree
40323 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40324 bool *no_add_attrs)
40326 tree *type = NULL;
40327 if (DECL_P (*node))
40329 if (TREE_CODE (*node) == TYPE_DECL)
40330 type = &TREE_TYPE (*node);
40332 else
40333 type = node;
40335 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40337 warning (OPT_Wattributes, "%qE attribute ignored",
40338 name);
40339 *no_add_attrs = true;
40342 else if ((is_attribute_p ("ms_struct", name)
40343 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40344 || ((is_attribute_p ("gcc_struct", name)
40345 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40347 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40348 name);
40349 *no_add_attrs = true;
40352 return NULL_TREE;
40355 static tree
40356 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40357 bool *no_add_attrs)
40359 if (TREE_CODE (*node) != FUNCTION_DECL)
40361 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40362 name);
40363 *no_add_attrs = true;
40365 return NULL_TREE;
40368 static tree
40369 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40370 int, bool *)
40372 return NULL_TREE;
40375 static tree
40376 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40378 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40379 but the function type contains args and return type data. */
40380 tree func_type = *node;
40381 tree return_type = TREE_TYPE (func_type);
40383 int nargs = 0;
40384 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40385 while (current_arg_type
40386 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40388 if (nargs == 0)
40390 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40391 error ("interrupt service routine should have a pointer "
40392 "as the first argument");
40394 else if (nargs == 1)
40396 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40397 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40398 error ("interrupt service routine should have unsigned %s"
40399 "int as the second argument",
40400 TARGET_64BIT
40401 ? (TARGET_X32 ? "long long " : "long ")
40402 : "");
40404 nargs++;
40405 current_arg_type = TREE_CHAIN (current_arg_type);
40407 if (!nargs || nargs > 2)
40408 error ("interrupt service routine can only have a pointer argument "
40409 "and an optional integer argument");
40410 if (! VOID_TYPE_P (return_type))
40411 error ("interrupt service routine can't have non-void return value");
40413 return NULL_TREE;
40416 static bool
40417 ix86_ms_bitfield_layout_p (const_tree record_type)
40419 return ((TARGET_MS_BITFIELD_LAYOUT
40420 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40421 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40424 /* Returns an expression indicating where the this parameter is
40425 located on entry to the FUNCTION. */
40427 static rtx
40428 x86_this_parameter (tree function)
40430 tree type = TREE_TYPE (function);
40431 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40432 int nregs;
40434 if (TARGET_64BIT)
40436 const int *parm_regs;
40438 if (ix86_function_type_abi (type) == MS_ABI)
40439 parm_regs = x86_64_ms_abi_int_parameter_registers;
40440 else
40441 parm_regs = x86_64_int_parameter_registers;
40442 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40445 nregs = ix86_function_regparm (type, function);
40447 if (nregs > 0 && !stdarg_p (type))
40449 int regno;
40450 unsigned int ccvt = ix86_get_callcvt (type);
40452 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40453 regno = aggr ? DX_REG : CX_REG;
40454 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40456 regno = CX_REG;
40457 if (aggr)
40458 return gen_rtx_MEM (SImode,
40459 plus_constant (Pmode, stack_pointer_rtx, 4));
40461 else
40463 regno = AX_REG;
40464 if (aggr)
40466 regno = DX_REG;
40467 if (nregs == 1)
40468 return gen_rtx_MEM (SImode,
40469 plus_constant (Pmode,
40470 stack_pointer_rtx, 4));
40473 return gen_rtx_REG (SImode, regno);
40476 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40477 aggr ? 8 : 4));
40480 /* Determine whether x86_output_mi_thunk can succeed. */
40482 static bool
40483 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40484 const_tree function)
40486 /* 64-bit can handle anything. */
40487 if (TARGET_64BIT)
40488 return true;
40490 /* For 32-bit, everything's fine if we have one free register. */
40491 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40492 return true;
40494 /* Need a free register for vcall_offset. */
40495 if (vcall_offset)
40496 return false;
40498 /* Need a free register for GOT references. */
40499 if (flag_pic && !targetm.binds_local_p (function))
40500 return false;
40502 /* Otherwise ok. */
40503 return true;
40506 /* Output the assembler code for a thunk function. THUNK_DECL is the
40507 declaration for the thunk function itself, FUNCTION is the decl for
40508 the target function. DELTA is an immediate constant offset to be
40509 added to THIS. If VCALL_OFFSET is nonzero, the word at
40510 *(*this + vcall_offset) should be added to THIS. */
40512 static void
40513 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40514 HOST_WIDE_INT vcall_offset, tree function)
40516 rtx this_param = x86_this_parameter (function);
40517 rtx this_reg, tmp, fnaddr;
40518 unsigned int tmp_regno;
40519 rtx_insn *insn;
40521 if (TARGET_64BIT)
40522 tmp_regno = R10_REG;
40523 else
40525 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40526 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40527 tmp_regno = AX_REG;
40528 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40529 tmp_regno = DX_REG;
40530 else
40531 tmp_regno = CX_REG;
40534 emit_note (NOTE_INSN_PROLOGUE_END);
40536 /* CET is enabled, insert EB instruction. */
40537 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
40538 emit_insn (gen_nop_endbr ());
40540 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40541 pull it in now and let DELTA benefit. */
40542 if (REG_P (this_param))
40543 this_reg = this_param;
40544 else if (vcall_offset)
40546 /* Put the this parameter into %eax. */
40547 this_reg = gen_rtx_REG (Pmode, AX_REG);
40548 emit_move_insn (this_reg, this_param);
40550 else
40551 this_reg = NULL_RTX;
40553 /* Adjust the this parameter by a fixed constant. */
40554 if (delta)
40556 rtx delta_rtx = GEN_INT (delta);
40557 rtx delta_dst = this_reg ? this_reg : this_param;
40559 if (TARGET_64BIT)
40561 if (!x86_64_general_operand (delta_rtx, Pmode))
40563 tmp = gen_rtx_REG (Pmode, tmp_regno);
40564 emit_move_insn (tmp, delta_rtx);
40565 delta_rtx = tmp;
40569 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40572 /* Adjust the this parameter by a value stored in the vtable. */
40573 if (vcall_offset)
40575 rtx vcall_addr, vcall_mem, this_mem;
40577 tmp = gen_rtx_REG (Pmode, tmp_regno);
40579 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40580 if (Pmode != ptr_mode)
40581 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40582 emit_move_insn (tmp, this_mem);
40584 /* Adjust the this parameter. */
40585 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40586 if (TARGET_64BIT
40587 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40589 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40590 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40591 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40594 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40595 if (Pmode != ptr_mode)
40596 emit_insn (gen_addsi_1_zext (this_reg,
40597 gen_rtx_REG (ptr_mode,
40598 REGNO (this_reg)),
40599 vcall_mem));
40600 else
40601 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40604 /* If necessary, drop THIS back to its stack slot. */
40605 if (this_reg && this_reg != this_param)
40606 emit_move_insn (this_param, this_reg);
40608 fnaddr = XEXP (DECL_RTL (function), 0);
40609 if (TARGET_64BIT)
40611 if (!flag_pic || targetm.binds_local_p (function)
40612 || TARGET_PECOFF)
40614 else
40616 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40617 tmp = gen_rtx_CONST (Pmode, tmp);
40618 fnaddr = gen_const_mem (Pmode, tmp);
40621 else
40623 if (!flag_pic || targetm.binds_local_p (function))
40625 #if TARGET_MACHO
40626 else if (TARGET_MACHO)
40628 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40629 fnaddr = XEXP (fnaddr, 0);
40631 #endif /* TARGET_MACHO */
40632 else
40634 tmp = gen_rtx_REG (Pmode, CX_REG);
40635 output_set_got (tmp, NULL_RTX);
40637 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40638 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40639 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40640 fnaddr = gen_const_mem (Pmode, fnaddr);
40644 /* Our sibling call patterns do not allow memories, because we have no
40645 predicate that can distinguish between frame and non-frame memory.
40646 For our purposes here, we can get away with (ab)using a jump pattern,
40647 because we're going to do no optimization. */
40648 if (MEM_P (fnaddr))
40650 if (sibcall_insn_operand (fnaddr, word_mode))
40652 fnaddr = XEXP (DECL_RTL (function), 0);
40653 tmp = gen_rtx_MEM (QImode, fnaddr);
40654 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40655 tmp = emit_call_insn (tmp);
40656 SIBLING_CALL_P (tmp) = 1;
40658 else
40659 emit_jump_insn (gen_indirect_jump (fnaddr));
40661 else
40663 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40665 // CM_LARGE_PIC always uses pseudo PIC register which is
40666 // uninitialized. Since FUNCTION is local and calling it
40667 // doesn't go through PLT, we use scratch register %r11 as
40668 // PIC register and initialize it here.
40669 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40670 ix86_init_large_pic_reg (tmp_regno);
40671 fnaddr = legitimize_pic_address (fnaddr,
40672 gen_rtx_REG (Pmode, tmp_regno));
40675 if (!sibcall_insn_operand (fnaddr, word_mode))
40677 tmp = gen_rtx_REG (word_mode, tmp_regno);
40678 if (GET_MODE (fnaddr) != word_mode)
40679 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40680 emit_move_insn (tmp, fnaddr);
40681 fnaddr = tmp;
40684 tmp = gen_rtx_MEM (QImode, fnaddr);
40685 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40686 tmp = emit_call_insn (tmp);
40687 SIBLING_CALL_P (tmp) = 1;
40689 emit_barrier ();
40691 /* Emit just enough of rest_of_compilation to get the insns emitted.
40692 Note that use_thunk calls assemble_start_function et al. */
40693 insn = get_insns ();
40694 shorten_branches (insn);
40695 final_start_function (insn, file, 1);
40696 final (insn, file, 1);
40697 final_end_function ();
40700 static void
40701 x86_file_start (void)
40703 default_file_start ();
40704 if (TARGET_16BIT)
40705 fputs ("\t.code16gcc\n", asm_out_file);
40706 #if TARGET_MACHO
40707 darwin_file_start ();
40708 #endif
40709 if (X86_FILE_START_VERSION_DIRECTIVE)
40710 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40711 if (X86_FILE_START_FLTUSED)
40712 fputs ("\t.global\t__fltused\n", asm_out_file);
40713 if (ix86_asm_dialect == ASM_INTEL)
40714 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40718 x86_field_alignment (tree type, int computed)
40720 machine_mode mode;
40722 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40723 return computed;
40724 if (TARGET_IAMCU)
40725 return iamcu_alignment (type, computed);
40726 mode = TYPE_MODE (strip_array_types (type));
40727 if (mode == DFmode || mode == DCmode
40728 || GET_MODE_CLASS (mode) == MODE_INT
40729 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40730 return MIN (32, computed);
40731 return computed;
40734 /* Print call to TARGET to FILE. */
40736 static void
40737 x86_print_call_or_nop (FILE *file, const char *target)
40739 if (flag_nop_mcount)
40740 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
40741 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
40742 else
40743 fprintf (file, "1:\tcall\t%s\n", target);
40746 /* Output assembler code to FILE to increment profiler label # LABELNO
40747 for profiling a function entry. */
40748 void
40749 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40751 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40752 : MCOUNT_NAME);
40753 if (TARGET_64BIT)
40755 #ifndef NO_PROFILE_COUNTERS
40756 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40757 #endif
40759 if (!TARGET_PECOFF && flag_pic)
40760 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40761 else
40762 x86_print_call_or_nop (file, mcount_name);
40764 else if (flag_pic)
40766 #ifndef NO_PROFILE_COUNTERS
40767 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40768 LPREFIX, labelno);
40769 #endif
40770 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40772 else
40774 #ifndef NO_PROFILE_COUNTERS
40775 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40776 LPREFIX, labelno);
40777 #endif
40778 x86_print_call_or_nop (file, mcount_name);
40781 if (flag_record_mcount)
40783 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40784 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40785 fprintf (file, "\t.previous\n");
40789 /* We don't have exact information about the insn sizes, but we may assume
40790 quite safely that we are informed about all 1 byte insns and memory
40791 address sizes. This is enough to eliminate unnecessary padding in
40792 99% of cases. */
40795 ix86_min_insn_size (rtx_insn *insn)
40797 int l = 0, len;
40799 if (!INSN_P (insn) || !active_insn_p (insn))
40800 return 0;
40802 /* Discard alignments we've emit and jump instructions. */
40803 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40804 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40805 return 0;
40807 /* Important case - calls are always 5 bytes.
40808 It is common to have many calls in the row. */
40809 if (CALL_P (insn)
40810 && symbolic_reference_mentioned_p (PATTERN (insn))
40811 && !SIBLING_CALL_P (insn))
40812 return 5;
40813 len = get_attr_length (insn);
40814 if (len <= 1)
40815 return 1;
40817 /* For normal instructions we rely on get_attr_length being exact,
40818 with a few exceptions. */
40819 if (!JUMP_P (insn))
40821 enum attr_type type = get_attr_type (insn);
40823 switch (type)
40825 case TYPE_MULTI:
40826 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40827 || asm_noperands (PATTERN (insn)) >= 0)
40828 return 0;
40829 break;
40830 case TYPE_OTHER:
40831 case TYPE_FCMP:
40832 break;
40833 default:
40834 /* Otherwise trust get_attr_length. */
40835 return len;
40838 l = get_attr_length_address (insn);
40839 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40840 l = 4;
40842 if (l)
40843 return 1+l;
40844 else
40845 return 2;
40848 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40850 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40851 window. */
40853 static void
40854 ix86_avoid_jump_mispredicts (void)
40856 rtx_insn *insn, *start = get_insns ();
40857 int nbytes = 0, njumps = 0;
40858 bool isjump = false;
40860 /* Look for all minimal intervals of instructions containing 4 jumps.
40861 The intervals are bounded by START and INSN. NBYTES is the total
40862 size of instructions in the interval including INSN and not including
40863 START. When the NBYTES is smaller than 16 bytes, it is possible
40864 that the end of START and INSN ends up in the same 16byte page.
40866 The smallest offset in the page INSN can start is the case where START
40867 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40868 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40870 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40871 have to, control transfer to label(s) can be performed through other
40872 means, and also we estimate minimum length of all asm stmts as 0. */
40873 for (insn = start; insn; insn = NEXT_INSN (insn))
40875 int min_size;
40877 if (LABEL_P (insn))
40879 int align = label_to_alignment (insn);
40880 int max_skip = label_to_max_skip (insn);
40882 if (max_skip > 15)
40883 max_skip = 15;
40884 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40885 already in the current 16 byte page, because otherwise
40886 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40887 bytes to reach 16 byte boundary. */
40888 if (align <= 0
40889 || (align <= 3 && max_skip != (1 << align) - 1))
40890 max_skip = 0;
40891 if (dump_file)
40892 fprintf (dump_file, "Label %i with max_skip %i\n",
40893 INSN_UID (insn), max_skip);
40894 if (max_skip)
40896 while (nbytes + max_skip >= 16)
40898 start = NEXT_INSN (start);
40899 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40900 || CALL_P (start))
40901 njumps--, isjump = true;
40902 else
40903 isjump = false;
40904 nbytes -= ix86_min_insn_size (start);
40907 continue;
40910 min_size = ix86_min_insn_size (insn);
40911 nbytes += min_size;
40912 if (dump_file)
40913 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
40914 INSN_UID (insn), min_size);
40915 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
40916 || CALL_P (insn))
40917 njumps++;
40918 else
40919 continue;
40921 while (njumps > 3)
40923 start = NEXT_INSN (start);
40924 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40925 || CALL_P (start))
40926 njumps--, isjump = true;
40927 else
40928 isjump = false;
40929 nbytes -= ix86_min_insn_size (start);
40931 gcc_assert (njumps >= 0);
40932 if (dump_file)
40933 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
40934 INSN_UID (start), INSN_UID (insn), nbytes);
40936 if (njumps == 3 && isjump && nbytes < 16)
40938 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
40940 if (dump_file)
40941 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
40942 INSN_UID (insn), padsize);
40943 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
40947 #endif
40949 /* AMD Athlon works faster
40950 when RET is not destination of conditional jump or directly preceded
40951 by other jump instruction. We avoid the penalty by inserting NOP just
40952 before the RET instructions in such cases. */
40953 static void
40954 ix86_pad_returns (void)
40956 edge e;
40957 edge_iterator ei;
40959 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40961 basic_block bb = e->src;
40962 rtx_insn *ret = BB_END (bb);
40963 rtx_insn *prev;
40964 bool replace = false;
40966 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
40967 || optimize_bb_for_size_p (bb))
40968 continue;
40969 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
40970 if (active_insn_p (prev) || LABEL_P (prev))
40971 break;
40972 if (prev && LABEL_P (prev))
40974 edge e;
40975 edge_iterator ei;
40977 FOR_EACH_EDGE (e, ei, bb->preds)
40978 if (EDGE_FREQUENCY (e) && e->src->index >= 0
40979 && !(e->flags & EDGE_FALLTHRU))
40981 replace = true;
40982 break;
40985 if (!replace)
40987 prev = prev_active_insn (ret);
40988 if (prev
40989 && ((JUMP_P (prev) && any_condjump_p (prev))
40990 || CALL_P (prev)))
40991 replace = true;
40992 /* Empty functions get branch mispredict even when
40993 the jump destination is not visible to us. */
40994 if (!prev && !optimize_function_for_size_p (cfun))
40995 replace = true;
40997 if (replace)
40999 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41000 delete_insn (ret);
41005 /* Count the minimum number of instructions in BB. Return 4 if the
41006 number of instructions >= 4. */
41008 static int
41009 ix86_count_insn_bb (basic_block bb)
41011 rtx_insn *insn;
41012 int insn_count = 0;
41014 /* Count number of instructions in this block. Return 4 if the number
41015 of instructions >= 4. */
41016 FOR_BB_INSNS (bb, insn)
41018 /* Only happen in exit blocks. */
41019 if (JUMP_P (insn)
41020 && ANY_RETURN_P (PATTERN (insn)))
41021 break;
41023 if (NONDEBUG_INSN_P (insn)
41024 && GET_CODE (PATTERN (insn)) != USE
41025 && GET_CODE (PATTERN (insn)) != CLOBBER)
41027 insn_count++;
41028 if (insn_count >= 4)
41029 return insn_count;
41033 return insn_count;
41037 /* Count the minimum number of instructions in code path in BB.
41038 Return 4 if the number of instructions >= 4. */
41040 static int
41041 ix86_count_insn (basic_block bb)
41043 edge e;
41044 edge_iterator ei;
41045 int min_prev_count;
41047 /* Only bother counting instructions along paths with no
41048 more than 2 basic blocks between entry and exit. Given
41049 that BB has an edge to exit, determine if a predecessor
41050 of BB has an edge from entry. If so, compute the number
41051 of instructions in the predecessor block. If there
41052 happen to be multiple such blocks, compute the minimum. */
41053 min_prev_count = 4;
41054 FOR_EACH_EDGE (e, ei, bb->preds)
41056 edge prev_e;
41057 edge_iterator prev_ei;
41059 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41061 min_prev_count = 0;
41062 break;
41064 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41066 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41068 int count = ix86_count_insn_bb (e->src);
41069 if (count < min_prev_count)
41070 min_prev_count = count;
41071 break;
41076 if (min_prev_count < 4)
41077 min_prev_count += ix86_count_insn_bb (bb);
41079 return min_prev_count;
41082 /* Pad short function to 4 instructions. */
41084 static void
41085 ix86_pad_short_function (void)
41087 edge e;
41088 edge_iterator ei;
41090 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41092 rtx_insn *ret = BB_END (e->src);
41093 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41095 int insn_count = ix86_count_insn (e->src);
41097 /* Pad short function. */
41098 if (insn_count < 4)
41100 rtx_insn *insn = ret;
41102 /* Find epilogue. */
41103 while (insn
41104 && (!NOTE_P (insn)
41105 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41106 insn = PREV_INSN (insn);
41108 if (!insn)
41109 insn = ret;
41111 /* Two NOPs count as one instruction. */
41112 insn_count = 2 * (4 - insn_count);
41113 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41119 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41120 the epilogue, the Windows system unwinder will apply epilogue logic and
41121 produce incorrect offsets. This can be avoided by adding a nop between
41122 the last insn that can throw and the first insn of the epilogue. */
41124 static void
41125 ix86_seh_fixup_eh_fallthru (void)
41127 edge e;
41128 edge_iterator ei;
41130 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41132 rtx_insn *insn, *next;
41134 /* Find the beginning of the epilogue. */
41135 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41136 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41137 break;
41138 if (insn == NULL)
41139 continue;
41141 /* We only care about preceding insns that can throw. */
41142 insn = prev_active_insn (insn);
41143 if (insn == NULL || !can_throw_internal (insn))
41144 continue;
41146 /* Do not separate calls from their debug information. */
41147 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41148 if (NOTE_P (next)
41149 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41150 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41151 insn = next;
41152 else
41153 break;
41155 emit_insn_after (gen_nops (const1_rtx), insn);
41159 /* Given a register number BASE, the lowest of a group of registers, update
41160 regsets IN and OUT with the registers that should be avoided in input
41161 and output operands respectively when trying to avoid generating a modr/m
41162 byte for -fmitigate-rop. */
41164 static void
41165 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41167 SET_HARD_REG_BIT (out, base);
41168 SET_HARD_REG_BIT (out, base + 1);
41169 SET_HARD_REG_BIT (in, base + 2);
41170 SET_HARD_REG_BIT (in, base + 3);
41173 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41174 that certain encodings of modr/m bytes do not occur. */
41175 static void
41176 ix86_mitigate_rop (void)
41178 HARD_REG_SET input_risky;
41179 HARD_REG_SET output_risky;
41180 HARD_REG_SET inout_risky;
41182 CLEAR_HARD_REG_SET (output_risky);
41183 CLEAR_HARD_REG_SET (input_risky);
41184 SET_HARD_REG_BIT (output_risky, AX_REG);
41185 SET_HARD_REG_BIT (output_risky, CX_REG);
41186 SET_HARD_REG_BIT (input_risky, BX_REG);
41187 SET_HARD_REG_BIT (input_risky, DX_REG);
41188 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41189 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41190 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41191 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41192 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41193 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41194 COPY_HARD_REG_SET (inout_risky, input_risky);
41195 IOR_HARD_REG_SET (inout_risky, output_risky);
41197 df_note_add_problem ();
41198 /* Fix up what stack-regs did. */
41199 df_insn_rescan_all ();
41200 df_analyze ();
41202 regrename_init (true);
41203 regrename_analyze (NULL);
41205 auto_vec<du_head_p> cands;
41207 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41209 if (!NONDEBUG_INSN_P (insn))
41210 continue;
41212 if (GET_CODE (PATTERN (insn)) == USE
41213 || GET_CODE (PATTERN (insn)) == CLOBBER)
41214 continue;
41216 extract_insn (insn);
41218 int opno0, opno1;
41219 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41220 recog_data.n_operands, &opno0,
41221 &opno1);
41223 if (!ix86_rop_should_change_byte_p (modrm))
41224 continue;
41226 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41228 /* This happens when regrename has to fail a block. */
41229 if (!info->op_info)
41230 continue;
41232 if (info->op_info[opno0].n_chains != 0)
41234 gcc_assert (info->op_info[opno0].n_chains == 1);
41235 du_head_p op0c;
41236 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41237 if (op0c->target_data_1 + op0c->target_data_2 == 0
41238 && !op0c->cannot_rename)
41239 cands.safe_push (op0c);
41241 op0c->target_data_1++;
41243 if (info->op_info[opno1].n_chains != 0)
41245 gcc_assert (info->op_info[opno1].n_chains == 1);
41246 du_head_p op1c;
41247 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41248 if (op1c->target_data_1 + op1c->target_data_2 == 0
41249 && !op1c->cannot_rename)
41250 cands.safe_push (op1c);
41252 op1c->target_data_2++;
41256 int i;
41257 du_head_p head;
41258 FOR_EACH_VEC_ELT (cands, i, head)
41260 int old_reg, best_reg;
41261 HARD_REG_SET unavailable;
41263 CLEAR_HARD_REG_SET (unavailable);
41264 if (head->target_data_1)
41265 IOR_HARD_REG_SET (unavailable, output_risky);
41266 if (head->target_data_2)
41267 IOR_HARD_REG_SET (unavailable, input_risky);
41269 int n_uses;
41270 reg_class superclass = regrename_find_superclass (head, &n_uses,
41271 &unavailable);
41272 old_reg = head->regno;
41273 best_reg = find_rename_reg (head, superclass, &unavailable,
41274 old_reg, false);
41275 bool ok = regrename_do_replace (head, best_reg);
41276 gcc_assert (ok);
41277 if (dump_file)
41278 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41279 reg_names[best_reg], reg_class_names[superclass]);
41283 regrename_finish ();
41285 df_analyze ();
41287 basic_block bb;
41288 regset_head live;
41290 INIT_REG_SET (&live);
41292 FOR_EACH_BB_FN (bb, cfun)
41294 rtx_insn *insn;
41296 COPY_REG_SET (&live, DF_LR_OUT (bb));
41297 df_simulate_initialize_backwards (bb, &live);
41299 FOR_BB_INSNS_REVERSE (bb, insn)
41301 if (!NONDEBUG_INSN_P (insn))
41302 continue;
41304 df_simulate_one_insn_backwards (bb, insn, &live);
41306 if (GET_CODE (PATTERN (insn)) == USE
41307 || GET_CODE (PATTERN (insn)) == CLOBBER)
41308 continue;
41310 extract_insn (insn);
41311 constrain_operands_cached (insn, reload_completed);
41312 int opno0, opno1;
41313 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41314 recog_data.n_operands, &opno0,
41315 &opno1);
41316 if (modrm < 0
41317 || !ix86_rop_should_change_byte_p (modrm)
41318 || opno0 == opno1)
41319 continue;
41321 rtx oldreg = recog_data.operand[opno1];
41322 preprocess_constraints (insn);
41323 const operand_alternative *alt = which_op_alt ();
41325 int i;
41326 for (i = 0; i < recog_data.n_operands; i++)
41327 if (i != opno1
41328 && alt[i].earlyclobber
41329 && reg_overlap_mentioned_p (recog_data.operand[i],
41330 oldreg))
41331 break;
41333 if (i < recog_data.n_operands)
41334 continue;
41336 if (dump_file)
41337 fprintf (dump_file,
41338 "attempting to fix modrm byte in insn %d:"
41339 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41340 reg_class_names[alt[opno1].cl]);
41342 HARD_REG_SET unavailable;
41343 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41344 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41345 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41346 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41347 IOR_HARD_REG_SET (unavailable, output_risky);
41348 IOR_COMPL_HARD_REG_SET (unavailable,
41349 reg_class_contents[alt[opno1].cl]);
41351 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41352 if (!TEST_HARD_REG_BIT (unavailable, i))
41353 break;
41354 if (i == FIRST_PSEUDO_REGISTER)
41356 if (dump_file)
41357 fprintf (dump_file, ", none available\n");
41358 continue;
41360 if (dump_file)
41361 fprintf (dump_file, " -> %d\n", i);
41362 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41363 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41364 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41369 /* Implement machine specific optimizations. We implement padding of returns
41370 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41371 static void
41372 ix86_reorg (void)
41374 /* We are freeing block_for_insn in the toplev to keep compatibility
41375 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41376 compute_bb_for_insn ();
41378 if (flag_mitigate_rop)
41379 ix86_mitigate_rop ();
41381 if (TARGET_SEH && current_function_has_exception_handlers ())
41382 ix86_seh_fixup_eh_fallthru ();
41384 if (optimize && optimize_function_for_speed_p (cfun))
41386 if (TARGET_PAD_SHORT_FUNCTION)
41387 ix86_pad_short_function ();
41388 else if (TARGET_PAD_RETURNS)
41389 ix86_pad_returns ();
41390 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41391 if (TARGET_FOUR_JUMP_LIMIT)
41392 ix86_avoid_jump_mispredicts ();
41393 #endif
41397 /* Return nonzero when QImode register that must be represented via REX prefix
41398 is used. */
41399 bool
41400 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41402 int i;
41403 extract_insn_cached (insn);
41404 for (i = 0; i < recog_data.n_operands; i++)
41405 if (GENERAL_REG_P (recog_data.operand[i])
41406 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41407 return true;
41408 return false;
41411 /* Return true when INSN mentions register that must be encoded using REX
41412 prefix. */
41413 bool
41414 x86_extended_reg_mentioned_p (rtx insn)
41416 subrtx_iterator::array_type array;
41417 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41419 const_rtx x = *iter;
41420 if (REG_P (x)
41421 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41422 return true;
41424 return false;
41427 /* If profitable, negate (without causing overflow) integer constant
41428 of mode MODE at location LOC. Return true in this case. */
41429 bool
41430 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41432 HOST_WIDE_INT val;
41434 if (!CONST_INT_P (*loc))
41435 return false;
41437 switch (mode)
41439 case E_DImode:
41440 /* DImode x86_64 constants must fit in 32 bits. */
41441 gcc_assert (x86_64_immediate_operand (*loc, mode));
41443 mode = SImode;
41444 break;
41446 case E_SImode:
41447 case E_HImode:
41448 case E_QImode:
41449 break;
41451 default:
41452 gcc_unreachable ();
41455 /* Avoid overflows. */
41456 if (mode_signbit_p (mode, *loc))
41457 return false;
41459 val = INTVAL (*loc);
41461 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41462 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41463 if ((val < 0 && val != -128)
41464 || val == 128)
41466 *loc = GEN_INT (-val);
41467 return true;
41470 return false;
41473 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41474 optabs would emit if we didn't have TFmode patterns. */
41476 void
41477 x86_emit_floatuns (rtx operands[2])
41479 rtx_code_label *neglab, *donelab;
41480 rtx i0, i1, f0, in, out;
41481 machine_mode mode, inmode;
41483 inmode = GET_MODE (operands[1]);
41484 gcc_assert (inmode == SImode || inmode == DImode);
41486 out = operands[0];
41487 in = force_reg (inmode, operands[1]);
41488 mode = GET_MODE (out);
41489 neglab = gen_label_rtx ();
41490 donelab = gen_label_rtx ();
41491 f0 = gen_reg_rtx (mode);
41493 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41495 expand_float (out, in, 0);
41497 emit_jump_insn (gen_jump (donelab));
41498 emit_barrier ();
41500 emit_label (neglab);
41502 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41503 1, OPTAB_DIRECT);
41504 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41505 1, OPTAB_DIRECT);
41506 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41508 expand_float (f0, i0, 0);
41510 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41512 emit_label (donelab);
41515 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41516 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41517 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41518 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41520 /* Get a vector mode of the same size as the original but with elements
41521 twice as wide. This is only guaranteed to apply to integral vectors. */
41523 static inline machine_mode
41524 get_mode_wider_vector (machine_mode o)
41526 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41527 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
41528 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41529 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41530 return n;
41533 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41534 fill target with val via vec_duplicate. */
41536 static bool
41537 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41539 bool ok;
41540 rtx_insn *insn;
41541 rtx dup;
41543 /* First attempt to recognize VAL as-is. */
41544 dup = gen_vec_duplicate (mode, val);
41545 insn = emit_insn (gen_rtx_SET (target, dup));
41546 if (recog_memoized (insn) < 0)
41548 rtx_insn *seq;
41549 machine_mode innermode = GET_MODE_INNER (mode);
41550 rtx reg;
41552 /* If that fails, force VAL into a register. */
41554 start_sequence ();
41555 reg = force_reg (innermode, val);
41556 if (GET_MODE (reg) != innermode)
41557 reg = gen_lowpart (innermode, reg);
41558 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
41559 seq = get_insns ();
41560 end_sequence ();
41561 if (seq)
41562 emit_insn_before (seq, insn);
41564 ok = recog_memoized (insn) >= 0;
41565 gcc_assert (ok);
41567 return true;
41570 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41571 with all elements equal to VAR. Return true if successful. */
41573 static bool
41574 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41575 rtx target, rtx val)
41577 bool ok;
41579 switch (mode)
41581 case E_V2SImode:
41582 case E_V2SFmode:
41583 if (!mmx_ok)
41584 return false;
41585 /* FALLTHRU */
41587 case E_V4DFmode:
41588 case E_V4DImode:
41589 case E_V8SFmode:
41590 case E_V8SImode:
41591 case E_V2DFmode:
41592 case E_V2DImode:
41593 case E_V4SFmode:
41594 case E_V4SImode:
41595 case E_V16SImode:
41596 case E_V8DImode:
41597 case E_V16SFmode:
41598 case E_V8DFmode:
41599 return ix86_vector_duplicate_value (mode, target, val);
41601 case E_V4HImode:
41602 if (!mmx_ok)
41603 return false;
41604 if (TARGET_SSE || TARGET_3DNOW_A)
41606 rtx x;
41608 val = gen_lowpart (SImode, val);
41609 x = gen_rtx_TRUNCATE (HImode, val);
41610 x = gen_rtx_VEC_DUPLICATE (mode, x);
41611 emit_insn (gen_rtx_SET (target, x));
41612 return true;
41614 goto widen;
41616 case E_V8QImode:
41617 if (!mmx_ok)
41618 return false;
41619 goto widen;
41621 case E_V8HImode:
41622 if (TARGET_AVX2)
41623 return ix86_vector_duplicate_value (mode, target, val);
41625 if (TARGET_SSE2)
41627 struct expand_vec_perm_d dperm;
41628 rtx tmp1, tmp2;
41630 permute:
41631 memset (&dperm, 0, sizeof (dperm));
41632 dperm.target = target;
41633 dperm.vmode = mode;
41634 dperm.nelt = GET_MODE_NUNITS (mode);
41635 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41636 dperm.one_operand_p = true;
41638 /* Extend to SImode using a paradoxical SUBREG. */
41639 tmp1 = gen_reg_rtx (SImode);
41640 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41642 /* Insert the SImode value as low element of a V4SImode vector. */
41643 tmp2 = gen_reg_rtx (V4SImode);
41644 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41645 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41647 ok = (expand_vec_perm_1 (&dperm)
41648 || expand_vec_perm_broadcast_1 (&dperm));
41649 gcc_assert (ok);
41650 return ok;
41652 goto widen;
41654 case E_V16QImode:
41655 if (TARGET_AVX2)
41656 return ix86_vector_duplicate_value (mode, target, val);
41658 if (TARGET_SSE2)
41659 goto permute;
41660 goto widen;
41662 widen:
41663 /* Replicate the value once into the next wider mode and recurse. */
41665 machine_mode smode, wsmode, wvmode;
41666 rtx x;
41668 smode = GET_MODE_INNER (mode);
41669 wvmode = get_mode_wider_vector (mode);
41670 wsmode = GET_MODE_INNER (wvmode);
41672 val = convert_modes (wsmode, smode, val, true);
41673 x = expand_simple_binop (wsmode, ASHIFT, val,
41674 GEN_INT (GET_MODE_BITSIZE (smode)),
41675 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41676 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41678 x = gen_reg_rtx (wvmode);
41679 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41680 gcc_assert (ok);
41681 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41682 return ok;
41685 case E_V16HImode:
41686 case E_V32QImode:
41687 if (TARGET_AVX2)
41688 return ix86_vector_duplicate_value (mode, target, val);
41689 else
41691 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41692 rtx x = gen_reg_rtx (hvmode);
41694 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41695 gcc_assert (ok);
41697 x = gen_rtx_VEC_CONCAT (mode, x, x);
41698 emit_insn (gen_rtx_SET (target, x));
41700 return true;
41702 case E_V64QImode:
41703 case E_V32HImode:
41704 if (TARGET_AVX512BW)
41705 return ix86_vector_duplicate_value (mode, target, val);
41706 else
41708 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41709 rtx x = gen_reg_rtx (hvmode);
41711 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41712 gcc_assert (ok);
41714 x = gen_rtx_VEC_CONCAT (mode, x, x);
41715 emit_insn (gen_rtx_SET (target, x));
41717 return true;
41719 default:
41720 return false;
41724 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41725 whose ONE_VAR element is VAR, and other elements are zero. Return true
41726 if successful. */
41728 static bool
41729 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41730 rtx target, rtx var, int one_var)
41732 machine_mode vsimode;
41733 rtx new_target;
41734 rtx x, tmp;
41735 bool use_vector_set = false;
41737 switch (mode)
41739 case E_V2DImode:
41740 /* For SSE4.1, we normally use vector set. But if the second
41741 element is zero and inter-unit moves are OK, we use movq
41742 instead. */
41743 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41744 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41745 && one_var == 0));
41746 break;
41747 case E_V16QImode:
41748 case E_V4SImode:
41749 case E_V4SFmode:
41750 use_vector_set = TARGET_SSE4_1;
41751 break;
41752 case E_V8HImode:
41753 use_vector_set = TARGET_SSE2;
41754 break;
41755 case E_V4HImode:
41756 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41757 break;
41758 case E_V32QImode:
41759 case E_V16HImode:
41760 case E_V8SImode:
41761 case E_V8SFmode:
41762 case E_V4DFmode:
41763 use_vector_set = TARGET_AVX;
41764 break;
41765 case E_V4DImode:
41766 /* Use ix86_expand_vector_set in 64bit mode only. */
41767 use_vector_set = TARGET_AVX && TARGET_64BIT;
41768 break;
41769 default:
41770 break;
41773 if (use_vector_set)
41775 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41776 var = force_reg (GET_MODE_INNER (mode), var);
41777 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41778 return true;
41781 switch (mode)
41783 case E_V2SFmode:
41784 case E_V2SImode:
41785 if (!mmx_ok)
41786 return false;
41787 /* FALLTHRU */
41789 case E_V2DFmode:
41790 case E_V2DImode:
41791 if (one_var != 0)
41792 return false;
41793 var = force_reg (GET_MODE_INNER (mode), var);
41794 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41795 emit_insn (gen_rtx_SET (target, x));
41796 return true;
41798 case E_V4SFmode:
41799 case E_V4SImode:
41800 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41801 new_target = gen_reg_rtx (mode);
41802 else
41803 new_target = target;
41804 var = force_reg (GET_MODE_INNER (mode), var);
41805 x = gen_rtx_VEC_DUPLICATE (mode, var);
41806 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41807 emit_insn (gen_rtx_SET (new_target, x));
41808 if (one_var != 0)
41810 /* We need to shuffle the value to the correct position, so
41811 create a new pseudo to store the intermediate result. */
41813 /* With SSE2, we can use the integer shuffle insns. */
41814 if (mode != V4SFmode && TARGET_SSE2)
41816 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41817 const1_rtx,
41818 GEN_INT (one_var == 1 ? 0 : 1),
41819 GEN_INT (one_var == 2 ? 0 : 1),
41820 GEN_INT (one_var == 3 ? 0 : 1)));
41821 if (target != new_target)
41822 emit_move_insn (target, new_target);
41823 return true;
41826 /* Otherwise convert the intermediate result to V4SFmode and
41827 use the SSE1 shuffle instructions. */
41828 if (mode != V4SFmode)
41830 tmp = gen_reg_rtx (V4SFmode);
41831 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41833 else
41834 tmp = new_target;
41836 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41837 const1_rtx,
41838 GEN_INT (one_var == 1 ? 0 : 1),
41839 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41840 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41842 if (mode != V4SFmode)
41843 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41844 else if (tmp != target)
41845 emit_move_insn (target, tmp);
41847 else if (target != new_target)
41848 emit_move_insn (target, new_target);
41849 return true;
41851 case E_V8HImode:
41852 case E_V16QImode:
41853 vsimode = V4SImode;
41854 goto widen;
41855 case E_V4HImode:
41856 case E_V8QImode:
41857 if (!mmx_ok)
41858 return false;
41859 vsimode = V2SImode;
41860 goto widen;
41861 widen:
41862 if (one_var != 0)
41863 return false;
41865 /* Zero extend the variable element to SImode and recurse. */
41866 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41868 x = gen_reg_rtx (vsimode);
41869 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41870 var, one_var))
41871 gcc_unreachable ();
41873 emit_move_insn (target, gen_lowpart (mode, x));
41874 return true;
41876 default:
41877 return false;
41881 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41882 consisting of the values in VALS. It is known that all elements
41883 except ONE_VAR are constants. Return true if successful. */
41885 static bool
41886 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41887 rtx target, rtx vals, int one_var)
41889 rtx var = XVECEXP (vals, 0, one_var);
41890 machine_mode wmode;
41891 rtx const_vec, x;
41893 const_vec = copy_rtx (vals);
41894 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
41895 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
41897 switch (mode)
41899 case E_V2DFmode:
41900 case E_V2DImode:
41901 case E_V2SFmode:
41902 case E_V2SImode:
41903 /* For the two element vectors, it's just as easy to use
41904 the general case. */
41905 return false;
41907 case E_V4DImode:
41908 /* Use ix86_expand_vector_set in 64bit mode only. */
41909 if (!TARGET_64BIT)
41910 return false;
41911 /* FALLTHRU */
41912 case E_V4DFmode:
41913 case E_V8SFmode:
41914 case E_V8SImode:
41915 case E_V16HImode:
41916 case E_V32QImode:
41917 case E_V4SFmode:
41918 case E_V4SImode:
41919 case E_V8HImode:
41920 case E_V4HImode:
41921 break;
41923 case E_V16QImode:
41924 if (TARGET_SSE4_1)
41925 break;
41926 wmode = V8HImode;
41927 goto widen;
41928 case E_V8QImode:
41929 wmode = V4HImode;
41930 goto widen;
41931 widen:
41932 /* There's no way to set one QImode entry easily. Combine
41933 the variable value with its adjacent constant value, and
41934 promote to an HImode set. */
41935 x = XVECEXP (vals, 0, one_var ^ 1);
41936 if (one_var & 1)
41938 var = convert_modes (HImode, QImode, var, true);
41939 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
41940 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41941 x = GEN_INT (INTVAL (x) & 0xff);
41943 else
41945 var = convert_modes (HImode, QImode, var, true);
41946 x = gen_int_mode (INTVAL (x) << 8, HImode);
41948 if (x != const0_rtx)
41949 var = expand_simple_binop (HImode, IOR, var, x, var,
41950 1, OPTAB_LIB_WIDEN);
41952 x = gen_reg_rtx (wmode);
41953 emit_move_insn (x, gen_lowpart (wmode, const_vec));
41954 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
41956 emit_move_insn (target, gen_lowpart (mode, x));
41957 return true;
41959 default:
41960 return false;
41963 emit_move_insn (target, const_vec);
41964 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41965 return true;
41968 /* A subroutine of ix86_expand_vector_init_general. Use vector
41969 concatenate to handle the most general case: all values variable,
41970 and none identical. */
41972 static void
41973 ix86_expand_vector_init_concat (machine_mode mode,
41974 rtx target, rtx *ops, int n)
41976 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
41977 rtx first[16], second[8], third[4];
41978 rtvec v;
41979 int i, j;
41981 switch (n)
41983 case 2:
41984 switch (mode)
41986 case E_V16SImode:
41987 cmode = V8SImode;
41988 break;
41989 case E_V16SFmode:
41990 cmode = V8SFmode;
41991 break;
41992 case E_V8DImode:
41993 cmode = V4DImode;
41994 break;
41995 case E_V8DFmode:
41996 cmode = V4DFmode;
41997 break;
41998 case E_V8SImode:
41999 cmode = V4SImode;
42000 break;
42001 case E_V8SFmode:
42002 cmode = V4SFmode;
42003 break;
42004 case E_V4DImode:
42005 cmode = V2DImode;
42006 break;
42007 case E_V4DFmode:
42008 cmode = V2DFmode;
42009 break;
42010 case E_V4SImode:
42011 cmode = V2SImode;
42012 break;
42013 case E_V4SFmode:
42014 cmode = V2SFmode;
42015 break;
42016 case E_V2DImode:
42017 cmode = DImode;
42018 break;
42019 case E_V2SImode:
42020 cmode = SImode;
42021 break;
42022 case E_V2DFmode:
42023 cmode = DFmode;
42024 break;
42025 case E_V2SFmode:
42026 cmode = SFmode;
42027 break;
42028 default:
42029 gcc_unreachable ();
42032 if (!register_operand (ops[1], cmode))
42033 ops[1] = force_reg (cmode, ops[1]);
42034 if (!register_operand (ops[0], cmode))
42035 ops[0] = force_reg (cmode, ops[0]);
42036 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42037 ops[1])));
42038 break;
42040 case 4:
42041 switch (mode)
42043 case E_V4DImode:
42044 cmode = V2DImode;
42045 break;
42046 case E_V4DFmode:
42047 cmode = V2DFmode;
42048 break;
42049 case E_V4SImode:
42050 cmode = V2SImode;
42051 break;
42052 case E_V4SFmode:
42053 cmode = V2SFmode;
42054 break;
42055 default:
42056 gcc_unreachable ();
42058 goto half;
42060 case 8:
42061 switch (mode)
42063 case E_V8DImode:
42064 cmode = V2DImode;
42065 hmode = V4DImode;
42066 break;
42067 case E_V8DFmode:
42068 cmode = V2DFmode;
42069 hmode = V4DFmode;
42070 break;
42071 case E_V8SImode:
42072 cmode = V2SImode;
42073 hmode = V4SImode;
42074 break;
42075 case E_V8SFmode:
42076 cmode = V2SFmode;
42077 hmode = V4SFmode;
42078 break;
42079 default:
42080 gcc_unreachable ();
42082 goto half;
42084 case 16:
42085 switch (mode)
42087 case E_V16SImode:
42088 cmode = V2SImode;
42089 hmode = V4SImode;
42090 gmode = V8SImode;
42091 break;
42092 case E_V16SFmode:
42093 cmode = V2SFmode;
42094 hmode = V4SFmode;
42095 gmode = V8SFmode;
42096 break;
42097 default:
42098 gcc_unreachable ();
42100 goto half;
42102 half:
42103 /* FIXME: We process inputs backward to help RA. PR 36222. */
42104 i = n - 1;
42105 j = (n >> 1) - 1;
42106 for (; i > 0; i -= 2, j--)
42108 first[j] = gen_reg_rtx (cmode);
42109 v = gen_rtvec (2, ops[i - 1], ops[i]);
42110 ix86_expand_vector_init (false, first[j],
42111 gen_rtx_PARALLEL (cmode, v));
42114 n >>= 1;
42115 if (n > 4)
42117 gcc_assert (hmode != VOIDmode);
42118 gcc_assert (gmode != VOIDmode);
42119 for (i = j = 0; i < n; i += 2, j++)
42121 second[j] = gen_reg_rtx (hmode);
42122 ix86_expand_vector_init_concat (hmode, second [j],
42123 &first [i], 2);
42125 n >>= 1;
42126 for (i = j = 0; i < n; i += 2, j++)
42128 third[j] = gen_reg_rtx (gmode);
42129 ix86_expand_vector_init_concat (gmode, third[j],
42130 &second[i], 2);
42132 n >>= 1;
42133 ix86_expand_vector_init_concat (mode, target, third, n);
42135 else if (n > 2)
42137 gcc_assert (hmode != VOIDmode);
42138 for (i = j = 0; i < n; i += 2, j++)
42140 second[j] = gen_reg_rtx (hmode);
42141 ix86_expand_vector_init_concat (hmode, second [j],
42142 &first [i], 2);
42144 n >>= 1;
42145 ix86_expand_vector_init_concat (mode, target, second, n);
42147 else
42148 ix86_expand_vector_init_concat (mode, target, first, n);
42149 break;
42151 default:
42152 gcc_unreachable ();
42156 /* A subroutine of ix86_expand_vector_init_general. Use vector
42157 interleave to handle the most general case: all values variable,
42158 and none identical. */
42160 static void
42161 ix86_expand_vector_init_interleave (machine_mode mode,
42162 rtx target, rtx *ops, int n)
42164 machine_mode first_imode, second_imode, third_imode, inner_mode;
42165 int i, j;
42166 rtx op0, op1;
42167 rtx (*gen_load_even) (rtx, rtx, rtx);
42168 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42169 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42171 switch (mode)
42173 case E_V8HImode:
42174 gen_load_even = gen_vec_setv8hi;
42175 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42176 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42177 inner_mode = HImode;
42178 first_imode = V4SImode;
42179 second_imode = V2DImode;
42180 third_imode = VOIDmode;
42181 break;
42182 case E_V16QImode:
42183 gen_load_even = gen_vec_setv16qi;
42184 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42185 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42186 inner_mode = QImode;
42187 first_imode = V8HImode;
42188 second_imode = V4SImode;
42189 third_imode = V2DImode;
42190 break;
42191 default:
42192 gcc_unreachable ();
42195 for (i = 0; i < n; i++)
42197 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42198 op0 = gen_reg_rtx (SImode);
42199 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42201 /* Insert the SImode value as low element of V4SImode vector. */
42202 op1 = gen_reg_rtx (V4SImode);
42203 op0 = gen_rtx_VEC_MERGE (V4SImode,
42204 gen_rtx_VEC_DUPLICATE (V4SImode,
42205 op0),
42206 CONST0_RTX (V4SImode),
42207 const1_rtx);
42208 emit_insn (gen_rtx_SET (op1, op0));
42210 /* Cast the V4SImode vector back to a vector in orignal mode. */
42211 op0 = gen_reg_rtx (mode);
42212 emit_move_insn (op0, gen_lowpart (mode, op1));
42214 /* Load even elements into the second position. */
42215 emit_insn (gen_load_even (op0,
42216 force_reg (inner_mode,
42217 ops [i + i + 1]),
42218 const1_rtx));
42220 /* Cast vector to FIRST_IMODE vector. */
42221 ops[i] = gen_reg_rtx (first_imode);
42222 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42225 /* Interleave low FIRST_IMODE vectors. */
42226 for (i = j = 0; i < n; i += 2, j++)
42228 op0 = gen_reg_rtx (first_imode);
42229 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42231 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42232 ops[j] = gen_reg_rtx (second_imode);
42233 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42236 /* Interleave low SECOND_IMODE vectors. */
42237 switch (second_imode)
42239 case E_V4SImode:
42240 for (i = j = 0; i < n / 2; i += 2, j++)
42242 op0 = gen_reg_rtx (second_imode);
42243 emit_insn (gen_interleave_second_low (op0, ops[i],
42244 ops[i + 1]));
42246 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42247 vector. */
42248 ops[j] = gen_reg_rtx (third_imode);
42249 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42251 second_imode = V2DImode;
42252 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42253 /* FALLTHRU */
42255 case E_V2DImode:
42256 op0 = gen_reg_rtx (second_imode);
42257 emit_insn (gen_interleave_second_low (op0, ops[0],
42258 ops[1]));
42260 /* Cast the SECOND_IMODE vector back to a vector on original
42261 mode. */
42262 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42263 break;
42265 default:
42266 gcc_unreachable ();
42270 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42271 all values variable, and none identical. */
42273 static void
42274 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42275 rtx target, rtx vals)
42277 rtx ops[64], op0, op1, op2, op3, op4, op5;
42278 machine_mode half_mode = VOIDmode;
42279 machine_mode quarter_mode = VOIDmode;
42280 int n, i;
42282 switch (mode)
42284 case E_V2SFmode:
42285 case E_V2SImode:
42286 if (!mmx_ok && !TARGET_SSE)
42287 break;
42288 /* FALLTHRU */
42290 case E_V16SImode:
42291 case E_V16SFmode:
42292 case E_V8DFmode:
42293 case E_V8DImode:
42294 case E_V8SFmode:
42295 case E_V8SImode:
42296 case E_V4DFmode:
42297 case E_V4DImode:
42298 case E_V4SFmode:
42299 case E_V4SImode:
42300 case E_V2DFmode:
42301 case E_V2DImode:
42302 n = GET_MODE_NUNITS (mode);
42303 for (i = 0; i < n; i++)
42304 ops[i] = XVECEXP (vals, 0, i);
42305 ix86_expand_vector_init_concat (mode, target, ops, n);
42306 return;
42308 case E_V2TImode:
42309 for (i = 0; i < 2; i++)
42310 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42311 op0 = gen_reg_rtx (V4DImode);
42312 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
42313 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42314 return;
42316 case E_V4TImode:
42317 for (i = 0; i < 4; i++)
42318 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42319 ops[4] = gen_reg_rtx (V4DImode);
42320 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
42321 ops[5] = gen_reg_rtx (V4DImode);
42322 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
42323 op0 = gen_reg_rtx (V8DImode);
42324 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
42325 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42326 return;
42328 case E_V32QImode:
42329 half_mode = V16QImode;
42330 goto half;
42332 case E_V16HImode:
42333 half_mode = V8HImode;
42334 goto half;
42336 half:
42337 n = GET_MODE_NUNITS (mode);
42338 for (i = 0; i < n; i++)
42339 ops[i] = XVECEXP (vals, 0, i);
42340 op0 = gen_reg_rtx (half_mode);
42341 op1 = gen_reg_rtx (half_mode);
42342 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42343 n >> 2);
42344 ix86_expand_vector_init_interleave (half_mode, op1,
42345 &ops [n >> 1], n >> 2);
42346 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42347 return;
42349 case E_V64QImode:
42350 quarter_mode = V16QImode;
42351 half_mode = V32QImode;
42352 goto quarter;
42354 case E_V32HImode:
42355 quarter_mode = V8HImode;
42356 half_mode = V16HImode;
42357 goto quarter;
42359 quarter:
42360 n = GET_MODE_NUNITS (mode);
42361 for (i = 0; i < n; i++)
42362 ops[i] = XVECEXP (vals, 0, i);
42363 op0 = gen_reg_rtx (quarter_mode);
42364 op1 = gen_reg_rtx (quarter_mode);
42365 op2 = gen_reg_rtx (quarter_mode);
42366 op3 = gen_reg_rtx (quarter_mode);
42367 op4 = gen_reg_rtx (half_mode);
42368 op5 = gen_reg_rtx (half_mode);
42369 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42370 n >> 3);
42371 ix86_expand_vector_init_interleave (quarter_mode, op1,
42372 &ops [n >> 2], n >> 3);
42373 ix86_expand_vector_init_interleave (quarter_mode, op2,
42374 &ops [n >> 1], n >> 3);
42375 ix86_expand_vector_init_interleave (quarter_mode, op3,
42376 &ops [(n >> 1) | (n >> 2)], n >> 3);
42377 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42378 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42379 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42380 return;
42382 case E_V16QImode:
42383 if (!TARGET_SSE4_1)
42384 break;
42385 /* FALLTHRU */
42387 case E_V8HImode:
42388 if (!TARGET_SSE2)
42389 break;
42391 /* Don't use ix86_expand_vector_init_interleave if we can't
42392 move from GPR to SSE register directly. */
42393 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42394 break;
42396 n = GET_MODE_NUNITS (mode);
42397 for (i = 0; i < n; i++)
42398 ops[i] = XVECEXP (vals, 0, i);
42399 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42400 return;
42402 case E_V4HImode:
42403 case E_V8QImode:
42404 break;
42406 default:
42407 gcc_unreachable ();
42411 int i, j, n_elts, n_words, n_elt_per_word;
42412 machine_mode inner_mode;
42413 rtx words[4], shift;
42415 inner_mode = GET_MODE_INNER (mode);
42416 n_elts = GET_MODE_NUNITS (mode);
42417 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42418 n_elt_per_word = n_elts / n_words;
42419 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42421 for (i = 0; i < n_words; ++i)
42423 rtx word = NULL_RTX;
42425 for (j = 0; j < n_elt_per_word; ++j)
42427 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42428 elt = convert_modes (word_mode, inner_mode, elt, true);
42430 if (j == 0)
42431 word = elt;
42432 else
42434 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42435 word, 1, OPTAB_LIB_WIDEN);
42436 word = expand_simple_binop (word_mode, IOR, word, elt,
42437 word, 1, OPTAB_LIB_WIDEN);
42441 words[i] = word;
42444 if (n_words == 1)
42445 emit_move_insn (target, gen_lowpart (mode, words[0]));
42446 else if (n_words == 2)
42448 rtx tmp = gen_reg_rtx (mode);
42449 emit_clobber (tmp);
42450 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42451 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42452 emit_move_insn (target, tmp);
42454 else if (n_words == 4)
42456 rtx tmp = gen_reg_rtx (V4SImode);
42457 gcc_assert (word_mode == SImode);
42458 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42459 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42460 emit_move_insn (target, gen_lowpart (mode, tmp));
42462 else
42463 gcc_unreachable ();
42467 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42468 instructions unless MMX_OK is true. */
42470 void
42471 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42473 machine_mode mode = GET_MODE (target);
42474 machine_mode inner_mode = GET_MODE_INNER (mode);
42475 int n_elts = GET_MODE_NUNITS (mode);
42476 int n_var = 0, one_var = -1;
42477 bool all_same = true, all_const_zero = true;
42478 int i;
42479 rtx x;
42481 /* Handle first initialization from vector elts. */
42482 if (n_elts != XVECLEN (vals, 0))
42484 rtx subtarget = target;
42485 x = XVECEXP (vals, 0, 0);
42486 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
42487 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
42489 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
42490 if (inner_mode == QImode || inner_mode == HImode)
42492 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
42493 mode = mode_for_vector (SImode, n_bits / 4).require ();
42494 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
42495 ops[0] = gen_lowpart (inner_mode, ops[0]);
42496 ops[1] = gen_lowpart (inner_mode, ops[1]);
42497 subtarget = gen_reg_rtx (mode);
42499 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
42500 if (subtarget != target)
42501 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
42502 return;
42504 gcc_unreachable ();
42507 for (i = 0; i < n_elts; ++i)
42509 x = XVECEXP (vals, 0, i);
42510 if (!(CONST_SCALAR_INT_P (x)
42511 || CONST_DOUBLE_P (x)
42512 || CONST_FIXED_P (x)))
42513 n_var++, one_var = i;
42514 else if (x != CONST0_RTX (inner_mode))
42515 all_const_zero = false;
42516 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42517 all_same = false;
42520 /* Constants are best loaded from the constant pool. */
42521 if (n_var == 0)
42523 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42524 return;
42527 /* If all values are identical, broadcast the value. */
42528 if (all_same
42529 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42530 XVECEXP (vals, 0, 0)))
42531 return;
42533 /* Values where only one field is non-constant are best loaded from
42534 the pool and overwritten via move later. */
42535 if (n_var == 1)
42537 if (all_const_zero
42538 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42539 XVECEXP (vals, 0, one_var),
42540 one_var))
42541 return;
42543 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42544 return;
42547 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42550 void
42551 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42553 machine_mode mode = GET_MODE (target);
42554 machine_mode inner_mode = GET_MODE_INNER (mode);
42555 machine_mode half_mode;
42556 bool use_vec_merge = false;
42557 rtx tmp;
42558 static rtx (*gen_extract[6][2]) (rtx, rtx)
42560 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42561 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42562 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42563 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42564 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42565 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42567 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42569 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42570 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42571 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42572 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42573 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42574 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42576 int i, j, n;
42577 machine_mode mmode = VOIDmode;
42578 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42580 switch (mode)
42582 case E_V2SFmode:
42583 case E_V2SImode:
42584 if (mmx_ok)
42586 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42587 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42588 if (elt == 0)
42589 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42590 else
42591 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42592 emit_insn (gen_rtx_SET (target, tmp));
42593 return;
42595 break;
42597 case E_V2DImode:
42598 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42599 if (use_vec_merge)
42600 break;
42602 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42603 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42604 if (elt == 0)
42605 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42606 else
42607 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42608 emit_insn (gen_rtx_SET (target, tmp));
42609 return;
42611 case E_V2DFmode:
42613 rtx op0, op1;
42615 /* For the two element vectors, we implement a VEC_CONCAT with
42616 the extraction of the other element. */
42618 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42619 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42621 if (elt == 0)
42622 op0 = val, op1 = tmp;
42623 else
42624 op0 = tmp, op1 = val;
42626 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42627 emit_insn (gen_rtx_SET (target, tmp));
42629 return;
42631 case E_V4SFmode:
42632 use_vec_merge = TARGET_SSE4_1;
42633 if (use_vec_merge)
42634 break;
42636 switch (elt)
42638 case 0:
42639 use_vec_merge = true;
42640 break;
42642 case 1:
42643 /* tmp = target = A B C D */
42644 tmp = copy_to_reg (target);
42645 /* target = A A B B */
42646 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42647 /* target = X A B B */
42648 ix86_expand_vector_set (false, target, val, 0);
42649 /* target = A X C D */
42650 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42651 const1_rtx, const0_rtx,
42652 GEN_INT (2+4), GEN_INT (3+4)));
42653 return;
42655 case 2:
42656 /* tmp = target = A B C D */
42657 tmp = copy_to_reg (target);
42658 /* tmp = X B C D */
42659 ix86_expand_vector_set (false, tmp, val, 0);
42660 /* target = A B X D */
42661 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42662 const0_rtx, const1_rtx,
42663 GEN_INT (0+4), GEN_INT (3+4)));
42664 return;
42666 case 3:
42667 /* tmp = target = A B C D */
42668 tmp = copy_to_reg (target);
42669 /* tmp = X B C D */
42670 ix86_expand_vector_set (false, tmp, val, 0);
42671 /* target = A B X D */
42672 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42673 const0_rtx, const1_rtx,
42674 GEN_INT (2+4), GEN_INT (0+4)));
42675 return;
42677 default:
42678 gcc_unreachable ();
42680 break;
42682 case E_V4SImode:
42683 use_vec_merge = TARGET_SSE4_1;
42684 if (use_vec_merge)
42685 break;
42687 /* Element 0 handled by vec_merge below. */
42688 if (elt == 0)
42690 use_vec_merge = true;
42691 break;
42694 if (TARGET_SSE2)
42696 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42697 store into element 0, then shuffle them back. */
42699 rtx order[4];
42701 order[0] = GEN_INT (elt);
42702 order[1] = const1_rtx;
42703 order[2] = const2_rtx;
42704 order[3] = GEN_INT (3);
42705 order[elt] = const0_rtx;
42707 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42708 order[1], order[2], order[3]));
42710 ix86_expand_vector_set (false, target, val, 0);
42712 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42713 order[1], order[2], order[3]));
42715 else
42717 /* For SSE1, we have to reuse the V4SF code. */
42718 rtx t = gen_reg_rtx (V4SFmode);
42719 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42720 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42721 emit_move_insn (target, gen_lowpart (mode, t));
42723 return;
42725 case E_V8HImode:
42726 use_vec_merge = TARGET_SSE2;
42727 break;
42728 case E_V4HImode:
42729 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42730 break;
42732 case E_V16QImode:
42733 use_vec_merge = TARGET_SSE4_1;
42734 break;
42736 case E_V8QImode:
42737 break;
42739 case E_V32QImode:
42740 half_mode = V16QImode;
42741 j = 0;
42742 n = 16;
42743 goto half;
42745 case E_V16HImode:
42746 half_mode = V8HImode;
42747 j = 1;
42748 n = 8;
42749 goto half;
42751 case E_V8SImode:
42752 half_mode = V4SImode;
42753 j = 2;
42754 n = 4;
42755 goto half;
42757 case E_V4DImode:
42758 half_mode = V2DImode;
42759 j = 3;
42760 n = 2;
42761 goto half;
42763 case E_V8SFmode:
42764 half_mode = V4SFmode;
42765 j = 4;
42766 n = 4;
42767 goto half;
42769 case E_V4DFmode:
42770 half_mode = V2DFmode;
42771 j = 5;
42772 n = 2;
42773 goto half;
42775 half:
42776 /* Compute offset. */
42777 i = elt / n;
42778 elt %= n;
42780 gcc_assert (i <= 1);
42782 /* Extract the half. */
42783 tmp = gen_reg_rtx (half_mode);
42784 emit_insn (gen_extract[j][i] (tmp, target));
42786 /* Put val in tmp at elt. */
42787 ix86_expand_vector_set (false, tmp, val, elt);
42789 /* Put it back. */
42790 emit_insn (gen_insert[j][i] (target, target, tmp));
42791 return;
42793 case E_V8DFmode:
42794 if (TARGET_AVX512F)
42796 mmode = QImode;
42797 gen_blendm = gen_avx512f_blendmv8df;
42799 break;
42801 case E_V8DImode:
42802 if (TARGET_AVX512F)
42804 mmode = QImode;
42805 gen_blendm = gen_avx512f_blendmv8di;
42807 break;
42809 case E_V16SFmode:
42810 if (TARGET_AVX512F)
42812 mmode = HImode;
42813 gen_blendm = gen_avx512f_blendmv16sf;
42815 break;
42817 case E_V16SImode:
42818 if (TARGET_AVX512F)
42820 mmode = HImode;
42821 gen_blendm = gen_avx512f_blendmv16si;
42823 break;
42825 case E_V32HImode:
42826 if (TARGET_AVX512F && TARGET_AVX512BW)
42828 mmode = SImode;
42829 gen_blendm = gen_avx512bw_blendmv32hi;
42831 break;
42833 case E_V64QImode:
42834 if (TARGET_AVX512F && TARGET_AVX512BW)
42836 mmode = DImode;
42837 gen_blendm = gen_avx512bw_blendmv64qi;
42839 break;
42841 default:
42842 break;
42845 if (mmode != VOIDmode)
42847 tmp = gen_reg_rtx (mode);
42848 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42849 /* The avx512*_blendm<mode> expanders have different operand order
42850 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42851 elements where the mask is set and second input operand otherwise,
42852 in {sse,avx}*_*blend* the first input operand is used for elements
42853 where the mask is clear and second input operand otherwise. */
42854 emit_insn (gen_blendm (target, target, tmp,
42855 force_reg (mmode,
42856 gen_int_mode (1 << elt, mmode))));
42858 else if (use_vec_merge)
42860 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42861 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42862 emit_insn (gen_rtx_SET (target, tmp));
42864 else
42866 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42868 emit_move_insn (mem, target);
42870 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42871 emit_move_insn (tmp, val);
42873 emit_move_insn (target, mem);
42877 void
42878 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42880 machine_mode mode = GET_MODE (vec);
42881 machine_mode inner_mode = GET_MODE_INNER (mode);
42882 bool use_vec_extr = false;
42883 rtx tmp;
42885 switch (mode)
42887 case E_V2SImode:
42888 case E_V2SFmode:
42889 if (!mmx_ok)
42890 break;
42891 /* FALLTHRU */
42893 case E_V2DFmode:
42894 case E_V2DImode:
42895 case E_V2TImode:
42896 case E_V4TImode:
42897 use_vec_extr = true;
42898 break;
42900 case E_V4SFmode:
42901 use_vec_extr = TARGET_SSE4_1;
42902 if (use_vec_extr)
42903 break;
42905 switch (elt)
42907 case 0:
42908 tmp = vec;
42909 break;
42911 case 1:
42912 case 3:
42913 tmp = gen_reg_rtx (mode);
42914 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42915 GEN_INT (elt), GEN_INT (elt),
42916 GEN_INT (elt+4), GEN_INT (elt+4)));
42917 break;
42919 case 2:
42920 tmp = gen_reg_rtx (mode);
42921 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42922 break;
42924 default:
42925 gcc_unreachable ();
42927 vec = tmp;
42928 use_vec_extr = true;
42929 elt = 0;
42930 break;
42932 case E_V4SImode:
42933 use_vec_extr = TARGET_SSE4_1;
42934 if (use_vec_extr)
42935 break;
42937 if (TARGET_SSE2)
42939 switch (elt)
42941 case 0:
42942 tmp = vec;
42943 break;
42945 case 1:
42946 case 3:
42947 tmp = gen_reg_rtx (mode);
42948 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
42949 GEN_INT (elt), GEN_INT (elt),
42950 GEN_INT (elt), GEN_INT (elt)));
42951 break;
42953 case 2:
42954 tmp = gen_reg_rtx (mode);
42955 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
42956 break;
42958 default:
42959 gcc_unreachable ();
42961 vec = tmp;
42962 use_vec_extr = true;
42963 elt = 0;
42965 else
42967 /* For SSE1, we have to reuse the V4SF code. */
42968 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
42969 gen_lowpart (V4SFmode, vec), elt);
42970 return;
42972 break;
42974 case E_V8HImode:
42975 use_vec_extr = TARGET_SSE2;
42976 break;
42977 case E_V4HImode:
42978 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42979 break;
42981 case E_V16QImode:
42982 use_vec_extr = TARGET_SSE4_1;
42983 break;
42985 case E_V8SFmode:
42986 if (TARGET_AVX)
42988 tmp = gen_reg_rtx (V4SFmode);
42989 if (elt < 4)
42990 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
42991 else
42992 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
42993 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42994 return;
42996 break;
42998 case E_V4DFmode:
42999 if (TARGET_AVX)
43001 tmp = gen_reg_rtx (V2DFmode);
43002 if (elt < 2)
43003 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43004 else
43005 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43006 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43007 return;
43009 break;
43011 case E_V32QImode:
43012 if (TARGET_AVX)
43014 tmp = gen_reg_rtx (V16QImode);
43015 if (elt < 16)
43016 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43017 else
43018 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43019 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43020 return;
43022 break;
43024 case E_V16HImode:
43025 if (TARGET_AVX)
43027 tmp = gen_reg_rtx (V8HImode);
43028 if (elt < 8)
43029 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43030 else
43031 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43032 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43033 return;
43035 break;
43037 case E_V8SImode:
43038 if (TARGET_AVX)
43040 tmp = gen_reg_rtx (V4SImode);
43041 if (elt < 4)
43042 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43043 else
43044 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43045 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43046 return;
43048 break;
43050 case E_V4DImode:
43051 if (TARGET_AVX)
43053 tmp = gen_reg_rtx (V2DImode);
43054 if (elt < 2)
43055 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43056 else
43057 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43058 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43059 return;
43061 break;
43063 case E_V32HImode:
43064 if (TARGET_AVX512BW)
43066 tmp = gen_reg_rtx (V16HImode);
43067 if (elt < 16)
43068 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43069 else
43070 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43071 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43072 return;
43074 break;
43076 case E_V64QImode:
43077 if (TARGET_AVX512BW)
43079 tmp = gen_reg_rtx (V32QImode);
43080 if (elt < 32)
43081 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43082 else
43083 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43084 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43085 return;
43087 break;
43089 case E_V16SFmode:
43090 tmp = gen_reg_rtx (V8SFmode);
43091 if (elt < 8)
43092 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43093 else
43094 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43095 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43096 return;
43098 case E_V8DFmode:
43099 tmp = gen_reg_rtx (V4DFmode);
43100 if (elt < 4)
43101 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43102 else
43103 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43104 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43105 return;
43107 case E_V16SImode:
43108 tmp = gen_reg_rtx (V8SImode);
43109 if (elt < 8)
43110 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43111 else
43112 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43113 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43114 return;
43116 case E_V8DImode:
43117 tmp = gen_reg_rtx (V4DImode);
43118 if (elt < 4)
43119 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43120 else
43121 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43122 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43123 return;
43125 case E_V8QImode:
43126 /* ??? Could extract the appropriate HImode element and shift. */
43127 default:
43128 break;
43131 if (use_vec_extr)
43133 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43134 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43136 /* Let the rtl optimizers know about the zero extension performed. */
43137 if (inner_mode == QImode || inner_mode == HImode)
43139 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43140 target = gen_lowpart (SImode, target);
43143 emit_insn (gen_rtx_SET (target, tmp));
43145 else
43147 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43149 emit_move_insn (mem, vec);
43151 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43152 emit_move_insn (target, tmp);
43156 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43157 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43158 The upper bits of DEST are undefined, though they shouldn't cause
43159 exceptions (some bits from src or all zeros are ok). */
43161 static void
43162 emit_reduc_half (rtx dest, rtx src, int i)
43164 rtx tem, d = dest;
43165 switch (GET_MODE (src))
43167 case E_V4SFmode:
43168 if (i == 128)
43169 tem = gen_sse_movhlps (dest, src, src);
43170 else
43171 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43172 GEN_INT (1 + 4), GEN_INT (1 + 4));
43173 break;
43174 case E_V2DFmode:
43175 tem = gen_vec_interleave_highv2df (dest, src, src);
43176 break;
43177 case E_V16QImode:
43178 case E_V8HImode:
43179 case E_V4SImode:
43180 case E_V2DImode:
43181 d = gen_reg_rtx (V1TImode);
43182 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43183 GEN_INT (i / 2));
43184 break;
43185 case E_V8SFmode:
43186 if (i == 256)
43187 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43188 else
43189 tem = gen_avx_shufps256 (dest, src, src,
43190 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43191 break;
43192 case E_V4DFmode:
43193 if (i == 256)
43194 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43195 else
43196 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43197 break;
43198 case E_V32QImode:
43199 case E_V16HImode:
43200 case E_V8SImode:
43201 case E_V4DImode:
43202 if (i == 256)
43204 if (GET_MODE (dest) != V4DImode)
43205 d = gen_reg_rtx (V4DImode);
43206 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43207 gen_lowpart (V4DImode, src),
43208 const1_rtx);
43210 else
43212 d = gen_reg_rtx (V2TImode);
43213 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43214 GEN_INT (i / 2));
43216 break;
43217 case E_V64QImode:
43218 case E_V32HImode:
43219 case E_V16SImode:
43220 case E_V16SFmode:
43221 case E_V8DImode:
43222 case E_V8DFmode:
43223 if (i > 128)
43224 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43225 gen_lowpart (V16SImode, src),
43226 gen_lowpart (V16SImode, src),
43227 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43228 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43229 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43230 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43231 GEN_INT (0xC), GEN_INT (0xD),
43232 GEN_INT (0xE), GEN_INT (0xF),
43233 GEN_INT (0x10), GEN_INT (0x11),
43234 GEN_INT (0x12), GEN_INT (0x13),
43235 GEN_INT (0x14), GEN_INT (0x15),
43236 GEN_INT (0x16), GEN_INT (0x17));
43237 else
43238 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43239 gen_lowpart (V16SImode, src),
43240 GEN_INT (i == 128 ? 0x2 : 0x1),
43241 GEN_INT (0x3),
43242 GEN_INT (0x3),
43243 GEN_INT (0x3),
43244 GEN_INT (i == 128 ? 0x6 : 0x5),
43245 GEN_INT (0x7),
43246 GEN_INT (0x7),
43247 GEN_INT (0x7),
43248 GEN_INT (i == 128 ? 0xA : 0x9),
43249 GEN_INT (0xB),
43250 GEN_INT (0xB),
43251 GEN_INT (0xB),
43252 GEN_INT (i == 128 ? 0xE : 0xD),
43253 GEN_INT (0xF),
43254 GEN_INT (0xF),
43255 GEN_INT (0xF));
43256 break;
43257 default:
43258 gcc_unreachable ();
43260 emit_insn (tem);
43261 if (d != dest)
43262 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43265 /* Expand a vector reduction. FN is the binary pattern to reduce;
43266 DEST is the destination; IN is the input vector. */
43268 void
43269 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43271 rtx half, dst, vec = in;
43272 machine_mode mode = GET_MODE (in);
43273 int i;
43275 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43276 if (TARGET_SSE4_1
43277 && mode == V8HImode
43278 && fn == gen_uminv8hi3)
43280 emit_insn (gen_sse4_1_phminposuw (dest, in));
43281 return;
43284 for (i = GET_MODE_BITSIZE (mode);
43285 i > GET_MODE_UNIT_BITSIZE (mode);
43286 i >>= 1)
43288 half = gen_reg_rtx (mode);
43289 emit_reduc_half (half, vec, i);
43290 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43291 dst = dest;
43292 else
43293 dst = gen_reg_rtx (mode);
43294 emit_insn (fn (dst, half, vec));
43295 vec = dst;
43299 /* Target hook for scalar_mode_supported_p. */
43300 static bool
43301 ix86_scalar_mode_supported_p (scalar_mode mode)
43303 if (DECIMAL_FLOAT_MODE_P (mode))
43304 return default_decimal_float_supported_p ();
43305 else if (mode == TFmode)
43306 return true;
43307 else
43308 return default_scalar_mode_supported_p (mode);
43311 /* Implements target hook vector_mode_supported_p. */
43312 static bool
43313 ix86_vector_mode_supported_p (machine_mode mode)
43315 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43316 return true;
43317 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43318 return true;
43319 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43320 return true;
43321 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43322 return true;
43323 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43324 return true;
43325 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43326 return true;
43327 return false;
43330 /* Target hook for c_mode_for_suffix. */
43331 static machine_mode
43332 ix86_c_mode_for_suffix (char suffix)
43334 if (suffix == 'q')
43335 return TFmode;
43336 if (suffix == 'w')
43337 return XFmode;
43339 return VOIDmode;
43342 /* Worker function for TARGET_MD_ASM_ADJUST.
43344 We implement asm flag outputs, and maintain source compatibility
43345 with the old cc0-based compiler. */
43347 static rtx_insn *
43348 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43349 vec<const char *> &constraints,
43350 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43352 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43353 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43355 bool saw_asm_flag = false;
43357 start_sequence ();
43358 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43360 const char *con = constraints[i];
43361 if (strncmp (con, "=@cc", 4) != 0)
43362 continue;
43363 con += 4;
43364 if (strchr (con, ',') != NULL)
43366 error ("alternatives not allowed in asm flag output");
43367 continue;
43370 bool invert = false;
43371 if (con[0] == 'n')
43372 invert = true, con++;
43374 machine_mode mode = CCmode;
43375 rtx_code code = UNKNOWN;
43377 switch (con[0])
43379 case 'a':
43380 if (con[1] == 0)
43381 mode = CCAmode, code = EQ;
43382 else if (con[1] == 'e' && con[2] == 0)
43383 mode = CCCmode, code = NE;
43384 break;
43385 case 'b':
43386 if (con[1] == 0)
43387 mode = CCCmode, code = EQ;
43388 else if (con[1] == 'e' && con[2] == 0)
43389 mode = CCAmode, code = NE;
43390 break;
43391 case 'c':
43392 if (con[1] == 0)
43393 mode = CCCmode, code = EQ;
43394 break;
43395 case 'e':
43396 if (con[1] == 0)
43397 mode = CCZmode, code = EQ;
43398 break;
43399 case 'g':
43400 if (con[1] == 0)
43401 mode = CCGCmode, code = GT;
43402 else if (con[1] == 'e' && con[2] == 0)
43403 mode = CCGCmode, code = GE;
43404 break;
43405 case 'l':
43406 if (con[1] == 0)
43407 mode = CCGCmode, code = LT;
43408 else if (con[1] == 'e' && con[2] == 0)
43409 mode = CCGCmode, code = LE;
43410 break;
43411 case 'o':
43412 if (con[1] == 0)
43413 mode = CCOmode, code = EQ;
43414 break;
43415 case 'p':
43416 if (con[1] == 0)
43417 mode = CCPmode, code = EQ;
43418 break;
43419 case 's':
43420 if (con[1] == 0)
43421 mode = CCSmode, code = EQ;
43422 break;
43423 case 'z':
43424 if (con[1] == 0)
43425 mode = CCZmode, code = EQ;
43426 break;
43428 if (code == UNKNOWN)
43430 error ("unknown asm flag output %qs", constraints[i]);
43431 continue;
43433 if (invert)
43434 code = reverse_condition (code);
43436 rtx dest = outputs[i];
43437 if (!saw_asm_flag)
43439 /* This is the first asm flag output. Here we put the flags
43440 register in as the real output and adjust the condition to
43441 allow it. */
43442 constraints[i] = "=Bf";
43443 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43444 saw_asm_flag = true;
43446 else
43448 /* We don't need the flags register as output twice. */
43449 constraints[i] = "=X";
43450 outputs[i] = gen_rtx_SCRATCH (SImode);
43453 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43454 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43456 machine_mode dest_mode = GET_MODE (dest);
43457 if (!SCALAR_INT_MODE_P (dest_mode))
43459 error ("invalid type for asm flag output");
43460 continue;
43463 if (dest_mode == DImode && !TARGET_64BIT)
43464 dest_mode = SImode;
43466 if (dest_mode != QImode)
43468 rtx destqi = gen_reg_rtx (QImode);
43469 emit_insn (gen_rtx_SET (destqi, x));
43471 if (TARGET_ZERO_EXTEND_WITH_AND
43472 && optimize_function_for_speed_p (cfun))
43474 x = force_reg (dest_mode, const0_rtx);
43476 emit_insn (gen_movstrictqi
43477 (gen_lowpart (QImode, x), destqi));
43479 else
43480 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43483 if (dest_mode != GET_MODE (dest))
43485 rtx tmp = gen_reg_rtx (SImode);
43487 emit_insn (gen_rtx_SET (tmp, x));
43488 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43490 else
43491 emit_insn (gen_rtx_SET (dest, x));
43493 rtx_insn *seq = get_insns ();
43494 end_sequence ();
43496 if (saw_asm_flag)
43497 return seq;
43498 else
43500 /* If we had no asm flag outputs, clobber the flags. */
43501 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43502 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43503 return NULL;
43507 /* Implements target vector targetm.asm.encode_section_info. */
43509 static void ATTRIBUTE_UNUSED
43510 ix86_encode_section_info (tree decl, rtx rtl, int first)
43512 default_encode_section_info (decl, rtl, first);
43514 if (ix86_in_large_data_p (decl))
43515 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43518 /* Worker function for REVERSE_CONDITION. */
43520 enum rtx_code
43521 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43523 return (mode == CCFPmode
43524 ? reverse_condition_maybe_unordered (code)
43525 : reverse_condition (code));
43528 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43529 to OPERANDS[0]. */
43531 const char *
43532 output_387_reg_move (rtx_insn *insn, rtx *operands)
43534 if (REG_P (operands[0]))
43536 if (REG_P (operands[1])
43537 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43539 if (REGNO (operands[0]) == FIRST_STACK_REG)
43540 return output_387_ffreep (operands, 0);
43541 return "fstp\t%y0";
43543 if (STACK_TOP_P (operands[0]))
43544 return "fld%Z1\t%y1";
43545 return "fst\t%y0";
43547 else if (MEM_P (operands[0]))
43549 gcc_assert (REG_P (operands[1]));
43550 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43551 return "fstp%Z0\t%y0";
43552 else
43554 /* There is no non-popping store to memory for XFmode.
43555 So if we need one, follow the store with a load. */
43556 if (GET_MODE (operands[0]) == XFmode)
43557 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43558 else
43559 return "fst%Z0\t%y0";
43562 else
43563 gcc_unreachable();
43566 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43567 FP status register is set. */
43569 void
43570 ix86_emit_fp_unordered_jump (rtx label)
43572 rtx reg = gen_reg_rtx (HImode);
43573 rtx temp;
43575 emit_insn (gen_x86_fnstsw_1 (reg));
43577 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43579 emit_insn (gen_x86_sahf_1 (reg));
43581 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43582 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43584 else
43586 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
43588 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43589 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43592 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43593 gen_rtx_LABEL_REF (VOIDmode, label),
43594 pc_rtx);
43595 temp = gen_rtx_SET (pc_rtx, temp);
43597 emit_jump_insn (temp);
43598 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43601 /* Output code to perform a log1p XFmode calculation. */
43603 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43605 rtx_code_label *label1 = gen_label_rtx ();
43606 rtx_code_label *label2 = gen_label_rtx ();
43608 rtx tmp = gen_reg_rtx (XFmode);
43609 rtx tmp2 = gen_reg_rtx (XFmode);
43610 rtx test;
43612 emit_insn (gen_absxf2 (tmp, op1));
43613 test = gen_rtx_GE (VOIDmode, tmp,
43614 const_double_from_real_value (
43615 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43616 XFmode));
43617 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43619 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43620 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43621 emit_jump (label2);
43623 emit_label (label1);
43624 emit_move_insn (tmp, CONST1_RTX (XFmode));
43625 emit_insn (gen_addxf3 (tmp, op1, tmp));
43626 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43627 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43629 emit_label (label2);
43632 /* Emit code for round calculation. */
43633 void ix86_emit_i387_round (rtx op0, rtx op1)
43635 machine_mode inmode = GET_MODE (op1);
43636 machine_mode outmode = GET_MODE (op0);
43637 rtx e1, e2, res, tmp, tmp1, half;
43638 rtx scratch = gen_reg_rtx (HImode);
43639 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43640 rtx_code_label *jump_label = gen_label_rtx ();
43641 rtx insn;
43642 rtx (*gen_abs) (rtx, rtx);
43643 rtx (*gen_neg) (rtx, rtx);
43645 switch (inmode)
43647 case E_SFmode:
43648 gen_abs = gen_abssf2;
43649 break;
43650 case E_DFmode:
43651 gen_abs = gen_absdf2;
43652 break;
43653 case E_XFmode:
43654 gen_abs = gen_absxf2;
43655 break;
43656 default:
43657 gcc_unreachable ();
43660 switch (outmode)
43662 case E_SFmode:
43663 gen_neg = gen_negsf2;
43664 break;
43665 case E_DFmode:
43666 gen_neg = gen_negdf2;
43667 break;
43668 case E_XFmode:
43669 gen_neg = gen_negxf2;
43670 break;
43671 case E_HImode:
43672 gen_neg = gen_neghi2;
43673 break;
43674 case E_SImode:
43675 gen_neg = gen_negsi2;
43676 break;
43677 case E_DImode:
43678 gen_neg = gen_negdi2;
43679 break;
43680 default:
43681 gcc_unreachable ();
43684 e1 = gen_reg_rtx (inmode);
43685 e2 = gen_reg_rtx (inmode);
43686 res = gen_reg_rtx (outmode);
43688 half = const_double_from_real_value (dconsthalf, inmode);
43690 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43692 /* scratch = fxam(op1) */
43693 emit_insn (gen_rtx_SET (scratch,
43694 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43695 UNSPEC_FXAM)));
43696 /* e1 = fabs(op1) */
43697 emit_insn (gen_abs (e1, op1));
43699 /* e2 = e1 + 0.5 */
43700 half = force_reg (inmode, half);
43701 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43703 /* res = floor(e2) */
43704 if (inmode != XFmode)
43706 tmp1 = gen_reg_rtx (XFmode);
43708 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43710 else
43711 tmp1 = e2;
43713 switch (outmode)
43715 case E_SFmode:
43716 case E_DFmode:
43718 rtx tmp0 = gen_reg_rtx (XFmode);
43720 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43722 emit_insn (gen_rtx_SET (res,
43723 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43724 UNSPEC_TRUNC_NOOP)));
43726 break;
43727 case E_XFmode:
43728 emit_insn (gen_frndintxf2_floor (res, tmp1));
43729 break;
43730 case E_HImode:
43731 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43732 break;
43733 case E_SImode:
43734 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43735 break;
43736 case E_DImode:
43737 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43738 break;
43739 default:
43740 gcc_unreachable ();
43743 /* flags = signbit(a) */
43744 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
43746 /* if (flags) then res = -res */
43747 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43748 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43749 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43750 pc_rtx);
43751 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43752 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43753 JUMP_LABEL (insn) = jump_label;
43755 emit_insn (gen_neg (res, res));
43757 emit_label (jump_label);
43758 LABEL_NUSES (jump_label) = 1;
43760 emit_move_insn (op0, res);
43763 /* Output code to perform a Newton-Rhapson approximation of a single precision
43764 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43766 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43768 rtx x0, x1, e0, e1;
43770 x0 = gen_reg_rtx (mode);
43771 e0 = gen_reg_rtx (mode);
43772 e1 = gen_reg_rtx (mode);
43773 x1 = gen_reg_rtx (mode);
43775 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43777 b = force_reg (mode, b);
43779 /* x0 = rcp(b) estimate */
43780 if (mode == V16SFmode || mode == V8DFmode)
43782 if (TARGET_AVX512ER)
43784 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43785 UNSPEC_RCP28)));
43786 /* res = a * x0 */
43787 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43788 return;
43790 else
43791 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43792 UNSPEC_RCP14)));
43794 else
43795 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43796 UNSPEC_RCP)));
43798 /* e0 = x0 * b */
43799 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43801 /* e0 = x0 * e0 */
43802 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43804 /* e1 = x0 + x0 */
43805 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43807 /* x1 = e1 - e0 */
43808 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43810 /* res = a * x1 */
43811 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43814 /* Output code to perform a Newton-Rhapson approximation of a
43815 single precision floating point [reciprocal] square root. */
43817 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43819 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43820 REAL_VALUE_TYPE r;
43821 int unspec;
43823 x0 = gen_reg_rtx (mode);
43824 e0 = gen_reg_rtx (mode);
43825 e1 = gen_reg_rtx (mode);
43826 e2 = gen_reg_rtx (mode);
43827 e3 = gen_reg_rtx (mode);
43829 if (TARGET_AVX512ER && mode == V16SFmode)
43831 if (recip)
43832 /* res = rsqrt28(a) estimate */
43833 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43834 UNSPEC_RSQRT28)));
43835 else
43837 /* x0 = rsqrt28(a) estimate */
43838 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43839 UNSPEC_RSQRT28)));
43840 /* res = rcp28(x0) estimate */
43841 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43842 UNSPEC_RCP28)));
43844 return;
43847 real_from_integer (&r, VOIDmode, -3, SIGNED);
43848 mthree = const_double_from_real_value (r, SFmode);
43850 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43851 mhalf = const_double_from_real_value (r, SFmode);
43852 unspec = UNSPEC_RSQRT;
43854 if (VECTOR_MODE_P (mode))
43856 mthree = ix86_build_const_vector (mode, true, mthree);
43857 mhalf = ix86_build_const_vector (mode, true, mhalf);
43858 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43859 if (GET_MODE_SIZE (mode) == 64)
43860 unspec = UNSPEC_RSQRT14;
43863 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43864 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43866 a = force_reg (mode, a);
43868 /* x0 = rsqrt(a) estimate */
43869 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43870 unspec)));
43872 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43873 if (!recip)
43875 rtx zero = force_reg (mode, CONST0_RTX(mode));
43876 rtx mask;
43878 /* Handle masked compare. */
43879 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43881 mask = gen_reg_rtx (HImode);
43882 /* Imm value 0x4 corresponds to not-equal comparison. */
43883 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43884 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43886 else
43888 mask = gen_reg_rtx (mode);
43889 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43890 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43894 /* e0 = x0 * a */
43895 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43896 /* e1 = e0 * x0 */
43897 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43899 /* e2 = e1 - 3. */
43900 mthree = force_reg (mode, mthree);
43901 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43903 mhalf = force_reg (mode, mhalf);
43904 if (recip)
43905 /* e3 = -.5 * x0 */
43906 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43907 else
43908 /* e3 = -.5 * e0 */
43909 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43910 /* ret = e2 * e3 */
43911 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43914 #ifdef TARGET_SOLARIS
43915 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43917 static void
43918 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43919 tree decl)
43921 /* With Binutils 2.15, the "@unwind" marker must be specified on
43922 every occurrence of the ".eh_frame" section, not just the first
43923 one. */
43924 if (TARGET_64BIT
43925 && strcmp (name, ".eh_frame") == 0)
43927 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43928 flags & SECTION_WRITE ? "aw" : "a");
43929 return;
43932 #ifndef USE_GAS
43933 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
43935 solaris_elf_asm_comdat_section (name, flags, decl);
43936 return;
43938 #endif
43940 default_elf_asm_named_section (name, flags, decl);
43942 #endif /* TARGET_SOLARIS */
43944 /* Return the mangling of TYPE if it is an extended fundamental type. */
43946 static const char *
43947 ix86_mangle_type (const_tree type)
43949 type = TYPE_MAIN_VARIANT (type);
43951 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
43952 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
43953 return NULL;
43955 switch (TYPE_MODE (type))
43957 case E_TFmode:
43958 /* __float128 is "g". */
43959 return "g";
43960 case E_XFmode:
43961 /* "long double" or __float80 is "e". */
43962 return "e";
43963 default:
43964 return NULL;
43968 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
43970 static tree
43971 ix86_stack_protect_guard (void)
43973 if (TARGET_SSP_TLS_GUARD)
43975 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
43976 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
43977 tree type = build_qualified_type (type_node, qual);
43978 tree t;
43980 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
43982 t = ix86_tls_stack_chk_guard_decl;
43984 if (t == NULL)
43986 rtx x;
43988 t = build_decl
43989 (UNKNOWN_LOCATION, VAR_DECL,
43990 get_identifier (ix86_stack_protector_guard_symbol_str),
43991 type);
43992 TREE_STATIC (t) = 1;
43993 TREE_PUBLIC (t) = 1;
43994 DECL_EXTERNAL (t) = 1;
43995 TREE_USED (t) = 1;
43996 TREE_THIS_VOLATILE (t) = 1;
43997 DECL_ARTIFICIAL (t) = 1;
43998 DECL_IGNORED_P (t) = 1;
44000 /* Do not share RTL as the declaration is visible outside of
44001 current function. */
44002 x = DECL_RTL (t);
44003 RTX_FLAG (x, used) = 1;
44005 ix86_tls_stack_chk_guard_decl = t;
44008 else
44010 tree asptrtype = build_pointer_type (type);
44012 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
44013 t = build2 (MEM_REF, asptrtype, t,
44014 build_int_cst (asptrtype, 0));
44017 return t;
44020 return default_stack_protect_guard ();
44023 /* For 32-bit code we can save PIC register setup by using
44024 __stack_chk_fail_local hidden function instead of calling
44025 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44026 register, so it is better to call __stack_chk_fail directly. */
44028 static tree ATTRIBUTE_UNUSED
44029 ix86_stack_protect_fail (void)
44031 return TARGET_64BIT
44032 ? default_external_stack_protect_fail ()
44033 : default_hidden_stack_protect_fail ();
44036 /* Select a format to encode pointers in exception handling data. CODE
44037 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44038 true if the symbol may be affected by dynamic relocations.
44040 ??? All x86 object file formats are capable of representing this.
44041 After all, the relocation needed is the same as for the call insn.
44042 Whether or not a particular assembler allows us to enter such, I
44043 guess we'll have to see. */
44045 asm_preferred_eh_data_format (int code, int global)
44047 if (flag_pic)
44049 int type = DW_EH_PE_sdata8;
44050 if (!TARGET_64BIT
44051 || ix86_cmodel == CM_SMALL_PIC
44052 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44053 type = DW_EH_PE_sdata4;
44054 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44056 if (ix86_cmodel == CM_SMALL
44057 || (ix86_cmodel == CM_MEDIUM && code))
44058 return DW_EH_PE_udata4;
44059 return DW_EH_PE_absptr;
44062 /* Expand copysign from SIGN to the positive value ABS_VALUE
44063 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44064 the sign-bit. */
44065 static void
44066 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44068 machine_mode mode = GET_MODE (sign);
44069 rtx sgn = gen_reg_rtx (mode);
44070 if (mask == NULL_RTX)
44072 machine_mode vmode;
44074 if (mode == SFmode)
44075 vmode = V4SFmode;
44076 else if (mode == DFmode)
44077 vmode = V2DFmode;
44078 else
44079 vmode = mode;
44081 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44082 if (!VECTOR_MODE_P (mode))
44084 /* We need to generate a scalar mode mask in this case. */
44085 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44086 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44087 mask = gen_reg_rtx (mode);
44088 emit_insn (gen_rtx_SET (mask, tmp));
44091 else
44092 mask = gen_rtx_NOT (mode, mask);
44093 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44094 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44097 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44098 mask for masking out the sign-bit is stored in *SMASK, if that is
44099 non-null. */
44100 static rtx
44101 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44103 machine_mode vmode, mode = GET_MODE (op0);
44104 rtx xa, mask;
44106 xa = gen_reg_rtx (mode);
44107 if (mode == SFmode)
44108 vmode = V4SFmode;
44109 else if (mode == DFmode)
44110 vmode = V2DFmode;
44111 else
44112 vmode = mode;
44113 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44114 if (!VECTOR_MODE_P (mode))
44116 /* We need to generate a scalar mode mask in this case. */
44117 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44118 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44119 mask = gen_reg_rtx (mode);
44120 emit_insn (gen_rtx_SET (mask, tmp));
44122 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44124 if (smask)
44125 *smask = mask;
44127 return xa;
44130 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44131 swapping the operands if SWAP_OPERANDS is true. The expanded
44132 code is a forward jump to a newly created label in case the
44133 comparison is true. The generated label rtx is returned. */
44134 static rtx_code_label *
44135 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44136 bool swap_operands)
44138 bool unordered_compare = ix86_unordered_fp_compare (code);
44139 rtx_code_label *label;
44140 rtx tmp, reg;
44142 if (swap_operands)
44143 std::swap (op0, op1);
44145 label = gen_label_rtx ();
44146 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
44147 if (unordered_compare)
44148 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
44149 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
44150 emit_insn (gen_rtx_SET (reg, tmp));
44151 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
44152 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44153 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44154 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44155 JUMP_LABEL (tmp) = label;
44157 return label;
44160 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44161 using comparison code CODE. Operands are swapped for the comparison if
44162 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44163 static rtx
44164 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44165 bool swap_operands)
44167 rtx (*insn)(rtx, rtx, rtx, rtx);
44168 machine_mode mode = GET_MODE (op0);
44169 rtx mask = gen_reg_rtx (mode);
44171 if (swap_operands)
44172 std::swap (op0, op1);
44174 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44176 emit_insn (insn (mask, op0, op1,
44177 gen_rtx_fmt_ee (code, mode, op0, op1)));
44178 return mask;
44181 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44182 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44183 static rtx
44184 ix86_gen_TWO52 (machine_mode mode)
44186 REAL_VALUE_TYPE TWO52r;
44187 rtx TWO52;
44189 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44190 TWO52 = const_double_from_real_value (TWO52r, mode);
44191 TWO52 = force_reg (mode, TWO52);
44193 return TWO52;
44196 /* Expand SSE sequence for computing lround from OP1 storing
44197 into OP0. */
44198 void
44199 ix86_expand_lround (rtx op0, rtx op1)
44201 /* C code for the stuff we're doing below:
44202 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44203 return (long)tmp;
44205 machine_mode mode = GET_MODE (op1);
44206 const struct real_format *fmt;
44207 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44208 rtx adj;
44210 /* load nextafter (0.5, 0.0) */
44211 fmt = REAL_MODE_FORMAT (mode);
44212 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44213 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44215 /* adj = copysign (0.5, op1) */
44216 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44217 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44219 /* adj = op1 + adj */
44220 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44222 /* op0 = (imode)adj */
44223 expand_fix (op0, adj, 0);
44226 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44227 into OPERAND0. */
44228 void
44229 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44231 /* C code for the stuff we're doing below (for do_floor):
44232 xi = (long)op1;
44233 xi -= (double)xi > op1 ? 1 : 0;
44234 return xi;
44236 machine_mode fmode = GET_MODE (op1);
44237 machine_mode imode = GET_MODE (op0);
44238 rtx ireg, freg, tmp;
44239 rtx_code_label *label;
44241 /* reg = (long)op1 */
44242 ireg = gen_reg_rtx (imode);
44243 expand_fix (ireg, op1, 0);
44245 /* freg = (double)reg */
44246 freg = gen_reg_rtx (fmode);
44247 expand_float (freg, ireg, 0);
44249 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44250 label = ix86_expand_sse_compare_and_jump (UNLE,
44251 freg, op1, !do_floor);
44252 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44253 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44254 emit_move_insn (ireg, tmp);
44256 emit_label (label);
44257 LABEL_NUSES (label) = 1;
44259 emit_move_insn (op0, ireg);
44262 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
44263 void
44264 ix86_expand_rint (rtx operand0, rtx operand1)
44266 /* C code for the stuff we're doing below:
44267 xa = fabs (operand1);
44268 if (!isless (xa, 2**52))
44269 return operand1;
44270 two52 = 2**52;
44271 if (flag_rounding_math)
44273 two52 = copysign (two52, operand1);
44274 xa = operand1;
44276 xa = xa + two52 - two52;
44277 return copysign (xa, operand1);
44279 machine_mode mode = GET_MODE (operand0);
44280 rtx res, xa, TWO52, two52, mask;
44281 rtx_code_label *label;
44283 res = gen_reg_rtx (mode);
44284 emit_move_insn (res, operand1);
44286 /* xa = abs (operand1) */
44287 xa = ix86_expand_sse_fabs (res, &mask);
44289 /* if (!isless (xa, TWO52)) goto label; */
44290 TWO52 = ix86_gen_TWO52 (mode);
44291 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44293 two52 = TWO52;
44294 if (flag_rounding_math)
44296 two52 = gen_reg_rtx (mode);
44297 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
44298 xa = res;
44301 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
44302 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
44304 ix86_sse_copysign_to_positive (res, xa, res, mask);
44306 emit_label (label);
44307 LABEL_NUSES (label) = 1;
44309 emit_move_insn (operand0, res);
44312 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44313 into OPERAND0. */
44314 void
44315 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44317 /* C code for the stuff we expand below.
44318 double xa = fabs (x), x2;
44319 if (!isless (xa, TWO52))
44320 return x;
44321 xa = xa + TWO52 - TWO52;
44322 x2 = copysign (xa, x);
44323 Compensate. Floor:
44324 if (x2 > x)
44325 x2 -= 1;
44326 Compensate. Ceil:
44327 if (x2 < x)
44328 x2 -= -1;
44329 return x2;
44331 machine_mode mode = GET_MODE (operand0);
44332 rtx xa, TWO52, tmp, one, res, mask;
44333 rtx_code_label *label;
44335 TWO52 = ix86_gen_TWO52 (mode);
44337 /* Temporary for holding the result, initialized to the input
44338 operand to ease control flow. */
44339 res = gen_reg_rtx (mode);
44340 emit_move_insn (res, operand1);
44342 /* xa = abs (operand1) */
44343 xa = ix86_expand_sse_fabs (res, &mask);
44345 /* if (!isless (xa, TWO52)) goto label; */
44346 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44348 /* xa = xa + TWO52 - TWO52; */
44349 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44350 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44352 /* xa = copysign (xa, operand1) */
44353 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44355 /* generate 1.0 or -1.0 */
44356 one = force_reg (mode,
44357 const_double_from_real_value (do_floor
44358 ? dconst1 : dconstm1, mode));
44360 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44361 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44362 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44363 /* We always need to subtract here to preserve signed zero. */
44364 tmp = expand_simple_binop (mode, MINUS,
44365 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44366 emit_move_insn (res, tmp);
44368 emit_label (label);
44369 LABEL_NUSES (label) = 1;
44371 emit_move_insn (operand0, res);
44374 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44375 into OPERAND0. */
44376 void
44377 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44379 /* C code for the stuff we expand below.
44380 double xa = fabs (x), x2;
44381 if (!isless (xa, TWO52))
44382 return x;
44383 x2 = (double)(long)x;
44384 Compensate. Floor:
44385 if (x2 > x)
44386 x2 -= 1;
44387 Compensate. Ceil:
44388 if (x2 < x)
44389 x2 += 1;
44390 if (HONOR_SIGNED_ZEROS (mode))
44391 return copysign (x2, x);
44392 return x2;
44394 machine_mode mode = GET_MODE (operand0);
44395 rtx xa, xi, TWO52, tmp, one, res, mask;
44396 rtx_code_label *label;
44398 TWO52 = ix86_gen_TWO52 (mode);
44400 /* Temporary for holding the result, initialized to the input
44401 operand to ease control flow. */
44402 res = gen_reg_rtx (mode);
44403 emit_move_insn (res, operand1);
44405 /* xa = abs (operand1) */
44406 xa = ix86_expand_sse_fabs (res, &mask);
44408 /* if (!isless (xa, TWO52)) goto label; */
44409 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44411 /* xa = (double)(long)x */
44412 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44413 expand_fix (xi, res, 0);
44414 expand_float (xa, xi, 0);
44416 /* generate 1.0 */
44417 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44419 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44420 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44421 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44422 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44423 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44424 emit_move_insn (res, tmp);
44426 if (HONOR_SIGNED_ZEROS (mode))
44427 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44429 emit_label (label);
44430 LABEL_NUSES (label) = 1;
44432 emit_move_insn (operand0, res);
44435 /* Expand SSE sequence for computing round from OPERAND1 storing
44436 into OPERAND0. Sequence that works without relying on DImode truncation
44437 via cvttsd2siq that is only available on 64bit targets. */
44438 void
44439 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44441 /* C code for the stuff we expand below.
44442 double xa = fabs (x), xa2, x2;
44443 if (!isless (xa, TWO52))
44444 return x;
44445 Using the absolute value and copying back sign makes
44446 -0.0 -> -0.0 correct.
44447 xa2 = xa + TWO52 - TWO52;
44448 Compensate.
44449 dxa = xa2 - xa;
44450 if (dxa <= -0.5)
44451 xa2 += 1;
44452 else if (dxa > 0.5)
44453 xa2 -= 1;
44454 x2 = copysign (xa2, x);
44455 return x2;
44457 machine_mode mode = GET_MODE (operand0);
44458 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44459 rtx_code_label *label;
44461 TWO52 = ix86_gen_TWO52 (mode);
44463 /* Temporary for holding the result, initialized to the input
44464 operand to ease control flow. */
44465 res = gen_reg_rtx (mode);
44466 emit_move_insn (res, operand1);
44468 /* xa = abs (operand1) */
44469 xa = ix86_expand_sse_fabs (res, &mask);
44471 /* if (!isless (xa, TWO52)) goto label; */
44472 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44474 /* xa2 = xa + TWO52 - TWO52; */
44475 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44476 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44478 /* dxa = xa2 - xa; */
44479 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44481 /* generate 0.5, 1.0 and -0.5 */
44482 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44483 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44484 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44485 0, OPTAB_DIRECT);
44487 /* Compensate. */
44488 tmp = gen_reg_rtx (mode);
44489 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44490 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44491 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44492 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44493 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44494 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44495 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44496 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44498 /* res = copysign (xa2, operand1) */
44499 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44501 emit_label (label);
44502 LABEL_NUSES (label) = 1;
44504 emit_move_insn (operand0, res);
44507 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44508 into OPERAND0. */
44509 void
44510 ix86_expand_trunc (rtx operand0, rtx operand1)
44512 /* C code for SSE variant we expand below.
44513 double xa = fabs (x), x2;
44514 if (!isless (xa, TWO52))
44515 return x;
44516 x2 = (double)(long)x;
44517 if (HONOR_SIGNED_ZEROS (mode))
44518 return copysign (x2, x);
44519 return x2;
44521 machine_mode mode = GET_MODE (operand0);
44522 rtx xa, xi, TWO52, res, mask;
44523 rtx_code_label *label;
44525 TWO52 = ix86_gen_TWO52 (mode);
44527 /* Temporary for holding the result, initialized to the input
44528 operand to ease control flow. */
44529 res = gen_reg_rtx (mode);
44530 emit_move_insn (res, operand1);
44532 /* xa = abs (operand1) */
44533 xa = ix86_expand_sse_fabs (res, &mask);
44535 /* if (!isless (xa, TWO52)) goto label; */
44536 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44538 /* x = (double)(long)x */
44539 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44540 expand_fix (xi, res, 0);
44541 expand_float (res, xi, 0);
44543 if (HONOR_SIGNED_ZEROS (mode))
44544 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44546 emit_label (label);
44547 LABEL_NUSES (label) = 1;
44549 emit_move_insn (operand0, res);
44552 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44553 into OPERAND0. */
44554 void
44555 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44557 machine_mode mode = GET_MODE (operand0);
44558 rtx xa, mask, TWO52, one, res, smask, tmp;
44559 rtx_code_label *label;
44561 /* C code for SSE variant we expand below.
44562 double xa = fabs (x), x2;
44563 if (!isless (xa, TWO52))
44564 return x;
44565 xa2 = xa + TWO52 - TWO52;
44566 Compensate:
44567 if (xa2 > xa)
44568 xa2 -= 1.0;
44569 x2 = copysign (xa2, x);
44570 return x2;
44573 TWO52 = ix86_gen_TWO52 (mode);
44575 /* Temporary for holding the result, initialized to the input
44576 operand to ease control flow. */
44577 res = gen_reg_rtx (mode);
44578 emit_move_insn (res, operand1);
44580 /* xa = abs (operand1) */
44581 xa = ix86_expand_sse_fabs (res, &smask);
44583 /* if (!isless (xa, TWO52)) goto label; */
44584 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44586 /* res = xa + TWO52 - TWO52; */
44587 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44588 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44589 emit_move_insn (res, tmp);
44591 /* generate 1.0 */
44592 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44594 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44595 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44596 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44597 tmp = expand_simple_binop (mode, MINUS,
44598 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44599 emit_move_insn (res, tmp);
44601 /* res = copysign (res, operand1) */
44602 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44604 emit_label (label);
44605 LABEL_NUSES (label) = 1;
44607 emit_move_insn (operand0, res);
44610 /* Expand SSE sequence for computing round from OPERAND1 storing
44611 into OPERAND0. */
44612 void
44613 ix86_expand_round (rtx operand0, rtx operand1)
44615 /* C code for the stuff we're doing below:
44616 double xa = fabs (x);
44617 if (!isless (xa, TWO52))
44618 return x;
44619 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44620 return copysign (xa, x);
44622 machine_mode mode = GET_MODE (operand0);
44623 rtx res, TWO52, xa, xi, half, mask;
44624 rtx_code_label *label;
44625 const struct real_format *fmt;
44626 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44628 /* Temporary for holding the result, initialized to the input
44629 operand to ease control flow. */
44630 res = gen_reg_rtx (mode);
44631 emit_move_insn (res, operand1);
44633 TWO52 = ix86_gen_TWO52 (mode);
44634 xa = ix86_expand_sse_fabs (res, &mask);
44635 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44637 /* load nextafter (0.5, 0.0) */
44638 fmt = REAL_MODE_FORMAT (mode);
44639 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44640 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44642 /* xa = xa + 0.5 */
44643 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44644 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44646 /* xa = (double)(int64_t)xa */
44647 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44648 expand_fix (xi, xa, 0);
44649 expand_float (xa, xi, 0);
44651 /* res = copysign (xa, operand1) */
44652 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44654 emit_label (label);
44655 LABEL_NUSES (label) = 1;
44657 emit_move_insn (operand0, res);
44660 /* Expand SSE sequence for computing round
44661 from OP1 storing into OP0 using sse4 round insn. */
44662 void
44663 ix86_expand_round_sse4 (rtx op0, rtx op1)
44665 machine_mode mode = GET_MODE (op0);
44666 rtx e1, e2, res, half;
44667 const struct real_format *fmt;
44668 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44669 rtx (*gen_copysign) (rtx, rtx, rtx);
44670 rtx (*gen_round) (rtx, rtx, rtx);
44672 switch (mode)
44674 case E_SFmode:
44675 gen_copysign = gen_copysignsf3;
44676 gen_round = gen_sse4_1_roundsf2;
44677 break;
44678 case E_DFmode:
44679 gen_copysign = gen_copysigndf3;
44680 gen_round = gen_sse4_1_rounddf2;
44681 break;
44682 default:
44683 gcc_unreachable ();
44686 /* round (a) = trunc (a + copysign (0.5, a)) */
44688 /* load nextafter (0.5, 0.0) */
44689 fmt = REAL_MODE_FORMAT (mode);
44690 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44691 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44692 half = const_double_from_real_value (pred_half, mode);
44694 /* e1 = copysign (0.5, op1) */
44695 e1 = gen_reg_rtx (mode);
44696 emit_insn (gen_copysign (e1, half, op1));
44698 /* e2 = op1 + e1 */
44699 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44701 /* res = trunc (e2) */
44702 res = gen_reg_rtx (mode);
44703 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44705 emit_move_insn (op0, res);
44709 /* Table of valid machine attributes. */
44710 static const struct attribute_spec ix86_attribute_table[] =
44712 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
44713 affects_type_identity, handler, exclude } */
44714 /* Stdcall attribute says callee is responsible for popping arguments
44715 if they are not variable. */
44716 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44717 NULL },
44718 /* Fastcall attribute says callee is responsible for popping arguments
44719 if they are not variable. */
44720 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44721 NULL },
44722 /* Thiscall attribute says callee is responsible for popping arguments
44723 if they are not variable. */
44724 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44725 NULL },
44726 /* Cdecl attribute says the callee is a normal C declaration */
44727 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44728 NULL },
44729 /* Regparm attribute specifies how many integer arguments are to be
44730 passed in registers. */
44731 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
44732 NULL },
44733 /* Sseregparm attribute says we are using x86_64 calling conventions
44734 for FP arguments. */
44735 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44736 NULL },
44737 /* The transactional memory builtins are implicitly regparm or fastcall
44738 depending on the ABI. Override the generic do-nothing attribute that
44739 these builtins were declared with. */
44740 { "*tm regparm", 0, 0, false, true, true, true,
44741 ix86_handle_tm_regparm_attribute, NULL },
44742 /* force_align_arg_pointer says this function realigns the stack at entry. */
44743 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44744 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
44745 NULL },
44746 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44747 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
44748 NULL },
44749 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
44750 NULL },
44751 { "shared", 0, 0, true, false, false, false,
44752 ix86_handle_shared_attribute, false, NULL },
44753 #endif
44754 { "ms_struct", 0, 0, false, false, false, false,
44755 ix86_handle_struct_attribute, NULL },
44756 { "gcc_struct", 0, 0, false, false, false, false,
44757 ix86_handle_struct_attribute, NULL },
44758 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44759 SUBTARGET_ATTRIBUTE_TABLE,
44760 #endif
44761 /* ms_abi and sysv_abi calling convention function attributes. */
44762 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
44763 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
44764 NULL },
44765 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
44766 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
44767 { "ms_hook_prologue", 0, 0, true, false, false, false,
44768 ix86_handle_fndecl_attribute, NULL },
44769 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
44770 ix86_handle_callee_pop_aggregate_return, NULL },
44771 { "interrupt", 0, 0, false, true, true, false,
44772 ix86_handle_interrupt_attribute, NULL },
44773 { "no_caller_saved_registers", 0, 0, false, true, true, false,
44774 ix86_handle_no_caller_saved_registers_attribute, NULL },
44775 { "naked", 0, 0, true, false, false, false,
44776 ix86_handle_fndecl_attribute, NULL },
44778 /* End element. */
44779 { NULL, 0, 0, false, false, false, false, NULL, NULL }
44782 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44783 static int
44784 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44785 tree vectype, int)
44787 bool fp = false;
44788 machine_mode mode = TImode;
44789 int index;
44790 if (vectype != NULL)
44792 fp = FLOAT_TYPE_P (vectype);
44793 mode = TYPE_MODE (vectype);
44796 switch (type_of_cost)
44798 case scalar_stmt:
44799 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
44801 case scalar_load:
44802 /* load/store costs are relative to register move which is 2. Recompute
44803 it to COSTS_N_INSNS so everything have same base. */
44804 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
44805 : ix86_cost->int_load [2]) / 2;
44807 case scalar_store:
44808 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
44809 : ix86_cost->int_store [2]) / 2;
44811 case vector_stmt:
44812 return ix86_vec_cost (mode,
44813 fp ? ix86_cost->addss : ix86_cost->sse_op,
44814 true);
44816 case vector_load:
44817 index = sse_store_index (mode);
44818 /* See PR82713 - we may end up being called on non-vector type. */
44819 if (index < 0)
44820 index = 2;
44821 return ix86_vec_cost (mode,
44822 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
44823 true);
44825 case vector_store:
44826 index = sse_store_index (mode);
44827 /* See PR82713 - we may end up being called on non-vector type. */
44828 if (index < 0)
44829 index = 2;
44830 return ix86_vec_cost (mode,
44831 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
44832 true);
44834 case vec_to_scalar:
44835 case scalar_to_vec:
44836 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
44838 /* We should have separate costs for unaligned loads and gather/scatter.
44839 Do that incrementally. */
44840 case unaligned_load:
44841 index = sse_store_index (mode);
44842 /* See PR82713 - we may end up being called on non-vector type. */
44843 if (index < 0)
44844 index = 2;
44845 return ix86_vec_cost (mode,
44846 COSTS_N_INSNS
44847 (ix86_cost->sse_unaligned_load[index]) / 2,
44848 true);
44850 case unaligned_store:
44851 index = sse_store_index (mode);
44852 /* See PR82713 - we may end up being called on non-vector type. */
44853 if (index < 0)
44854 index = 2;
44855 return ix86_vec_cost (mode,
44856 COSTS_N_INSNS
44857 (ix86_cost->sse_unaligned_store[index]) / 2,
44858 true);
44860 case vector_gather_load:
44861 return ix86_vec_cost (mode,
44862 COSTS_N_INSNS
44863 (ix86_cost->gather_static
44864 + ix86_cost->gather_per_elt
44865 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44866 true);
44868 case vector_scatter_store:
44869 return ix86_vec_cost (mode,
44870 COSTS_N_INSNS
44871 (ix86_cost->scatter_static
44872 + ix86_cost->scatter_per_elt
44873 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44874 true);
44876 case cond_branch_taken:
44877 return ix86_cost->cond_taken_branch_cost;
44879 case cond_branch_not_taken:
44880 return ix86_cost->cond_not_taken_branch_cost;
44882 case vec_perm:
44883 case vec_promote_demote:
44884 return ix86_vec_cost (mode,
44885 ix86_cost->sse_op, true);
44887 case vec_construct:
44888 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
44890 default:
44891 gcc_unreachable ();
44895 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44896 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44897 insn every time. */
44899 static GTY(()) rtx_insn *vselect_insn;
44901 /* Initialize vselect_insn. */
44903 static void
44904 init_vselect_insn (void)
44906 unsigned i;
44907 rtx x;
44909 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44910 for (i = 0; i < MAX_VECT_LEN; ++i)
44911 XVECEXP (x, 0, i) = const0_rtx;
44912 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44913 const0_rtx), x);
44914 x = gen_rtx_SET (const0_rtx, x);
44915 start_sequence ();
44916 vselect_insn = emit_insn (x);
44917 end_sequence ();
44920 /* Construct (set target (vec_select op0 (parallel perm))) and
44921 return true if that's a valid instruction in the active ISA. */
44923 static bool
44924 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44925 unsigned nelt, bool testing_p)
44927 unsigned int i;
44928 rtx x, save_vconcat;
44929 int icode;
44931 if (vselect_insn == NULL_RTX)
44932 init_vselect_insn ();
44934 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44935 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44936 for (i = 0; i < nelt; ++i)
44937 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44938 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44939 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44940 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44941 SET_DEST (PATTERN (vselect_insn)) = target;
44942 icode = recog_memoized (vselect_insn);
44944 if (icode >= 0 && !testing_p)
44945 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44947 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44948 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44949 INSN_CODE (vselect_insn) = -1;
44951 return icode >= 0;
44954 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44956 static bool
44957 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44958 const unsigned char *perm, unsigned nelt,
44959 bool testing_p)
44961 machine_mode v2mode;
44962 rtx x;
44963 bool ok;
44965 if (vselect_insn == NULL_RTX)
44966 init_vselect_insn ();
44968 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
44969 return false;
44970 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44971 PUT_MODE (x, v2mode);
44972 XEXP (x, 0) = op0;
44973 XEXP (x, 1) = op1;
44974 ok = expand_vselect (target, x, perm, nelt, testing_p);
44975 XEXP (x, 0) = const0_rtx;
44976 XEXP (x, 1) = const0_rtx;
44977 return ok;
44980 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44981 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44983 static bool
44984 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44986 machine_mode mmode, vmode = d->vmode;
44987 unsigned i, mask, nelt = d->nelt;
44988 rtx target, op0, op1, maskop, x;
44989 rtx rperm[32], vperm;
44991 if (d->one_operand_p)
44992 return false;
44993 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44994 && (TARGET_AVX512BW
44995 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44997 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44999 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45001 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45003 else
45004 return false;
45006 /* This is a blend, not a permute. Elements must stay in their
45007 respective lanes. */
45008 for (i = 0; i < nelt; ++i)
45010 unsigned e = d->perm[i];
45011 if (!(e == i || e == i + nelt))
45012 return false;
45015 if (d->testing_p)
45016 return true;
45018 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45019 decision should be extracted elsewhere, so that we only try that
45020 sequence once all budget==3 options have been tried. */
45021 target = d->target;
45022 op0 = d->op0;
45023 op1 = d->op1;
45024 mask = 0;
45026 switch (vmode)
45028 case E_V8DFmode:
45029 case E_V16SFmode:
45030 case E_V4DFmode:
45031 case E_V8SFmode:
45032 case E_V2DFmode:
45033 case E_V4SFmode:
45034 case E_V8HImode:
45035 case E_V8SImode:
45036 case E_V32HImode:
45037 case E_V64QImode:
45038 case E_V16SImode:
45039 case E_V8DImode:
45040 for (i = 0; i < nelt; ++i)
45041 mask |= (d->perm[i] >= nelt) << i;
45042 break;
45044 case E_V2DImode:
45045 for (i = 0; i < 2; ++i)
45046 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45047 vmode = V8HImode;
45048 goto do_subreg;
45050 case E_V4SImode:
45051 for (i = 0; i < 4; ++i)
45052 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45053 vmode = V8HImode;
45054 goto do_subreg;
45056 case E_V16QImode:
45057 /* See if bytes move in pairs so we can use pblendw with
45058 an immediate argument, rather than pblendvb with a vector
45059 argument. */
45060 for (i = 0; i < 16; i += 2)
45061 if (d->perm[i] + 1 != d->perm[i + 1])
45063 use_pblendvb:
45064 for (i = 0; i < nelt; ++i)
45065 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45067 finish_pblendvb:
45068 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45069 vperm = force_reg (vmode, vperm);
45071 if (GET_MODE_SIZE (vmode) == 16)
45072 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45073 else
45074 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45075 if (target != d->target)
45076 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45077 return true;
45080 for (i = 0; i < 8; ++i)
45081 mask |= (d->perm[i * 2] >= 16) << i;
45082 vmode = V8HImode;
45083 /* FALLTHRU */
45085 do_subreg:
45086 target = gen_reg_rtx (vmode);
45087 op0 = gen_lowpart (vmode, op0);
45088 op1 = gen_lowpart (vmode, op1);
45089 break;
45091 case E_V32QImode:
45092 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45093 for (i = 0; i < 32; i += 2)
45094 if (d->perm[i] + 1 != d->perm[i + 1])
45095 goto use_pblendvb;
45096 /* See if bytes move in quadruplets. If yes, vpblendd
45097 with immediate can be used. */
45098 for (i = 0; i < 32; i += 4)
45099 if (d->perm[i] + 2 != d->perm[i + 2])
45100 break;
45101 if (i < 32)
45103 /* See if bytes move the same in both lanes. If yes,
45104 vpblendw with immediate can be used. */
45105 for (i = 0; i < 16; i += 2)
45106 if (d->perm[i] + 16 != d->perm[i + 16])
45107 goto use_pblendvb;
45109 /* Use vpblendw. */
45110 for (i = 0; i < 16; ++i)
45111 mask |= (d->perm[i * 2] >= 32) << i;
45112 vmode = V16HImode;
45113 goto do_subreg;
45116 /* Use vpblendd. */
45117 for (i = 0; i < 8; ++i)
45118 mask |= (d->perm[i * 4] >= 32) << i;
45119 vmode = V8SImode;
45120 goto do_subreg;
45122 case E_V16HImode:
45123 /* See if words move in pairs. If yes, vpblendd can be used. */
45124 for (i = 0; i < 16; i += 2)
45125 if (d->perm[i] + 1 != d->perm[i + 1])
45126 break;
45127 if (i < 16)
45129 /* See if words move the same in both lanes. If not,
45130 vpblendvb must be used. */
45131 for (i = 0; i < 8; i++)
45132 if (d->perm[i] + 8 != d->perm[i + 8])
45134 /* Use vpblendvb. */
45135 for (i = 0; i < 32; ++i)
45136 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45138 vmode = V32QImode;
45139 nelt = 32;
45140 target = gen_reg_rtx (vmode);
45141 op0 = gen_lowpart (vmode, op0);
45142 op1 = gen_lowpart (vmode, op1);
45143 goto finish_pblendvb;
45146 /* Use vpblendw. */
45147 for (i = 0; i < 16; ++i)
45148 mask |= (d->perm[i] >= 16) << i;
45149 break;
45152 /* Use vpblendd. */
45153 for (i = 0; i < 8; ++i)
45154 mask |= (d->perm[i * 2] >= 16) << i;
45155 vmode = V8SImode;
45156 goto do_subreg;
45158 case E_V4DImode:
45159 /* Use vpblendd. */
45160 for (i = 0; i < 4; ++i)
45161 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45162 vmode = V8SImode;
45163 goto do_subreg;
45165 default:
45166 gcc_unreachable ();
45169 switch (vmode)
45171 case E_V8DFmode:
45172 case E_V8DImode:
45173 mmode = QImode;
45174 break;
45175 case E_V16SFmode:
45176 case E_V16SImode:
45177 mmode = HImode;
45178 break;
45179 case E_V32HImode:
45180 mmode = SImode;
45181 break;
45182 case E_V64QImode:
45183 mmode = DImode;
45184 break;
45185 default:
45186 mmode = VOIDmode;
45189 if (mmode != VOIDmode)
45190 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45191 else
45192 maskop = GEN_INT (mask);
45194 /* This matches five different patterns with the different modes. */
45195 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45196 x = gen_rtx_SET (target, x);
45197 emit_insn (x);
45198 if (target != d->target)
45199 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45201 return true;
45204 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45205 in terms of the variable form of vpermilps.
45207 Note that we will have already failed the immediate input vpermilps,
45208 which requires that the high and low part shuffle be identical; the
45209 variable form doesn't require that. */
45211 static bool
45212 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45214 rtx rperm[8], vperm;
45215 unsigned i;
45217 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45218 return false;
45220 /* We can only permute within the 128-bit lane. */
45221 for (i = 0; i < 8; ++i)
45223 unsigned e = d->perm[i];
45224 if (i < 4 ? e >= 4 : e < 4)
45225 return false;
45228 if (d->testing_p)
45229 return true;
45231 for (i = 0; i < 8; ++i)
45233 unsigned e = d->perm[i];
45235 /* Within each 128-bit lane, the elements of op0 are numbered
45236 from 0 and the elements of op1 are numbered from 4. */
45237 if (e >= 8 + 4)
45238 e -= 8;
45239 else if (e >= 4)
45240 e -= 4;
45242 rperm[i] = GEN_INT (e);
45245 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45246 vperm = force_reg (V8SImode, vperm);
45247 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45249 return true;
45252 /* Return true if permutation D can be performed as VMODE permutation
45253 instead. */
45255 static bool
45256 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45258 unsigned int i, j, chunk;
45260 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45261 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45262 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45263 return false;
45265 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45266 return true;
45268 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45269 for (i = 0; i < d->nelt; i += chunk)
45270 if (d->perm[i] & (chunk - 1))
45271 return false;
45272 else
45273 for (j = 1; j < chunk; ++j)
45274 if (d->perm[i] + j != d->perm[i + j])
45275 return false;
45277 return true;
45280 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45281 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45283 static bool
45284 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45286 unsigned i, nelt, eltsz, mask;
45287 unsigned char perm[64];
45288 machine_mode vmode = V16QImode;
45289 rtx rperm[64], vperm, target, op0, op1;
45291 nelt = d->nelt;
45293 if (!d->one_operand_p)
45295 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45297 if (TARGET_AVX2
45298 && valid_perm_using_mode_p (V2TImode, d))
45300 if (d->testing_p)
45301 return true;
45303 /* Use vperm2i128 insn. The pattern uses
45304 V4DImode instead of V2TImode. */
45305 target = d->target;
45306 if (d->vmode != V4DImode)
45307 target = gen_reg_rtx (V4DImode);
45308 op0 = gen_lowpart (V4DImode, d->op0);
45309 op1 = gen_lowpart (V4DImode, d->op1);
45310 rperm[0]
45311 = GEN_INT ((d->perm[0] / (nelt / 2))
45312 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45313 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45314 if (target != d->target)
45315 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45316 return true;
45318 return false;
45321 else
45323 if (GET_MODE_SIZE (d->vmode) == 16)
45325 if (!TARGET_SSSE3)
45326 return false;
45328 else if (GET_MODE_SIZE (d->vmode) == 32)
45330 if (!TARGET_AVX2)
45331 return false;
45333 /* V4DImode should be already handled through
45334 expand_vselect by vpermq instruction. */
45335 gcc_assert (d->vmode != V4DImode);
45337 vmode = V32QImode;
45338 if (d->vmode == V8SImode
45339 || d->vmode == V16HImode
45340 || d->vmode == V32QImode)
45342 /* First see if vpermq can be used for
45343 V8SImode/V16HImode/V32QImode. */
45344 if (valid_perm_using_mode_p (V4DImode, d))
45346 for (i = 0; i < 4; i++)
45347 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45348 if (d->testing_p)
45349 return true;
45350 target = gen_reg_rtx (V4DImode);
45351 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45352 perm, 4, false))
45354 emit_move_insn (d->target,
45355 gen_lowpart (d->vmode, target));
45356 return true;
45358 return false;
45361 /* Next see if vpermd can be used. */
45362 if (valid_perm_using_mode_p (V8SImode, d))
45363 vmode = V8SImode;
45365 /* Or if vpermps can be used. */
45366 else if (d->vmode == V8SFmode)
45367 vmode = V8SImode;
45369 if (vmode == V32QImode)
45371 /* vpshufb only works intra lanes, it is not
45372 possible to shuffle bytes in between the lanes. */
45373 for (i = 0; i < nelt; ++i)
45374 if ((d->perm[i] ^ i) & (nelt / 2))
45375 return false;
45378 else if (GET_MODE_SIZE (d->vmode) == 64)
45380 if (!TARGET_AVX512BW)
45381 return false;
45383 /* If vpermq didn't work, vpshufb won't work either. */
45384 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45385 return false;
45387 vmode = V64QImode;
45388 if (d->vmode == V16SImode
45389 || d->vmode == V32HImode
45390 || d->vmode == V64QImode)
45392 /* First see if vpermq can be used for
45393 V16SImode/V32HImode/V64QImode. */
45394 if (valid_perm_using_mode_p (V8DImode, d))
45396 for (i = 0; i < 8; i++)
45397 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45398 if (d->testing_p)
45399 return true;
45400 target = gen_reg_rtx (V8DImode);
45401 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45402 perm, 8, false))
45404 emit_move_insn (d->target,
45405 gen_lowpart (d->vmode, target));
45406 return true;
45408 return false;
45411 /* Next see if vpermd can be used. */
45412 if (valid_perm_using_mode_p (V16SImode, d))
45413 vmode = V16SImode;
45415 /* Or if vpermps can be used. */
45416 else if (d->vmode == V16SFmode)
45417 vmode = V16SImode;
45418 if (vmode == V64QImode)
45420 /* vpshufb only works intra lanes, it is not
45421 possible to shuffle bytes in between the lanes. */
45422 for (i = 0; i < nelt; ++i)
45423 if ((d->perm[i] ^ i) & (nelt / 4))
45424 return false;
45427 else
45428 return false;
45431 if (d->testing_p)
45432 return true;
45434 if (vmode == V8SImode)
45435 for (i = 0; i < 8; ++i)
45436 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45437 else if (vmode == V16SImode)
45438 for (i = 0; i < 16; ++i)
45439 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45440 else
45442 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45443 if (!d->one_operand_p)
45444 mask = 2 * nelt - 1;
45445 else if (vmode == V16QImode)
45446 mask = nelt - 1;
45447 else if (vmode == V64QImode)
45448 mask = nelt / 4 - 1;
45449 else
45450 mask = nelt / 2 - 1;
45452 for (i = 0; i < nelt; ++i)
45454 unsigned j, e = d->perm[i] & mask;
45455 for (j = 0; j < eltsz; ++j)
45456 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45460 vperm = gen_rtx_CONST_VECTOR (vmode,
45461 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45462 vperm = force_reg (vmode, vperm);
45464 target = d->target;
45465 if (d->vmode != vmode)
45466 target = gen_reg_rtx (vmode);
45467 op0 = gen_lowpart (vmode, d->op0);
45468 if (d->one_operand_p)
45470 if (vmode == V16QImode)
45471 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45472 else if (vmode == V32QImode)
45473 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45474 else if (vmode == V64QImode)
45475 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45476 else if (vmode == V8SFmode)
45477 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45478 else if (vmode == V8SImode)
45479 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45480 else if (vmode == V16SFmode)
45481 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45482 else if (vmode == V16SImode)
45483 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45484 else
45485 gcc_unreachable ();
45487 else
45489 op1 = gen_lowpart (vmode, d->op1);
45490 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45492 if (target != d->target)
45493 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45495 return true;
45498 /* For V*[QHS]Imode permutations, check if the same permutation
45499 can't be performed in a 2x, 4x or 8x wider inner mode. */
45501 static bool
45502 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45503 struct expand_vec_perm_d *nd)
45505 int i;
45506 machine_mode mode = VOIDmode;
45508 switch (d->vmode)
45510 case E_V16QImode: mode = V8HImode; break;
45511 case E_V32QImode: mode = V16HImode; break;
45512 case E_V64QImode: mode = V32HImode; break;
45513 case E_V8HImode: mode = V4SImode; break;
45514 case E_V16HImode: mode = V8SImode; break;
45515 case E_V32HImode: mode = V16SImode; break;
45516 case E_V4SImode: mode = V2DImode; break;
45517 case E_V8SImode: mode = V4DImode; break;
45518 case E_V16SImode: mode = V8DImode; break;
45519 default: return false;
45521 for (i = 0; i < d->nelt; i += 2)
45522 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45523 return false;
45524 nd->vmode = mode;
45525 nd->nelt = d->nelt / 2;
45526 for (i = 0; i < nd->nelt; i++)
45527 nd->perm[i] = d->perm[2 * i] / 2;
45528 if (GET_MODE_INNER (mode) != DImode)
45529 canonicalize_vector_int_perm (nd, nd);
45530 if (nd != d)
45532 nd->one_operand_p = d->one_operand_p;
45533 nd->testing_p = d->testing_p;
45534 if (d->op0 == d->op1)
45535 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45536 else
45538 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45539 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45541 if (d->testing_p)
45542 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45543 else
45544 nd->target = gen_reg_rtx (nd->vmode);
45546 return true;
45549 /* Try to expand one-operand permutation with constant mask. */
45551 static bool
45552 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45554 machine_mode mode = GET_MODE (d->op0);
45555 machine_mode maskmode = mode;
45556 rtx (*gen) (rtx, rtx, rtx) = NULL;
45557 rtx target, op0, mask;
45558 rtx vec[64];
45560 if (!rtx_equal_p (d->op0, d->op1))
45561 return false;
45563 if (!TARGET_AVX512F)
45564 return false;
45566 switch (mode)
45568 case E_V16SImode:
45569 gen = gen_avx512f_permvarv16si;
45570 break;
45571 case E_V16SFmode:
45572 gen = gen_avx512f_permvarv16sf;
45573 maskmode = V16SImode;
45574 break;
45575 case E_V8DImode:
45576 gen = gen_avx512f_permvarv8di;
45577 break;
45578 case E_V8DFmode:
45579 gen = gen_avx512f_permvarv8df;
45580 maskmode = V8DImode;
45581 break;
45582 default:
45583 return false;
45586 target = d->target;
45587 op0 = d->op0;
45588 for (int i = 0; i < d->nelt; ++i)
45589 vec[i] = GEN_INT (d->perm[i]);
45590 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45591 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45592 return true;
45595 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45596 in a single instruction. */
45598 static bool
45599 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45601 unsigned i, nelt = d->nelt;
45602 struct expand_vec_perm_d nd;
45604 /* Check plain VEC_SELECT first, because AVX has instructions that could
45605 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45606 input where SEL+CONCAT may not. */
45607 if (d->one_operand_p)
45609 int mask = nelt - 1;
45610 bool identity_perm = true;
45611 bool broadcast_perm = true;
45613 for (i = 0; i < nelt; i++)
45615 nd.perm[i] = d->perm[i] & mask;
45616 if (nd.perm[i] != i)
45617 identity_perm = false;
45618 if (nd.perm[i])
45619 broadcast_perm = false;
45622 if (identity_perm)
45624 if (!d->testing_p)
45625 emit_move_insn (d->target, d->op0);
45626 return true;
45628 else if (broadcast_perm && TARGET_AVX2)
45630 /* Use vpbroadcast{b,w,d}. */
45631 rtx (*gen) (rtx, rtx) = NULL;
45632 switch (d->vmode)
45634 case E_V64QImode:
45635 if (TARGET_AVX512BW)
45636 gen = gen_avx512bw_vec_dupv64qi_1;
45637 break;
45638 case E_V32QImode:
45639 gen = gen_avx2_pbroadcastv32qi_1;
45640 break;
45641 case E_V32HImode:
45642 if (TARGET_AVX512BW)
45643 gen = gen_avx512bw_vec_dupv32hi_1;
45644 break;
45645 case E_V16HImode:
45646 gen = gen_avx2_pbroadcastv16hi_1;
45647 break;
45648 case E_V16SImode:
45649 if (TARGET_AVX512F)
45650 gen = gen_avx512f_vec_dupv16si_1;
45651 break;
45652 case E_V8SImode:
45653 gen = gen_avx2_pbroadcastv8si_1;
45654 break;
45655 case E_V16QImode:
45656 gen = gen_avx2_pbroadcastv16qi;
45657 break;
45658 case E_V8HImode:
45659 gen = gen_avx2_pbroadcastv8hi;
45660 break;
45661 case E_V16SFmode:
45662 if (TARGET_AVX512F)
45663 gen = gen_avx512f_vec_dupv16sf_1;
45664 break;
45665 case E_V8SFmode:
45666 gen = gen_avx2_vec_dupv8sf_1;
45667 break;
45668 case E_V8DFmode:
45669 if (TARGET_AVX512F)
45670 gen = gen_avx512f_vec_dupv8df_1;
45671 break;
45672 case E_V8DImode:
45673 if (TARGET_AVX512F)
45674 gen = gen_avx512f_vec_dupv8di_1;
45675 break;
45676 /* For other modes prefer other shuffles this function creates. */
45677 default: break;
45679 if (gen != NULL)
45681 if (!d->testing_p)
45682 emit_insn (gen (d->target, d->op0));
45683 return true;
45687 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45688 return true;
45690 /* There are plenty of patterns in sse.md that are written for
45691 SEL+CONCAT and are not replicated for a single op. Perhaps
45692 that should be changed, to avoid the nastiness here. */
45694 /* Recognize interleave style patterns, which means incrementing
45695 every other permutation operand. */
45696 for (i = 0; i < nelt; i += 2)
45698 nd.perm[i] = d->perm[i] & mask;
45699 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45701 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45702 d->testing_p))
45703 return true;
45705 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45706 if (nelt >= 4)
45708 for (i = 0; i < nelt; i += 4)
45710 nd.perm[i + 0] = d->perm[i + 0] & mask;
45711 nd.perm[i + 1] = d->perm[i + 1] & mask;
45712 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45713 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45716 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45717 d->testing_p))
45718 return true;
45722 /* Finally, try the fully general two operand permute. */
45723 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45724 d->testing_p))
45725 return true;
45727 /* Recognize interleave style patterns with reversed operands. */
45728 if (!d->one_operand_p)
45730 for (i = 0; i < nelt; ++i)
45732 unsigned e = d->perm[i];
45733 if (e >= nelt)
45734 e -= nelt;
45735 else
45736 e += nelt;
45737 nd.perm[i] = e;
45740 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45741 d->testing_p))
45742 return true;
45745 /* Try the SSE4.1 blend variable merge instructions. */
45746 if (expand_vec_perm_blend (d))
45747 return true;
45749 /* Try one of the AVX vpermil variable permutations. */
45750 if (expand_vec_perm_vpermil (d))
45751 return true;
45753 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45754 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45755 if (expand_vec_perm_pshufb (d))
45756 return true;
45758 /* Try the AVX2 vpalignr instruction. */
45759 if (expand_vec_perm_palignr (d, true))
45760 return true;
45762 /* Try the AVX512F vperm{s,d} instructions. */
45763 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45764 return true;
45766 /* Try the AVX512F vpermt2/vpermi2 instructions. */
45767 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45768 return true;
45770 /* See if we can get the same permutation in different vector integer
45771 mode. */
45772 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45774 if (!d->testing_p)
45775 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45776 return true;
45778 return false;
45781 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45782 in terms of a pair of pshuflw + pshufhw instructions. */
45784 static bool
45785 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45787 unsigned char perm2[MAX_VECT_LEN];
45788 unsigned i;
45789 bool ok;
45791 if (d->vmode != V8HImode || !d->one_operand_p)
45792 return false;
45794 /* The two permutations only operate in 64-bit lanes. */
45795 for (i = 0; i < 4; ++i)
45796 if (d->perm[i] >= 4)
45797 return false;
45798 for (i = 4; i < 8; ++i)
45799 if (d->perm[i] < 4)
45800 return false;
45802 if (d->testing_p)
45803 return true;
45805 /* Emit the pshuflw. */
45806 memcpy (perm2, d->perm, 4);
45807 for (i = 4; i < 8; ++i)
45808 perm2[i] = i;
45809 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45810 gcc_assert (ok);
45812 /* Emit the pshufhw. */
45813 memcpy (perm2 + 4, d->perm + 4, 4);
45814 for (i = 0; i < 4; ++i)
45815 perm2[i] = i;
45816 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45817 gcc_assert (ok);
45819 return true;
45822 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45823 the permutation using the SSSE3 palignr instruction. This succeeds
45824 when all of the elements in PERM fit within one vector and we merely
45825 need to shift them down so that a single vector permutation has a
45826 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45827 the vpalignr instruction itself can perform the requested permutation. */
45829 static bool
45830 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45832 unsigned i, nelt = d->nelt;
45833 unsigned min, max, minswap, maxswap;
45834 bool in_order, ok, swap = false;
45835 rtx shift, target;
45836 struct expand_vec_perm_d dcopy;
45838 /* Even with AVX, palignr only operates on 128-bit vectors,
45839 in AVX2 palignr operates on both 128-bit lanes. */
45840 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45841 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45842 return false;
45844 min = 2 * nelt;
45845 max = 0;
45846 minswap = 2 * nelt;
45847 maxswap = 0;
45848 for (i = 0; i < nelt; ++i)
45850 unsigned e = d->perm[i];
45851 unsigned eswap = d->perm[i] ^ nelt;
45852 if (GET_MODE_SIZE (d->vmode) == 32)
45854 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45855 eswap = e ^ (nelt / 2);
45857 if (e < min)
45858 min = e;
45859 if (e > max)
45860 max = e;
45861 if (eswap < minswap)
45862 minswap = eswap;
45863 if (eswap > maxswap)
45864 maxswap = eswap;
45866 if (min == 0
45867 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45869 if (d->one_operand_p
45870 || minswap == 0
45871 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45872 ? nelt / 2 : nelt))
45873 return false;
45874 swap = true;
45875 min = minswap;
45876 max = maxswap;
45879 /* Given that we have SSSE3, we know we'll be able to implement the
45880 single operand permutation after the palignr with pshufb for
45881 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45882 first. */
45883 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45884 return true;
45886 dcopy = *d;
45887 if (swap)
45889 dcopy.op0 = d->op1;
45890 dcopy.op1 = d->op0;
45891 for (i = 0; i < nelt; ++i)
45892 dcopy.perm[i] ^= nelt;
45895 in_order = true;
45896 for (i = 0; i < nelt; ++i)
45898 unsigned e = dcopy.perm[i];
45899 if (GET_MODE_SIZE (d->vmode) == 32
45900 && e >= nelt
45901 && (e & (nelt / 2 - 1)) < min)
45902 e = e - min - (nelt / 2);
45903 else
45904 e = e - min;
45905 if (e != i)
45906 in_order = false;
45907 dcopy.perm[i] = e;
45909 dcopy.one_operand_p = true;
45911 if (single_insn_only_p && !in_order)
45912 return false;
45914 /* For AVX2, test whether we can permute the result in one instruction. */
45915 if (d->testing_p)
45917 if (in_order)
45918 return true;
45919 dcopy.op1 = dcopy.op0;
45920 return expand_vec_perm_1 (&dcopy);
45923 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45924 if (GET_MODE_SIZE (d->vmode) == 16)
45926 target = gen_reg_rtx (TImode);
45927 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45928 gen_lowpart (TImode, dcopy.op0), shift));
45930 else
45932 target = gen_reg_rtx (V2TImode);
45933 emit_insn (gen_avx2_palignrv2ti (target,
45934 gen_lowpart (V2TImode, dcopy.op1),
45935 gen_lowpart (V2TImode, dcopy.op0),
45936 shift));
45939 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45941 /* Test for the degenerate case where the alignment by itself
45942 produces the desired permutation. */
45943 if (in_order)
45945 emit_move_insn (d->target, dcopy.op0);
45946 return true;
45949 ok = expand_vec_perm_1 (&dcopy);
45950 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45952 return ok;
45955 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45956 the permutation using the SSE4_1 pblendv instruction. Potentially
45957 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45959 static bool
45960 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45962 unsigned i, which, nelt = d->nelt;
45963 struct expand_vec_perm_d dcopy, dcopy1;
45964 machine_mode vmode = d->vmode;
45965 bool ok;
45967 /* Use the same checks as in expand_vec_perm_blend. */
45968 if (d->one_operand_p)
45969 return false;
45970 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45972 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45974 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45976 else
45977 return false;
45979 /* Figure out where permutation elements stay not in their
45980 respective lanes. */
45981 for (i = 0, which = 0; i < nelt; ++i)
45983 unsigned e = d->perm[i];
45984 if (e != i)
45985 which |= (e < nelt ? 1 : 2);
45987 /* We can pblend the part where elements stay not in their
45988 respective lanes only when these elements are all in one
45989 half of a permutation.
45990 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45991 lanes, but both 8 and 9 >= 8
45992 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45993 respective lanes and 8 >= 8, but 2 not. */
45994 if (which != 1 && which != 2)
45995 return false;
45996 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45997 return true;
45999 /* First we apply one operand permutation to the part where
46000 elements stay not in their respective lanes. */
46001 dcopy = *d;
46002 if (which == 2)
46003 dcopy.op0 = dcopy.op1 = d->op1;
46004 else
46005 dcopy.op0 = dcopy.op1 = d->op0;
46006 if (!d->testing_p)
46007 dcopy.target = gen_reg_rtx (vmode);
46008 dcopy.one_operand_p = true;
46010 for (i = 0; i < nelt; ++i)
46011 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46013 ok = expand_vec_perm_1 (&dcopy);
46014 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46015 return false;
46016 else
46017 gcc_assert (ok);
46018 if (d->testing_p)
46019 return true;
46021 /* Next we put permuted elements into their positions. */
46022 dcopy1 = *d;
46023 if (which == 2)
46024 dcopy1.op1 = dcopy.target;
46025 else
46026 dcopy1.op0 = dcopy.target;
46028 for (i = 0; i < nelt; ++i)
46029 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46031 ok = expand_vec_perm_blend (&dcopy1);
46032 gcc_assert (ok);
46034 return true;
46037 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46039 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46040 a two vector permutation into a single vector permutation by using
46041 an interleave operation to merge the vectors. */
46043 static bool
46044 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46046 struct expand_vec_perm_d dremap, dfinal;
46047 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46048 unsigned HOST_WIDE_INT contents;
46049 unsigned char remap[2 * MAX_VECT_LEN];
46050 rtx_insn *seq;
46051 bool ok, same_halves = false;
46053 if (GET_MODE_SIZE (d->vmode) == 16)
46055 if (d->one_operand_p)
46056 return false;
46058 else if (GET_MODE_SIZE (d->vmode) == 32)
46060 if (!TARGET_AVX)
46061 return false;
46062 /* For 32-byte modes allow even d->one_operand_p.
46063 The lack of cross-lane shuffling in some instructions
46064 might prevent a single insn shuffle. */
46065 dfinal = *d;
46066 dfinal.testing_p = true;
46067 /* If expand_vec_perm_interleave3 can expand this into
46068 a 3 insn sequence, give up and let it be expanded as
46069 3 insn sequence. While that is one insn longer,
46070 it doesn't need a memory operand and in the common
46071 case that both interleave low and high permutations
46072 with the same operands are adjacent needs 4 insns
46073 for both after CSE. */
46074 if (expand_vec_perm_interleave3 (&dfinal))
46075 return false;
46077 else
46078 return false;
46080 /* Examine from whence the elements come. */
46081 contents = 0;
46082 for (i = 0; i < nelt; ++i)
46083 contents |= HOST_WIDE_INT_1U << d->perm[i];
46085 memset (remap, 0xff, sizeof (remap));
46086 dremap = *d;
46088 if (GET_MODE_SIZE (d->vmode) == 16)
46090 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46092 /* Split the two input vectors into 4 halves. */
46093 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46094 h2 = h1 << nelt2;
46095 h3 = h2 << nelt2;
46096 h4 = h3 << nelt2;
46098 /* If the elements from the low halves use interleave low, and similarly
46099 for interleave high. If the elements are from mis-matched halves, we
46100 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46101 if ((contents & (h1 | h3)) == contents)
46103 /* punpckl* */
46104 for (i = 0; i < nelt2; ++i)
46106 remap[i] = i * 2;
46107 remap[i + nelt] = i * 2 + 1;
46108 dremap.perm[i * 2] = i;
46109 dremap.perm[i * 2 + 1] = i + nelt;
46111 if (!TARGET_SSE2 && d->vmode == V4SImode)
46112 dremap.vmode = V4SFmode;
46114 else if ((contents & (h2 | h4)) == contents)
46116 /* punpckh* */
46117 for (i = 0; i < nelt2; ++i)
46119 remap[i + nelt2] = i * 2;
46120 remap[i + nelt + nelt2] = i * 2 + 1;
46121 dremap.perm[i * 2] = i + nelt2;
46122 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46124 if (!TARGET_SSE2 && d->vmode == V4SImode)
46125 dremap.vmode = V4SFmode;
46127 else if ((contents & (h1 | h4)) == contents)
46129 /* shufps */
46130 for (i = 0; i < nelt2; ++i)
46132 remap[i] = i;
46133 remap[i + nelt + nelt2] = i + nelt2;
46134 dremap.perm[i] = i;
46135 dremap.perm[i + nelt2] = i + nelt + nelt2;
46137 if (nelt != 4)
46139 /* shufpd */
46140 dremap.vmode = V2DImode;
46141 dremap.nelt = 2;
46142 dremap.perm[0] = 0;
46143 dremap.perm[1] = 3;
46146 else if ((contents & (h2 | h3)) == contents)
46148 /* shufps */
46149 for (i = 0; i < nelt2; ++i)
46151 remap[i + nelt2] = i;
46152 remap[i + nelt] = i + nelt2;
46153 dremap.perm[i] = i + nelt2;
46154 dremap.perm[i + nelt2] = i + nelt;
46156 if (nelt != 4)
46158 /* shufpd */
46159 dremap.vmode = V2DImode;
46160 dremap.nelt = 2;
46161 dremap.perm[0] = 1;
46162 dremap.perm[1] = 2;
46165 else
46166 return false;
46168 else
46170 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46171 unsigned HOST_WIDE_INT q[8];
46172 unsigned int nonzero_halves[4];
46174 /* Split the two input vectors into 8 quarters. */
46175 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46176 for (i = 1; i < 8; ++i)
46177 q[i] = q[0] << (nelt4 * i);
46178 for (i = 0; i < 4; ++i)
46179 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46181 nonzero_halves[nzcnt] = i;
46182 ++nzcnt;
46185 if (nzcnt == 1)
46187 gcc_assert (d->one_operand_p);
46188 nonzero_halves[1] = nonzero_halves[0];
46189 same_halves = true;
46191 else if (d->one_operand_p)
46193 gcc_assert (nonzero_halves[0] == 0);
46194 gcc_assert (nonzero_halves[1] == 1);
46197 if (nzcnt <= 2)
46199 if (d->perm[0] / nelt2 == nonzero_halves[1])
46201 /* Attempt to increase the likelihood that dfinal
46202 shuffle will be intra-lane. */
46203 std::swap (nonzero_halves[0], nonzero_halves[1]);
46206 /* vperm2f128 or vperm2i128. */
46207 for (i = 0; i < nelt2; ++i)
46209 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46210 remap[i + nonzero_halves[0] * nelt2] = i;
46211 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46212 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46215 if (d->vmode != V8SFmode
46216 && d->vmode != V4DFmode
46217 && d->vmode != V8SImode)
46219 dremap.vmode = V8SImode;
46220 dremap.nelt = 8;
46221 for (i = 0; i < 4; ++i)
46223 dremap.perm[i] = i + nonzero_halves[0] * 4;
46224 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46228 else if (d->one_operand_p)
46229 return false;
46230 else if (TARGET_AVX2
46231 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46233 /* vpunpckl* */
46234 for (i = 0; i < nelt4; ++i)
46236 remap[i] = i * 2;
46237 remap[i + nelt] = i * 2 + 1;
46238 remap[i + nelt2] = i * 2 + nelt2;
46239 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46240 dremap.perm[i * 2] = i;
46241 dremap.perm[i * 2 + 1] = i + nelt;
46242 dremap.perm[i * 2 + nelt2] = i + nelt2;
46243 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46246 else if (TARGET_AVX2
46247 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46249 /* vpunpckh* */
46250 for (i = 0; i < nelt4; ++i)
46252 remap[i + nelt4] = i * 2;
46253 remap[i + nelt + nelt4] = i * 2 + 1;
46254 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46255 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46256 dremap.perm[i * 2] = i + nelt4;
46257 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46258 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46259 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46262 else
46263 return false;
46266 /* Use the remapping array set up above to move the elements from their
46267 swizzled locations into their final destinations. */
46268 dfinal = *d;
46269 for (i = 0; i < nelt; ++i)
46271 unsigned e = remap[d->perm[i]];
46272 gcc_assert (e < nelt);
46273 /* If same_halves is true, both halves of the remapped vector are the
46274 same. Avoid cross-lane accesses if possible. */
46275 if (same_halves && i >= nelt2)
46277 gcc_assert (e < nelt2);
46278 dfinal.perm[i] = e + nelt2;
46280 else
46281 dfinal.perm[i] = e;
46283 if (!d->testing_p)
46285 dremap.target = gen_reg_rtx (dremap.vmode);
46286 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46288 dfinal.op1 = dfinal.op0;
46289 dfinal.one_operand_p = true;
46291 /* Test if the final remap can be done with a single insn. For V4SFmode or
46292 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46293 start_sequence ();
46294 ok = expand_vec_perm_1 (&dfinal);
46295 seq = get_insns ();
46296 end_sequence ();
46298 if (!ok)
46299 return false;
46301 if (d->testing_p)
46302 return true;
46304 if (dremap.vmode != dfinal.vmode)
46306 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46307 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46310 ok = expand_vec_perm_1 (&dremap);
46311 gcc_assert (ok);
46313 emit_insn (seq);
46314 return true;
46317 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46318 a single vector cross-lane permutation into vpermq followed
46319 by any of the single insn permutations. */
46321 static bool
46322 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46324 struct expand_vec_perm_d dremap, dfinal;
46325 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46326 unsigned contents[2];
46327 bool ok;
46329 if (!(TARGET_AVX2
46330 && (d->vmode == V32QImode || d->vmode == V16HImode)
46331 && d->one_operand_p))
46332 return false;
46334 contents[0] = 0;
46335 contents[1] = 0;
46336 for (i = 0; i < nelt2; ++i)
46338 contents[0] |= 1u << (d->perm[i] / nelt4);
46339 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46342 for (i = 0; i < 2; ++i)
46344 unsigned int cnt = 0;
46345 for (j = 0; j < 4; ++j)
46346 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46347 return false;
46350 if (d->testing_p)
46351 return true;
46353 dremap = *d;
46354 dremap.vmode = V4DImode;
46355 dremap.nelt = 4;
46356 dremap.target = gen_reg_rtx (V4DImode);
46357 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46358 dremap.op1 = dremap.op0;
46359 dremap.one_operand_p = true;
46360 for (i = 0; i < 2; ++i)
46362 unsigned int cnt = 0;
46363 for (j = 0; j < 4; ++j)
46364 if ((contents[i] & (1u << j)) != 0)
46365 dremap.perm[2 * i + cnt++] = j;
46366 for (; cnt < 2; ++cnt)
46367 dremap.perm[2 * i + cnt] = 0;
46370 dfinal = *d;
46371 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46372 dfinal.op1 = dfinal.op0;
46373 dfinal.one_operand_p = true;
46374 for (i = 0, j = 0; i < nelt; ++i)
46376 if (i == nelt2)
46377 j = 2;
46378 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46379 if ((d->perm[i] / nelt4) == dremap.perm[j])
46381 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46382 dfinal.perm[i] |= nelt4;
46383 else
46384 gcc_unreachable ();
46387 ok = expand_vec_perm_1 (&dremap);
46388 gcc_assert (ok);
46390 ok = expand_vec_perm_1 (&dfinal);
46391 gcc_assert (ok);
46393 return true;
46396 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46397 a vector permutation using two instructions, vperm2f128 resp.
46398 vperm2i128 followed by any single in-lane permutation. */
46400 static bool
46401 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46403 struct expand_vec_perm_d dfirst, dsecond;
46404 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46405 bool ok;
46407 if (!TARGET_AVX
46408 || GET_MODE_SIZE (d->vmode) != 32
46409 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46410 return false;
46412 dsecond = *d;
46413 dsecond.one_operand_p = false;
46414 dsecond.testing_p = true;
46416 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46417 immediate. For perm < 16 the second permutation uses
46418 d->op0 as first operand, for perm >= 16 it uses d->op1
46419 as first operand. The second operand is the result of
46420 vperm2[fi]128. */
46421 for (perm = 0; perm < 32; perm++)
46423 /* Ignore permutations which do not move anything cross-lane. */
46424 if (perm < 16)
46426 /* The second shuffle for e.g. V4DFmode has
46427 0123 and ABCD operands.
46428 Ignore AB23, as 23 is already in the second lane
46429 of the first operand. */
46430 if ((perm & 0xc) == (1 << 2)) continue;
46431 /* And 01CD, as 01 is in the first lane of the first
46432 operand. */
46433 if ((perm & 3) == 0) continue;
46434 /* And 4567, as then the vperm2[fi]128 doesn't change
46435 anything on the original 4567 second operand. */
46436 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46438 else
46440 /* The second shuffle for e.g. V4DFmode has
46441 4567 and ABCD operands.
46442 Ignore AB67, as 67 is already in the second lane
46443 of the first operand. */
46444 if ((perm & 0xc) == (3 << 2)) continue;
46445 /* And 45CD, as 45 is in the first lane of the first
46446 operand. */
46447 if ((perm & 3) == 2) continue;
46448 /* And 0123, as then the vperm2[fi]128 doesn't change
46449 anything on the original 0123 first operand. */
46450 if ((perm & 0xf) == (1 << 2)) continue;
46453 for (i = 0; i < nelt; i++)
46455 j = d->perm[i] / nelt2;
46456 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46457 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46458 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46459 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46460 else
46461 break;
46464 if (i == nelt)
46466 start_sequence ();
46467 ok = expand_vec_perm_1 (&dsecond);
46468 end_sequence ();
46470 else
46471 ok = false;
46473 if (ok)
46475 if (d->testing_p)
46476 return true;
46478 /* Found a usable second shuffle. dfirst will be
46479 vperm2f128 on d->op0 and d->op1. */
46480 dsecond.testing_p = false;
46481 dfirst = *d;
46482 dfirst.target = gen_reg_rtx (d->vmode);
46483 for (i = 0; i < nelt; i++)
46484 dfirst.perm[i] = (i & (nelt2 - 1))
46485 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46487 canonicalize_perm (&dfirst);
46488 ok = expand_vec_perm_1 (&dfirst);
46489 gcc_assert (ok);
46491 /* And dsecond is some single insn shuffle, taking
46492 d->op0 and result of vperm2f128 (if perm < 16) or
46493 d->op1 and result of vperm2f128 (otherwise). */
46494 if (perm >= 16)
46495 dsecond.op0 = dsecond.op1;
46496 dsecond.op1 = dfirst.target;
46498 ok = expand_vec_perm_1 (&dsecond);
46499 gcc_assert (ok);
46501 return true;
46504 /* For one operand, the only useful vperm2f128 permutation is 0x01
46505 aka lanes swap. */
46506 if (d->one_operand_p)
46507 return false;
46510 return false;
46513 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46514 a two vector permutation using 2 intra-lane interleave insns
46515 and cross-lane shuffle for 32-byte vectors. */
46517 static bool
46518 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46520 unsigned i, nelt;
46521 rtx (*gen) (rtx, rtx, rtx);
46523 if (d->one_operand_p)
46524 return false;
46525 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46527 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46529 else
46530 return false;
46532 nelt = d->nelt;
46533 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46534 return false;
46535 for (i = 0; i < nelt; i += 2)
46536 if (d->perm[i] != d->perm[0] + i / 2
46537 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46538 return false;
46540 if (d->testing_p)
46541 return true;
46543 switch (d->vmode)
46545 case E_V32QImode:
46546 if (d->perm[0])
46547 gen = gen_vec_interleave_highv32qi;
46548 else
46549 gen = gen_vec_interleave_lowv32qi;
46550 break;
46551 case E_V16HImode:
46552 if (d->perm[0])
46553 gen = gen_vec_interleave_highv16hi;
46554 else
46555 gen = gen_vec_interleave_lowv16hi;
46556 break;
46557 case E_V8SImode:
46558 if (d->perm[0])
46559 gen = gen_vec_interleave_highv8si;
46560 else
46561 gen = gen_vec_interleave_lowv8si;
46562 break;
46563 case E_V4DImode:
46564 if (d->perm[0])
46565 gen = gen_vec_interleave_highv4di;
46566 else
46567 gen = gen_vec_interleave_lowv4di;
46568 break;
46569 case E_V8SFmode:
46570 if (d->perm[0])
46571 gen = gen_vec_interleave_highv8sf;
46572 else
46573 gen = gen_vec_interleave_lowv8sf;
46574 break;
46575 case E_V4DFmode:
46576 if (d->perm[0])
46577 gen = gen_vec_interleave_highv4df;
46578 else
46579 gen = gen_vec_interleave_lowv4df;
46580 break;
46581 default:
46582 gcc_unreachable ();
46585 emit_insn (gen (d->target, d->op0, d->op1));
46586 return true;
46589 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46590 a single vector permutation using a single intra-lane vector
46591 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46592 the non-swapped and swapped vectors together. */
46594 static bool
46595 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46597 struct expand_vec_perm_d dfirst, dsecond;
46598 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46599 rtx_insn *seq;
46600 bool ok;
46601 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46603 if (!TARGET_AVX
46604 || TARGET_AVX2
46605 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46606 || !d->one_operand_p)
46607 return false;
46609 dfirst = *d;
46610 for (i = 0; i < nelt; i++)
46611 dfirst.perm[i] = 0xff;
46612 for (i = 0, msk = 0; i < nelt; i++)
46614 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46615 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46616 return false;
46617 dfirst.perm[j] = d->perm[i];
46618 if (j != i)
46619 msk |= (1 << i);
46621 for (i = 0; i < nelt; i++)
46622 if (dfirst.perm[i] == 0xff)
46623 dfirst.perm[i] = i;
46625 if (!d->testing_p)
46626 dfirst.target = gen_reg_rtx (dfirst.vmode);
46628 start_sequence ();
46629 ok = expand_vec_perm_1 (&dfirst);
46630 seq = get_insns ();
46631 end_sequence ();
46633 if (!ok)
46634 return false;
46636 if (d->testing_p)
46637 return true;
46639 emit_insn (seq);
46641 dsecond = *d;
46642 dsecond.op0 = dfirst.target;
46643 dsecond.op1 = dfirst.target;
46644 dsecond.one_operand_p = true;
46645 dsecond.target = gen_reg_rtx (dsecond.vmode);
46646 for (i = 0; i < nelt; i++)
46647 dsecond.perm[i] = i ^ nelt2;
46649 ok = expand_vec_perm_1 (&dsecond);
46650 gcc_assert (ok);
46652 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46653 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46654 return true;
46657 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46658 permutation using two vperm2f128, followed by a vshufpd insn blending
46659 the two vectors together. */
46661 static bool
46662 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46664 struct expand_vec_perm_d dfirst, dsecond, dthird;
46665 bool ok;
46667 if (!TARGET_AVX || (d->vmode != V4DFmode))
46668 return false;
46670 if (d->testing_p)
46671 return true;
46673 dfirst = *d;
46674 dsecond = *d;
46675 dthird = *d;
46677 dfirst.perm[0] = (d->perm[0] & ~1);
46678 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46679 dfirst.perm[2] = (d->perm[2] & ~1);
46680 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46681 dsecond.perm[0] = (d->perm[1] & ~1);
46682 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46683 dsecond.perm[2] = (d->perm[3] & ~1);
46684 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46685 dthird.perm[0] = (d->perm[0] % 2);
46686 dthird.perm[1] = (d->perm[1] % 2) + 4;
46687 dthird.perm[2] = (d->perm[2] % 2) + 2;
46688 dthird.perm[3] = (d->perm[3] % 2) + 6;
46690 dfirst.target = gen_reg_rtx (dfirst.vmode);
46691 dsecond.target = gen_reg_rtx (dsecond.vmode);
46692 dthird.op0 = dfirst.target;
46693 dthird.op1 = dsecond.target;
46694 dthird.one_operand_p = false;
46696 canonicalize_perm (&dfirst);
46697 canonicalize_perm (&dsecond);
46699 ok = expand_vec_perm_1 (&dfirst)
46700 && expand_vec_perm_1 (&dsecond)
46701 && expand_vec_perm_1 (&dthird);
46703 gcc_assert (ok);
46705 return true;
46708 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46709 permutation with two pshufb insns and an ior. We should have already
46710 failed all two instruction sequences. */
46712 static bool
46713 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46715 rtx rperm[2][16], vperm, l, h, op, m128;
46716 unsigned int i, nelt, eltsz;
46718 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46719 return false;
46720 gcc_assert (!d->one_operand_p);
46722 if (d->testing_p)
46723 return true;
46725 nelt = d->nelt;
46726 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46728 /* Generate two permutation masks. If the required element is within
46729 the given vector it is shuffled into the proper lane. If the required
46730 element is in the other vector, force a zero into the lane by setting
46731 bit 7 in the permutation mask. */
46732 m128 = GEN_INT (-128);
46733 for (i = 0; i < nelt; ++i)
46735 unsigned j, e = d->perm[i];
46736 unsigned which = (e >= nelt);
46737 if (e >= nelt)
46738 e -= nelt;
46740 for (j = 0; j < eltsz; ++j)
46742 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46743 rperm[1-which][i*eltsz + j] = m128;
46747 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46748 vperm = force_reg (V16QImode, vperm);
46750 l = gen_reg_rtx (V16QImode);
46751 op = gen_lowpart (V16QImode, d->op0);
46752 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46754 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46755 vperm = force_reg (V16QImode, vperm);
46757 h = gen_reg_rtx (V16QImode);
46758 op = gen_lowpart (V16QImode, d->op1);
46759 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46761 op = d->target;
46762 if (d->vmode != V16QImode)
46763 op = gen_reg_rtx (V16QImode);
46764 emit_insn (gen_iorv16qi3 (op, l, h));
46765 if (op != d->target)
46766 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46768 return true;
46771 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46772 with two vpshufb insns, vpermq and vpor. We should have already failed
46773 all two or three instruction sequences. */
46775 static bool
46776 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46778 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46779 unsigned int i, nelt, eltsz;
46781 if (!TARGET_AVX2
46782 || !d->one_operand_p
46783 || (d->vmode != V32QImode && d->vmode != V16HImode))
46784 return false;
46786 if (d->testing_p)
46787 return true;
46789 nelt = d->nelt;
46790 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46792 /* Generate two permutation masks. If the required element is within
46793 the same lane, it is shuffled in. If the required element from the
46794 other lane, force a zero by setting bit 7 in the permutation mask.
46795 In the other mask the mask has non-negative elements if element
46796 is requested from the other lane, but also moved to the other lane,
46797 so that the result of vpshufb can have the two V2TImode halves
46798 swapped. */
46799 m128 = GEN_INT (-128);
46800 for (i = 0; i < nelt; ++i)
46802 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46803 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46805 for (j = 0; j < eltsz; ++j)
46807 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46808 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46812 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46813 vperm = force_reg (V32QImode, vperm);
46815 h = gen_reg_rtx (V32QImode);
46816 op = gen_lowpart (V32QImode, d->op0);
46817 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46819 /* Swap the 128-byte lanes of h into hp. */
46820 hp = gen_reg_rtx (V4DImode);
46821 op = gen_lowpart (V4DImode, h);
46822 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46823 const1_rtx));
46825 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46826 vperm = force_reg (V32QImode, vperm);
46828 l = gen_reg_rtx (V32QImode);
46829 op = gen_lowpart (V32QImode, d->op0);
46830 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46832 op = d->target;
46833 if (d->vmode != V32QImode)
46834 op = gen_reg_rtx (V32QImode);
46835 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46836 if (op != d->target)
46837 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46839 return true;
46842 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46843 and extract-odd permutations of two V32QImode and V16QImode operand
46844 with two vpshufb insns, vpor and vpermq. We should have already
46845 failed all two or three instruction sequences. */
46847 static bool
46848 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46850 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46851 unsigned int i, nelt, eltsz;
46853 if (!TARGET_AVX2
46854 || d->one_operand_p
46855 || (d->vmode != V32QImode && d->vmode != V16HImode))
46856 return false;
46858 for (i = 0; i < d->nelt; ++i)
46859 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46860 return false;
46862 if (d->testing_p)
46863 return true;
46865 nelt = d->nelt;
46866 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46868 /* Generate two permutation masks. In the first permutation mask
46869 the first quarter will contain indexes for the first half
46870 of the op0, the second quarter will contain bit 7 set, third quarter
46871 will contain indexes for the second half of the op0 and the
46872 last quarter bit 7 set. In the second permutation mask
46873 the first quarter will contain bit 7 set, the second quarter
46874 indexes for the first half of the op1, the third quarter bit 7 set
46875 and last quarter indexes for the second half of the op1.
46876 I.e. the first mask e.g. for V32QImode extract even will be:
46877 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46878 (all values masked with 0xf except for -128) and second mask
46879 for extract even will be
46880 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46881 m128 = GEN_INT (-128);
46882 for (i = 0; i < nelt; ++i)
46884 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46885 unsigned which = d->perm[i] >= nelt;
46886 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46888 for (j = 0; j < eltsz; ++j)
46890 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46891 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46895 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46896 vperm = force_reg (V32QImode, vperm);
46898 l = gen_reg_rtx (V32QImode);
46899 op = gen_lowpart (V32QImode, d->op0);
46900 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46902 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46903 vperm = force_reg (V32QImode, vperm);
46905 h = gen_reg_rtx (V32QImode);
46906 op = gen_lowpart (V32QImode, d->op1);
46907 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46909 ior = gen_reg_rtx (V32QImode);
46910 emit_insn (gen_iorv32qi3 (ior, l, h));
46912 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46913 op = gen_reg_rtx (V4DImode);
46914 ior = gen_lowpart (V4DImode, ior);
46915 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46916 const1_rtx, GEN_INT (3)));
46917 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46919 return true;
46922 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46923 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46924 with two "and" and "pack" or two "shift" and "pack" insns. We should
46925 have already failed all two instruction sequences. */
46927 static bool
46928 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46930 rtx op, dop0, dop1, t;
46931 unsigned i, odd, c, s, nelt = d->nelt;
46932 bool end_perm = false;
46933 machine_mode half_mode;
46934 rtx (*gen_and) (rtx, rtx, rtx);
46935 rtx (*gen_pack) (rtx, rtx, rtx);
46936 rtx (*gen_shift) (rtx, rtx, rtx);
46938 if (d->one_operand_p)
46939 return false;
46941 switch (d->vmode)
46943 case E_V8HImode:
46944 /* Required for "pack". */
46945 if (!TARGET_SSE4_1)
46946 return false;
46947 c = 0xffff;
46948 s = 16;
46949 half_mode = V4SImode;
46950 gen_and = gen_andv4si3;
46951 gen_pack = gen_sse4_1_packusdw;
46952 gen_shift = gen_lshrv4si3;
46953 break;
46954 case E_V16QImode:
46955 /* No check as all instructions are SSE2. */
46956 c = 0xff;
46957 s = 8;
46958 half_mode = V8HImode;
46959 gen_and = gen_andv8hi3;
46960 gen_pack = gen_sse2_packuswb;
46961 gen_shift = gen_lshrv8hi3;
46962 break;
46963 case E_V16HImode:
46964 if (!TARGET_AVX2)
46965 return false;
46966 c = 0xffff;
46967 s = 16;
46968 half_mode = V8SImode;
46969 gen_and = gen_andv8si3;
46970 gen_pack = gen_avx2_packusdw;
46971 gen_shift = gen_lshrv8si3;
46972 end_perm = true;
46973 break;
46974 case E_V32QImode:
46975 if (!TARGET_AVX2)
46976 return false;
46977 c = 0xff;
46978 s = 8;
46979 half_mode = V16HImode;
46980 gen_and = gen_andv16hi3;
46981 gen_pack = gen_avx2_packuswb;
46982 gen_shift = gen_lshrv16hi3;
46983 end_perm = true;
46984 break;
46985 default:
46986 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46987 general shuffles. */
46988 return false;
46991 /* Check that permutation is even or odd. */
46992 odd = d->perm[0];
46993 if (odd > 1)
46994 return false;
46996 for (i = 1; i < nelt; ++i)
46997 if (d->perm[i] != 2 * i + odd)
46998 return false;
47000 if (d->testing_p)
47001 return true;
47003 dop0 = gen_reg_rtx (half_mode);
47004 dop1 = gen_reg_rtx (half_mode);
47005 if (odd == 0)
47007 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
47008 t = force_reg (half_mode, t);
47009 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47010 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47012 else
47014 emit_insn (gen_shift (dop0,
47015 gen_lowpart (half_mode, d->op0),
47016 GEN_INT (s)));
47017 emit_insn (gen_shift (dop1,
47018 gen_lowpart (half_mode, d->op1),
47019 GEN_INT (s)));
47021 /* In AVX2 for 256 bit case we need to permute pack result. */
47022 if (TARGET_AVX2 && end_perm)
47024 op = gen_reg_rtx (d->vmode);
47025 t = gen_reg_rtx (V4DImode);
47026 emit_insn (gen_pack (op, dop0, dop1));
47027 emit_insn (gen_avx2_permv4di_1 (t,
47028 gen_lowpart (V4DImode, op),
47029 const0_rtx,
47030 const2_rtx,
47031 const1_rtx,
47032 GEN_INT (3)));
47033 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47035 else
47036 emit_insn (gen_pack (d->target, dop0, dop1));
47038 return true;
47041 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47042 and extract-odd permutations of two V64QI operands
47043 with two "shifts", two "truncs" and one "concat" insns for "odd"
47044 and two "truncs" and one concat insn for "even."
47045 Have already failed all two instruction sequences. */
47047 static bool
47048 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47050 rtx t1, t2, t3, t4;
47051 unsigned i, odd, nelt = d->nelt;
47053 if (!TARGET_AVX512BW
47054 || d->one_operand_p
47055 || d->vmode != V64QImode)
47056 return false;
47058 /* Check that permutation is even or odd. */
47059 odd = d->perm[0];
47060 if (odd > 1)
47061 return false;
47063 for (i = 1; i < nelt; ++i)
47064 if (d->perm[i] != 2 * i + odd)
47065 return false;
47067 if (d->testing_p)
47068 return true;
47071 if (odd)
47073 t1 = gen_reg_rtx (V32HImode);
47074 t2 = gen_reg_rtx (V32HImode);
47075 emit_insn (gen_lshrv32hi3 (t1,
47076 gen_lowpart (V32HImode, d->op0),
47077 GEN_INT (8)));
47078 emit_insn (gen_lshrv32hi3 (t2,
47079 gen_lowpart (V32HImode, d->op1),
47080 GEN_INT (8)));
47082 else
47084 t1 = gen_lowpart (V32HImode, d->op0);
47085 t2 = gen_lowpart (V32HImode, d->op1);
47088 t3 = gen_reg_rtx (V32QImode);
47089 t4 = gen_reg_rtx (V32QImode);
47090 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47091 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47092 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47094 return true;
47097 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47098 and extract-odd permutations. */
47100 static bool
47101 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47103 rtx t1, t2, t3, t4, t5;
47105 switch (d->vmode)
47107 case E_V4DFmode:
47108 if (d->testing_p)
47109 break;
47110 t1 = gen_reg_rtx (V4DFmode);
47111 t2 = gen_reg_rtx (V4DFmode);
47113 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47114 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47115 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47117 /* Now an unpck[lh]pd will produce the result required. */
47118 if (odd)
47119 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47120 else
47121 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47122 emit_insn (t3);
47123 break;
47125 case E_V8SFmode:
47127 int mask = odd ? 0xdd : 0x88;
47129 if (d->testing_p)
47130 break;
47131 t1 = gen_reg_rtx (V8SFmode);
47132 t2 = gen_reg_rtx (V8SFmode);
47133 t3 = gen_reg_rtx (V8SFmode);
47135 /* Shuffle within the 128-bit lanes to produce:
47136 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47137 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47138 GEN_INT (mask)));
47140 /* Shuffle the lanes around to produce:
47141 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47142 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47143 GEN_INT (0x3)));
47145 /* Shuffle within the 128-bit lanes to produce:
47146 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47147 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47149 /* Shuffle within the 128-bit lanes to produce:
47150 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47151 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47153 /* Shuffle the lanes around to produce:
47154 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47155 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47156 GEN_INT (0x20)));
47158 break;
47160 case E_V2DFmode:
47161 case E_V4SFmode:
47162 case E_V2DImode:
47163 case E_V4SImode:
47164 /* These are always directly implementable by expand_vec_perm_1. */
47165 gcc_unreachable ();
47167 case E_V8HImode:
47168 if (TARGET_SSE4_1)
47169 return expand_vec_perm_even_odd_pack (d);
47170 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47171 return expand_vec_perm_pshufb2 (d);
47172 else
47174 if (d->testing_p)
47175 break;
47176 /* We need 2*log2(N)-1 operations to achieve odd/even
47177 with interleave. */
47178 t1 = gen_reg_rtx (V8HImode);
47179 t2 = gen_reg_rtx (V8HImode);
47180 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47181 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47182 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47183 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47184 if (odd)
47185 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47186 else
47187 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47188 emit_insn (t3);
47190 break;
47192 case E_V16QImode:
47193 return expand_vec_perm_even_odd_pack (d);
47195 case E_V16HImode:
47196 case E_V32QImode:
47197 return expand_vec_perm_even_odd_pack (d);
47199 case E_V64QImode:
47200 return expand_vec_perm_even_odd_trunc (d);
47202 case E_V4DImode:
47203 if (!TARGET_AVX2)
47205 struct expand_vec_perm_d d_copy = *d;
47206 d_copy.vmode = V4DFmode;
47207 if (d->testing_p)
47208 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47209 else
47210 d_copy.target = gen_reg_rtx (V4DFmode);
47211 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47212 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47213 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47215 if (!d->testing_p)
47216 emit_move_insn (d->target,
47217 gen_lowpart (V4DImode, d_copy.target));
47218 return true;
47220 return false;
47223 if (d->testing_p)
47224 break;
47226 t1 = gen_reg_rtx (V4DImode);
47227 t2 = gen_reg_rtx (V4DImode);
47229 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47230 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47231 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47233 /* Now an vpunpck[lh]qdq will produce the result required. */
47234 if (odd)
47235 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47236 else
47237 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47238 emit_insn (t3);
47239 break;
47241 case E_V8SImode:
47242 if (!TARGET_AVX2)
47244 struct expand_vec_perm_d d_copy = *d;
47245 d_copy.vmode = V8SFmode;
47246 if (d->testing_p)
47247 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47248 else
47249 d_copy.target = gen_reg_rtx (V8SFmode);
47250 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47251 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47252 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47254 if (!d->testing_p)
47255 emit_move_insn (d->target,
47256 gen_lowpart (V8SImode, d_copy.target));
47257 return true;
47259 return false;
47262 if (d->testing_p)
47263 break;
47265 t1 = gen_reg_rtx (V8SImode);
47266 t2 = gen_reg_rtx (V8SImode);
47267 t3 = gen_reg_rtx (V4DImode);
47268 t4 = gen_reg_rtx (V4DImode);
47269 t5 = gen_reg_rtx (V4DImode);
47271 /* Shuffle the lanes around into
47272 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47273 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47274 gen_lowpart (V4DImode, d->op1),
47275 GEN_INT (0x20)));
47276 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47277 gen_lowpart (V4DImode, d->op1),
47278 GEN_INT (0x31)));
47280 /* Swap the 2nd and 3rd position in each lane into
47281 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47282 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47283 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47284 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47285 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47287 /* Now an vpunpck[lh]qdq will produce
47288 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47289 if (odd)
47290 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47291 gen_lowpart (V4DImode, t2));
47292 else
47293 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47294 gen_lowpart (V4DImode, t2));
47295 emit_insn (t3);
47296 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47297 break;
47299 default:
47300 gcc_unreachable ();
47303 return true;
47306 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47307 extract-even and extract-odd permutations. */
47309 static bool
47310 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47312 unsigned i, odd, nelt = d->nelt;
47314 odd = d->perm[0];
47315 if (odd != 0 && odd != 1)
47316 return false;
47318 for (i = 1; i < nelt; ++i)
47319 if (d->perm[i] != 2 * i + odd)
47320 return false;
47322 return expand_vec_perm_even_odd_1 (d, odd);
47325 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47326 permutations. We assume that expand_vec_perm_1 has already failed. */
47328 static bool
47329 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47331 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47332 machine_mode vmode = d->vmode;
47333 unsigned char perm2[4];
47334 rtx op0 = d->op0, dest;
47335 bool ok;
47337 switch (vmode)
47339 case E_V4DFmode:
47340 case E_V8SFmode:
47341 /* These are special-cased in sse.md so that we can optionally
47342 use the vbroadcast instruction. They expand to two insns
47343 if the input happens to be in a register. */
47344 gcc_unreachable ();
47346 case E_V2DFmode:
47347 case E_V2DImode:
47348 case E_V4SFmode:
47349 case E_V4SImode:
47350 /* These are always implementable using standard shuffle patterns. */
47351 gcc_unreachable ();
47353 case E_V8HImode:
47354 case E_V16QImode:
47355 /* These can be implemented via interleave. We save one insn by
47356 stopping once we have promoted to V4SImode and then use pshufd. */
47357 if (d->testing_p)
47358 return true;
47361 rtx dest;
47362 rtx (*gen) (rtx, rtx, rtx)
47363 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47364 : gen_vec_interleave_lowv8hi;
47366 if (elt >= nelt2)
47368 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47369 : gen_vec_interleave_highv8hi;
47370 elt -= nelt2;
47372 nelt2 /= 2;
47374 dest = gen_reg_rtx (vmode);
47375 emit_insn (gen (dest, op0, op0));
47376 vmode = get_mode_wider_vector (vmode);
47377 op0 = gen_lowpart (vmode, dest);
47379 while (vmode != V4SImode);
47381 memset (perm2, elt, 4);
47382 dest = gen_reg_rtx (V4SImode);
47383 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47384 gcc_assert (ok);
47385 if (!d->testing_p)
47386 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47387 return true;
47389 case E_V64QImode:
47390 case E_V32QImode:
47391 case E_V16HImode:
47392 case E_V8SImode:
47393 case E_V4DImode:
47394 /* For AVX2 broadcasts of the first element vpbroadcast* or
47395 vpermq should be used by expand_vec_perm_1. */
47396 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47397 return false;
47399 default:
47400 gcc_unreachable ();
47404 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47405 broadcast permutations. */
47407 static bool
47408 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47410 unsigned i, elt, nelt = d->nelt;
47412 if (!d->one_operand_p)
47413 return false;
47415 elt = d->perm[0];
47416 for (i = 1; i < nelt; ++i)
47417 if (d->perm[i] != elt)
47418 return false;
47420 return expand_vec_perm_broadcast_1 (d);
47423 /* Implement arbitrary permutations of two V64QImode operands
47424 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
47425 static bool
47426 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
47428 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47429 return false;
47431 if (d->testing_p)
47432 return true;
47434 struct expand_vec_perm_d ds[2];
47435 rtx rperm[128], vperm, target0, target1;
47436 unsigned int i, nelt;
47437 machine_mode vmode;
47439 nelt = d->nelt;
47440 vmode = V64QImode;
47442 for (i = 0; i < 2; i++)
47444 ds[i] = *d;
47445 ds[i].vmode = V32HImode;
47446 ds[i].nelt = 32;
47447 ds[i].target = gen_reg_rtx (V32HImode);
47448 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47449 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47452 /* Prepare permutations such that the first one takes care of
47453 putting the even bytes into the right positions or one higher
47454 positions (ds[0]) and the second one takes care of
47455 putting the odd bytes into the right positions or one below
47456 (ds[1]). */
47458 for (i = 0; i < nelt; i++)
47460 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47461 if (i & 1)
47463 rperm[i] = constm1_rtx;
47464 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47466 else
47468 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47469 rperm[i + 64] = constm1_rtx;
47473 bool ok = expand_vec_perm_1 (&ds[0]);
47474 gcc_assert (ok);
47475 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47477 ok = expand_vec_perm_1 (&ds[1]);
47478 gcc_assert (ok);
47479 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47481 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47482 vperm = force_reg (vmode, vperm);
47483 target0 = gen_reg_rtx (V64QImode);
47484 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47486 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47487 vperm = force_reg (vmode, vperm);
47488 target1 = gen_reg_rtx (V64QImode);
47489 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47491 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47492 return true;
47495 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47496 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47497 all the shorter instruction sequences. */
47499 static bool
47500 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47502 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47503 unsigned int i, nelt, eltsz;
47504 bool used[4];
47506 if (!TARGET_AVX2
47507 || d->one_operand_p
47508 || (d->vmode != V32QImode && d->vmode != V16HImode))
47509 return false;
47511 if (d->testing_p)
47512 return true;
47514 nelt = d->nelt;
47515 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47517 /* Generate 4 permutation masks. If the required element is within
47518 the same lane, it is shuffled in. If the required element from the
47519 other lane, force a zero by setting bit 7 in the permutation mask.
47520 In the other mask the mask has non-negative elements if element
47521 is requested from the other lane, but also moved to the other lane,
47522 so that the result of vpshufb can have the two V2TImode halves
47523 swapped. */
47524 m128 = GEN_INT (-128);
47525 for (i = 0; i < 32; ++i)
47527 rperm[0][i] = m128;
47528 rperm[1][i] = m128;
47529 rperm[2][i] = m128;
47530 rperm[3][i] = m128;
47532 used[0] = false;
47533 used[1] = false;
47534 used[2] = false;
47535 used[3] = false;
47536 for (i = 0; i < nelt; ++i)
47538 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47539 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47540 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47542 for (j = 0; j < eltsz; ++j)
47543 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47544 used[which] = true;
47547 for (i = 0; i < 2; ++i)
47549 if (!used[2 * i + 1])
47551 h[i] = NULL_RTX;
47552 continue;
47554 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47555 gen_rtvec_v (32, rperm[2 * i + 1]));
47556 vperm = force_reg (V32QImode, vperm);
47557 h[i] = gen_reg_rtx (V32QImode);
47558 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47559 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47562 /* Swap the 128-byte lanes of h[X]. */
47563 for (i = 0; i < 2; ++i)
47565 if (h[i] == NULL_RTX)
47566 continue;
47567 op = gen_reg_rtx (V4DImode);
47568 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47569 const2_rtx, GEN_INT (3), const0_rtx,
47570 const1_rtx));
47571 h[i] = gen_lowpart (V32QImode, op);
47574 for (i = 0; i < 2; ++i)
47576 if (!used[2 * i])
47578 l[i] = NULL_RTX;
47579 continue;
47581 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47582 vperm = force_reg (V32QImode, vperm);
47583 l[i] = gen_reg_rtx (V32QImode);
47584 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47585 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47588 for (i = 0; i < 2; ++i)
47590 if (h[i] && l[i])
47592 op = gen_reg_rtx (V32QImode);
47593 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47594 l[i] = op;
47596 else if (h[i])
47597 l[i] = h[i];
47600 gcc_assert (l[0] && l[1]);
47601 op = d->target;
47602 if (d->vmode != V32QImode)
47603 op = gen_reg_rtx (V32QImode);
47604 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47605 if (op != d->target)
47606 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47607 return true;
47610 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47611 With all of the interface bits taken care of, perform the expansion
47612 in D and return true on success. */
47614 static bool
47615 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47617 /* Try a single instruction expansion. */
47618 if (expand_vec_perm_1 (d))
47619 return true;
47621 /* Try sequences of two instructions. */
47623 if (expand_vec_perm_pshuflw_pshufhw (d))
47624 return true;
47626 if (expand_vec_perm_palignr (d, false))
47627 return true;
47629 if (expand_vec_perm_interleave2 (d))
47630 return true;
47632 if (expand_vec_perm_broadcast (d))
47633 return true;
47635 if (expand_vec_perm_vpermq_perm_1 (d))
47636 return true;
47638 if (expand_vec_perm_vperm2f128 (d))
47639 return true;
47641 if (expand_vec_perm_pblendv (d))
47642 return true;
47644 /* Try sequences of three instructions. */
47646 if (expand_vec_perm_even_odd_pack (d))
47647 return true;
47649 if (expand_vec_perm_2vperm2f128_vshuf (d))
47650 return true;
47652 if (expand_vec_perm_pshufb2 (d))
47653 return true;
47655 if (expand_vec_perm_interleave3 (d))
47656 return true;
47658 if (expand_vec_perm_vperm2f128_vblend (d))
47659 return true;
47661 /* Try sequences of four instructions. */
47663 if (expand_vec_perm_even_odd_trunc (d))
47664 return true;
47665 if (expand_vec_perm_vpshufb2_vpermq (d))
47666 return true;
47668 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47669 return true;
47671 if (expand_vec_perm_vpermt2_vpshub2 (d))
47672 return true;
47674 /* ??? Look for narrow permutations whose element orderings would
47675 allow the promotion to a wider mode. */
47677 /* ??? Look for sequences of interleave or a wider permute that place
47678 the data into the correct lanes for a half-vector shuffle like
47679 pshuf[lh]w or vpermilps. */
47681 /* ??? Look for sequences of interleave that produce the desired results.
47682 The combinatorics of punpck[lh] get pretty ugly... */
47684 if (expand_vec_perm_even_odd (d))
47685 return true;
47687 /* Even longer sequences. */
47688 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47689 return true;
47691 /* See if we can get the same permutation in different vector integer
47692 mode. */
47693 struct expand_vec_perm_d nd;
47694 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47696 if (!d->testing_p)
47697 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47698 return true;
47701 return false;
47704 /* If a permutation only uses one operand, make it clear. Returns true
47705 if the permutation references both operands. */
47707 static bool
47708 canonicalize_perm (struct expand_vec_perm_d *d)
47710 int i, which, nelt = d->nelt;
47712 for (i = which = 0; i < nelt; ++i)
47713 which |= (d->perm[i] < nelt ? 1 : 2);
47715 d->one_operand_p = true;
47716 switch (which)
47718 default:
47719 gcc_unreachable();
47721 case 3:
47722 if (!rtx_equal_p (d->op0, d->op1))
47724 d->one_operand_p = false;
47725 break;
47727 /* The elements of PERM do not suggest that only the first operand
47728 is used, but both operands are identical. Allow easier matching
47729 of the permutation by folding the permutation into the single
47730 input vector. */
47731 /* FALLTHRU */
47733 case 2:
47734 for (i = 0; i < nelt; ++i)
47735 d->perm[i] &= nelt - 1;
47736 d->op0 = d->op1;
47737 break;
47739 case 1:
47740 d->op1 = d->op0;
47741 break;
47744 return (which == 3);
47747 bool
47748 ix86_expand_vec_perm_const (rtx operands[4])
47750 struct expand_vec_perm_d d;
47751 unsigned char perm[MAX_VECT_LEN];
47752 int i, nelt;
47753 bool two_args;
47754 rtx sel;
47756 d.target = operands[0];
47757 d.op0 = operands[1];
47758 d.op1 = operands[2];
47759 sel = operands[3];
47761 d.vmode = GET_MODE (d.target);
47762 gcc_assert (VECTOR_MODE_P (d.vmode));
47763 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47764 d.testing_p = false;
47766 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47767 gcc_assert (XVECLEN (sel, 0) == nelt);
47768 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47770 for (i = 0; i < nelt; ++i)
47772 rtx e = XVECEXP (sel, 0, i);
47773 int ei = INTVAL (e) & (2 * nelt - 1);
47774 d.perm[i] = ei;
47775 perm[i] = ei;
47778 two_args = canonicalize_perm (&d);
47780 if (ix86_expand_vec_perm_const_1 (&d))
47781 return true;
47783 /* If the selector says both arguments are needed, but the operands are the
47784 same, the above tried to expand with one_operand_p and flattened selector.
47785 If that didn't work, retry without one_operand_p; we succeeded with that
47786 during testing. */
47787 if (two_args && d.one_operand_p)
47789 d.one_operand_p = false;
47790 memcpy (d.perm, perm, sizeof (perm));
47791 return ix86_expand_vec_perm_const_1 (&d);
47794 return false;
47797 /* Implement targetm.vectorize.vec_perm_const_ok. */
47799 static bool
47800 ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
47802 struct expand_vec_perm_d d;
47803 unsigned int i, nelt, which;
47804 bool ret;
47806 d.vmode = vmode;
47807 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47808 d.testing_p = true;
47810 /* Given sufficient ISA support we can just return true here
47811 for selected vector modes. */
47812 switch (d.vmode)
47814 case E_V16SFmode:
47815 case E_V16SImode:
47816 case E_V8DImode:
47817 case E_V8DFmode:
47818 if (TARGET_AVX512F)
47819 /* All implementable with a single vperm[it]2 insn. */
47820 return true;
47821 break;
47822 case E_V32HImode:
47823 if (TARGET_AVX512BW)
47824 /* All implementable with a single vperm[it]2 insn. */
47825 return true;
47826 break;
47827 case E_V64QImode:
47828 if (TARGET_AVX512BW)
47829 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
47830 return true;
47831 break;
47832 case E_V8SImode:
47833 case E_V8SFmode:
47834 case E_V4DFmode:
47835 case E_V4DImode:
47836 if (TARGET_AVX512VL)
47837 /* All implementable with a single vperm[it]2 insn. */
47838 return true;
47839 break;
47840 case E_V16HImode:
47841 if (TARGET_AVX2)
47842 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47843 return true;
47844 break;
47845 case E_V32QImode:
47846 if (TARGET_AVX2)
47847 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47848 return true;
47849 break;
47850 case E_V4SImode:
47851 case E_V4SFmode:
47852 case E_V8HImode:
47853 case E_V16QImode:
47854 /* All implementable with a single vpperm insn. */
47855 if (TARGET_XOP)
47856 return true;
47857 /* All implementable with 2 pshufb + 1 ior. */
47858 if (TARGET_SSSE3)
47859 return true;
47860 break;
47861 case E_V2DImode:
47862 case E_V2DFmode:
47863 /* All implementable with shufpd or unpck[lh]pd. */
47864 return true;
47865 default:
47866 return false;
47869 /* Extract the values from the vector CST into the permutation
47870 array in D. */
47871 for (i = which = 0; i < nelt; ++i)
47873 unsigned char e = sel[i];
47874 gcc_assert (e < 2 * nelt);
47875 d.perm[i] = e;
47876 which |= (e < nelt ? 1 : 2);
47879 /* For all elements from second vector, fold the elements to first. */
47880 if (which == 2)
47881 for (i = 0; i < nelt; ++i)
47882 d.perm[i] -= nelt;
47884 /* Check whether the mask can be applied to the vector type. */
47885 d.one_operand_p = (which != 3);
47887 /* Implementable with shufps or pshufd. */
47888 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47889 return true;
47891 /* Otherwise we have to go through the motions and see if we can
47892 figure out how to generate the requested permutation. */
47893 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47894 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47895 if (!d.one_operand_p)
47896 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47898 start_sequence ();
47899 ret = ix86_expand_vec_perm_const_1 (&d);
47900 end_sequence ();
47902 return ret;
47905 void
47906 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47908 struct expand_vec_perm_d d;
47909 unsigned i, nelt;
47911 d.target = targ;
47912 d.op0 = op0;
47913 d.op1 = op1;
47914 d.vmode = GET_MODE (targ);
47915 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47916 d.one_operand_p = false;
47917 d.testing_p = false;
47919 for (i = 0; i < nelt; ++i)
47920 d.perm[i] = i * 2 + odd;
47922 /* We'll either be able to implement the permutation directly... */
47923 if (expand_vec_perm_1 (&d))
47924 return;
47926 /* ... or we use the special-case patterns. */
47927 expand_vec_perm_even_odd_1 (&d, odd);
47930 static void
47931 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47933 struct expand_vec_perm_d d;
47934 unsigned i, nelt, base;
47935 bool ok;
47937 d.target = targ;
47938 d.op0 = op0;
47939 d.op1 = op1;
47940 d.vmode = GET_MODE (targ);
47941 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47942 d.one_operand_p = false;
47943 d.testing_p = false;
47945 base = high_p ? nelt / 2 : 0;
47946 for (i = 0; i < nelt / 2; ++i)
47948 d.perm[i * 2] = i + base;
47949 d.perm[i * 2 + 1] = i + base + nelt;
47952 /* Note that for AVX this isn't one instruction. */
47953 ok = ix86_expand_vec_perm_const_1 (&d);
47954 gcc_assert (ok);
47958 /* Expand a vector operation CODE for a V*QImode in terms of the
47959 same operation on V*HImode. */
47961 void
47962 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47964 machine_mode qimode = GET_MODE (dest);
47965 machine_mode himode;
47966 rtx (*gen_il) (rtx, rtx, rtx);
47967 rtx (*gen_ih) (rtx, rtx, rtx);
47968 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47969 struct expand_vec_perm_d d;
47970 bool ok, full_interleave;
47971 bool uns_p = false;
47972 int i;
47974 switch (qimode)
47976 case E_V16QImode:
47977 himode = V8HImode;
47978 gen_il = gen_vec_interleave_lowv16qi;
47979 gen_ih = gen_vec_interleave_highv16qi;
47980 break;
47981 case E_V32QImode:
47982 himode = V16HImode;
47983 gen_il = gen_avx2_interleave_lowv32qi;
47984 gen_ih = gen_avx2_interleave_highv32qi;
47985 break;
47986 case E_V64QImode:
47987 himode = V32HImode;
47988 gen_il = gen_avx512bw_interleave_lowv64qi;
47989 gen_ih = gen_avx512bw_interleave_highv64qi;
47990 break;
47991 default:
47992 gcc_unreachable ();
47995 op2_l = op2_h = op2;
47996 switch (code)
47998 case MULT:
47999 /* Unpack data such that we've got a source byte in each low byte of
48000 each word. We don't care what goes into the high byte of each word.
48001 Rather than trying to get zero in there, most convenient is to let
48002 it be a copy of the low byte. */
48003 op2_l = gen_reg_rtx (qimode);
48004 op2_h = gen_reg_rtx (qimode);
48005 emit_insn (gen_il (op2_l, op2, op2));
48006 emit_insn (gen_ih (op2_h, op2, op2));
48008 op1_l = gen_reg_rtx (qimode);
48009 op1_h = gen_reg_rtx (qimode);
48010 emit_insn (gen_il (op1_l, op1, op1));
48011 emit_insn (gen_ih (op1_h, op1, op1));
48012 full_interleave = qimode == V16QImode;
48013 break;
48015 case ASHIFT:
48016 case LSHIFTRT:
48017 uns_p = true;
48018 /* FALLTHRU */
48019 case ASHIFTRT:
48020 op1_l = gen_reg_rtx (himode);
48021 op1_h = gen_reg_rtx (himode);
48022 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48023 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48024 full_interleave = true;
48025 break;
48026 default:
48027 gcc_unreachable ();
48030 /* Perform the operation. */
48031 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48032 1, OPTAB_DIRECT);
48033 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48034 1, OPTAB_DIRECT);
48035 gcc_assert (res_l && res_h);
48037 /* Merge the data back into the right place. */
48038 d.target = dest;
48039 d.op0 = gen_lowpart (qimode, res_l);
48040 d.op1 = gen_lowpart (qimode, res_h);
48041 d.vmode = qimode;
48042 d.nelt = GET_MODE_NUNITS (qimode);
48043 d.one_operand_p = false;
48044 d.testing_p = false;
48046 if (full_interleave)
48048 /* For SSE2, we used an full interleave, so the desired
48049 results are in the even elements. */
48050 for (i = 0; i < d.nelt; ++i)
48051 d.perm[i] = i * 2;
48053 else
48055 /* For AVX, the interleave used above was not cross-lane. So the
48056 extraction is evens but with the second and third quarter swapped.
48057 Happily, that is even one insn shorter than even extraction.
48058 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48059 always first from the first and then from the second source operand,
48060 the index bits above the low 4 bits remains the same.
48061 Thus, for d.nelt == 32 we want permutation
48062 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48063 and for d.nelt == 64 we want permutation
48064 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48065 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48066 for (i = 0; i < d.nelt; ++i)
48067 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48070 ok = ix86_expand_vec_perm_const_1 (&d);
48071 gcc_assert (ok);
48073 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48074 gen_rtx_fmt_ee (code, qimode, op1, op2));
48077 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48078 if op is CONST_VECTOR with all odd elements equal to their
48079 preceding element. */
48081 static bool
48082 const_vector_equal_evenodd_p (rtx op)
48084 machine_mode mode = GET_MODE (op);
48085 int i, nunits = GET_MODE_NUNITS (mode);
48086 if (GET_CODE (op) != CONST_VECTOR
48087 || nunits != CONST_VECTOR_NUNITS (op))
48088 return false;
48089 for (i = 0; i < nunits; i += 2)
48090 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48091 return false;
48092 return true;
48095 void
48096 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48097 bool uns_p, bool odd_p)
48099 machine_mode mode = GET_MODE (op1);
48100 machine_mode wmode = GET_MODE (dest);
48101 rtx x;
48102 rtx orig_op1 = op1, orig_op2 = op2;
48104 if (!nonimmediate_operand (op1, mode))
48105 op1 = force_reg (mode, op1);
48106 if (!nonimmediate_operand (op2, mode))
48107 op2 = force_reg (mode, op2);
48109 /* We only play even/odd games with vectors of SImode. */
48110 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48112 /* If we're looking for the odd results, shift those members down to
48113 the even slots. For some cpus this is faster than a PSHUFD. */
48114 if (odd_p)
48116 /* For XOP use vpmacsdqh, but only for smult, as it is only
48117 signed. */
48118 if (TARGET_XOP && mode == V4SImode && !uns_p)
48120 x = force_reg (wmode, CONST0_RTX (wmode));
48121 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48122 return;
48125 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48126 if (!const_vector_equal_evenodd_p (orig_op1))
48127 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48128 x, NULL, 1, OPTAB_DIRECT);
48129 if (!const_vector_equal_evenodd_p (orig_op2))
48130 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48131 x, NULL, 1, OPTAB_DIRECT);
48132 op1 = gen_lowpart (mode, op1);
48133 op2 = gen_lowpart (mode, op2);
48136 if (mode == V16SImode)
48138 if (uns_p)
48139 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48140 else
48141 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48143 else if (mode == V8SImode)
48145 if (uns_p)
48146 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48147 else
48148 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48150 else if (uns_p)
48151 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48152 else if (TARGET_SSE4_1)
48153 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48154 else
48156 rtx s1, s2, t0, t1, t2;
48158 /* The easiest way to implement this without PMULDQ is to go through
48159 the motions as if we are performing a full 64-bit multiply. With
48160 the exception that we need to do less shuffling of the elements. */
48162 /* Compute the sign-extension, aka highparts, of the two operands. */
48163 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48164 op1, pc_rtx, pc_rtx);
48165 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48166 op2, pc_rtx, pc_rtx);
48168 /* Multiply LO(A) * HI(B), and vice-versa. */
48169 t1 = gen_reg_rtx (wmode);
48170 t2 = gen_reg_rtx (wmode);
48171 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48172 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48174 /* Multiply LO(A) * LO(B). */
48175 t0 = gen_reg_rtx (wmode);
48176 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48178 /* Combine and shift the highparts into place. */
48179 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48180 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48181 1, OPTAB_DIRECT);
48183 /* Combine high and low parts. */
48184 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48185 return;
48187 emit_insn (x);
48190 void
48191 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48192 bool uns_p, bool high_p)
48194 machine_mode wmode = GET_MODE (dest);
48195 machine_mode mode = GET_MODE (op1);
48196 rtx t1, t2, t3, t4, mask;
48198 switch (mode)
48200 case E_V4SImode:
48201 t1 = gen_reg_rtx (mode);
48202 t2 = gen_reg_rtx (mode);
48203 if (TARGET_XOP && !uns_p)
48205 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48206 shuffle the elements once so that all elements are in the right
48207 place for immediate use: { A C B D }. */
48208 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48209 const1_rtx, GEN_INT (3)));
48210 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48211 const1_rtx, GEN_INT (3)));
48213 else
48215 /* Put the elements into place for the multiply. */
48216 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48217 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48218 high_p = false;
48220 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48221 break;
48223 case E_V8SImode:
48224 /* Shuffle the elements between the lanes. After this we
48225 have { A B E F | C D G H } for each operand. */
48226 t1 = gen_reg_rtx (V4DImode);
48227 t2 = gen_reg_rtx (V4DImode);
48228 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48229 const0_rtx, const2_rtx,
48230 const1_rtx, GEN_INT (3)));
48231 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48232 const0_rtx, const2_rtx,
48233 const1_rtx, GEN_INT (3)));
48235 /* Shuffle the elements within the lanes. After this we
48236 have { A A B B | C C D D } or { E E F F | G G H H }. */
48237 t3 = gen_reg_rtx (V8SImode);
48238 t4 = gen_reg_rtx (V8SImode);
48239 mask = GEN_INT (high_p
48240 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48241 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48242 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48243 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48245 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48246 break;
48248 case E_V8HImode:
48249 case E_V16HImode:
48250 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48251 uns_p, OPTAB_DIRECT);
48252 t2 = expand_binop (mode,
48253 uns_p ? umul_highpart_optab : smul_highpart_optab,
48254 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48255 gcc_assert (t1 && t2);
48257 t3 = gen_reg_rtx (mode);
48258 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48259 emit_move_insn (dest, gen_lowpart (wmode, t3));
48260 break;
48262 case E_V16QImode:
48263 case E_V32QImode:
48264 case E_V32HImode:
48265 case E_V16SImode:
48266 case E_V64QImode:
48267 t1 = gen_reg_rtx (wmode);
48268 t2 = gen_reg_rtx (wmode);
48269 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48270 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48272 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48273 break;
48275 default:
48276 gcc_unreachable ();
48280 void
48281 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48283 rtx res_1, res_2, res_3, res_4;
48285 res_1 = gen_reg_rtx (V4SImode);
48286 res_2 = gen_reg_rtx (V4SImode);
48287 res_3 = gen_reg_rtx (V2DImode);
48288 res_4 = gen_reg_rtx (V2DImode);
48289 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48290 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48292 /* Move the results in element 2 down to element 1; we don't care
48293 what goes in elements 2 and 3. Then we can merge the parts
48294 back together with an interleave.
48296 Note that two other sequences were tried:
48297 (1) Use interleaves at the start instead of psrldq, which allows
48298 us to use a single shufps to merge things back at the end.
48299 (2) Use shufps here to combine the two vectors, then pshufd to
48300 put the elements in the correct order.
48301 In both cases the cost of the reformatting stall was too high
48302 and the overall sequence slower. */
48304 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48305 const0_rtx, const2_rtx,
48306 const0_rtx, const0_rtx));
48307 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48308 const0_rtx, const2_rtx,
48309 const0_rtx, const0_rtx));
48310 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48312 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48315 void
48316 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48318 machine_mode mode = GET_MODE (op0);
48319 rtx t1, t2, t3, t4, t5, t6;
48321 if (TARGET_AVX512DQ && mode == V8DImode)
48322 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48323 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48324 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48325 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48326 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48327 else if (TARGET_XOP && mode == V2DImode)
48329 /* op1: A,B,C,D, op2: E,F,G,H */
48330 op1 = gen_lowpart (V4SImode, op1);
48331 op2 = gen_lowpart (V4SImode, op2);
48333 t1 = gen_reg_rtx (V4SImode);
48334 t2 = gen_reg_rtx (V4SImode);
48335 t3 = gen_reg_rtx (V2DImode);
48336 t4 = gen_reg_rtx (V2DImode);
48338 /* t1: B,A,D,C */
48339 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48340 GEN_INT (1),
48341 GEN_INT (0),
48342 GEN_INT (3),
48343 GEN_INT (2)));
48345 /* t2: (B*E),(A*F),(D*G),(C*H) */
48346 emit_insn (gen_mulv4si3 (t2, t1, op2));
48348 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48349 emit_insn (gen_xop_phadddq (t3, t2));
48351 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48352 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48354 /* Multiply lower parts and add all */
48355 t5 = gen_reg_rtx (V2DImode);
48356 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48357 gen_lowpart (V4SImode, op1),
48358 gen_lowpart (V4SImode, op2)));
48359 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48362 else
48364 machine_mode nmode;
48365 rtx (*umul) (rtx, rtx, rtx);
48367 if (mode == V2DImode)
48369 umul = gen_vec_widen_umult_even_v4si;
48370 nmode = V4SImode;
48372 else if (mode == V4DImode)
48374 umul = gen_vec_widen_umult_even_v8si;
48375 nmode = V8SImode;
48377 else if (mode == V8DImode)
48379 umul = gen_vec_widen_umult_even_v16si;
48380 nmode = V16SImode;
48382 else
48383 gcc_unreachable ();
48386 /* Multiply low parts. */
48387 t1 = gen_reg_rtx (mode);
48388 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48390 /* Shift input vectors right 32 bits so we can multiply high parts. */
48391 t6 = GEN_INT (32);
48392 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48393 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48395 /* Multiply high parts by low parts. */
48396 t4 = gen_reg_rtx (mode);
48397 t5 = gen_reg_rtx (mode);
48398 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48399 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48401 /* Combine and shift the highparts back. */
48402 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48403 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48405 /* Combine high and low parts. */
48406 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48409 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48410 gen_rtx_MULT (mode, op1, op2));
48413 /* Return 1 if control tansfer instruction INSN
48414 should be encoded with bnd prefix.
48415 If insn is NULL then return 1 when control
48416 transfer instructions should be prefixed with
48417 bnd by default for current function. */
48419 bool
48420 ix86_bnd_prefixed_insn_p (rtx insn)
48422 /* For call insns check special flag. */
48423 if (insn && CALL_P (insn))
48425 rtx call = get_call_rtx_from (insn);
48426 if (call)
48427 return CALL_EXPR_WITH_BOUNDS_P (call);
48430 /* All other insns are prefixed only if function is instrumented. */
48431 return chkp_function_instrumented_p (current_function_decl);
48434 /* Return 1 if control tansfer instruction INSN
48435 should be encoded with notrack prefix. */
48437 static bool
48438 ix86_notrack_prefixed_insn_p (rtx insn)
48440 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
48441 return false;
48443 if (CALL_P (insn))
48445 rtx call = get_call_rtx_from (insn);
48446 gcc_assert (call != NULL_RTX);
48447 rtx addr = XEXP (call, 0);
48449 /* Do not emit 'notrack' if it's not an indirect call. */
48450 if (MEM_P (addr)
48451 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
48452 return false;
48453 else
48454 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
48457 if (JUMP_P (insn) && !flag_cet_switch)
48459 rtx target = JUMP_LABEL (insn);
48460 if (target == NULL_RTX || ANY_RETURN_P (target))
48461 return false;
48463 /* Check the jump is a switch table. */
48464 rtx_insn *label = as_a<rtx_insn *> (target);
48465 rtx_insn *table = next_insn (label);
48466 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
48467 return false;
48468 else
48469 return true;
48471 return false;
48474 /* Calculate integer abs() using only SSE2 instructions. */
48476 void
48477 ix86_expand_sse2_abs (rtx target, rtx input)
48479 machine_mode mode = GET_MODE (target);
48480 rtx tmp0, tmp1, x;
48482 switch (mode)
48484 /* For 32-bit signed integer X, the best way to calculate the absolute
48485 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48486 case E_V4SImode:
48487 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48488 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48489 NULL, 0, OPTAB_DIRECT);
48490 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48491 NULL, 0, OPTAB_DIRECT);
48492 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48493 target, 0, OPTAB_DIRECT);
48494 break;
48496 /* For 16-bit signed integer X, the best way to calculate the absolute
48497 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48498 case E_V8HImode:
48499 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48501 x = expand_simple_binop (mode, SMAX, tmp0, input,
48502 target, 0, OPTAB_DIRECT);
48503 break;
48505 /* For 8-bit signed integer X, the best way to calculate the absolute
48506 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48507 as SSE2 provides the PMINUB insn. */
48508 case E_V16QImode:
48509 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48511 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48512 target, 0, OPTAB_DIRECT);
48513 break;
48515 default:
48516 gcc_unreachable ();
48519 if (x != target)
48520 emit_move_insn (target, x);
48523 /* Expand an extract from a vector register through pextr insn.
48524 Return true if successful. */
48526 bool
48527 ix86_expand_pextr (rtx *operands)
48529 rtx dst = operands[0];
48530 rtx src = operands[1];
48532 unsigned int size = INTVAL (operands[2]);
48533 unsigned int pos = INTVAL (operands[3]);
48535 if (SUBREG_P (dst))
48537 /* Reject non-lowpart subregs. */
48538 if (SUBREG_BYTE (dst) > 0)
48539 return false;
48540 dst = SUBREG_REG (dst);
48543 if (SUBREG_P (src))
48545 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48546 src = SUBREG_REG (src);
48549 switch (GET_MODE (src))
48551 case E_V16QImode:
48552 case E_V8HImode:
48553 case E_V4SImode:
48554 case E_V2DImode:
48555 case E_V1TImode:
48556 case E_TImode:
48558 machine_mode srcmode, dstmode;
48559 rtx d, pat;
48561 if (!int_mode_for_size (size, 0).exists (&dstmode))
48562 return false;
48564 switch (dstmode)
48566 case E_QImode:
48567 if (!TARGET_SSE4_1)
48568 return false;
48569 srcmode = V16QImode;
48570 break;
48572 case E_HImode:
48573 if (!TARGET_SSE2)
48574 return false;
48575 srcmode = V8HImode;
48576 break;
48578 case E_SImode:
48579 if (!TARGET_SSE4_1)
48580 return false;
48581 srcmode = V4SImode;
48582 break;
48584 case E_DImode:
48585 gcc_assert (TARGET_64BIT);
48586 if (!TARGET_SSE4_1)
48587 return false;
48588 srcmode = V2DImode;
48589 break;
48591 default:
48592 return false;
48595 /* Reject extractions from misaligned positions. */
48596 if (pos & (size-1))
48597 return false;
48599 if (GET_MODE (dst) == dstmode)
48600 d = dst;
48601 else
48602 d = gen_reg_rtx (dstmode);
48604 /* Construct insn pattern. */
48605 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48606 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48608 /* Let the rtl optimizers know about the zero extension performed. */
48609 if (dstmode == QImode || dstmode == HImode)
48611 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48612 d = gen_lowpart (SImode, d);
48615 emit_insn (gen_rtx_SET (d, pat));
48617 if (d != dst)
48618 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48619 return true;
48622 default:
48623 return false;
48627 /* Expand an insert into a vector register through pinsr insn.
48628 Return true if successful. */
48630 bool
48631 ix86_expand_pinsr (rtx *operands)
48633 rtx dst = operands[0];
48634 rtx src = operands[3];
48636 unsigned int size = INTVAL (operands[1]);
48637 unsigned int pos = INTVAL (operands[2]);
48639 if (SUBREG_P (dst))
48641 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48642 dst = SUBREG_REG (dst);
48645 switch (GET_MODE (dst))
48647 case E_V16QImode:
48648 case E_V8HImode:
48649 case E_V4SImode:
48650 case E_V2DImode:
48651 case E_V1TImode:
48652 case E_TImode:
48654 machine_mode srcmode, dstmode;
48655 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48656 rtx d;
48658 if (!int_mode_for_size (size, 0).exists (&srcmode))
48659 return false;
48661 switch (srcmode)
48663 case E_QImode:
48664 if (!TARGET_SSE4_1)
48665 return false;
48666 dstmode = V16QImode;
48667 pinsr = gen_sse4_1_pinsrb;
48668 break;
48670 case E_HImode:
48671 if (!TARGET_SSE2)
48672 return false;
48673 dstmode = V8HImode;
48674 pinsr = gen_sse2_pinsrw;
48675 break;
48677 case E_SImode:
48678 if (!TARGET_SSE4_1)
48679 return false;
48680 dstmode = V4SImode;
48681 pinsr = gen_sse4_1_pinsrd;
48682 break;
48684 case E_DImode:
48685 gcc_assert (TARGET_64BIT);
48686 if (!TARGET_SSE4_1)
48687 return false;
48688 dstmode = V2DImode;
48689 pinsr = gen_sse4_1_pinsrq;
48690 break;
48692 default:
48693 return false;
48696 /* Reject insertions to misaligned positions. */
48697 if (pos & (size-1))
48698 return false;
48700 if (SUBREG_P (src))
48702 unsigned int srcpos = SUBREG_BYTE (src);
48704 if (srcpos > 0)
48706 rtx extr_ops[4];
48708 extr_ops[0] = gen_reg_rtx (srcmode);
48709 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48710 extr_ops[2] = GEN_INT (size);
48711 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48713 if (!ix86_expand_pextr (extr_ops))
48714 return false;
48716 src = extr_ops[0];
48718 else
48719 src = gen_lowpart (srcmode, SUBREG_REG (src));
48722 if (GET_MODE (dst) == dstmode)
48723 d = dst;
48724 else
48725 d = gen_reg_rtx (dstmode);
48727 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48728 gen_lowpart (srcmode, src),
48729 GEN_INT (1 << (pos / size))));
48730 if (d != dst)
48731 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48732 return true;
48735 default:
48736 return false;
48740 /* This function returns the calling abi specific va_list type node.
48741 It returns the FNDECL specific va_list type. */
48743 static tree
48744 ix86_fn_abi_va_list (tree fndecl)
48746 if (!TARGET_64BIT)
48747 return va_list_type_node;
48748 gcc_assert (fndecl != NULL_TREE);
48750 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48751 return ms_va_list_type_node;
48752 else
48753 return sysv_va_list_type_node;
48756 /* Returns the canonical va_list type specified by TYPE. If there
48757 is no valid TYPE provided, it return NULL_TREE. */
48759 static tree
48760 ix86_canonical_va_list_type (tree type)
48762 if (TARGET_64BIT)
48764 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48765 return ms_va_list_type_node;
48767 if ((TREE_CODE (type) == ARRAY_TYPE
48768 && integer_zerop (array_type_nelts (type)))
48769 || POINTER_TYPE_P (type))
48771 tree elem_type = TREE_TYPE (type);
48772 if (TREE_CODE (elem_type) == RECORD_TYPE
48773 && lookup_attribute ("sysv_abi va_list",
48774 TYPE_ATTRIBUTES (elem_type)))
48775 return sysv_va_list_type_node;
48778 return NULL_TREE;
48781 return std_canonical_va_list_type (type);
48784 /* Iterate through the target-specific builtin types for va_list.
48785 IDX denotes the iterator, *PTREE is set to the result type of
48786 the va_list builtin, and *PNAME to its internal type.
48787 Returns zero if there is no element for this index, otherwise
48788 IDX should be increased upon the next call.
48789 Note, do not iterate a base builtin's name like __builtin_va_list.
48790 Used from c_common_nodes_and_builtins. */
48792 static int
48793 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48795 if (TARGET_64BIT)
48797 switch (idx)
48799 default:
48800 break;
48802 case 0:
48803 *ptree = ms_va_list_type_node;
48804 *pname = "__builtin_ms_va_list";
48805 return 1;
48807 case 1:
48808 *ptree = sysv_va_list_type_node;
48809 *pname = "__builtin_sysv_va_list";
48810 return 1;
48814 return 0;
48817 #undef TARGET_SCHED_DISPATCH
48818 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
48819 #undef TARGET_SCHED_DISPATCH_DO
48820 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
48821 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48822 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48823 #undef TARGET_SCHED_REORDER
48824 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
48825 #undef TARGET_SCHED_ADJUST_PRIORITY
48826 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48827 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48828 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48829 ix86_dependencies_evaluation_hook
48832 /* Implementation of reassociation_width target hook used by
48833 reassoc phase to identify parallelism level in reassociated
48834 tree. Statements tree_code is passed in OPC. Arguments type
48835 is passed in MODE. */
48837 static int
48838 ix86_reassociation_width (unsigned int op, machine_mode mode)
48840 int width = 1;
48841 /* Vector part. */
48842 if (VECTOR_MODE_P (mode))
48844 int div = 1;
48845 if (INTEGRAL_MODE_P (mode))
48846 width = ix86_cost->reassoc_vec_int;
48847 else if (FLOAT_MODE_P (mode))
48848 width = ix86_cost->reassoc_vec_fp;
48850 if (width == 1)
48851 return 1;
48853 /* Integer vector instructions execute in FP unit
48854 and can execute 3 additions and one multiplication per cycle. */
48855 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
48856 && op != PLUS && op != MINUS)
48857 return 1;
48859 /* Account for targets that splits wide vectors into multiple parts. */
48860 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
48861 div = GET_MODE_BITSIZE (mode) / 128;
48862 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
48863 div = GET_MODE_BITSIZE (mode) / 64;
48864 width = (width + div - 1) / div;
48866 /* Scalar part. */
48867 else if (INTEGRAL_MODE_P (mode))
48868 width = ix86_cost->reassoc_int;
48869 else if (FLOAT_MODE_P (mode))
48870 width = ix86_cost->reassoc_fp;
48872 /* Avoid using too many registers in 32bit mode. */
48873 if (!TARGET_64BIT && width > 2)
48874 width = 2;
48875 return width;
48878 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
48879 place emms and femms instructions. */
48881 static machine_mode
48882 ix86_preferred_simd_mode (scalar_mode mode)
48884 if (!TARGET_SSE)
48885 return word_mode;
48887 switch (mode)
48889 case E_QImode:
48890 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48891 return V64QImode;
48892 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48893 return V32QImode;
48894 else
48895 return V16QImode;
48897 case E_HImode:
48898 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48899 return V32HImode;
48900 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48901 return V16HImode;
48902 else
48903 return V8HImode;
48905 case E_SImode:
48906 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48907 return V16SImode;
48908 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48909 return V8SImode;
48910 else
48911 return V4SImode;
48913 case E_DImode:
48914 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48915 return V8DImode;
48916 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48917 return V4DImode;
48918 else
48919 return V2DImode;
48921 case E_SFmode:
48922 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48923 return V16SFmode;
48924 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48925 return V8SFmode;
48926 else
48927 return V4SFmode;
48929 case E_DFmode:
48930 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48931 return V8DFmode;
48932 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48933 return V4DFmode;
48934 else if (TARGET_SSE2)
48935 return V2DFmode;
48936 /* FALLTHRU */
48938 default:
48939 return word_mode;
48943 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
48944 vectors. If AVX512F is enabled then try vectorizing with 512bit,
48945 256bit and 128bit vectors. */
48947 static unsigned int
48948 ix86_autovectorize_vector_sizes (void)
48950 unsigned int bytesizes = 0;
48952 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48953 bytesizes |= (64 | 32 | 16);
48954 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48955 bytesizes |= (32 | 16);
48957 return bytesizes;
48960 /* Implemenation of targetm.vectorize.get_mask_mode. */
48962 static opt_machine_mode
48963 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
48965 unsigned elem_size = vector_size / nunits;
48967 /* Scalar mask case. */
48968 if ((TARGET_AVX512F && vector_size == 64)
48969 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
48971 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
48972 return smallest_int_mode_for_size (nunits);
48975 scalar_int_mode elem_mode
48976 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
48978 gcc_assert (elem_size * nunits == vector_size);
48980 return mode_for_vector (elem_mode, nunits);
48985 /* Return class of registers which could be used for pseudo of MODE
48986 and of class RCLASS for spilling instead of memory. Return NO_REGS
48987 if it is not possible or non-profitable. */
48989 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
48991 static reg_class_t
48992 ix86_spill_class (reg_class_t rclass, machine_mode mode)
48994 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
48995 && TARGET_SSE2
48996 && TARGET_INTER_UNIT_MOVES_TO_VEC
48997 && TARGET_INTER_UNIT_MOVES_FROM_VEC
48998 && (mode == SImode || (TARGET_64BIT && mode == DImode))
48999 && INTEGER_CLASS_P (rclass))
49000 return ALL_SSE_REGS;
49001 return NO_REGS;
49004 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
49005 but returns a lower bound. */
49007 static unsigned int
49008 ix86_max_noce_ifcvt_seq_cost (edge e)
49010 bool predictable_p = predictable_edge_p (e);
49012 enum compiler_param param
49013 = (predictable_p
49014 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
49015 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
49017 /* If we have a parameter set, use that, otherwise take a guess using
49018 BRANCH_COST. */
49019 if (global_options_set.x_param_values[param])
49020 return PARAM_VALUE (param);
49021 else
49022 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
49025 /* Return true if SEQ is a good candidate as a replacement for the
49026 if-convertible sequence described in IF_INFO. */
49028 static bool
49029 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
49031 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
49033 int cmov_cnt = 0;
49034 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
49035 Maybe we should allow even more conditional moves as long as they
49036 are used far enough not to stall the CPU, or also consider
49037 IF_INFO->TEST_BB succ edge probabilities. */
49038 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
49040 rtx set = single_set (insn);
49041 if (!set)
49042 continue;
49043 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
49044 continue;
49045 rtx src = SET_SRC (set);
49046 machine_mode mode = GET_MODE (src);
49047 if (GET_MODE_CLASS (mode) != MODE_INT
49048 && GET_MODE_CLASS (mode) != MODE_FLOAT)
49049 continue;
49050 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
49051 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
49052 continue;
49053 /* insn is CMOV or FCMOV. */
49054 if (++cmov_cnt > 1)
49055 return false;
49058 return default_noce_conversion_profitable_p (seq, if_info);
49061 /* Implement targetm.vectorize.init_cost. */
49063 static void *
49064 ix86_init_cost (struct loop *)
49066 unsigned *cost = XNEWVEC (unsigned, 3);
49067 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49068 return cost;
49071 /* Implement targetm.vectorize.add_stmt_cost. */
49073 static unsigned
49074 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49075 struct _stmt_vec_info *stmt_info, int misalign,
49076 enum vect_cost_model_location where)
49078 unsigned *cost = (unsigned *) data;
49079 unsigned retval = 0;
49081 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49082 int stmt_cost = - 1;
49084 if ((kind == vector_stmt || kind == scalar_stmt)
49085 && stmt_info
49086 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
49088 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
49089 bool fp = false;
49090 machine_mode mode = TImode;
49092 if (vectype != NULL)
49094 fp = FLOAT_TYPE_P (vectype);
49095 mode = TYPE_MODE (vectype);
49097 /*machine_mode inner_mode = mode;
49098 if (VECTOR_MODE_P (mode))
49099 inner_mode = GET_MODE_INNER (mode);*/
49101 switch (subcode)
49103 case PLUS_EXPR:
49104 case POINTER_PLUS_EXPR:
49105 case MINUS_EXPR:
49106 if (kind == scalar_stmt)
49108 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49109 stmt_cost = ix86_cost->addss;
49110 else if (X87_FLOAT_MODE_P (mode))
49111 stmt_cost = ix86_cost->fadd;
49112 else
49113 stmt_cost = ix86_cost->add;
49115 else
49116 stmt_cost = ix86_vec_cost (mode,
49117 fp ? ix86_cost->addss
49118 : ix86_cost->sse_op,
49119 true);
49120 break;
49122 case MULT_EXPR:
49123 case WIDEN_MULT_EXPR:
49124 case MULT_HIGHPART_EXPR:
49125 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
49126 break;
49127 case FMA_EXPR:
49128 stmt_cost = ix86_vec_cost (mode,
49129 mode == SFmode ? ix86_cost->fmass
49130 : ix86_cost->fmasd,
49131 true);
49132 break;
49133 case NEGATE_EXPR:
49134 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49135 stmt_cost = ix86_cost->sse_op;
49136 else if (X87_FLOAT_MODE_P (mode))
49137 stmt_cost = ix86_cost->fchs;
49138 else if (VECTOR_MODE_P (mode))
49139 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49140 else
49141 stmt_cost = ix86_cost->add;
49142 break;
49143 case TRUNC_DIV_EXPR:
49144 case CEIL_DIV_EXPR:
49145 case FLOOR_DIV_EXPR:
49146 case ROUND_DIV_EXPR:
49147 case TRUNC_MOD_EXPR:
49148 case CEIL_MOD_EXPR:
49149 case FLOOR_MOD_EXPR:
49150 case RDIV_EXPR:
49151 case ROUND_MOD_EXPR:
49152 case EXACT_DIV_EXPR:
49153 stmt_cost = ix86_division_cost (ix86_cost, mode);
49154 break;
49156 case RSHIFT_EXPR:
49157 case LSHIFT_EXPR:
49158 case LROTATE_EXPR:
49159 case RROTATE_EXPR:
49161 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
49162 stmt_cost = ix86_shift_rotate_cost
49163 (ix86_cost, mode,
49164 TREE_CODE (op2) == INTEGER_CST,
49165 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
49166 true, false, false, NULL, NULL);
49168 break;
49169 case NOP_EXPR:
49170 stmt_cost = 0;
49171 break;
49173 case BIT_IOR_EXPR:
49174 case ABS_EXPR:
49175 case MIN_EXPR:
49176 case MAX_EXPR:
49177 case BIT_XOR_EXPR:
49178 case BIT_AND_EXPR:
49179 case BIT_NOT_EXPR:
49180 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49181 stmt_cost = ix86_cost->sse_op;
49182 else if (VECTOR_MODE_P (mode))
49183 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49184 else
49185 stmt_cost = ix86_cost->add;
49186 break;
49187 default:
49188 break;
49191 if (stmt_cost == -1)
49192 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49194 /* Penalize DFmode vector operations for Bonnell. */
49195 if (TARGET_BONNELL && kind == vector_stmt
49196 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49197 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49199 /* Statements in an inner loop relative to the loop being
49200 vectorized are weighted more heavily. The value here is
49201 arbitrary and could potentially be improved with analysis. */
49202 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49203 count *= 50; /* FIXME. */
49205 retval = (unsigned) (count * stmt_cost);
49207 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49208 for Silvermont as it has out of order integer pipeline and can execute
49209 2 scalar instruction per tick, but has in order SIMD pipeline. */
49210 if ((TARGET_SILVERMONT || TARGET_INTEL)
49211 && stmt_info && stmt_info->stmt)
49213 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49214 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49215 retval = (retval * 17) / 10;
49218 cost[where] += retval;
49220 return retval;
49223 /* Implement targetm.vectorize.finish_cost. */
49225 static void
49226 ix86_finish_cost (void *data, unsigned *prologue_cost,
49227 unsigned *body_cost, unsigned *epilogue_cost)
49229 unsigned *cost = (unsigned *) data;
49230 *prologue_cost = cost[vect_prologue];
49231 *body_cost = cost[vect_body];
49232 *epilogue_cost = cost[vect_epilogue];
49235 /* Implement targetm.vectorize.destroy_cost_data. */
49237 static void
49238 ix86_destroy_cost_data (void *data)
49240 free (data);
49243 /* Validate target specific memory model bits in VAL. */
49245 static unsigned HOST_WIDE_INT
49246 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49248 enum memmodel model = memmodel_from_int (val);
49249 bool strong;
49251 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49252 |MEMMODEL_MASK)
49253 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49255 warning (OPT_Winvalid_memory_model,
49256 "unknown architecture specific memory model");
49257 return MEMMODEL_SEQ_CST;
49259 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49260 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49262 warning (OPT_Winvalid_memory_model,
49263 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49264 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49266 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49268 warning (OPT_Winvalid_memory_model,
49269 "HLE_RELEASE not used with RELEASE or stronger memory model");
49270 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49272 return val;
49275 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49276 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49277 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49278 or number of vecsize_mangle variants that should be emitted. */
49280 static int
49281 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49282 struct cgraph_simd_clone *clonei,
49283 tree base_type, int num)
49285 int ret = 1;
49287 if (clonei->simdlen
49288 && (clonei->simdlen < 2
49289 || clonei->simdlen > 1024
49290 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49292 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49293 "unsupported simdlen %d", clonei->simdlen);
49294 return 0;
49297 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49298 if (TREE_CODE (ret_type) != VOID_TYPE)
49299 switch (TYPE_MODE (ret_type))
49301 case E_QImode:
49302 case E_HImode:
49303 case E_SImode:
49304 case E_DImode:
49305 case E_SFmode:
49306 case E_DFmode:
49307 /* case E_SCmode: */
49308 /* case E_DCmode: */
49309 break;
49310 default:
49311 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49312 "unsupported return type %qT for simd\n", ret_type);
49313 return 0;
49316 tree t;
49317 int i;
49319 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49320 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49321 switch (TYPE_MODE (TREE_TYPE (t)))
49323 case E_QImode:
49324 case E_HImode:
49325 case E_SImode:
49326 case E_DImode:
49327 case E_SFmode:
49328 case E_DFmode:
49329 /* case E_SCmode: */
49330 /* case E_DCmode: */
49331 break;
49332 default:
49333 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49334 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49335 return 0;
49338 if (!TREE_PUBLIC (node->decl))
49340 /* If the function isn't exported, we can pick up just one ISA
49341 for the clones. */
49342 if (TARGET_AVX512F)
49343 clonei->vecsize_mangle = 'e';
49344 else if (TARGET_AVX2)
49345 clonei->vecsize_mangle = 'd';
49346 else if (TARGET_AVX)
49347 clonei->vecsize_mangle = 'c';
49348 else
49349 clonei->vecsize_mangle = 'b';
49350 ret = 1;
49352 else
49354 clonei->vecsize_mangle = "bcde"[num];
49355 ret = 4;
49357 clonei->mask_mode = VOIDmode;
49358 switch (clonei->vecsize_mangle)
49360 case 'b':
49361 clonei->vecsize_int = 128;
49362 clonei->vecsize_float = 128;
49363 break;
49364 case 'c':
49365 clonei->vecsize_int = 128;
49366 clonei->vecsize_float = 256;
49367 break;
49368 case 'd':
49369 clonei->vecsize_int = 256;
49370 clonei->vecsize_float = 256;
49371 break;
49372 case 'e':
49373 clonei->vecsize_int = 512;
49374 clonei->vecsize_float = 512;
49375 if (TYPE_MODE (base_type) == QImode)
49376 clonei->mask_mode = DImode;
49377 else
49378 clonei->mask_mode = SImode;
49379 break;
49381 if (clonei->simdlen == 0)
49383 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49384 clonei->simdlen = clonei->vecsize_int;
49385 else
49386 clonei->simdlen = clonei->vecsize_float;
49387 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49389 else if (clonei->simdlen > 16)
49391 /* For compatibility with ICC, use the same upper bounds
49392 for simdlen. In particular, for CTYPE below, use the return type,
49393 unless the function returns void, in that case use the characteristic
49394 type. If it is possible for given SIMDLEN to pass CTYPE value
49395 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49396 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49397 emit corresponding clone. */
49398 tree ctype = ret_type;
49399 if (TREE_CODE (ret_type) == VOID_TYPE)
49400 ctype = base_type;
49401 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49402 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49403 cnt /= clonei->vecsize_int;
49404 else
49405 cnt /= clonei->vecsize_float;
49406 if (cnt > (TARGET_64BIT ? 16 : 8))
49408 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49409 "unsupported simdlen %d", clonei->simdlen);
49410 return 0;
49413 return ret;
49416 /* Add target attribute to SIMD clone NODE if needed. */
49418 static void
49419 ix86_simd_clone_adjust (struct cgraph_node *node)
49421 const char *str = NULL;
49422 gcc_assert (node->decl == cfun->decl);
49423 switch (node->simdclone->vecsize_mangle)
49425 case 'b':
49426 if (!TARGET_SSE2)
49427 str = "sse2";
49428 break;
49429 case 'c':
49430 if (!TARGET_AVX)
49431 str = "avx";
49432 break;
49433 case 'd':
49434 if (!TARGET_AVX2)
49435 str = "avx2";
49436 break;
49437 case 'e':
49438 if (!TARGET_AVX512F)
49439 str = "avx512f";
49440 break;
49441 default:
49442 gcc_unreachable ();
49444 if (str == NULL)
49445 return;
49446 push_cfun (NULL);
49447 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49448 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49449 gcc_assert (ok);
49450 pop_cfun ();
49451 ix86_reset_previous_fndecl ();
49452 ix86_set_current_function (node->decl);
49455 /* If SIMD clone NODE can't be used in a vectorized loop
49456 in current function, return -1, otherwise return a badness of using it
49457 (0 if it is most desirable from vecsize_mangle point of view, 1
49458 slightly less desirable, etc.). */
49460 static int
49461 ix86_simd_clone_usable (struct cgraph_node *node)
49463 switch (node->simdclone->vecsize_mangle)
49465 case 'b':
49466 if (!TARGET_SSE2)
49467 return -1;
49468 if (!TARGET_AVX)
49469 return 0;
49470 return TARGET_AVX2 ? 2 : 1;
49471 case 'c':
49472 if (!TARGET_AVX)
49473 return -1;
49474 return TARGET_AVX2 ? 1 : 0;
49475 case 'd':
49476 if (!TARGET_AVX2)
49477 return -1;
49478 return 0;
49479 case 'e':
49480 if (!TARGET_AVX512F)
49481 return -1;
49482 return 0;
49483 default:
49484 gcc_unreachable ();
49488 /* This function adjusts the unroll factor based on
49489 the hardware capabilities. For ex, bdver3 has
49490 a loop buffer which makes unrolling of smaller
49491 loops less important. This function decides the
49492 unroll factor using number of memory references
49493 (value 32 is used) as a heuristic. */
49495 static unsigned
49496 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49498 basic_block *bbs;
49499 rtx_insn *insn;
49500 unsigned i;
49501 unsigned mem_count = 0;
49503 if (!TARGET_ADJUST_UNROLL)
49504 return nunroll;
49506 /* Count the number of memory references within the loop body.
49507 This value determines the unrolling factor for bdver3 and bdver4
49508 architectures. */
49509 subrtx_iterator::array_type array;
49510 bbs = get_loop_body (loop);
49511 for (i = 0; i < loop->num_nodes; i++)
49512 FOR_BB_INSNS (bbs[i], insn)
49513 if (NONDEBUG_INSN_P (insn))
49514 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49515 if (const_rtx x = *iter)
49516 if (MEM_P (x))
49518 machine_mode mode = GET_MODE (x);
49519 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
49520 if (n_words > 4)
49521 mem_count += 2;
49522 else
49523 mem_count += 1;
49525 free (bbs);
49527 if (mem_count && mem_count <=32)
49528 return 32/mem_count;
49530 return nunroll;
49534 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
49536 static bool
49537 ix86_float_exceptions_rounding_supported_p (void)
49539 /* For x87 floating point with standard excess precision handling,
49540 there is no adddf3 pattern (since x87 floating point only has
49541 XFmode operations) so the default hook implementation gets this
49542 wrong. */
49543 return TARGET_80387 || TARGET_SSE_MATH;
49546 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
49548 static void
49549 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
49551 if (!TARGET_80387 && !TARGET_SSE_MATH)
49552 return;
49553 tree exceptions_var = create_tmp_var_raw (integer_type_node);
49554 if (TARGET_80387)
49556 tree fenv_index_type = build_index_type (size_int (6));
49557 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
49558 tree fenv_var = create_tmp_var_raw (fenv_type);
49559 TREE_ADDRESSABLE (fenv_var) = 1;
49560 tree fenv_ptr = build_pointer_type (fenv_type);
49561 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
49562 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
49563 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
49564 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
49565 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
49566 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
49567 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
49568 tree hold_fnclex = build_call_expr (fnclex, 0);
49569 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
49570 NULL_TREE, NULL_TREE);
49571 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
49572 hold_fnclex);
49573 *clear = build_call_expr (fnclex, 0);
49574 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
49575 tree fnstsw_call = build_call_expr (fnstsw, 0);
49576 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
49577 sw_var, fnstsw_call);
49578 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
49579 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
49580 exceptions_var, exceptions_x87);
49581 *update = build2 (COMPOUND_EXPR, integer_type_node,
49582 sw_mod, update_mod);
49583 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
49584 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
49586 if (TARGET_SSE_MATH)
49588 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
49589 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
49590 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
49591 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
49592 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
49593 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
49594 mxcsr_orig_var, stmxcsr_hold_call);
49595 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
49596 mxcsr_orig_var,
49597 build_int_cst (unsigned_type_node, 0x1f80));
49598 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
49599 build_int_cst (unsigned_type_node, 0xffffffc0));
49600 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
49601 mxcsr_mod_var, hold_mod_val);
49602 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49603 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
49604 hold_assign_orig, hold_assign_mod);
49605 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
49606 ldmxcsr_hold_call);
49607 if (*hold)
49608 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
49609 else
49610 *hold = hold_all;
49611 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49612 if (*clear)
49613 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
49614 ldmxcsr_clear_call);
49615 else
49616 *clear = ldmxcsr_clear_call;
49617 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
49618 tree exceptions_sse = fold_convert (integer_type_node,
49619 stxmcsr_update_call);
49620 if (*update)
49622 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
49623 exceptions_var, exceptions_sse);
49624 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
49625 exceptions_var, exceptions_mod);
49626 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
49627 exceptions_assign);
49629 else
49630 *update = build2 (MODIFY_EXPR, integer_type_node,
49631 exceptions_var, exceptions_sse);
49632 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
49633 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49634 ldmxcsr_update_call);
49636 tree atomic_feraiseexcept
49637 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
49638 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
49639 1, exceptions_var);
49640 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49641 atomic_feraiseexcept_call);
49644 /* Return mode to be used for bounds or VOIDmode
49645 if bounds are not supported. */
49647 static machine_mode
49648 ix86_mpx_bound_mode ()
49650 /* Do not support pointer checker if MPX
49651 is not enabled. */
49652 if (!TARGET_MPX)
49654 if (flag_check_pointer_bounds)
49655 warning (0, "Pointer Checker requires MPX support on this target."
49656 " Use -mmpx options to enable MPX.");
49657 return VOIDmode;
49660 return BNDmode;
49663 /* Return constant used to statically initialize constant bounds.
49665 This function is used to create special bound values. For now
49666 only INIT bounds and NONE bounds are expected. More special
49667 values may be added later. */
49669 static tree
49670 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
49672 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
49673 : build_zero_cst (pointer_sized_int_node);
49674 tree high = ub ? build_zero_cst (pointer_sized_int_node)
49675 : build_minus_one_cst (pointer_sized_int_node);
49677 /* This function is supposed to be used to create INIT and
49678 NONE bounds only. */
49679 gcc_assert ((lb == 0 && ub == -1)
49680 || (lb == -1 && ub == 0));
49682 return build_complex (NULL, low, high);
49685 /* Generate a list of statements STMTS to initialize pointer bounds
49686 variable VAR with bounds LB and UB. Return the number of generated
49687 statements. */
49689 static int
49690 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
49692 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
49693 tree lhs, modify, var_p;
49695 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
49696 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
49698 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
49699 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
49700 append_to_statement_list (modify, stmts);
49702 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
49703 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
49704 TYPE_SIZE_UNIT (pointer_sized_int_node)));
49705 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
49706 append_to_statement_list (modify, stmts);
49708 return 2;
49711 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
49712 /* For i386, common symbol is local only for non-PIE binaries. For
49713 x86-64, common symbol is local only for non-PIE binaries or linker
49714 supports copy reloc in PIE binaries. */
49716 static bool
49717 ix86_binds_local_p (const_tree exp)
49719 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
49720 (!flag_pic
49721 || (TARGET_64BIT
49722 && HAVE_LD_PIE_COPYRELOC != 0)));
49724 #endif
49726 /* If MEM is in the form of [base+offset], extract the two parts
49727 of address and set to BASE and OFFSET, otherwise return false. */
49729 static bool
49730 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
49732 rtx addr;
49734 gcc_assert (MEM_P (mem));
49736 addr = XEXP (mem, 0);
49738 if (GET_CODE (addr) == CONST)
49739 addr = XEXP (addr, 0);
49741 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
49743 *base = addr;
49744 *offset = const0_rtx;
49745 return true;
49748 if (GET_CODE (addr) == PLUS
49749 && (REG_P (XEXP (addr, 0))
49750 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49751 && CONST_INT_P (XEXP (addr, 1)))
49753 *base = XEXP (addr, 0);
49754 *offset = XEXP (addr, 1);
49755 return true;
49758 return false;
49761 /* Given OPERANDS of consecutive load/store, check if we can merge
49762 them into move multiple. LOAD is true if they are load instructions.
49763 MODE is the mode of memory operands. */
49765 bool
49766 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
49767 machine_mode mode)
49769 HOST_WIDE_INT offval_1, offval_2, msize;
49770 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
49772 if (load)
49774 mem_1 = operands[1];
49775 mem_2 = operands[3];
49776 reg_1 = operands[0];
49777 reg_2 = operands[2];
49779 else
49781 mem_1 = operands[0];
49782 mem_2 = operands[2];
49783 reg_1 = operands[1];
49784 reg_2 = operands[3];
49787 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
49789 if (REGNO (reg_1) != REGNO (reg_2))
49790 return false;
49792 /* Check if the addresses are in the form of [base+offset]. */
49793 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
49794 return false;
49795 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
49796 return false;
49798 /* Check if the bases are the same. */
49799 if (!rtx_equal_p (base_1, base_2))
49800 return false;
49802 offval_1 = INTVAL (offset_1);
49803 offval_2 = INTVAL (offset_2);
49804 msize = GET_MODE_SIZE (mode);
49805 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
49806 if (offval_1 + msize != offval_2)
49807 return false;
49809 return true;
49812 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
49814 static bool
49815 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
49816 optimization_type opt_type)
49818 switch (op)
49820 case asin_optab:
49821 case acos_optab:
49822 case log1p_optab:
49823 case exp_optab:
49824 case exp10_optab:
49825 case exp2_optab:
49826 case expm1_optab:
49827 case ldexp_optab:
49828 case scalb_optab:
49829 case round_optab:
49830 return opt_type == OPTIMIZE_FOR_SPEED;
49832 case rint_optab:
49833 if (SSE_FLOAT_MODE_P (mode1)
49834 && TARGET_SSE_MATH
49835 && !flag_trapping_math
49836 && !TARGET_SSE4_1)
49837 return opt_type == OPTIMIZE_FOR_SPEED;
49838 return true;
49840 case floor_optab:
49841 case ceil_optab:
49842 case btrunc_optab:
49843 if (SSE_FLOAT_MODE_P (mode1)
49844 && TARGET_SSE_MATH
49845 && !flag_trapping_math
49846 && TARGET_SSE4_1)
49847 return true;
49848 return opt_type == OPTIMIZE_FOR_SPEED;
49850 case rsqrt_optab:
49851 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
49853 default:
49854 return true;
49858 /* Address space support.
49860 This is not "far pointers" in the 16-bit sense, but an easy way
49861 to use %fs and %gs segment prefixes. Therefore:
49863 (a) All address spaces have the same modes,
49864 (b) All address spaces have the same addresss forms,
49865 (c) While %fs and %gs are technically subsets of the generic
49866 address space, they are probably not subsets of each other.
49867 (d) Since we have no access to the segment base register values
49868 without resorting to a system call, we cannot convert a
49869 non-default address space to a default address space.
49870 Therefore we do not claim %fs or %gs are subsets of generic.
49872 Therefore we can (mostly) use the default hooks. */
49874 /* All use of segmentation is assumed to make address 0 valid. */
49876 static bool
49877 ix86_addr_space_zero_address_valid (addr_space_t as)
49879 return as != ADDR_SPACE_GENERIC;
49882 static void
49883 ix86_init_libfuncs (void)
49885 if (TARGET_64BIT)
49887 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
49888 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
49890 else
49892 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
49893 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
49896 #if TARGET_MACHO
49897 darwin_rename_builtins ();
49898 #endif
49901 /* Generate call to __divmoddi4. */
49903 static void
49904 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
49905 rtx op0, rtx op1,
49906 rtx *quot_p, rtx *rem_p)
49908 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
49910 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
49911 mode,
49912 op0, GET_MODE (op0),
49913 op1, GET_MODE (op1),
49914 XEXP (rem, 0), Pmode);
49915 *quot_p = quot;
49916 *rem_p = rem;
49919 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
49920 FPU, assume that the fpcw is set to extended precision; when using
49921 only SSE, rounding is correct; when using both SSE and the FPU,
49922 the rounding precision is indeterminate, since either may be chosen
49923 apparently at random. */
49925 static enum flt_eval_method
49926 ix86_excess_precision (enum excess_precision_type type)
49928 switch (type)
49930 case EXCESS_PRECISION_TYPE_FAST:
49931 /* The fastest type to promote to will always be the native type,
49932 whether that occurs with implicit excess precision or
49933 otherwise. */
49934 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49935 case EXCESS_PRECISION_TYPE_STANDARD:
49936 case EXCESS_PRECISION_TYPE_IMPLICIT:
49937 /* Otherwise, the excess precision we want when we are
49938 in a standards compliant mode, and the implicit precision we
49939 provide would be identical were it not for the unpredictable
49940 cases. */
49941 if (!TARGET_80387)
49942 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49943 else if (!TARGET_MIX_SSE_I387)
49945 if (!TARGET_SSE_MATH)
49946 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
49947 else if (TARGET_SSE2)
49948 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49951 /* If we are in standards compliant mode, but we know we will
49952 calculate in unpredictable precision, return
49953 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
49954 excess precision if the target can't guarantee it will honor
49955 it. */
49956 return (type == EXCESS_PRECISION_TYPE_STANDARD
49957 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
49958 : FLT_EVAL_METHOD_UNPREDICTABLE);
49959 default:
49960 gcc_unreachable ();
49963 return FLT_EVAL_METHOD_UNPREDICTABLE;
49966 /* Target-specific selftests. */
49968 #if CHECKING_P
49970 namespace selftest {
49972 /* Verify that hard regs are dumped as expected (in compact mode). */
49974 static void
49975 ix86_test_dumping_hard_regs ()
49977 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
49978 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
49981 /* Test dumping an insn with repeated references to the same SCRATCH,
49982 to verify the rtx_reuse code. */
49984 static void
49985 ix86_test_dumping_memory_blockage ()
49987 set_new_first_and_last_insn (NULL, NULL);
49989 rtx pat = gen_memory_blockage ();
49990 rtx_reuse_manager r;
49991 r.preprocess (pat);
49993 /* Verify that the repeated references to the SCRATCH show use
49994 reuse IDS. The first should be prefixed with a reuse ID,
49995 and the second should be dumped as a "reuse_rtx" of that ID.
49996 The expected string assumes Pmode == DImode. */
49997 if (Pmode == DImode)
49998 ASSERT_RTL_DUMP_EQ_WITH_REUSE
49999 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
50000 " (unspec:BLK [\n"
50001 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
50002 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
50005 /* Verify loading an RTL dump; specifically a dump of copying
50006 a param on x86_64 from a hard reg into the frame.
50007 This test is target-specific since the dump contains target-specific
50008 hard reg names. */
50010 static void
50011 ix86_test_loading_dump_fragment_1 ()
50013 rtl_dump_test t (SELFTEST_LOCATION,
50014 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
50016 rtx_insn *insn = get_insn_by_uid (1);
50018 /* The block structure and indentation here is purely for
50019 readability; it mirrors the structure of the rtx. */
50020 tree mem_expr;
50022 rtx pat = PATTERN (insn);
50023 ASSERT_EQ (SET, GET_CODE (pat));
50025 rtx dest = SET_DEST (pat);
50026 ASSERT_EQ (MEM, GET_CODE (dest));
50027 /* Verify the "/c" was parsed. */
50028 ASSERT_TRUE (RTX_FLAG (dest, call));
50029 ASSERT_EQ (SImode, GET_MODE (dest));
50031 rtx addr = XEXP (dest, 0);
50032 ASSERT_EQ (PLUS, GET_CODE (addr));
50033 ASSERT_EQ (DImode, GET_MODE (addr));
50035 rtx lhs = XEXP (addr, 0);
50036 /* Verify that the "frame" REG was consolidated. */
50037 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
50040 rtx rhs = XEXP (addr, 1);
50041 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
50042 ASSERT_EQ (-4, INTVAL (rhs));
50045 /* Verify the "[1 i+0 S4 A32]" was parsed. */
50046 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
50047 /* "i" should have been handled by synthesizing a global int
50048 variable named "i". */
50049 mem_expr = MEM_EXPR (dest);
50050 ASSERT_NE (mem_expr, NULL);
50051 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
50052 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
50053 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
50054 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
50055 /* "+0". */
50056 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
50057 ASSERT_EQ (0, MEM_OFFSET (dest));
50058 /* "S4". */
50059 ASSERT_EQ (4, MEM_SIZE (dest));
50060 /* "A32. */
50061 ASSERT_EQ (32, MEM_ALIGN (dest));
50064 rtx src = SET_SRC (pat);
50065 ASSERT_EQ (REG, GET_CODE (src));
50066 ASSERT_EQ (SImode, GET_MODE (src));
50067 ASSERT_EQ (5, REGNO (src));
50068 tree reg_expr = REG_EXPR (src);
50069 /* "i" here should point to the same var as for the MEM_EXPR. */
50070 ASSERT_EQ (reg_expr, mem_expr);
50075 /* Verify that the RTL loader copes with a call_insn dump.
50076 This test is target-specific since the dump contains a target-specific
50077 hard reg name. */
50079 static void
50080 ix86_test_loading_call_insn ()
50082 /* The test dump includes register "xmm0", where requires TARGET_SSE
50083 to exist. */
50084 if (!TARGET_SSE)
50085 return;
50087 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
50089 rtx_insn *insn = get_insns ();
50090 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
50092 /* "/j". */
50093 ASSERT_TRUE (RTX_FLAG (insn, jump));
50095 rtx pat = PATTERN (insn);
50096 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
50098 /* Verify REG_NOTES. */
50100 /* "(expr_list:REG_CALL_DECL". */
50101 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
50102 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
50103 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
50105 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
50106 rtx_expr_list *note1 = note0->next ();
50107 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
50109 ASSERT_EQ (NULL, note1->next ());
50112 /* Verify CALL_INSN_FUNCTION_USAGE. */
50114 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
50115 rtx_expr_list *usage
50116 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
50117 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
50118 ASSERT_EQ (DFmode, GET_MODE (usage));
50119 ASSERT_EQ (USE, GET_CODE (usage->element ()));
50120 ASSERT_EQ (NULL, usage->next ());
50124 /* Verify that the RTL loader copes a dump from print_rtx_function.
50125 This test is target-specific since the dump contains target-specific
50126 hard reg names. */
50128 static void
50129 ix86_test_loading_full_dump ()
50131 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
50133 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50135 rtx_insn *insn_1 = get_insn_by_uid (1);
50136 ASSERT_EQ (NOTE, GET_CODE (insn_1));
50138 rtx_insn *insn_7 = get_insn_by_uid (7);
50139 ASSERT_EQ (INSN, GET_CODE (insn_7));
50140 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
50142 rtx_insn *insn_15 = get_insn_by_uid (15);
50143 ASSERT_EQ (INSN, GET_CODE (insn_15));
50144 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
50146 /* Verify crtl->return_rtx. */
50147 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
50148 ASSERT_EQ (0, REGNO (crtl->return_rtx));
50149 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
50152 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
50153 In particular, verify that it correctly loads the 2nd operand.
50154 This test is target-specific since these are machine-specific
50155 operands (and enums). */
50157 static void
50158 ix86_test_loading_unspec ()
50160 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
50162 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50164 ASSERT_TRUE (cfun);
50166 /* Test of an UNSPEC. */
50167 rtx_insn *insn = get_insns ();
50168 ASSERT_EQ (INSN, GET_CODE (insn));
50169 rtx set = single_set (insn);
50170 ASSERT_NE (NULL, set);
50171 rtx dst = SET_DEST (set);
50172 ASSERT_EQ (MEM, GET_CODE (dst));
50173 rtx src = SET_SRC (set);
50174 ASSERT_EQ (UNSPEC, GET_CODE (src));
50175 ASSERT_EQ (BLKmode, GET_MODE (src));
50176 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
50178 rtx v0 = XVECEXP (src, 0, 0);
50180 /* Verify that the two uses of the first SCRATCH have pointer
50181 equality. */
50182 rtx scratch_a = XEXP (dst, 0);
50183 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
50185 rtx scratch_b = XEXP (v0, 0);
50186 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
50188 ASSERT_EQ (scratch_a, scratch_b);
50190 /* Verify that the two mems are thus treated as equal. */
50191 ASSERT_TRUE (rtx_equal_p (dst, v0));
50193 /* Verify the the insn is recognized. */
50194 ASSERT_NE(-1, recog_memoized (insn));
50196 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
50197 insn = NEXT_INSN (insn);
50198 ASSERT_EQ (INSN, GET_CODE (insn));
50200 set = single_set (insn);
50201 ASSERT_NE (NULL, set);
50203 src = SET_SRC (set);
50204 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
50205 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
50208 /* Run all target-specific selftests. */
50210 static void
50211 ix86_run_selftests (void)
50213 ix86_test_dumping_hard_regs ();
50214 ix86_test_dumping_memory_blockage ();
50216 /* Various tests of loading RTL dumps, here because they contain
50217 ix86-isms (e.g. names of hard regs). */
50218 ix86_test_loading_dump_fragment_1 ();
50219 ix86_test_loading_call_insn ();
50220 ix86_test_loading_full_dump ();
50221 ix86_test_loading_unspec ();
50224 } // namespace selftest
50226 #endif /* CHECKING_P */
50228 /* Initialize the GCC target structure. */
50229 #undef TARGET_RETURN_IN_MEMORY
50230 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50232 #undef TARGET_LEGITIMIZE_ADDRESS
50233 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50235 #undef TARGET_ATTRIBUTE_TABLE
50236 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50237 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50238 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50239 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50240 # undef TARGET_MERGE_DECL_ATTRIBUTES
50241 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50242 #endif
50244 #undef TARGET_COMP_TYPE_ATTRIBUTES
50245 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50247 #undef TARGET_INIT_BUILTINS
50248 #define TARGET_INIT_BUILTINS ix86_init_builtins
50249 #undef TARGET_BUILTIN_DECL
50250 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50251 #undef TARGET_EXPAND_BUILTIN
50252 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50254 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50255 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50256 ix86_builtin_vectorized_function
50258 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50259 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50261 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50262 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50264 #undef TARGET_BUILTIN_RECIPROCAL
50265 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50267 #undef TARGET_ASM_FUNCTION_EPILOGUE
50268 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50270 #undef TARGET_ENCODE_SECTION_INFO
50271 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50272 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50273 #else
50274 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50275 #endif
50277 #undef TARGET_ASM_OPEN_PAREN
50278 #define TARGET_ASM_OPEN_PAREN ""
50279 #undef TARGET_ASM_CLOSE_PAREN
50280 #define TARGET_ASM_CLOSE_PAREN ""
50282 #undef TARGET_ASM_BYTE_OP
50283 #define TARGET_ASM_BYTE_OP ASM_BYTE
50285 #undef TARGET_ASM_ALIGNED_HI_OP
50286 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50287 #undef TARGET_ASM_ALIGNED_SI_OP
50288 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50289 #ifdef ASM_QUAD
50290 #undef TARGET_ASM_ALIGNED_DI_OP
50291 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50292 #endif
50294 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50295 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50297 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50298 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50300 #undef TARGET_ASM_UNALIGNED_HI_OP
50301 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50302 #undef TARGET_ASM_UNALIGNED_SI_OP
50303 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50304 #undef TARGET_ASM_UNALIGNED_DI_OP
50305 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50307 #undef TARGET_PRINT_OPERAND
50308 #define TARGET_PRINT_OPERAND ix86_print_operand
50309 #undef TARGET_PRINT_OPERAND_ADDRESS
50310 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50311 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50312 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50313 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50314 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50316 #undef TARGET_SCHED_INIT_GLOBAL
50317 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50318 #undef TARGET_SCHED_ADJUST_COST
50319 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50320 #undef TARGET_SCHED_ISSUE_RATE
50321 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50322 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50323 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50324 ia32_multipass_dfa_lookahead
50325 #undef TARGET_SCHED_MACRO_FUSION_P
50326 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50327 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50328 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50330 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50331 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50333 #undef TARGET_MEMMODEL_CHECK
50334 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50336 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50337 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50339 #ifdef HAVE_AS_TLS
50340 #undef TARGET_HAVE_TLS
50341 #define TARGET_HAVE_TLS true
50342 #endif
50343 #undef TARGET_CANNOT_FORCE_CONST_MEM
50344 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50345 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50346 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50348 #undef TARGET_DELEGITIMIZE_ADDRESS
50349 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50351 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
50352 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
50354 #undef TARGET_MS_BITFIELD_LAYOUT_P
50355 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50357 #if TARGET_MACHO
50358 #undef TARGET_BINDS_LOCAL_P
50359 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50360 #else
50361 #undef TARGET_BINDS_LOCAL_P
50362 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50363 #endif
50364 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50365 #undef TARGET_BINDS_LOCAL_P
50366 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50367 #endif
50369 #undef TARGET_ASM_OUTPUT_MI_THUNK
50370 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50371 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50372 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50374 #undef TARGET_ASM_FILE_START
50375 #define TARGET_ASM_FILE_START x86_file_start
50377 #undef TARGET_OPTION_OVERRIDE
50378 #define TARGET_OPTION_OVERRIDE ix86_option_override
50380 #undef TARGET_REGISTER_MOVE_COST
50381 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50382 #undef TARGET_MEMORY_MOVE_COST
50383 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50384 #undef TARGET_RTX_COSTS
50385 #define TARGET_RTX_COSTS ix86_rtx_costs
50386 #undef TARGET_ADDRESS_COST
50387 #define TARGET_ADDRESS_COST ix86_address_cost
50389 #undef TARGET_FLAGS_REGNUM
50390 #define TARGET_FLAGS_REGNUM FLAGS_REG
50391 #undef TARGET_FIXED_CONDITION_CODE_REGS
50392 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50393 #undef TARGET_CC_MODES_COMPATIBLE
50394 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50396 #undef TARGET_MACHINE_DEPENDENT_REORG
50397 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50399 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50400 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50402 #undef TARGET_BUILD_BUILTIN_VA_LIST
50403 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50405 #undef TARGET_FOLD_BUILTIN
50406 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50408 #undef TARGET_GIMPLE_FOLD_BUILTIN
50409 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
50411 #undef TARGET_COMPARE_VERSION_PRIORITY
50412 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50414 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50415 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50416 ix86_generate_version_dispatcher_body
50418 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50419 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50420 ix86_get_function_versions_dispatcher
50422 #undef TARGET_ENUM_VA_LIST_P
50423 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50425 #undef TARGET_FN_ABI_VA_LIST
50426 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50428 #undef TARGET_CANONICAL_VA_LIST_TYPE
50429 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50431 #undef TARGET_EXPAND_BUILTIN_VA_START
50432 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50434 #undef TARGET_MD_ASM_ADJUST
50435 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50437 #undef TARGET_C_EXCESS_PRECISION
50438 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
50439 #undef TARGET_PROMOTE_PROTOTYPES
50440 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50441 #undef TARGET_SETUP_INCOMING_VARARGS
50442 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50443 #undef TARGET_MUST_PASS_IN_STACK
50444 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50445 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
50446 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
50447 #undef TARGET_FUNCTION_ARG_ADVANCE
50448 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50449 #undef TARGET_FUNCTION_ARG
50450 #define TARGET_FUNCTION_ARG ix86_function_arg
50451 #undef TARGET_INIT_PIC_REG
50452 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50453 #undef TARGET_USE_PSEUDO_PIC_REG
50454 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50455 #undef TARGET_FUNCTION_ARG_BOUNDARY
50456 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50457 #undef TARGET_PASS_BY_REFERENCE
50458 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50459 #undef TARGET_INTERNAL_ARG_POINTER
50460 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50461 #undef TARGET_UPDATE_STACK_BOUNDARY
50462 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50463 #undef TARGET_GET_DRAP_RTX
50464 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50465 #undef TARGET_STRICT_ARGUMENT_NAMING
50466 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50467 #undef TARGET_STATIC_CHAIN
50468 #define TARGET_STATIC_CHAIN ix86_static_chain
50469 #undef TARGET_TRAMPOLINE_INIT
50470 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50471 #undef TARGET_RETURN_POPS_ARGS
50472 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50474 #undef TARGET_WARN_FUNC_RETURN
50475 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
50477 #undef TARGET_LEGITIMATE_COMBINED_INSN
50478 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50480 #undef TARGET_ASAN_SHADOW_OFFSET
50481 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50483 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50484 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50486 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50487 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50489 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50490 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50492 #undef TARGET_C_MODE_FOR_SUFFIX
50493 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50495 #ifdef HAVE_AS_TLS
50496 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50497 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50498 #endif
50500 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50501 #undef TARGET_INSERT_ATTRIBUTES
50502 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50503 #endif
50505 #undef TARGET_MANGLE_TYPE
50506 #define TARGET_MANGLE_TYPE ix86_mangle_type
50508 #undef TARGET_STACK_PROTECT_GUARD
50509 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50511 #if !TARGET_MACHO
50512 #undef TARGET_STACK_PROTECT_FAIL
50513 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50514 #endif
50516 #undef TARGET_FUNCTION_VALUE
50517 #define TARGET_FUNCTION_VALUE ix86_function_value
50519 #undef TARGET_FUNCTION_VALUE_REGNO_P
50520 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50522 #undef TARGET_PROMOTE_FUNCTION_MODE
50523 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50525 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50526 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50528 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50529 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50531 #undef TARGET_INSTANTIATE_DECLS
50532 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50534 #undef TARGET_SECONDARY_RELOAD
50535 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50536 #undef TARGET_SECONDARY_MEMORY_NEEDED
50537 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
50538 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
50539 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
50541 #undef TARGET_CLASS_MAX_NREGS
50542 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50544 #undef TARGET_PREFERRED_RELOAD_CLASS
50545 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50546 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50547 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50548 #undef TARGET_CLASS_LIKELY_SPILLED_P
50549 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50551 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50552 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50553 ix86_builtin_vectorization_cost
50554 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50555 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50556 ix86_vectorize_vec_perm_const_ok
50557 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50558 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50559 ix86_preferred_simd_mode
50560 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50561 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50562 ix86_autovectorize_vector_sizes
50563 #undef TARGET_VECTORIZE_GET_MASK_MODE
50564 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50565 #undef TARGET_VECTORIZE_INIT_COST
50566 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50567 #undef TARGET_VECTORIZE_ADD_STMT_COST
50568 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50569 #undef TARGET_VECTORIZE_FINISH_COST
50570 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50571 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50572 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50574 #undef TARGET_SET_CURRENT_FUNCTION
50575 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50577 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50578 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50580 #undef TARGET_OPTION_SAVE
50581 #define TARGET_OPTION_SAVE ix86_function_specific_save
50583 #undef TARGET_OPTION_RESTORE
50584 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50586 #undef TARGET_OPTION_POST_STREAM_IN
50587 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50589 #undef TARGET_OPTION_PRINT
50590 #define TARGET_OPTION_PRINT ix86_function_specific_print
50592 #undef TARGET_OPTION_FUNCTION_VERSIONS
50593 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
50595 #undef TARGET_CAN_INLINE_P
50596 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50598 #undef TARGET_LEGITIMATE_ADDRESS_P
50599 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50601 #undef TARGET_REGISTER_PRIORITY
50602 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50604 #undef TARGET_REGISTER_USAGE_LEVELING_P
50605 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50607 #undef TARGET_LEGITIMATE_CONSTANT_P
50608 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50610 #undef TARGET_COMPUTE_FRAME_LAYOUT
50611 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
50613 #undef TARGET_FRAME_POINTER_REQUIRED
50614 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50616 #undef TARGET_CAN_ELIMINATE
50617 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50619 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50620 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50622 #undef TARGET_ASM_CODE_END
50623 #define TARGET_ASM_CODE_END ix86_code_end
50625 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50626 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50628 #undef TARGET_CANONICALIZE_COMPARISON
50629 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
50631 #undef TARGET_LOOP_UNROLL_ADJUST
50632 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50634 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50635 #undef TARGET_SPILL_CLASS
50636 #define TARGET_SPILL_CLASS ix86_spill_class
50638 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50639 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50640 ix86_simd_clone_compute_vecsize_and_simdlen
50642 #undef TARGET_SIMD_CLONE_ADJUST
50643 #define TARGET_SIMD_CLONE_ADJUST \
50644 ix86_simd_clone_adjust
50646 #undef TARGET_SIMD_CLONE_USABLE
50647 #define TARGET_SIMD_CLONE_USABLE \
50648 ix86_simd_clone_usable
50650 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50651 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50652 ix86_float_exceptions_rounding_supported_p
50654 #undef TARGET_MODE_EMIT
50655 #define TARGET_MODE_EMIT ix86_emit_mode_set
50657 #undef TARGET_MODE_NEEDED
50658 #define TARGET_MODE_NEEDED ix86_mode_needed
50660 #undef TARGET_MODE_AFTER
50661 #define TARGET_MODE_AFTER ix86_mode_after
50663 #undef TARGET_MODE_ENTRY
50664 #define TARGET_MODE_ENTRY ix86_mode_entry
50666 #undef TARGET_MODE_EXIT
50667 #define TARGET_MODE_EXIT ix86_mode_exit
50669 #undef TARGET_MODE_PRIORITY
50670 #define TARGET_MODE_PRIORITY ix86_mode_priority
50672 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50673 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50675 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50676 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50678 #undef TARGET_STORE_BOUNDS_FOR_ARG
50679 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50681 #undef TARGET_LOAD_RETURNED_BOUNDS
50682 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50684 #undef TARGET_STORE_RETURNED_BOUNDS
50685 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50687 #undef TARGET_CHKP_BOUND_MODE
50688 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50690 #undef TARGET_BUILTIN_CHKP_FUNCTION
50691 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50693 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50694 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50696 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50697 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50699 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50700 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50702 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50703 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50705 #undef TARGET_OFFLOAD_OPTIONS
50706 #define TARGET_OFFLOAD_OPTIONS \
50707 ix86_offload_options
50709 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50710 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50712 #undef TARGET_OPTAB_SUPPORTED_P
50713 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50715 #undef TARGET_HARD_REGNO_SCRATCH_OK
50716 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50718 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
50719 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
50721 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
50722 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
50724 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50725 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50727 #undef TARGET_INIT_LIBFUNCS
50728 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
50730 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
50731 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
50733 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
50734 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
50736 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
50737 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
50739 #undef TARGET_HARD_REGNO_NREGS
50740 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
50741 #undef TARGET_HARD_REGNO_MODE_OK
50742 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
50744 #undef TARGET_MODES_TIEABLE_P
50745 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
50747 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
50748 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
50749 ix86_hard_regno_call_part_clobbered
50751 #undef TARGET_CAN_CHANGE_MODE_CLASS
50752 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
50754 #undef TARGET_STATIC_RTX_ALIGNMENT
50755 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
50756 #undef TARGET_CONSTANT_ALIGNMENT
50757 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
50759 #undef TARGET_EMPTY_RECORD_P
50760 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
50762 #undef TARGET_WARN_PARAMETER_PASSING_ABI
50763 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
50765 #if CHECKING_P
50766 #undef TARGET_RUN_TARGET_SELFTESTS
50767 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
50768 #endif /* #if CHECKING_P */
50770 struct gcc_target targetm = TARGET_INITIALIZER;
50772 #include "gt-i386.h"